Skip to content

Commit 993d8c7

Browse files
Storages: support building vector index for ColumnFileTiny (Part 3) (#9547)
ref #9032 Signed-off-by: Lloyd-Pottiger <[email protected]>
1 parent c10340d commit 993d8c7

22 files changed

+1090
-102
lines changed

dbms/src/Storages/DeltaMerge/BitmapFilter/BitmapFilterView.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ class BitmapFilterView
4646
return BitmapFilterView(std::make_shared<BitmapFilter>(size, default_value), 0, size);
4747
}
4848

49+
BitmapFilterView createSubView(UInt32 offset, UInt32 size) const
50+
{
51+
RUNTIME_CHECK(offset + size <= filter_size, offset, size, filter_size);
52+
return BitmapFilterView(filter, filter_offset + offset, size);
53+
}
54+
4955
// Caller should ensure n in [0, size).
5056
inline bool get(UInt32 n) const { return filter->get(filter_offset + n); }
5157

dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetInputStream.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,9 @@ size_t ColumnFileSetInputStream::skipNextBlock()
3939
return 0;
4040
}
4141

42-
Block ColumnFileSetInputStream::read()
42+
Block ColumnFileSetInputStream::read(FilterPtr & res_filter, bool)
4343
{
44+
res_filter = nullptr;
4445
while (cur_column_file_reader != reader.column_file_readers.end())
4546
{
4647
if (*cur_column_file_reader == nullptr)

dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetInputStream.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,13 @@ class ColumnFileSetInputStream : public SkippableBlockInputStream
4747

4848
size_t skipNextBlock() override;
4949

50-
Block read() override;
50+
Block read() override
51+
{
52+
FilterPtr filter = nullptr;
53+
return read(filter, false);
54+
}
55+
56+
Block read(FilterPtr & res_filter, bool return_filter) override;
5157

5258
Block readWithFilter(const IColumn::Filter & filter) override;
5359
};

dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ namespace DB::DM
2525
class ColumnFileSetReader
2626
{
2727
friend class ColumnFileSetInputStream;
28+
friend class ColumnFileSetWithVectorIndexInputStream;
2829

2930
private:
3031
const DMContext & context;
Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// Copyright 2024 PingCAP, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <Columns/ColumnsCommon.h>
16+
#include <Storages/DeltaMerge/ColumnFile/ColumnFileSetWithVectorIndexInputStream.h>
17+
#include <Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h>
18+
#include <Storages/DeltaMerge/Filter/WithANNQueryInfo.h>
19+
20+
21+
namespace DB::DM
22+
{
23+
24+
ColumnFileSetInputStreamPtr ColumnFileSetWithVectorIndexInputStream::tryBuild(
25+
const DMContext & context,
26+
const ColumnFileSetSnapshotPtr & delta_snap,
27+
const ColumnDefinesPtr & col_defs,
28+
const RowKeyRange & segment_range_,
29+
const IColumnFileDataProviderPtr & data_provider,
30+
const RSOperatorPtr & rs_operator,
31+
const BitmapFilterPtr & bitmap_filter,
32+
size_t offset,
33+
ReadTag read_tag_)
34+
{
35+
auto fallback = [&]() {
36+
return std::make_shared<ColumnFileSetInputStream>(context, delta_snap, col_defs, segment_range_, read_tag_);
37+
};
38+
39+
if (rs_operator == nullptr || bitmap_filter == nullptr)
40+
return fallback();
41+
42+
auto filter_with_ann = std::dynamic_pointer_cast<WithANNQueryInfo>(rs_operator);
43+
if (filter_with_ann == nullptr)
44+
return fallback();
45+
46+
auto ann_query_info = filter_with_ann->ann_query_info;
47+
if (!ann_query_info)
48+
return fallback();
49+
50+
// Fast check: ANNQueryInfo is available in the whole read path. However we may not reading vector column now.
51+
bool is_matching_ann_query = false;
52+
for (const auto & cd : *col_defs)
53+
{
54+
if (cd.id == ann_query_info->column_id())
55+
{
56+
is_matching_ann_query = true;
57+
break;
58+
}
59+
}
60+
if (!is_matching_ann_query)
61+
return fallback();
62+
63+
std::optional<ColumnDefine> vec_cd;
64+
auto rest_columns = std::make_shared<ColumnDefines>();
65+
rest_columns->reserve(col_defs->size() - 1);
66+
for (const auto & cd : *col_defs)
67+
{
68+
if (cd.id == ann_query_info->column_id())
69+
vec_cd.emplace(cd);
70+
else
71+
rest_columns->emplace_back(cd);
72+
}
73+
74+
// No vector index column is specified, just use the normal logic.
75+
if (!vec_cd.has_value())
76+
return fallback();
77+
78+
// All check passed. Let's read via vector index.
79+
return std::make_shared<ColumnFileSetWithVectorIndexInputStream>(
80+
context,
81+
delta_snap,
82+
col_defs,
83+
segment_range_,
84+
data_provider,
85+
ann_query_info,
86+
BitmapFilterView(bitmap_filter, offset, delta_snap->getRows()),
87+
std::move(*vec_cd),
88+
rest_columns,
89+
read_tag_);
90+
}
91+
92+
Block ColumnFileSetWithVectorIndexInputStream::read(FilterPtr & res_filter, bool return_filter)
93+
{
94+
if (return_filter)
95+
return readImpl(res_filter);
96+
97+
// If return_filter == false, we must filter by ourselves.
98+
99+
FilterPtr filter = nullptr;
100+
auto res = readImpl(filter);
101+
if (filter != nullptr)
102+
{
103+
auto passed_count = countBytesInFilter(*filter);
104+
for (auto & col : res)
105+
col.column = col.column->filter(*filter, passed_count);
106+
}
107+
// filter == nullptr means all rows are valid and no need to filter.
108+
return res;
109+
}
110+
111+
Block ColumnFileSetWithVectorIndexInputStream::readOtherColumns()
112+
{
113+
auto reset_column_file_reader = (*cur_column_file_reader)->createNewReader(rest_col_defs, ReadTag::Query);
114+
Block block = reset_column_file_reader->readNextBlock();
115+
return block;
116+
}
117+
118+
void ColumnFileSetWithVectorIndexInputStream::toNextFile(size_t current_file_index, size_t current_file_rows)
119+
{
120+
(*cur_column_file_reader).reset();
121+
++cur_column_file_reader;
122+
read_rows += current_file_rows;
123+
tiny_readers[current_file_index].reset();
124+
}
125+
126+
Block ColumnFileSetWithVectorIndexInputStream::readImpl(FilterPtr & res_filter)
127+
{
128+
load();
129+
130+
while (cur_column_file_reader != reader.column_file_readers.end())
131+
{
132+
// Skip ColumnFileDeleteRange
133+
if (*cur_column_file_reader == nullptr)
134+
{
135+
++cur_column_file_reader;
136+
continue;
137+
}
138+
auto current_file_index = std::distance(reader.column_file_readers.begin(), cur_column_file_reader);
139+
// If has index, we can read the column by vector index.
140+
if (tiny_readers[current_file_index] != nullptr)
141+
{
142+
const auto file_rows = column_files[current_file_index]->getRows();
143+
auto selected_row_begin = std::lower_bound(
144+
selected_rows.cbegin(),
145+
selected_rows.cend(),
146+
read_rows,
147+
[](const auto & row, UInt32 offset) { return row.key < offset; });
148+
auto selected_row_end = std::lower_bound(
149+
selected_row_begin,
150+
selected_rows.cend(),
151+
read_rows + file_rows,
152+
[](const auto & row, UInt32 offset) { return row.key < offset; });
153+
size_t selected_rows = std::distance(selected_row_begin, selected_row_end);
154+
// If all rows are filtered out, skip this file.
155+
if (selected_rows == 0)
156+
{
157+
toNextFile(current_file_index, file_rows);
158+
continue;
159+
}
160+
161+
// read vector type column by vector index
162+
auto tiny_reader = tiny_readers[current_file_index];
163+
auto vec_column = vec_cd.type->createColumn();
164+
const std::span file_selected_rows{selected_row_begin, selected_row_end};
165+
tiny_reader->read(vec_column, file_selected_rows, /* rowid_start_offset= */ read_rows, file_rows);
166+
assert(vec_column->size() == file_rows);
167+
168+
Block block;
169+
if (!rest_col_defs->empty())
170+
{
171+
block = readOtherColumns();
172+
assert(block.rows() == vec_column->size());
173+
}
174+
175+
auto index = header.getPositionByName(vec_cd.name);
176+
block.insert(index, ColumnWithTypeAndName(std::move(vec_column), vec_cd.type, vec_cd.name));
177+
178+
// Fill res_filter
179+
if (selected_rows == file_rows)
180+
{
181+
res_filter = nullptr;
182+
}
183+
else
184+
{
185+
filter.clear();
186+
filter.resize_fill(file_rows, 0);
187+
for (const auto & [rowid, _] : file_selected_rows)
188+
filter[rowid - read_rows] = 1;
189+
res_filter = &filter;
190+
}
191+
192+
// All rows in this ColumnFileTiny have been read.
193+
block.setStartOffset(read_rows);
194+
toNextFile(current_file_index, file_rows);
195+
return block;
196+
}
197+
auto block = (*cur_column_file_reader)->readNextBlock();
198+
if (block)
199+
{
200+
block.setStartOffset(read_rows);
201+
read_rows += block.rows();
202+
res_filter = nullptr;
203+
return block;
204+
}
205+
else
206+
{
207+
(*cur_column_file_reader).reset();
208+
++cur_column_file_reader;
209+
}
210+
}
211+
return {};
212+
}
213+
214+
void ColumnFileSetWithVectorIndexInputStream::load()
215+
{
216+
if (loaded)
217+
return;
218+
219+
tiny_readers.reserve(column_files.size());
220+
UInt32 precedes_rows = 0;
221+
for (const auto & column_file : column_files)
222+
{
223+
if (auto * tiny_file = column_file->tryToTinyFile();
224+
tiny_file && tiny_file->hasIndex(ann_query_info->index_id()))
225+
{
226+
auto tiny_reader = std::make_shared<ColumnFileTinyVectorIndexReader>(
227+
*tiny_file,
228+
data_provider,
229+
ann_query_info,
230+
valid_rows.createSubView(precedes_rows, tiny_file->getRows()),
231+
vec_cd,
232+
vec_index_cache);
233+
auto sr = tiny_reader->load();
234+
for (auto & row : sr)
235+
row.key += precedes_rows;
236+
selected_rows.insert(selected_rows.end(), sr.begin(), sr.end());
237+
tiny_readers.push_back(tiny_reader);
238+
// avoid virutal function call
239+
precedes_rows += tiny_file->getRows();
240+
}
241+
else
242+
{
243+
tiny_readers.push_back(nullptr);
244+
precedes_rows += column_file->getRows();
245+
}
246+
}
247+
// Keep the top k minimum distances rows.
248+
auto select_size = selected_rows.size() > ann_query_info->top_k() ? ann_query_info->top_k() : selected_rows.size();
249+
auto top_k_end = selected_rows.begin() + select_size;
250+
std::nth_element(selected_rows.begin(), top_k_end, selected_rows.end(), [](const auto & lhs, const auto & rhs) {
251+
return lhs.distance < rhs.distance;
252+
});
253+
selected_rows.resize(select_size);
254+
// Sort by key again.
255+
std::sort(selected_rows.begin(), selected_rows.end(), [](const auto & lhs, const auto & rhs) {
256+
return lhs.key < rhs.key;
257+
});
258+
259+
loaded = true;
260+
}
261+
262+
} // namespace DB::DM

0 commit comments

Comments
 (0)