Skip to content

Commit c71cb2d

Browse files
authored
[fix] Implementing match_phrase_edge without index query method (#41658) (#43397)
pick #41658
1 parent beb0192 commit c71cb2d

File tree

4 files changed

+189
-4
lines changed

4 files changed

+189
-4
lines changed

be/src/vec/functions/match.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,72 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s
506506
return Status::OK();
507507
}
508508

509+
Status FunctionMatchPhraseEdge::execute_match(
510+
FunctionContext* context, const std::string& column_name,
511+
const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
512+
InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
513+
ColumnUInt8::Container& result) const {
514+
RETURN_IF_ERROR(check(context, name));
515+
516+
std::vector<std::string> query_tokens =
517+
analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
518+
if (query_tokens.empty()) {
519+
VLOG_DEBUG << fmt::format(
520+
"token parser result is empty for query, "
521+
"please check your query: '{}' and index parser: '{}'",
522+
match_query_str,
523+
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
524+
return Status::OK();
525+
}
526+
527+
int32_t current_src_array_offset = 0;
528+
for (size_t i = 0; i < input_rows_count; i++) {
529+
auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
530+
array_offsets, current_src_array_offset);
531+
532+
int32_t dis_count = data_tokens.size() - query_tokens.size();
533+
if (dis_count < 0) {
534+
continue;
535+
}
536+
537+
for (size_t j = 0; j < dis_count + 1; j++) {
538+
bool match = true;
539+
if (query_tokens.size() == 1) {
540+
if (data_tokens[j].find(query_tokens[0]) == std::string::npos) {
541+
match = false;
542+
}
543+
} else {
544+
for (size_t k = 0; k < query_tokens.size(); k++) {
545+
const std::string& data_token = data_tokens[j + k];
546+
const std::string& query_token = query_tokens[k];
547+
if (k == 0) {
548+
if (!data_token.ends_with(query_token)) {
549+
match = false;
550+
break;
551+
}
552+
} else if (k == query_tokens.size() - 1) {
553+
if (!data_token.starts_with(query_token)) {
554+
match = false;
555+
break;
556+
}
557+
} else {
558+
if (data_token != query_token) {
559+
match = false;
560+
break;
561+
}
562+
}
563+
}
564+
}
565+
if (match) {
566+
result[i] = true;
567+
break;
568+
}
569+
}
570+
}
571+
572+
return Status::OK();
573+
}
574+
509575
void register_function_match(SimpleFunctionFactory& factory) {
510576
factory.register_function<FunctionMatchAny>();
511577
factory.register_function<FunctionMatchAll>();

be/src/vec/functions/match.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,10 +180,7 @@ class FunctionMatchPhraseEdge : public FunctionMatchBase {
180180
const std::string& match_query_str, size_t input_rows_count,
181181
const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx,
182182
const ColumnArray::Offsets64* array_offsets,
183-
ColumnUInt8::Container& result) const override {
184-
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
185-
"FunctionMatchPhraseEdge not support execute_match");
186-
}
183+
ColumnUInt8::Container& result) const override;
187184
};
188185

189186
} // namespace doris::vectorized

regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,27 @@
4141
-- !sql --
4242
6
4343

44+
-- !sql --
45+
0
46+
47+
-- !sql --
48+
874
49+
50+
-- !sql --
51+
150
52+
53+
-- !sql --
54+
20
55+
56+
-- !sql --
57+
0
58+
59+
-- !sql --
60+
874
61+
62+
-- !sql --
63+
150
64+
65+
-- !sql --
66+
20
67+

regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,102 @@ suite("test_index_match_phrase_edge", "nonConcurrent"){
7979
} finally {
8080
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
8181
}
82+
83+
def indexTbName2 = "test_index_match_phrase_edge2"
84+
def indexTbName3 = "test_index_match_phrase_edge3"
85+
86+
sql "DROP TABLE IF EXISTS ${indexTbName2}"
87+
sql "DROP TABLE IF EXISTS ${indexTbName3}"
88+
89+
sql """
90+
CREATE TABLE ${indexTbName2} (
91+
`@timestamp` int(11) NULL COMMENT "",
92+
`clientip` varchar(20) NULL COMMENT "",
93+
`request` text NULL COMMENT "",
94+
`status` int(11) NULL COMMENT "",
95+
`size` int(11) NULL COMMENT "",
96+
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
97+
) ENGINE=OLAP
98+
DUPLICATE KEY(`@timestamp`)
99+
COMMENT "OLAP"
100+
DISTRIBUTED BY RANDOM BUCKETS 1
101+
PROPERTIES (
102+
"replication_allocation" = "tag.location.default: 1"
103+
);
104+
"""
105+
106+
sql """
107+
CREATE TABLE ${indexTbName3} (
108+
`@timestamp` int(11) NULL COMMENT "",
109+
`clientip` varchar(20) NULL COMMENT "",
110+
`request` text NULL COMMENT "",
111+
`status` int(11) NULL COMMENT "",
112+
`size` int(11) NULL COMMENT ""
113+
) ENGINE=OLAP
114+
DUPLICATE KEY(`@timestamp`)
115+
COMMENT "OLAP"
116+
DISTRIBUTED BY RANDOM BUCKETS 1
117+
PROPERTIES (
118+
"replication_allocation" = "tag.location.default: 1"
119+
);
120+
"""
121+
122+
def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
123+
expected_succ_rows = -1, load_to_single_tablet = 'true' ->
124+
125+
// load the json data
126+
streamLoad {
127+
table "${table_name}"
128+
129+
// set http request header params
130+
set 'label', label + "_" + UUID.randomUUID().toString()
131+
set 'read_json_by_line', read_flag
132+
set 'format', format_flag
133+
file file_name // import json file
134+
time 10000 // limit inflight 10s
135+
if (expected_succ_rows >= 0) {
136+
set 'max_filter_ratio', '1'
137+
}
138+
139+
// if declared a check callback, the default check condition will ignore.
140+
// So you must check all condition
141+
check { result, exception, startTime, endTime ->
142+
if (ignore_failure && expected_succ_rows < 0) { return }
143+
if (exception != null) {
144+
throw exception
145+
}
146+
log.info("Stream load result: ${result}".toString())
147+
def json = parseJson(result)
148+
assertEquals("success", json.Status.toLowerCase())
149+
if (expected_succ_rows >= 0) {
150+
assertEquals(json.NumberLoadedRows, expected_succ_rows)
151+
} else {
152+
assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows)
153+
assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
154+
}
155+
}
156+
}
157+
}
158+
159+
try {
160+
load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 'documents-1000.json')
161+
load_httplogs_data.call(indexTbName3, indexTbName3, 'true', 'json', 'documents-1000.json')
162+
163+
sql "sync"
164+
sql """ set enable_common_expr_pushdown = true; """
165+
166+
GetDebugPoint().enableDebugPointForAllBEs("VMatchPredicate.execute")
167+
qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge ''; """
168+
qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'age'; """
169+
qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'es/na'; """
170+
qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'ets/images/ti'; """
171+
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
172+
173+
qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge ''; """
174+
qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'age'; """
175+
qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'es/na'; """
176+
qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'ets/images/ti'; """
177+
} finally {
178+
GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
179+
}
82180
}

0 commit comments

Comments
 (0)