Skip to content

Commit 44cee58

Browse files
committed
Improve like perf for utf8 ci collations
1 parent 7339a99 commit 44cee58

File tree

5 files changed

+358
-74
lines changed

5 files changed

+358
-74
lines changed

dbms/src/Functions/CollationStringSearchOptimized.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ struct BinStrPattern
220220
}
221221
}
222222
return true;
223-
};
223+
}
224224

225225
// match from end exactly
226226
// - return true if meet %
@@ -265,7 +265,7 @@ struct BinStrPattern
265265
}
266266
}
267267
return true;
268-
};
268+
}
269269

270270
// search by pattern `...%..%`
271271
// - return true if meet %
@@ -336,7 +336,7 @@ struct BinStrPattern
336336
}
337337
}
338338
}
339-
};
339+
}
340340

341341
ALWAYS_INLINE inline bool match(std::string_view src) const
342342
{
@@ -432,6 +432,7 @@ ALWAYS_INLINE inline bool StringPatternMatchImpl(
432432
{
433433
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_BIN:
434434
case TiDB::ITiDBCollator::CollatorType::UTF8_BIN:
435+
case TiDB::ITiDBCollator::CollatorType::UTF8MB4_0900_BIN:
435436
{
436437
BinStringPatternMatch<Result, revert, true>(a_data, a_offsets, pattern_str, escape_char, c);
437438
use_optimized_path = true;
@@ -451,4 +452,4 @@ ALWAYS_INLINE inline bool StringPatternMatchImpl(
451452
}
452453
return use_optimized_path;
453454
}
454-
} // namespace DB
455+
} // namespace DB

dbms/src/Functions/FunctionsStringSearch.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,10 @@ struct MatchImpl
166166
{
167167
auto matcher = collator->pattern();
168168
matcher->compile(orig_pattern, escape_char);
169+
if (collator->isCI())
170+
{
171+
matcher->tryCompileAsciiCi(orig_pattern, escape_char);
172+
}
169173
LoopOneColumn(data, offsets, offsets.size(), [&](const std::string_view & view, size_t i) {
170174
res[i] = revert ^ matcher->match(view.data(), view.size());
171175
});

dbms/src/TiDB/Collation/Collator.cpp

Lines changed: 240 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,16 @@ extern const int LOGICAL_ERROR;
2626

2727
namespace TiDB
2828
{
29+
30+
const std::array<char, 128> weight_ascii_ci
31+
= {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12,
32+
0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25,
33+
0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
34+
0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
35+
0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E,
36+
0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
37+
0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F};
38+
2939
TiDBCollators dummy_collators;
3040
std::vector<std::string> dummy_sort_key_contaners;
3141
std::string dummy_sort_key_contaner;
@@ -72,12 +82,13 @@ inline Rune decodeUtf8Char(const char * s, size_t & offset)
7282
template <typename Collator>
7383
void Pattern<Collator>::compile(const std::string & pattern, char escape)
7484
{
75-
chars.clear();
85+
pattern_weights.clear();
7686
match_types.clear();
7787

78-
chars.reserve(pattern.length() * sizeof(typename Collator::CharType));
88+
pattern_weights.reserve(pattern.length() * sizeof(typename Collator::CharType));
7989
match_types.reserve(pattern.length() * sizeof(typename Pattern::MatchType));
8090

91+
auto last_tp = MatchType::Match;
8192
size_t offset = 0;
8293
while (offset < pattern.length())
8394
{
@@ -102,63 +113,266 @@ void Pattern<Collator>::compile(const std::string & pattern, char escape)
102113
}
103114
else if (c == '%')
104115
{
116+
// Only keep one '%' for continuous '%'s
117+
if (last_tp == MatchType::Any)
118+
{
119+
continue;
120+
}
105121
tp = MatchType::Any;
106122
}
107123
else
108124
{
109125
tp = MatchType::Match;
110126
}
111-
chars.push_back(c);
127+
pattern_weights.push_back(c);
112128
match_types.push_back(tp);
129+
last_tp = tp;
130+
}
131+
}
132+
133+
template <typename Collator>
134+
void Pattern<Collator>::tryCompileAsciiCi(const std::string & pattern, char escape)
135+
{
136+
is_ascii_ci_pattern = false;
137+
// Can't handle non-ASCII escape
138+
if (escape < 0)
139+
{
140+
return;
141+
}
142+
ascii_ci_pattern.clear();
143+
ascii_ci_pattern.reserve(pattern.length());
144+
145+
for (size_t i = 0; i < pattern.length(); i++)
146+
{
147+
auto c = pattern[i];
148+
// Can't handle non-ASCII character
149+
if (c < 0)
150+
{
151+
return;
152+
}
153+
154+
if (c == escape)
155+
{
156+
if (i < pattern.length() - 1)
157+
{
158+
// use next char to match
159+
c = pattern[++i];
160+
}
161+
else
162+
{
163+
// use `escape` to match
164+
}
165+
}
166+
else if (c == '%')
167+
{
168+
if (i > 0 && pattern[i - 1] == '%')
169+
{
170+
continue;
171+
}
172+
}
173+
174+
ascii_ci_pattern.push_back(weight_ascii_ci[c]);
113175
}
176+
is_ascii_ci_pattern = true;
114177
}
115178

116179
template <typename Collator>
117180
bool Pattern<Collator>::match(const char * s, size_t length) const
118181
{
119-
size_t s_offset = 0, next_s_offset = 0, tmp_s_offset = 0;
120-
size_t p_idx = 0, next_p_idx = 0;
121-
while (p_idx < chars.size() || s_offset < length)
182+
if (is_ascii_ci_pattern)
122183
{
123-
if (p_idx < chars.size())
184+
if (auto ret = tryMatchAsciiCi(s, length); ret >= 0) {
185+
return ret;
186+
}
187+
// if ret == -1, means the string contains non-ASCII characters, continue to check
188+
}
189+
190+
size_t s_offset = 0, backtrack_s_offset = 0;
191+
size_t p_idx = 0, p_idx_after_any = 0;
192+
while (true)
193+
{
194+
if (p_idx < pattern_weights.size())
124195
{
125-
switch (match_types[p_idx])
196+
if (s_offset >= length)
126197
{
127-
case Match:
128-
if (s_offset < length
129-
&& Collator::regexEq(Collator::decodeChar(s, tmp_s_offset = s_offset), chars[p_idx]))
198+
// If the last character is '%', it means the pattern is like 'a%',
199+
// we can match the rest of the string with '%'.
200+
return static_cast<bool>(match_types[p_idx] == Any && p_idx == pattern_weights.size() - 1);
201+
}
202+
else
203+
{
204+
if (match_types[p_idx] == Match)
130205
{
131-
p_idx++;
132-
s_offset = tmp_s_offset;
133-
continue;
206+
if (Collator::regexEq(Collator::decodeChar(s, s_offset), pattern_weights[p_idx]))
207+
{
208+
p_idx++;
209+
// To compare the next
210+
continue;
211+
}
134212
}
135-
break;
136-
case One:
137-
if (s_offset < length)
213+
else if (match_types[p_idx] == One)
138214
{
139215
p_idx++;
140216
Collator::decodeChar(s, s_offset);
141217
continue;
142218
}
143-
break;
144-
case Any:
145-
next_p_idx = p_idx;
146-
Collator::decodeChar(s, next_s_offset = s_offset);
147-
p_idx++;
148-
continue;
219+
else if (match_types[p_idx] == Any)
220+
{
221+
// Last '%' can match all left characters
222+
if (p_idx == pattern_weights.size() - 1)
223+
{
224+
return true;
225+
}
226+
p_idx_after_any = ++p_idx;
227+
backtrack_s_offset = s_offset;
228+
continue;
229+
}
230+
}
231+
}
232+
else
233+
{
234+
// All characters in the pattern have been matched,
235+
// we need to check if the rest of the string is empty.
236+
if (s_offset >= length)
237+
{
238+
return true;
149239
}
240+
// Else there are still characters in the string to match,
241+
// we need to backtrack below if there is '%' before.
150242
}
151-
if (0 < next_s_offset && next_s_offset <= length)
243+
244+
// Backtrack
245+
if (p_idx_after_any > 0)
152246
{
153-
p_idx = next_p_idx;
154-
s_offset = next_s_offset;
247+
Collator::decodeChar(s, s_offset = backtrack_s_offset);
248+
// Fast forward to the first match position
249+
if (match_types[p_idx_after_any] == Match)
250+
{
251+
while (true)
252+
{
253+
backtrack_s_offset = s_offset;
254+
if (Collator::regexEq(Collator::decodeChar(s, s_offset), pattern_weights[p_idx_after_any]))
255+
{
256+
break;
257+
}
258+
if (s_offset >= length)
259+
{
260+
return false;
261+
}
262+
}
263+
264+
p_idx = p_idx_after_any + 1;
265+
continue;
266+
}
267+
p_idx = p_idx_after_any;
268+
backtrack_s_offset = s_offset;
155269
continue;
156270
}
157271
return false;
158272
}
159273
return true;
160274
}
161275

276+
// Similar logical like match, but don't need to decodeChar
277+
template <typename Collator>
278+
int Pattern<Collator>::tryMatchAsciiCi(const char * s, size_t length) const
279+
{
280+
size_t p_idx = 0;
281+
size_t p_idx_after_any = 0;
282+
size_t str_idx = 0;
283+
size_t backtrack_idx = 0;
284+
285+
while (true)
286+
{
287+
if (p_idx < ascii_ci_pattern.size())
288+
{
289+
if (str_idx >= length)
290+
{
291+
if (match_types[p_idx] == Any && p_idx == ascii_ci_pattern.size() - 1)
292+
{
293+
return 1;
294+
}
295+
else
296+
{
297+
return 0;
298+
}
299+
}
300+
else
301+
{
302+
// Can't handle non-ASCII escape
303+
if (s[str_idx] < 0)
304+
{
305+
return -1;
306+
}
307+
if ((match_types[p_idx] == Match
308+
&& weight_ascii_ci[s[str_idx]] == ascii_ci_pattern[p_idx])
309+
|| match_types[p_idx] == One)
310+
{
311+
p_idx++;
312+
str_idx++;
313+
continue;
314+
}
315+
else if (match_types[p_idx] == Any)
316+
{
317+
if (p_idx == ascii_ci_pattern.size() - 1)
318+
{
319+
return 1;
320+
}
321+
p_idx_after_any = ++p_idx;
322+
backtrack_idx = str_idx;
323+
continue;
324+
}
325+
}
326+
}
327+
else
328+
{
329+
if (str_idx >= length)
330+
{
331+
return 1;
332+
}
333+
}
334+
335+
// Backtrack
336+
if (p_idx_after_any > 0)
337+
{
338+
str_idx = ++backtrack_idx;
339+
if (match_types[p_idx_after_any] == Match)
340+
{ // Fast forward to the first match position
341+
while (str_idx < length)
342+
{
343+
// Can't handle non-ASCII escape
344+
if (s[str_idx] < 0)
345+
{
346+
return -1;
347+
}
348+
349+
if (weight_ascii_ci[s[str_idx]] != ascii_ci_pattern[p_idx_after_any])
350+
{
351+
str_idx = ++backtrack_idx;
352+
}
353+
else
354+
{
355+
break;
356+
}
357+
}
358+
359+
if (str_idx >= length)
360+
{
361+
return 0;
362+
}
363+
str_idx++;
364+
p_idx = p_idx_after_any + 1;
365+
continue;
366+
}
367+
368+
p_idx = p_idx_after_any;
369+
continue;
370+
}
371+
return 0;
372+
}
373+
return 1;
374+
}
375+
162376
template <typename T, bool padding>
163377
inline std::unique_ptr<ITiDBCollator::IPattern> BinCollator<T, padding>::pattern() const
164378
{

0 commit comments

Comments
 (0)