@@ -26,6 +26,16 @@ extern const int LOGICAL_ERROR;
26
26
27
27
namespace TiDB
28
28
{
29
+
30
+ const std::array<char , 128 > weight_ascii_ci
31
+ = {0x00 , 0x01 , 0x02 , 0x03 , 0x04 , 0x05 , 0x06 , 0x07 , 0x08 , 0x09 , 0x0A , 0x0B , 0x0C , 0x0D , 0x0E , 0x0F , 0x10 , 0x11 , 0x12 ,
32
+ 0x13 , 0x14 , 0x15 , 0x16 , 0x17 , 0x18 , 0x19 , 0x1A , 0x1B , 0x1C , 0x1D , 0x1E , 0x1F , 0x20 , 0x21 , 0x22 , 0x23 , 0x24 , 0x25 ,
33
+ 0x26 , 0x27 , 0x28 , 0x29 , 0x2A , 0x2B , 0x2C , 0x2D , 0x2E , 0x2F , 0x30 , 0x31 , 0x32 , 0x33 , 0x34 , 0x35 , 0x36 , 0x37 , 0x38 ,
34
+ 0x39 , 0x3A , 0x3B , 0x3C , 0x3D , 0x3E , 0x3F , 0x40 , 0x61 , 0x62 , 0x63 , 0x64 , 0x65 , 0x66 , 0x67 , 0x68 , 0x69 , 0x6A , 0x6B ,
35
+ 0x6C , 0x6D , 0x6E , 0x6F , 0x70 , 0x71 , 0x72 , 0x73 , 0x74 , 0x75 , 0x76 , 0x77 , 0x78 , 0x79 , 0x7A , 0x5B , 0x5C , 0x5D , 0x5E ,
36
+ 0x5F , 0x60 , 0x61 , 0x62 , 0x63 , 0x64 , 0x65 , 0x66 , 0x67 , 0x68 , 0x69 , 0x6A , 0x6B , 0x6C , 0x6D , 0x6E , 0x6F , 0x70 , 0x71 ,
37
+ 0x72 , 0x73 , 0x74 , 0x75 , 0x76 , 0x77 , 0x78 , 0x79 , 0x7A , 0x7B , 0x7C , 0x7D , 0x7E , 0x7F };
38
+
29
39
TiDBCollators dummy_collators;
30
40
std::vector<std::string> dummy_sort_key_contaners;
31
41
std::string dummy_sort_key_contaner;
@@ -72,12 +82,13 @@ inline Rune decodeUtf8Char(const char * s, size_t & offset)
72
82
template <typename Collator>
73
83
void Pattern<Collator>::compile(const std::string & pattern, char escape)
74
84
{
75
- chars .clear ();
85
+ pattern_weights .clear ();
76
86
match_types.clear ();
77
87
78
- chars .reserve (pattern.length () * sizeof (typename Collator::CharType));
88
+ pattern_weights .reserve (pattern.length () * sizeof (typename Collator::CharType));
79
89
match_types.reserve (pattern.length () * sizeof (typename Pattern::MatchType));
80
90
91
+ auto last_tp = MatchType::Match;
81
92
size_t offset = 0 ;
82
93
while (offset < pattern.length ())
83
94
{
@@ -102,63 +113,266 @@ void Pattern<Collator>::compile(const std::string & pattern, char escape)
102
113
}
103
114
else if (c == ' %' )
104
115
{
116
+ // Only keep one '%' for continuous '%'s
117
+ if (last_tp == MatchType::Any)
118
+ {
119
+ continue ;
120
+ }
105
121
tp = MatchType::Any;
106
122
}
107
123
else
108
124
{
109
125
tp = MatchType::Match;
110
126
}
111
- chars .push_back (c);
127
+ pattern_weights .push_back (c);
112
128
match_types.push_back (tp);
129
+ last_tp = tp;
130
+ }
131
+ }
132
+
133
+ template <typename Collator>
134
+ void Pattern<Collator>::tryCompileAsciiCi(const std::string & pattern, char escape)
135
+ {
136
+ is_ascii_ci_pattern = false ;
137
+ // Can't handle non-ASCII escape
138
+ if (escape < 0 )
139
+ {
140
+ return ;
141
+ }
142
+ ascii_ci_pattern.clear ();
143
+ ascii_ci_pattern.reserve (pattern.length ());
144
+
145
+ for (size_t i = 0 ; i < pattern.length (); i++)
146
+ {
147
+ auto c = pattern[i];
148
+ // Can't handle non-ASCII character
149
+ if (c < 0 )
150
+ {
151
+ return ;
152
+ }
153
+
154
+ if (c == escape)
155
+ {
156
+ if (i < pattern.length () - 1 )
157
+ {
158
+ // use next char to match
159
+ c = pattern[++i];
160
+ }
161
+ else
162
+ {
163
+ // use `escape` to match
164
+ }
165
+ }
166
+ else if (c == ' %' )
167
+ {
168
+ if (i > 0 && pattern[i - 1 ] == ' %' )
169
+ {
170
+ continue ;
171
+ }
172
+ }
173
+
174
+ ascii_ci_pattern.push_back (weight_ascii_ci[c]);
113
175
}
176
+ is_ascii_ci_pattern = true ;
114
177
}
115
178
116
179
template <typename Collator>
117
180
bool Pattern<Collator>::match(const char * s, size_t length) const
118
181
{
119
- size_t s_offset = 0 , next_s_offset = 0 , tmp_s_offset = 0 ;
120
- size_t p_idx = 0 , next_p_idx = 0 ;
121
- while (p_idx < chars.size () || s_offset < length)
182
+ if (is_ascii_ci_pattern)
122
183
{
123
- if (p_idx < chars.size ())
184
+ if (auto ret = tryMatchAsciiCi (s, length); ret >= 0 ) {
185
+ return ret;
186
+ }
187
+ // if ret == -1, means the string contains non-ASCII characters, continue to check
188
+ }
189
+
190
+ size_t s_offset = 0 , backtrack_s_offset = 0 ;
191
+ size_t p_idx = 0 , p_idx_after_any = 0 ;
192
+ while (true )
193
+ {
194
+ if (p_idx < pattern_weights.size ())
124
195
{
125
- switch (match_types[p_idx] )
196
+ if (s_offset >= length )
126
197
{
127
- case Match:
128
- if (s_offset < length
129
- && Collator::regexEq (Collator::decodeChar (s, tmp_s_offset = s_offset), chars[p_idx]))
198
+ // If the last character is '%', it means the pattern is like 'a%',
199
+ // we can match the rest of the string with '%'.
200
+ return static_cast <bool >(match_types[p_idx] == Any && p_idx == pattern_weights.size () - 1 );
201
+ }
202
+ else
203
+ {
204
+ if (match_types[p_idx] == Match)
130
205
{
131
- p_idx++;
132
- s_offset = tmp_s_offset;
133
- continue ;
206
+ if (Collator::regexEq (Collator::decodeChar (s, s_offset), pattern_weights[p_idx]))
207
+ {
208
+ p_idx++;
209
+ // To compare the next
210
+ continue ;
211
+ }
134
212
}
135
- break ;
136
- case One:
137
- if (s_offset < length)
213
+ else if (match_types[p_idx] == One)
138
214
{
139
215
p_idx++;
140
216
Collator::decodeChar (s, s_offset);
141
217
continue ;
142
218
}
143
- break ;
144
- case Any:
145
- next_p_idx = p_idx;
146
- Collator::decodeChar (s, next_s_offset = s_offset);
147
- p_idx++;
148
- continue ;
219
+ else if (match_types[p_idx] == Any)
220
+ {
221
+ // Last '%' can match all left characters
222
+ if (p_idx == pattern_weights.size () - 1 )
223
+ {
224
+ return true ;
225
+ }
226
+ p_idx_after_any = ++p_idx;
227
+ backtrack_s_offset = s_offset;
228
+ continue ;
229
+ }
230
+ }
231
+ }
232
+ else
233
+ {
234
+ // All characters in the pattern have been matched,
235
+ // we need to check if the rest of the string is empty.
236
+ if (s_offset >= length)
237
+ {
238
+ return true ;
149
239
}
240
+ // Else there are still characters in the string to match,
241
+ // we need to backtrack below if there is '%' before.
150
242
}
151
- if (0 < next_s_offset && next_s_offset <= length)
243
+
244
+ // Backtrack
245
+ if (p_idx_after_any > 0 )
152
246
{
153
- p_idx = next_p_idx;
154
- s_offset = next_s_offset;
247
+ Collator::decodeChar (s, s_offset = backtrack_s_offset);
248
+ // Fast forward to the first match position
249
+ if (match_types[p_idx_after_any] == Match)
250
+ {
251
+ while (true )
252
+ {
253
+ backtrack_s_offset = s_offset;
254
+ if (Collator::regexEq (Collator::decodeChar (s, s_offset), pattern_weights[p_idx_after_any]))
255
+ {
256
+ break ;
257
+ }
258
+ if (s_offset >= length)
259
+ {
260
+ return false ;
261
+ }
262
+ }
263
+
264
+ p_idx = p_idx_after_any + 1 ;
265
+ continue ;
266
+ }
267
+ p_idx = p_idx_after_any;
268
+ backtrack_s_offset = s_offset;
155
269
continue ;
156
270
}
157
271
return false ;
158
272
}
159
273
return true ;
160
274
}
161
275
276
+ // Similar logical like match, but don't need to decodeChar
277
+ template <typename Collator>
278
+ int Pattern<Collator>::tryMatchAsciiCi(const char * s, size_t length) const
279
+ {
280
+ size_t p_idx = 0 ;
281
+ size_t p_idx_after_any = 0 ;
282
+ size_t str_idx = 0 ;
283
+ size_t backtrack_idx = 0 ;
284
+
285
+ while (true )
286
+ {
287
+ if (p_idx < ascii_ci_pattern.size ())
288
+ {
289
+ if (str_idx >= length)
290
+ {
291
+ if (match_types[p_idx] == Any && p_idx == ascii_ci_pattern.size () - 1 )
292
+ {
293
+ return 1 ;
294
+ }
295
+ else
296
+ {
297
+ return 0 ;
298
+ }
299
+ }
300
+ else
301
+ {
302
+ // Can't handle non-ASCII escape
303
+ if (s[str_idx] < 0 )
304
+ {
305
+ return -1 ;
306
+ }
307
+ if ((match_types[p_idx] == Match
308
+ && weight_ascii_ci[s[str_idx]] == ascii_ci_pattern[p_idx])
309
+ || match_types[p_idx] == One)
310
+ {
311
+ p_idx++;
312
+ str_idx++;
313
+ continue ;
314
+ }
315
+ else if (match_types[p_idx] == Any)
316
+ {
317
+ if (p_idx == ascii_ci_pattern.size () - 1 )
318
+ {
319
+ return 1 ;
320
+ }
321
+ p_idx_after_any = ++p_idx;
322
+ backtrack_idx = str_idx;
323
+ continue ;
324
+ }
325
+ }
326
+ }
327
+ else
328
+ {
329
+ if (str_idx >= length)
330
+ {
331
+ return 1 ;
332
+ }
333
+ }
334
+
335
+ // Backtrack
336
+ if (p_idx_after_any > 0 )
337
+ {
338
+ str_idx = ++backtrack_idx;
339
+ if (match_types[p_idx_after_any] == Match)
340
+ { // Fast forward to the first match position
341
+ while (str_idx < length)
342
+ {
343
+ // Can't handle non-ASCII escape
344
+ if (s[str_idx] < 0 )
345
+ {
346
+ return -1 ;
347
+ }
348
+
349
+ if (weight_ascii_ci[s[str_idx]] != ascii_ci_pattern[p_idx_after_any])
350
+ {
351
+ str_idx = ++backtrack_idx;
352
+ }
353
+ else
354
+ {
355
+ break ;
356
+ }
357
+ }
358
+
359
+ if (str_idx >= length)
360
+ {
361
+ return 0 ;
362
+ }
363
+ str_idx++;
364
+ p_idx = p_idx_after_any + 1 ;
365
+ continue ;
366
+ }
367
+
368
+ p_idx = p_idx_after_any;
369
+ continue ;
370
+ }
371
+ return 0 ;
372
+ }
373
+ return 1 ;
374
+ }
375
+
162
376
template <typename T, bool padding>
163
377
inline std::unique_ptr<ITiDBCollator::IPattern> BinCollator<T, padding>::pattern() const
164
378
{
0 commit comments