@@ -242,9 +242,11 @@ struct SubstringUtil {
242
242
const char * str_data = (char *)chars.data () + offsets[i - 1 ];
243
243
int start_value = is_const ? start[0 ] : start[i];
244
244
int len_value = is_const ? len[0 ] : len[i];
245
-
245
+ // Unsigned numbers cannot be used here because start_value can be negative.
246
+ int char_len = simd::VStringFunctions::get_char_len (str_data, str_size);
246
247
// return empty string if start > src.length
247
- if (start_value > str_size || str_size == 0 || start_value == 0 || len_value <= 0 ) {
248
+ // Here, start_value is compared against the length of the character.
249
+ if (start_value > char_len || str_size == 0 || start_value == 0 || len_value <= 0 ) {
248
250
StringOP::push_empty_string (i, res_chars, res_offsets);
249
251
continue ;
250
252
}
@@ -3386,8 +3388,6 @@ class FunctionSubReplace : public IFunction {
3386
3388
return get_variadic_argument_types_impl ().size ();
3387
3389
}
3388
3390
3389
- bool use_default_implementation_for_nulls () const override { return false ; }
3390
-
3391
3391
Status execute_impl (FunctionContext* context, Block& block, const ColumnNumbers& arguments,
3392
3392
size_t result, size_t input_rows_count) const override {
3393
3393
return Impl::execute_impl (context, block, arguments, result, input_rows_count);
@@ -3398,59 +3398,116 @@ struct SubReplaceImpl {
3398
3398
static Status replace_execute (Block& block, const ColumnNumbers& arguments, size_t result,
3399
3399
size_t input_rows_count) {
3400
3400
auto res_column = ColumnString::create ();
3401
- auto result_column = assert_cast<ColumnString*>(res_column.get ());
3401
+ auto * result_column = assert_cast<ColumnString*>(res_column.get ());
3402
3402
auto args_null_map = ColumnUInt8::create (input_rows_count, 0 );
3403
3403
ColumnPtr argument_columns[4 ];
3404
+ bool col_const[4 ];
3404
3405
for (int i = 0 ; i < 4 ; ++i) {
3405
- argument_columns[i] =
3406
- block.get_by_position (arguments[i]).column ->convert_to_full_column_if_const ();
3407
- if (auto * nullable = check_and_get_column<ColumnNullable>(*argument_columns[i])) {
3408
- // Danger: Here must dispose the null map data first! Because
3409
- // argument_columns[i]=nullable->get_nested_column_ptr(); will release the mem
3410
- // of column nullable mem of null map
3411
- VectorizedUtils::update_null_map (args_null_map->get_data (),
3412
- nullable->get_null_map_data ());
3413
- argument_columns[i] = nullable->get_nested_column_ptr ();
3414
- }
3406
+ std::tie (argument_columns[i], col_const[i]) =
3407
+ unpack_if_const (block.get_by_position (arguments[i]).column );
3415
3408
}
3409
+ const auto * data_column = assert_cast<const ColumnString*>(argument_columns[0 ].get ());
3410
+ const auto * mask_column = assert_cast<const ColumnString*>(argument_columns[1 ].get ());
3411
+ const auto * start_column =
3412
+ assert_cast<const ColumnVector<Int32>*>(argument_columns[2 ].get ());
3413
+ const auto * length_column =
3414
+ assert_cast<const ColumnVector<Int32>*>(argument_columns[3 ].get ());
3416
3415
3417
- auto data_column = assert_cast<const ColumnString*>(argument_columns[0 ].get ());
3418
- auto mask_column = assert_cast<const ColumnString*>(argument_columns[1 ].get ());
3419
- auto start_column = assert_cast<const ColumnVector<Int32>*>(argument_columns[2 ].get ());
3420
- auto length_column = assert_cast<const ColumnVector<Int32>*>(argument_columns[3 ].get ());
3421
-
3422
- vector (data_column, mask_column, start_column->get_data (), length_column->get_data (),
3423
- args_null_map->get_data (), result_column, input_rows_count);
3424
-
3416
+ std::visit (
3417
+ [&](auto origin_str_const, auto new_str_const, auto start_const, auto len_const) {
3418
+ if (simd::VStringFunctions::is_ascii (
3419
+ StringRef {data_column->get_chars ().data (), data_column->size ()})) {
3420
+ vector_ascii<origin_str_const, new_str_const, start_const, len_const>(
3421
+ data_column, mask_column, start_column->get_data (),
3422
+ length_column->get_data (), args_null_map->get_data (), result_column,
3423
+ input_rows_count);
3424
+ } else {
3425
+ vector_utf8<origin_str_const, new_str_const, start_const, len_const>(
3426
+ data_column, mask_column, start_column->get_data (),
3427
+ length_column->get_data (), args_null_map->get_data (), result_column,
3428
+ input_rows_count);
3429
+ }
3430
+ },
3431
+ vectorized::make_bool_variant (col_const[0 ]),
3432
+ vectorized::make_bool_variant (col_const[1 ]),
3433
+ vectorized::make_bool_variant (col_const[2 ]),
3434
+ vectorized::make_bool_variant (col_const[3 ]));
3425
3435
block.get_by_position (result).column =
3426
3436
ColumnNullable::create (std::move (res_column), std::move (args_null_map));
3427
3437
return Status::OK ();
3428
3438
}
3429
3439
3430
3440
private:
3431
- static void vector (const ColumnString* data_column, const ColumnString* mask_column,
3432
- const PaddedPODArray<Int32>& start, const PaddedPODArray<Int32>& length,
3433
- NullMap& args_null_map, ColumnString* result_column,
3434
- size_t input_rows_count) {
3441
+ template <bool origin_str_const, bool new_str_const, bool start_const, bool len_const>
3442
+ static void vector_ascii (const ColumnString* data_column, const ColumnString* mask_column,
3443
+ const PaddedPODArray<Int32>& args_start,
3444
+ const PaddedPODArray<Int32>& args_length, NullMap& args_null_map,
3445
+ ColumnString* result_column, size_t input_rows_count) {
3435
3446
ColumnString::Chars& res_chars = result_column->get_chars ();
3436
3447
ColumnString::Offsets& res_offsets = result_column->get_offsets ();
3437
3448
for (size_t row = 0 ; row < input_rows_count; ++row) {
3438
- StringRef origin_str = data_column->get_data_at (row);
3439
- StringRef new_str = mask_column->get_data_at (row);
3440
- size_t origin_str_len = origin_str.size ;
3449
+ StringRef origin_str =
3450
+ data_column->get_data_at (index_check_const<origin_str_const>(row));
3451
+ StringRef new_str = mask_column->get_data_at (index_check_const<new_str_const>(row));
3452
+ const auto start = args_start[index_check_const<start_const>(row)];
3453
+ const auto length = args_length[index_check_const<len_const>(row)];
3454
+ const size_t origin_str_len = origin_str.size ;
3441
3455
// input is null, start < 0, len < 0, str_size <= start. return NULL
3442
- if (args_null_map[row] || start[row] < 0 || length[row] < 0 ||
3443
- origin_str_len <= start[row]) {
3456
+ if (args_null_map[row] || start < 0 || length < 0 || origin_str_len <= start) {
3444
3457
res_offsets.push_back (res_chars.size ());
3445
3458
args_null_map[row] = 1 ;
3446
3459
} else {
3447
3460
std::string_view replace_str = new_str.to_string_view ();
3448
3461
std::string result = origin_str.to_string ();
3449
- result.replace (start[row] , length[row] , replace_str);
3462
+ result.replace (start, length, replace_str);
3450
3463
result_column->insert_data (result.data (), result.length ());
3451
3464
}
3452
3465
}
3453
3466
}
3467
+
3468
+ template <bool origin_str_const, bool new_str_const, bool start_const, bool len_const>
3469
+ static void vector_utf8 (const ColumnString* data_column, const ColumnString* mask_column,
3470
+ const PaddedPODArray<Int32>& args_start,
3471
+ const PaddedPODArray<Int32>& args_length, NullMap& args_null_map,
3472
+ ColumnString* result_column, size_t input_rows_count) {
3473
+ ColumnString::Chars& res_chars = result_column->get_chars ();
3474
+ ColumnString::Offsets& res_offsets = result_column->get_offsets ();
3475
+
3476
+ for (size_t row = 0 ; row < input_rows_count; ++row) {
3477
+ StringRef origin_str =
3478
+ data_column->get_data_at (index_check_const<origin_str_const>(row));
3479
+ StringRef new_str = mask_column->get_data_at (index_check_const<new_str_const>(row));
3480
+ const auto start = args_start[index_check_const<start_const>(row)];
3481
+ const auto length = args_length[index_check_const<len_const>(row)];
3482
+ // input is null, start < 0, len < 0 return NULL
3483
+ if (args_null_map[row] || start < 0 || length < 0 ) {
3484
+ res_offsets.push_back (res_chars.size ());
3485
+ args_null_map[row] = 1 ;
3486
+ continue ;
3487
+ }
3488
+
3489
+ const auto [start_byte_len, start_char_len] =
3490
+ simd::VStringFunctions::iterate_utf8_with_limit_length (origin_str.begin (),
3491
+ origin_str.end (), start);
3492
+
3493
+ // start >= orgin.size
3494
+ DCHECK (start_char_len <= start);
3495
+ if (start_byte_len == origin_str.size ) {
3496
+ res_offsets.push_back (res_chars.size ());
3497
+ args_null_map[row] = 1 ;
3498
+ continue ;
3499
+ }
3500
+
3501
+ auto [end_byte_len, end_char_len] =
3502
+ simd::VStringFunctions::iterate_utf8_with_limit_length (
3503
+ origin_str.begin () + start_byte_len, origin_str.end (), length);
3504
+ DCHECK (end_char_len <= length);
3505
+ std::string_view replace_str = new_str.to_string_view ();
3506
+ std::string result = origin_str.to_string ();
3507
+ result.replace (start_byte_len, end_byte_len, replace_str);
3508
+ result_column->insert_data (result.data (), result.length ());
3509
+ }
3510
+ }
3454
3511
};
3455
3512
3456
3513
struct SubReplaceThreeImpl {
@@ -3467,13 +3524,14 @@ struct SubReplaceThreeImpl {
3467
3524
3468
3525
auto str_col =
3469
3526
block.get_by_position (arguments[1 ]).column ->convert_to_full_column_if_const ();
3470
- if (auto * nullable = check_and_get_column<const ColumnNullable>(*str_col)) {
3527
+ if (const auto * nullable = check_and_get_column<const ColumnNullable>(*str_col)) {
3471
3528
str_col = nullable->get_nested_column_ptr ();
3472
3529
}
3473
- auto & str_offset = assert_cast<const ColumnString*>(str_col.get ())-> get_offsets ( );
3474
-
3530
+ const auto * str_column = assert_cast<const ColumnString*>(str_col.get ());
3531
+ // use utf8 len
3475
3532
for (int i = 0 ; i < input_rows_count; ++i) {
3476
- strlen_data[i] = str_offset[i] - str_offset[i - 1 ];
3533
+ StringRef str_ref = str_column->get_data_at (i);
3534
+ strlen_data[i] = simd::VStringFunctions::get_char_len (str_ref.data , str_ref.size );
3477
3535
}
3478
3536
3479
3537
block.insert ({std::move (params), std::make_shared<DataTypeInt32>(), " strlen" });
0 commit comments