Skip to content

Commit a5a6ffe

Browse files
committed
[opt](serde)Optimize the filling of fixed values ​​into block columns without repeated deserialization. (apache#37377)
## Proposed changes Since the value of the partition column is fixed when querying the partition table, we can deserialize the value only once and then repeatedly insert the value into the block. ```sql in Hive: CREATE TABLE parquet_partition_tb ( col1 STRING, col2 INT, col3 DOUBLE ) PARTITIONED BY ( partition_col1 STRING, partition_col2 INT ) STORED AS PARQUET; insert into parquet_partition_tb partition (partition_col1="hello",partition_col2=1) values("word",2,2.3); insert into parquet_partition_tb partition(partition_col1="hello",partition_col2=1 ) select col1,col2,col3 from parquet_partition_tb where partition_col1="hello" and partition_col2=1; Repeat the `insert into xxx select xxx`operation several times. Doris : before: mysql> select count(partition_col1) from parquet_partition_tb; +-----------------------+ | count(partition_col1) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (3.24 sec) mysql> select count(partition_col2) from parquet_partition_tb; +-----------------------+ | count(partition_col2) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (3.34 sec) after: mysql> select count(partition_col1) from parquet_partition_tb ; +-----------------------+ | count(partition_col1) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (0.79 sec) mysql> select count(partition_col2) from parquet_partition_tb; +-----------------------+ | count(partition_col2) | +-----------------------+ | 33554432 | +-----------------------+ 1 row in set (0.51 sec) ``` ## Summary: test sql `select count(partition_col) from tbl;` Number of lines : 33554432 | |before | after| |---|---|--| |boolean | 3.96|0.47 | |tinyint | 3.39|0.47 | |smallint | 3.14|0.50 | |int |3.34|0.51 | |bigint | 3.61|0.51 | |float | 4.59 |0.51 | |double |4.60| 0.55 | |decimal(5,2)| 3.96 |0.61 | |date | 5.80|0.52 | |timestamp | 7.68 | 0.52 | |string | 3.24 |0.79 | Issue Number: close #xxx <!--Describe your changes.-->
1 parent 9f4e734 commit a5a6ffe

15 files changed

+191
-18
lines changed

be/src/vec/data_types/serde/data_type_datetimev2_serde.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,4 +247,25 @@ Status DataTypeDateTimeV2SerDe::write_column_to_orc(const std::string& timezone,
247247
return Status::OK();
248248
}
249249

250+
Status DataTypeDateTimeV2SerDe::deserialize_column_from_fixed_json(
251+
IColumn& column, Slice& slice, int rows, int* num_deserialized,
252+
const FormatOptions& options) const {
253+
Status st = deserialize_one_cell_from_json(column, slice, options);
254+
if (!st.ok()) {
255+
return st;
256+
}
257+
258+
DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(column, rows - 1);
259+
*num_deserialized = rows;
260+
return Status::OK();
261+
}
262+
263+
void DataTypeDateTimeV2SerDe::insert_column_last_value_multiple_times(IColumn& column,
264+
int times) const {
265+
auto& col = static_cast<ColumnVector<UInt64>&>(column);
266+
auto sz = col.size();
267+
UInt64 val = col.get_element(sz - 1);
268+
col.insert_many_vals(val, times);
269+
}
270+
250271
} // namespace doris::vectorized

be/src/vec/data_types/serde/data_type_datetimev2_serde.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ class DataTypeDateTimeV2SerDe : public DataTypeNumberSerDe<UInt64> {
7777
int start, int end,
7878
std::vector<StringRef>& buffer_list) const override;
7979

80+
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows,
81+
int* num_deserialized,
82+
const FormatOptions& options) const override;
83+
void insert_column_last_value_multiple_times(IColumn& column, int times) const override;
84+
8085
private:
8186
template <bool is_binary_format>
8287
Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result,

be/src/vec/data_types/serde/data_type_datev2_serde.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,5 +175,26 @@ Status DataTypeDateV2SerDe::write_column_to_orc(const std::string& timezone, con
175175
return Status::OK();
176176
}
177177

178+
Status DataTypeDateV2SerDe::deserialize_column_from_fixed_json(IColumn& column, Slice& slice,
179+
int rows, int* num_deserialized,
180+
const FormatOptions& options) const {
181+
Status st = deserialize_one_cell_from_json(column, slice, options);
182+
if (!st.ok()) {
183+
return st;
184+
}
185+
DataTypeDateV2SerDe::insert_column_last_value_multiple_times(column, rows - 1);
186+
*num_deserialized = rows;
187+
return Status::OK();
188+
}
189+
190+
void DataTypeDateV2SerDe::insert_column_last_value_multiple_times(IColumn& column,
191+
int times) const {
192+
auto& col = static_cast<ColumnVector<UInt32>&>(column);
193+
auto sz = col.size();
194+
UInt32 val = col.get_element(sz - 1);
195+
196+
col.insert_many_vals(val, times);
197+
}
198+
178199
} // namespace vectorized
179200
} // namespace doris

be/src/vec/data_types/serde/data_type_datev2_serde.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ class DataTypeDateV2SerDe : public DataTypeNumberSerDe<UInt32> {
7474
int start, int end,
7575
std::vector<StringRef>& buffer_list) const override;
7676

77+
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows,
78+
int* num_deserialized,
79+
const FormatOptions& options) const override;
80+
81+
void insert_column_last_value_multiple_times(IColumn& column, int times) const override;
82+
7783
private:
7884
template <bool is_binary_format>
7985
Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result,

be/src/vec/data_types/serde/data_type_decimal_serde.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,32 @@ Status DataTypeDecimalSerDe<T>::write_column_to_orc(const std::string& timezone,
275275
}
276276
return Status::OK();
277277
}
278+
template <typename T>
279+
280+
Status DataTypeDecimalSerDe<T>::deserialize_column_from_fixed_json(
281+
IColumn& column, Slice& slice, int rows, int* num_deserialized,
282+
const FormatOptions& options) const {
283+
Status st = deserialize_one_cell_from_json(column, slice, options);
284+
if (!st.ok()) {
285+
return st;
286+
}
287+
288+
DataTypeDecimalSerDe::insert_column_last_value_multiple_times(column, rows - 1);
289+
*num_deserialized = rows;
290+
return Status::OK();
291+
}
292+
293+
template <typename T>
294+
void DataTypeDecimalSerDe<T>::insert_column_last_value_multiple_times(IColumn& column,
295+
int times) const {
296+
auto& col = static_cast<ColumnDecimal<T>&>(column);
297+
auto sz = col.size();
298+
299+
T val = col.get_element(sz - 1);
300+
for (int i = 0; i < times; i++) {
301+
col.insert_value(val);
302+
}
303+
}
278304

279305
template class DataTypeDecimalSerDe<Decimal32>;
280306
template class DataTypeDecimalSerDe<Decimal64>;

be/src/vec/data_types/serde/data_type_decimal_serde.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,12 @@ class DataTypeDecimalSerDe : public DataTypeSerDe {
114114
int start, int end,
115115
std::vector<StringRef>& buffer_list) const override;
116116

117+
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows,
118+
int* num_deserialized,
119+
const FormatOptions& options) const override;
120+
121+
void insert_column_last_value_multiple_times(IColumn& column, int times) const override;
122+
117123
private:
118124
template <bool is_binary_format>
119125
Status _write_column_to_mysql(const IColumn& column, MysqlRowBuffer<is_binary_format>& result,

be/src/vec/data_types/serde/data_type_nullable_serde.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,26 @@ Status DataTypeNullableSerDe::deserialize_column_from_hive_text_vector(
127127
return Status::OK();
128128
}
129129

130+
Status DataTypeNullableSerDe::deserialize_column_from_fixed_json(
131+
IColumn& column, Slice& slice, int rows, int* num_deserialized,
132+
const FormatOptions& options) const {
133+
auto& col = static_cast<ColumnNullable&>(column);
134+
Status st = deserialize_one_cell_from_json(column, slice, options);
135+
if (!st.ok()) {
136+
return st;
137+
}
138+
auto& null_map = col.get_null_map_data();
139+
auto& nested_column = col.get_nested_column();
140+
141+
null_map.resize_fill(
142+
rows, null_map.back()); // data_type_nullable::insert_column_last_value_multiple_times()
143+
if (rows - 1 != 0) {
144+
nested_serde->insert_column_last_value_multiple_times(nested_column, rows - 1);
145+
}
146+
*num_deserialized = rows;
147+
return Status::OK();
148+
}
149+
130150
Status DataTypeNullableSerDe::deserialize_one_cell_from_json(IColumn& column, Slice& slice,
131151
const FormatOptions& options) const {
132152
auto& null_column = assert_cast<ColumnNullable&>(column);

be/src/vec/data_types/serde/data_type_nullable_serde.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ class DataTypeNullableSerDe : public DataTypeSerDe {
4747
int* num_deserialized,
4848
const FormatOptions& options) const override;
4949

50+
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows,
51+
int* num_deserialized,
52+
const FormatOptions& options) const override;
5053
Status deserialize_one_cell_from_hive_text(
5154
IColumn& column, Slice& slice, const FormatOptions& options,
5255
int hive_text_complex_type_delimiter_level = 1) const override;

be/src/vec/data_types/serde/data_type_number_serde.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,28 @@ void DataTypeNumberSerDe<T>::read_column_from_arrow(IColumn& column,
224224
const auto* raw_data = reinterpret_cast<const T*>(buffer->data()) + start;
225225
col_data.insert(raw_data, raw_data + row_count);
226226
}
227+
template <typename T>
228+
Status DataTypeNumberSerDe<T>::deserialize_column_from_fixed_json(
229+
IColumn& column, Slice& slice, int rows, int* num_deserialized,
230+
const FormatOptions& options) const {
231+
Status st = deserialize_one_cell_from_json(column, slice, options);
232+
if (!st.ok()) {
233+
return st;
234+
}
235+
236+
DataTypeNumberSerDe::insert_column_last_value_multiple_times(column, rows - 1);
237+
*num_deserialized = rows;
238+
return Status::OK();
239+
}
240+
241+
template <typename T>
242+
void DataTypeNumberSerDe<T>::insert_column_last_value_multiple_times(IColumn& column,
243+
int times) const {
244+
auto& col = static_cast<ColumnVector<T>&>(column);
245+
auto sz = col.size();
246+
T val = col.get_element(sz - 1);
247+
col.insert_many_vals(val, times);
248+
}
227249

228250
template <typename T>
229251
template <bool is_binary_format>

be/src/vec/data_types/serde/data_type_number_serde.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ class DataTypeNumberSerDe : public DataTypeSerDe {
7070
int* num_deserialized,
7171
const FormatOptions& options) const override;
7272

73+
Status deserialize_column_from_fixed_json(IColumn& column, Slice& slice, int rows,
74+
int* num_deserialized,
75+
const FormatOptions& options) const override;
76+
77+
void insert_column_last_value_multiple_times(IColumn& column, int times) const override;
78+
7379
Status write_column_to_pb(const IColumn& column, PValues& result, int start,
7480
int end) const override;
7581
Status read_column_from_pb(IColumn& column, const PValues& arg) const override;

0 commit comments

Comments
 (0)