Skip to content

Commit 87962aa

Browse files
authored
[bugfix](paimon)Fixed the reading of timestamp with time zone type data (#37716)
## Proposed changes 1. When using jni to read timestamps with time zones, the time needs to be converted to local time 2. In version 0.8 of paimon, the time zone (isAdjustToUTC) information of parquet files is added, and doris can parse data directly according to the time zone information
1 parent 0816969 commit 87962aa

File tree

7 files changed

+400
-51
lines changed

7 files changed

+400
-51
lines changed

be/src/vec/exec/scan/vfile_scanner.cpp

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -785,20 +785,9 @@ Status VFileScanner::_get_next_reader() {
785785
break;
786786
}
787787
case TFileFormatType::FORMAT_PARQUET: {
788-
static const cctz::time_zone utc0 = cctz::utc_time_zone();
789-
cctz::time_zone* tz;
790-
if (range.__isset.table_format_params &&
791-
range.table_format_params.table_format_type == "paimon") {
792-
// The timestmap generated by paimon does not carry metadata information (e.g., isAdjustToUTC, etc.),
793-
// and the stored data is UTC0 by default, so it is directly set to the UTC time zone.
794-
// In version 0.7, paimon fixed this issue and can remove the judgment here
795-
tz = const_cast<cctz::time_zone*>(&utc0);
796-
} else {
797-
tz = const_cast<cctz::time_zone*>(&_state->timezone_obj());
798-
}
799788
std::unique_ptr<ParquetReader> parquet_reader = ParquetReader::create_unique(
800-
_profile, *_params, range, _state->query_options().batch_size, tz,
801-
_io_ctx.get(), _state,
789+
_profile, *_params, range, _state->query_options().batch_size,
790+
const_cast<cctz::time_zone*>(&_state->timezone_obj()), _io_ctx.get(), _state,
802791
_shoudl_enable_file_meta_cache() ? ExecEnv::GetInstance()->file_meta_cache()
803792
: nullptr,
804793
_state->query_options().enable_parquet_lazy_mat);

docker/thirdparties/docker-compose/iceberg/tools/save_docker.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,7 @@ docker exec iceberg-rest bash -c 'cp /tmp/iceberg_rest_mode\=memory /mnt/data/in
2121

2222
# save iceberg from s3
2323
docker exec mc bash -c 'mc cp -r minio/warehouse /mnt/data/input/minio'
24+
25+
# package zip
26+
cp -r data iceberg_data
27+
zip -rq iceberg_data.zip iceberg_data

fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonColumnValue.java

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,33 +24,43 @@
2424
import org.apache.paimon.data.InternalArray;
2525
import org.apache.paimon.data.InternalMap;
2626
import org.apache.paimon.data.InternalRow;
27+
import org.apache.paimon.data.Timestamp;
28+
import org.apache.paimon.types.ArrayType;
29+
import org.apache.paimon.types.DataType;
30+
import org.apache.paimon.types.LocalZonedTimestampType;
31+
import org.apache.paimon.types.MapType;
32+
import org.apache.paimon.types.RowType;
2733
import org.slf4j.Logger;
2834
import org.slf4j.LoggerFactory;
2935

3036
import java.math.BigDecimal;
3137
import java.math.BigInteger;
3238
import java.time.LocalDate;
3339
import java.time.LocalDateTime;
40+
import java.time.ZoneId;
3441
import java.util.List;
3542

3643
public class PaimonColumnValue implements ColumnValue {
3744
private static final Logger LOG = LoggerFactory.getLogger(PaimonColumnValue.class);
3845
private int idx;
3946
private DataGetters record;
4047
private ColumnType dorisType;
48+
private DataType dataType;
4149

4250
public PaimonColumnValue() {
4351
}
4452

45-
public PaimonColumnValue(DataGetters record, int idx, ColumnType columnType) {
53+
public PaimonColumnValue(DataGetters record, int idx, ColumnType columnType, DataType dataType) {
4654
this.idx = idx;
4755
this.record = record;
4856
this.dorisType = columnType;
57+
this.dataType = dataType;
4958
}
5059

51-
public void setIdx(int idx, ColumnType dorisType) {
60+
public void setIdx(int idx, ColumnType dorisType, DataType dataType) {
5261
this.idx = idx;
5362
this.dorisType = dorisType;
63+
this.dataType = dataType;
5464
}
5565

5666
public void setOffsetRow(InternalRow record) {
@@ -124,7 +134,12 @@ public LocalDate getDate() {
124134

125135
@Override
126136
public LocalDateTime getDateTime() {
127-
return record.getTimestamp(idx, dorisType.getPrecision()).toLocalDateTime();
137+
Timestamp ts = record.getTimestamp(idx, dorisType.getPrecision());
138+
if (dataType instanceof LocalZonedTimestampType) {
139+
return LocalDateTime.ofInstant(ts.toInstant(), ZoneId.systemDefault());
140+
} else {
141+
return ts.toLocalDateTime();
142+
}
128143
}
129144

130145
@Override
@@ -142,7 +157,7 @@ public void unpackArray(List<ColumnValue> values) {
142157
InternalArray recordArray = record.getArray(idx);
143158
for (int i = 0; i < recordArray.size(); i++) {
144159
PaimonColumnValue arrayColumnValue = new PaimonColumnValue((DataGetters) recordArray, i,
145-
dorisType.getChildTypes().get(0));
160+
dorisType.getChildTypes().get(0), ((ArrayType) dataType).getElementType());
146161
values.add(arrayColumnValue);
147162
}
148163
}
@@ -153,13 +168,13 @@ public void unpackMap(List<ColumnValue> keys, List<ColumnValue> values) {
153168
InternalArray key = map.keyArray();
154169
for (int i = 0; i < key.size(); i++) {
155170
PaimonColumnValue keyColumnValue = new PaimonColumnValue((DataGetters) key, i,
156-
dorisType.getChildTypes().get(0));
171+
dorisType.getChildTypes().get(0), ((MapType) dataType).getKeyType());
157172
keys.add(keyColumnValue);
158173
}
159174
InternalArray value = map.valueArray();
160175
for (int i = 0; i < value.size(); i++) {
161176
PaimonColumnValue valueColumnValue = new PaimonColumnValue((DataGetters) value, i,
162-
dorisType.getChildTypes().get(1));
177+
dorisType.getChildTypes().get(1), ((MapType) dataType).getValueType());
163178
values.add(valueColumnValue);
164179
}
165180
}
@@ -169,7 +184,8 @@ public void unpackStruct(List<Integer> structFieldIndex, List<ColumnValue> value
169184
// todo: support pruned struct fields
170185
InternalRow row = record.getRow(idx, structFieldIndex.size());
171186
for (int i : structFieldIndex) {
172-
values.add(new PaimonColumnValue(row, i, dorisType.getChildTypes().get(i)));
187+
values.add(new PaimonColumnValue(row, i, dorisType.getChildTypes().get(i),
188+
((RowType) dataType).getFields().get(i).type()));
173189
}
174190
}
175191
}

fe/be-java-extensions/paimon-scanner/src/main/java/org/apache/doris/paimon/PaimonJniScanner.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ public class PaimonJniScanner extends JniScanner {
5353
private RecordReader<InternalRow> reader;
5454
private final PaimonColumnValue columnValue = new PaimonColumnValue();
5555
private List<String> paimonAllFieldNames;
56+
private List<DataType> paimonDataTypeList;
5657

5758
private long ctlId;
5859
private long dbId;
@@ -99,7 +100,7 @@ public void open() throws IOException {
99100
initTable();
100101
initReader();
101102
resetDatetimeV2Precision();
102-
} catch (Exception e) {
103+
} catch (Throwable e) {
103104
LOG.warn("Failed to open paimon_scanner: " + e.getMessage(), e);
104105
throw e;
105106
}
@@ -114,9 +115,12 @@ private void initReader() throws IOException {
114115
+ " Please refresh table and try again",
115116
fields.length, paimonAllFieldNames.size()));
116117
}
117-
readBuilder.withProjection(getProjected());
118+
int[] projected = getProjected();
119+
readBuilder.withProjection(projected);
118120
readBuilder.withFilter(getPredicates());
119121
reader = readBuilder.newRead().createReader(getSplit());
122+
paimonDataTypeList =
123+
Arrays.stream(projected).mapToObj(i -> table.rowType().getTypeAt(i)).collect(Collectors.toList());
120124
}
121125

122126
private int[] getProjected() {
@@ -175,7 +179,7 @@ protected int getNext() throws IOException {
175179
while ((record = recordIterator.next()) != null) {
176180
columnValue.setOffsetRow(record);
177181
for (int i = 0; i < fields.length; i++) {
178-
columnValue.setIdx(i, types[i]);
182+
columnValue.setIdx(i, types[i], paimonDataTypeList.get(i));
179183
appendData(i, columnValue);
180184
}
181185
rows++;
@@ -189,8 +193,8 @@ protected int getNext() throws IOException {
189193
} catch (Exception e) {
190194
close();
191195
LOG.warn("Failed to get the next batch of paimon. "
192-
+ "split: {}, requiredFieldNames: {}, paimonAllFieldNames: {}",
193-
getSplit(), params.get("required_fields"), paimonAllFieldNames, e);
196+
+ "split: {}, requiredFieldNames: {}, paimonAllFieldNames: {}, dataType: {}",
197+
getSplit(), params.get("required_fields"), paimonAllFieldNames, paimonDataTypeList, e);
194198
throw new IOException(e);
195199
}
196200
return rows;

fe/fe-core/src/main/java/org/apache/doris/datasource/paimon/PaimonExternalTable.java

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,7 @@ public Optional<SchemaCacheValue> initSchema() {
8888
}
8989

9090
private Type paimonPrimitiveTypeToDorisType(org.apache.paimon.types.DataType dataType) {
91+
int tsScale = 3; // default
9192
switch (dataType.getTypeRoot()) {
9293
case BOOLEAN:
9394
return Type.BOOLEAN;
@@ -114,20 +115,26 @@ private Type paimonPrimitiveTypeToDorisType(org.apache.paimon.types.DataType dat
114115
case DATE:
115116
return ScalarType.createDateV2Type();
116117
case TIMESTAMP_WITHOUT_TIME_ZONE:
117-
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
118-
int scale = 3; // default
119118
if (dataType instanceof org.apache.paimon.types.TimestampType) {
120-
scale = ((org.apache.paimon.types.TimestampType) dataType).getPrecision();
121-
if (scale > 6) {
122-
scale = 6;
119+
tsScale = ((org.apache.paimon.types.TimestampType) dataType).getPrecision();
120+
if (tsScale > 6) {
121+
tsScale = 6;
123122
}
124123
} else if (dataType instanceof org.apache.paimon.types.LocalZonedTimestampType) {
125-
scale = ((org.apache.paimon.types.LocalZonedTimestampType) dataType).getPrecision();
126-
if (scale > 6) {
127-
scale = 6;
124+
tsScale = ((org.apache.paimon.types.LocalZonedTimestampType) dataType).getPrecision();
125+
if (tsScale > 6) {
126+
tsScale = 6;
127+
}
128+
}
129+
return ScalarType.createDatetimeV2Type(tsScale);
130+
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
131+
if (dataType instanceof org.apache.paimon.types.LocalZonedTimestampType) {
132+
tsScale = ((org.apache.paimon.types.LocalZonedTimestampType) dataType).getPrecision();
133+
if (tsScale > 6) {
134+
tsScale = 6;
128135
}
129136
}
130-
return ScalarType.createDatetimeV2Type(scale);
137+
return ScalarType.createDatetimeV2Type(tsScale);
131138
case ARRAY:
132139
ArrayType arrayType = (ArrayType) dataType;
133140
Type innerType = paimonPrimitiveTypeToDorisType(arrayType.getElementType());
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !c1 --
3+
1 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456
4+
5+
-- !c2 --
6+
1 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456
7+
8+
-- !ltz_ntz_simple2 --
9+
1 {"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"} {"2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456", "2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456"} ["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"] ["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"] {"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 10:12:34.123456"}
10+
11+
-- !ltz_ntz_simple3 --
12+
{"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"}
13+
14+
-- !ltz_ntz_simple4 --
15+
2024-01-02T10:12:34.123456
16+
17+
-- !ltz_ntz_simple5 --
18+
2024-01-04T10:12:34.123456
19+
20+
-- !ltz_ntz_simple6 --
21+
{"2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456", "2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456"}
22+
23+
-- !ltz_ntz_simple7 --
24+
2024-01-02T10:12:34.123456
25+
26+
-- !ltz_ntz_simple8 --
27+
2024-01-04T10:12:34.123456
28+
29+
-- !ltz_ntz_simple9 --
30+
["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"]
31+
32+
-- !ltz_ntz_simple10 --
33+
2024-01-02T10:12:34.123456
34+
35+
-- !ltz_ntz_simple11 --
36+
["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"]
37+
38+
-- !ltz_ntz_simple12 --
39+
2024-01-02T10:12:34.123456
40+
41+
-- !ltz_ntz_simple13 --
42+
{"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 10:12:34.123456"}
43+
44+
-- !ltz_ntz_simple14 --
45+
2024-01-01T10:12:34.123456
46+
47+
-- !ltz_ntz_simple15 --
48+
2024-01-02T10:12:34.123456
49+
50+
-- !c1 --
51+
1 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T02:04:05.100 2024-01-02T02:04:05.120 2024-01-02T02:04:05.123 2024-01-02T02:04:05.123400 2024-01-02T02:04:05.123450 2024-01-02T02:04:05.123456 2024-01-02T02:04:05.123456 2024-01-02T02:04:05.123456 2024-01-02T02:04:05.123456
52+
53+
-- !c2 --
54+
1 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T18:04:05.123456 2024-01-02T18:04:05.123456 2024-01-02T18:04:05.123456 2024-01-02T10:04:05.100 2024-01-02T10:04:05.120 2024-01-02T10:04:05.123 2024-01-02T10:04:05.123400 2024-01-02T10:04:05.123450 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456 2024-01-02T10:04:05.123456
55+
56+
-- !ltz_ntz_simple2 --
57+
1 {"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"} {"2024-01-03 02:12:34.123456":"2024-01-04 02:12:34.123456", "2024-01-01 02:12:34.123456":"2024-01-02 02:12:34.123456"} ["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"] ["2024-01-01 02:12:34.123456", "2024-01-02 02:12:34.123456", "2024-01-03 02:12:34.123456"] {"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 02:12:34.123456"}
58+
59+
-- !ltz_ntz_simple3 --
60+
{"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"}
61+
62+
-- !ltz_ntz_simple4 --
63+
2024-01-02T10:12:34.123456
64+
65+
-- !ltz_ntz_simple5 --
66+
2024-01-04T10:12:34.123456
67+
68+
-- !ltz_ntz_simple6 --
69+
{"2024-01-03 02:12:34.123456":"2024-01-04 02:12:34.123456", "2024-01-01 02:12:34.123456":"2024-01-02 02:12:34.123456"}
70+
71+
-- !ltz_ntz_simple7 --
72+
\N
73+
74+
-- !ltz_ntz_simple8 --
75+
\N
76+
77+
-- !ltz_ntz_simple9 --
78+
["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"]
79+
80+
-- !ltz_ntz_simple10 --
81+
2024-01-02T10:12:34.123456
82+
83+
-- !ltz_ntz_simple11 --
84+
["2024-01-01 02:12:34.123456", "2024-01-02 02:12:34.123456", "2024-01-03 02:12:34.123456"]
85+
86+
-- !ltz_ntz_simple12 --
87+
2024-01-02T02:12:34.123456
88+
89+
-- !ltz_ntz_simple13 --
90+
{"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 02:12:34.123456"}
91+
92+
-- !ltz_ntz_simple14 --
93+
2024-01-01T10:12:34.123456
94+
95+
-- !ltz_ntz_simple15 --
96+
2024-01-02T02:12:34.123456
97+
98+
-- !ltz_ntz_simple2 --
99+
1 {"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"} {"2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456", "2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456"} ["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"] ["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"] {"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 10:12:34.123456"}
100+
101+
-- !ltz_ntz_simple3 --
102+
{"2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456"}
103+
104+
-- !ltz_ntz_simple4 --
105+
2024-01-02T10:12:34.123456
106+
107+
-- !ltz_ntz_simple5 --
108+
2024-01-04T10:12:34.123456
109+
110+
-- !ltz_ntz_simple6 --
111+
{"2024-01-03 10:12:34.123456":"2024-01-04 10:12:34.123456", "2024-01-01 10:12:34.123456":"2024-01-02 10:12:34.123456"}
112+
113+
-- !ltz_ntz_simple7 --
114+
2024-01-02T10:12:34.123456
115+
116+
-- !ltz_ntz_simple8 --
117+
2024-01-04T10:12:34.123456
118+
119+
-- !ltz_ntz_simple9 --
120+
["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"]
121+
122+
-- !ltz_ntz_simple10 --
123+
2024-01-02T10:12:34.123456
124+
125+
-- !ltz_ntz_simple11 --
126+
["2024-01-01 10:12:34.123456", "2024-01-02 10:12:34.123456", "2024-01-03 10:12:34.123456"]
127+
128+
-- !ltz_ntz_simple12 --
129+
2024-01-02T10:12:34.123456
130+
131+
-- !ltz_ntz_simple13 --
132+
{"crow1":"2024-01-01 10:12:34.123456", "crow2":"2024-01-02 10:12:34.123456"}
133+
134+
-- !ltz_ntz_simple14 --
135+
2024-01-01T10:12:34.123456
136+
137+
-- !ltz_ntz_simple15 --
138+
2024-01-02T10:12:34.123456
139+

0 commit comments

Comments
 (0)