Skip to content

Commit cb9f0e2

Browse files
committed
[fix](ES Catalog)Do not extract doc_values of field with ignore_above setting (apache#40314)
1 parent cb0613e commit cb9f0e2

File tree

12 files changed

+321
-119
lines changed

12 files changed

+321
-119
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"test1": "string_ignore_above_10",
3+
"test2": "text_ignore_above_10",
4+
"test3": 5.0,
5+
"test4": "2022-08-08",
6+
"test5": 3333.22,
7+
"test6": "2022-08-08T12:10:10.151",
8+
"c_bool": [true, false, true, true],
9+
"c_byte": [1, -2, -3, 4],
10+
"c_short": [128, 129, -129, -130],
11+
"c_integer": [32768, 32769, -32769, -32770],
12+
"c_long": [-1, 0, 1, 2],
13+
"c_unsigned_long": [0, 1, 2, 3],
14+
"c_float": [1.0, 1.1, 1.2, 1.3],
15+
"c_half_float": [1, 2, 3, 4],
16+
"c_double": [1, 2, 3, 4],
17+
"c_scaled_float": [1, 2, 3, 4],
18+
"c_date": ["2020-01-01", "2020-01-02"],
19+
"c_datetime": ["2020-01-01 12:00:00", "2020-01-02 13:01:01"],
20+
"c_keyword": ["a", "b", "c"],
21+
"c_text": ["d", "e", "f"],
22+
"c_ip": ["192.168.0.1", "127.0.0.1"],
23+
"c_person": [
24+
{"name": "Andy", "age": 18},
25+
{"name": "Tim", "age": 28}
26+
],
27+
"message": "I'm not null or empty",
28+
"c_user": [
29+
{"first": "John", "last": "Smith"},
30+
{"first": "Alice", "last": "White"}
31+
]
32+
}
Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,35 @@
11
{
2-
"field1": "value1",
3-
"field2": "value2"
2+
"test1": "string_ignore_above_10",
3+
"test2": "text_ignore_above_10",
4+
"test3": 6.0,
5+
"test4": "2022-08-08",
6+
"test5": "2022-08-11 12:10:10",
7+
"test6": 1660191010000,
8+
"test7": "2022-08-11 12:10:10",
9+
"test8": "2022-08-11T12:10:10+09:00",
10+
"test9": "4444.22",
11+
"test10": "2022-08-08T12:10:10.151",
12+
"c_bool": [true, false, true, true],
13+
"c_byte": [1, -2, -3, 4],
14+
"c_short": [128, 129, -129, -130],
15+
"c_integer": [32768, 32769, -32769, -32770],
16+
"c_long": [-1, 0, 1, 2],
17+
"c_unsigned_long": [0, 1, 2, 3],
18+
"c_float": [1.0, 1.1, 1.2, 1.3],
19+
"c_half_float": [1, 2, 3, 4],
20+
"c_double": [1, 2, 3, 4],
21+
"c_scaled_float": [1, 2, 3, 4],
22+
"c_date": ["2020-01-01", "2020-01-02"],
23+
"c_datetime": ["2020-01-01 12:00:00", "2020-01-02 13:01:01"],
24+
"c_keyword": ["a", "b", "c"],
25+
"c_text": ["d", "e", "f"],
26+
"c_ip": ["192.168.0.1", "127.0.0.1"],
27+
"c_person": [
28+
{"name": "Andy", "age": 18},
29+
{"name": "Tim", "age": 28}
30+
],
31+
"c_user": [
32+
{"first": "John", "last": "Smith"},
33+
{"first": "Alice", "last": "White"}
34+
]
435
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"field1": "value1",
3+
"field2": "value2"
4+
}

docker/thirdparties/docker-compose/elasticsearch/scripts/es_init.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ curl "http://${ES_5_HOST}:9200/test1/doc/1" -H "Content-Type:application/json" -
5353
curl "http://${ES_5_HOST}:9200/test1/doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2_es6.json'
5454
# only difference between es5 and es6
5555
curl "http://${ES_5_HOST}:9200/test1/doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3_es5.json'
56+
curl "http://${ES_5_HOST}:9200/test1/doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4_es6.json'
5657
# put data for test2_20220808
5758
curl "http://${ES_5_HOST}:9200/test2_20220808/doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1_es6.json'
5859
curl "http://${ES_5_HOST}:9200/test2_20220808/doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2_es6.json'
@@ -88,6 +89,7 @@ curl "http://${ES_6_HOST}:9200/test2_20220809" -H "Content-Type:application/json
8889
curl "http://${ES_6_HOST}:9200/test1/doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1_es6.json'
8990
curl "http://${ES_6_HOST}:9200/test1/doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2_es6.json'
9091
curl "http://${ES_6_HOST}:9200/test1/doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3_es6.json'
92+
curl "http://${ES_6_HOST}:9200/test1/doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4_es6.json'
9193
# put data for test2_20220808
9294
curl "http://${ES_6_HOST}:9200/test2_20220808/doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1_es6.json'
9395
curl "http://${ES_6_HOST}:9200/test2_20220808/doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2_es6.json'
@@ -125,6 +127,7 @@ curl "http://${ES_7_HOST}:9200/test1/_doc/1" -H "Content-Type:application/json"
125127
curl "http://${ES_7_HOST}:9200/test1/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json'
126128
curl "http://${ES_7_HOST}:9200/test1/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json'
127129
curl "http://${ES_7_HOST}:9200/test1/_doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4.json'
130+
curl "http://${ES_7_HOST}:9200/test1/_doc/5" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data5.json'
128131
# put data for test2_20220808
129132
curl "http://${ES_7_HOST}:9200/test2_20220808/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1.json'
130133
curl "http://${ES_7_HOST}:9200/test2_20220808/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json'
@@ -136,7 +139,7 @@ curl "http://${ES_7_HOST}:9200/test2_20220809/_doc/2" -H "Content-Type:applicati
136139
curl "http://${ES_7_HOST}:9200/test2_20220809/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json'
137140
curl "http://${ES_7_HOST}:9200/test2_20220809/_doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4.json'
138141
# put data for test3_20231005
139-
curl "http://${ES_7_HOST}:9200/test3_20231005/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data5.json'
142+
curl "http://${ES_7_HOST}:9200/test3_20231005/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data6.json'
140143

141144
# put _meta for array
142145
curl "http://${ES_7_HOST}:9200/test1/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"
@@ -170,6 +173,7 @@ curl "http://${ES_8_HOST}:9200/test1/_doc/1" -H "Content-Type:application/json"
170173
curl "http://${ES_8_HOST}:9200/test1/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json'
171174
curl "http://${ES_8_HOST}:9200/test1/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json'
172175
curl "http://${ES_8_HOST}:9200/test1/_doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4.json'
176+
curl "http://${ES_8_HOST}:9200/test1/_doc/5" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data5.json'
173177
# put data for test2_20220808
174178
curl "http://${ES_8_HOST}:9200/test2_20220808/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data1.json'
175179
curl "http://${ES_8_HOST}:9200/test2_20220808/_doc/2" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data2.json'
@@ -181,7 +185,7 @@ curl "http://${ES_8_HOST}:9200/test2_20220809/_doc/2" -H "Content-Type:applicati
181185
curl "http://${ES_8_HOST}:9200/test2_20220809/_doc/3" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data3.json'
182186
curl "http://${ES_8_HOST}:9200/test2_20220809/_doc/4" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data4.json'
183187
# put data for test3_20231005
184-
curl "http://${ES_8_HOST}:9200/test3_20231005/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data5.json'
188+
curl "http://${ES_8_HOST}:9200/test3_20231005/_doc/1" -H "Content-Type:application/json" -X POST -d '@/mnt/scripts/data/data6.json'
185189

186190
# put _meta for array
187191
curl "http://${ES_8_HOST}:9200/test1/_mapping" -H "Content-Type:application/json" -X PUT -d "@/mnt/scripts/index/array_meta.json"

docker/thirdparties/docker-compose/elasticsearch/scripts/index/es6_test1.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
"doc": {
88
"properties": {
99
"test1": {
10-
"type": "keyword"
10+
"type": "keyword",
11+
"ignore_above": 10
1112
},
1213
"test2": {
1314
"type": "text",
1415
"fields": {
1516
"keyword": {
1617
"type": "keyword",
17-
"ignore_above": 256
18+
"ignore_above": 10
1819
}
1920
}
2021
},

docker/thirdparties/docker-compose/elasticsearch/scripts/index/es7_test1.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
"mappings": {
77
"properties": {
88
"test1": {
9-
"type": "keyword"
9+
"type": "keyword",
10+
"ignore_above": 10
1011
},
1112
"test2": {
1213
"type": "text",
1314
"fields": {
1415
"keyword": {
1516
"type": "keyword",
16-
"ignore_above": 256
17+
"ignore_above": 10
1718
}
1819
}
1920
},

fe/fe-core/src/main/java/org/apache/doris/catalog/EsTable.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@
2424
import org.apache.doris.datasource.es.EsRestClient;
2525
import org.apache.doris.datasource.es.EsTablePartitions;
2626
import org.apache.doris.datasource.es.EsUtil;
27+
import org.apache.doris.persist.gson.GsonPostProcessable;
2728
import org.apache.doris.thrift.TEsTable;
2829
import org.apache.doris.thrift.TTableDescriptor;
2930
import org.apache.doris.thrift.TTableType;
3031

32+
import com.google.gson.annotations.SerializedName;
3133
import lombok.Getter;
3234
import lombok.Setter;
3335
import org.apache.commons.codec.digest.DigestUtils;
@@ -38,7 +40,7 @@
3840
import java.io.DataInput;
3941
import java.io.DataOutput;
4042
import java.io.IOException;
41-
import java.util.Collections;
43+
import java.util.Arrays;
4244
import java.util.HashMap;
4345
import java.util.HashSet;
4446
import java.util.List;
@@ -51,7 +53,10 @@
5153
@Getter
5254
@Setter
5355
public class EsTable extends Table {
54-
public static final Set<String> DEFAULT_DOCVALUE_DISABLED_FIELDS = new HashSet<>(Collections.singletonList("text"));
56+
// reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/doc-values.html
57+
// https://www.elastic.co/guide/en/elasticsearch/reference/current/text.html
58+
public static final Set<String> DEFAULT_DOCVALUE_DISABLED_FIELDS =
59+
new HashSet<>(Arrays.asList("text", "annotated_text", "match_only_text"));
5560

5661
private static final Logger LOG = LogManager.getLogger(EsTable.class);
5762
// Solr doc_values vs stored_fields performance-smackdown indicate:

fe/fe-core/src/main/java/org/apache/doris/datasource/es/MappingPhase.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,16 @@ private static void resolveDocValuesFields(SearchContext searchContext, ObjectNo
130130
if (docValue) {
131131
docValueField = colName;
132132
}
133+
} else if (innerTypeObject.has("ignore_above")) {
134+
// reference:
135+
// https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#keyword-params
136+
// > ignore_above
137+
// > Do not index any string longer than this value. Defaults to 2147483647 so that all values
138+
// > would be accepted. Please however note that default dynamic mapping rules create a sub
139+
// > keyword field that overrides this default by setting ignore_above: 256.
140+
// this field has `ignore_above` param
141+
// Strings longer than the ignore_above setting will not be indexed or stored
142+
// so we cannot rely on its doc_values
133143
} else {
134144
// a : {c : {}} -> a -> a.c
135145
docValueField = colName + "." + fieldName;
@@ -146,6 +156,17 @@ private static void resolveDocValuesFields(SearchContext searchContext, ObjectNo
146156
} else if (fieldType == null || "nested".equals(fieldType)) {
147157
// The object field has no type, and nested not support doc value.
148158
return;
159+
} else if (fieldObject.has("ignore_above")) {
160+
// reference:
161+
// https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#keyword-params
162+
// > ignore_above
163+
// > Do not index any string longer than this value. Defaults to 2147483647 so that all values
164+
// > would be accepted. Please however note that default dynamic mapping rules create a sub
165+
// > keyword field that overrides this default by setting ignore_above: 256.
166+
// this field has `ignore_above` param
167+
// Strings longer than the ignore_above setting will not be indexed or stored
168+
// so we cannot rely on its doc_values
169+
return;
149170
}
150171
docValueField = colName;
151172
}

0 commit comments

Comments
 (0)