@@ -3,6 +3,7 @@ package array
3
3
import (
4
4
"encoding/json"
5
5
"fmt"
6
+ "sort"
6
7
"strconv"
7
8
"strings"
8
9
"sync/atomic"
36
37
ValueType : arrow .BinaryTypes .String ,
37
38
Ordered : false ,
38
39
}
40
+ StringREEType = arrow .RunEndEncodedOf (arrow .PrimitiveTypes .Int32 , arrow .BinaryTypes .String )
39
41
)
40
42
41
43
// Array represents an immutable sequence of values.
@@ -113,21 +115,24 @@ type Builder interface {
113
115
}
114
116
115
117
// String holds an array of flux string values. The arrow data must be
116
- // either a `utf8` or `dictionary<value=utf8, indices=int32, ordered=false>`.
118
+ // either a `utf8`, a `dictionary<value=utf8, indices=int32, ordered=false>`,
119
+ // or a `run_end_encoded<run_ends:int32, values:utf8>`.
117
120
// Internally the string data is stored in an array.Binary value.
118
121
type String struct {
119
122
refCount int64
120
123
data * array.Data
121
124
nullBitmapBytes []byte
122
125
123
126
indices * array.Int32
127
+ runEnds * array.Int32
124
128
values * array.Binary
125
129
}
126
130
127
131
// Create a new String array from an arrow.ArrayData that contains
128
- // either a `utf8` or a `dictionary<values=utf8, indices=int32, ordered=false>`
129
- // set of data buffers. NewStringData will panic if the array data is of
130
- // an unsupported type.
132
+ // either a `utf8`, a `dictionary<values=utf8, indices=int32, ordered=false>`,
133
+ // or a `run_end_encoded<run_ends:int32, values:utf8>` set of data
134
+ // buffers. NewStringData will panic if the array data is of an
135
+ // unsupported type.
131
136
func NewStringData (data arrow.ArrayData ) * String {
132
137
a := String {
133
138
refCount : 1 ,
@@ -136,19 +141,32 @@ func NewStringData(data arrow.ArrayData) *String {
136
141
return & a
137
142
}
138
143
139
- // validateStringDataType checks that the datatype is supported for
140
- // using to create a String array .
141
- func validateStringDataType (dt arrow.DataType ) {
144
+ // isStringDataType checks if the given arrow.DataType is a string type
145
+ // supported by flux .
146
+ func isStringDataType (dt arrow.DataType ) bool {
142
147
switch dt := dt .(type ) {
143
148
case * arrow.DictionaryType :
144
149
if dt .IndexType .ID () == arrow .INT32 && dt .ValueType .ID () == arrow .STRING {
145
- return
150
+ return true
151
+ }
152
+ case * arrow.RunEndEncodedType :
153
+ if dt .RunEnds ().ID () == arrow .INT32 && dt .Encoded ().ID () == arrow .STRING {
154
+ return true
146
155
}
147
156
default :
148
157
if dt .ID () == arrow .STRING {
149
- return
158
+ return true
150
159
}
151
160
}
161
+ return false
162
+ }
163
+
164
+ // validateStringDataType checks that the datatype is supported for
165
+ // using to create a String array.
166
+ func validateStringDataType (dt arrow.DataType ) {
167
+ if isStringDataType (dt ) {
168
+ return
169
+ }
152
170
panic (errors .Newf (codes .Internal , "incorrect data type for String (%s)" , dt ))
153
171
}
154
172
@@ -167,45 +185,101 @@ func (a *String) setData(data *array.Data) {
167
185
}
168
186
169
187
var indices * array.Int32
188
+ var runEnds * array.Int32
170
189
var values * array.Binary
171
190
172
191
if data .DataType ().ID () == arrow .DICTIONARY {
173
192
idxData := array .NewData (arrow .PrimitiveTypes .Int32 , data .Len (), data .Buffers (), nil , data .NullN (), data .Offset ())
174
193
indices = array .NewInt32Data (idxData )
175
194
idxData .Release ()
176
195
values = array .NewBinaryData (data .Dictionary ())
196
+ } else if data .DataType ().ID () == arrow .RUN_END_ENCODED {
197
+ runEnds = array .NewInt32Data (data .Children ()[0 ])
198
+ values = array .NewBinaryData (data .Children ()[1 ])
177
199
} else {
178
200
values = array .NewBinaryData (data )
179
201
}
180
202
if a .indices != nil {
181
203
a .indices .Release ()
182
204
}
205
+ if a .runEnds != nil {
206
+ a .runEnds .Release ()
207
+ }
183
208
if a .values != nil {
184
209
a .values .Release ()
185
210
}
186
211
a .indices = indices
212
+ a .runEnds = runEnds
187
213
a .values = values
188
214
a .data = data
189
215
}
190
216
217
+ func (a * String ) valuesIndex (i int ) (int , bool ) {
218
+ if a .indices != nil {
219
+ if a .indices .IsNull (i ) {
220
+ return 0 , false
221
+ }
222
+ return int (a .indices .Value (i )), true
223
+ } else if a .runEnds != nil {
224
+ return sort .Search (a .runEnds .Len (), func (j int ) bool {
225
+ return a .runEnds .Value (j ) > int32 (i + a .data .Offset ())
226
+ }), true
227
+ }
228
+ return i , true
229
+ }
230
+
191
231
func (a * String ) DataType () arrow.DataType {
192
232
return a .data .DataType ()
193
233
}
194
234
195
235
func (a * String ) NullN () int {
236
+ if a .runEnds != nil {
237
+ nbm := a .NullBitmapBytes ()
238
+ if nbm == nil {
239
+ return 0
240
+ }
241
+ sz := a .data .Len ()
242
+ return sz - bitutil .CountSetBits (nbm , 0 , sz )
243
+ }
196
244
return a .data .NullN ()
197
245
}
198
246
199
247
func (a * String ) NullBitmapBytes () []byte {
248
+ if a .runEnds == nil {
249
+ return a .nullBitmapBytes
250
+ }
251
+ if a .values .NullN () == 0 {
252
+ return nil
253
+ }
254
+ if a .nullBitmapBytes == nil {
255
+ a .nullBitmapBytes = make ([]byte , bitutil .BytesForBits (int64 (a .data .Len ())))
256
+ last := int64 (a .data .Offset ())
257
+ end := last + int64 (a .data .Len ())
258
+ for i , _ := a .valuesIndex (0 ); i < a .runEnds .Len () && last < end ; i ++ {
259
+ runEnd := int64 (a .runEnds .Value (i ))
260
+ if runEnd > end {
261
+ runEnd = end
262
+ }
263
+ count := runEnd - last
264
+ bitutil .SetBitsTo (a .nullBitmapBytes , last , count , a .values .IsValid (i ))
265
+ last += count
266
+ }
267
+ }
200
268
return a .nullBitmapBytes
201
269
}
202
270
203
271
func (a * String ) IsNull (i int ) bool {
204
- return len (a .nullBitmapBytes ) != 0 && bitutil .BitIsNotSet (a .nullBitmapBytes , a .data .Offset ()+ i )
272
+ if i , ok := a .valuesIndex (i ); ok {
273
+ return a .values .IsNull (i )
274
+ }
275
+ return true
205
276
}
206
277
207
278
func (a * String ) IsValid (i int ) bool {
208
- return len (a .nullBitmapBytes ) == 0 || bitutil .BitIsSet (a .nullBitmapBytes , a .data .Offset ()+ i )
279
+ if i , ok := a .valuesIndex (i ); ok {
280
+ return a .values .IsValid (i )
281
+ }
282
+ return false
209
283
}
210
284
211
285
func (a * String ) ValueStr (i int ) string {
@@ -264,6 +338,10 @@ func (a *String) Release() {
264
338
a .indices .Release ()
265
339
a .indices = nil
266
340
}
341
+ if a .runEnds != nil {
342
+ a .runEnds .Release ()
343
+ a .runEnds = nil
344
+ }
267
345
if a .values != nil {
268
346
a .values .Release ()
269
347
a .values = nil
@@ -277,28 +355,14 @@ func (a *String) Release() {
277
355
func (a * String ) String () string {
278
356
var sb strings.Builder
279
357
sb .WriteByte ('[' )
280
- if a .indices != nil {
281
- for i := 0 ; i < a .Len (); i ++ {
282
- if i > 0 {
283
- sb .WriteByte (' ' )
284
- }
285
- if a .indices .IsValid (i ) {
286
- idx := int (a .indices .Value (i ))
287
- fmt .Fprintf (& sb , "%q" , a .values .ValueString (idx ))
288
- } else {
289
- sb .WriteString (array .NullValueStr )
290
- }
358
+ for i := 0 ; i < a .Len (); i ++ {
359
+ if i > 0 {
360
+ sb .WriteByte (' ' )
291
361
}
292
- } else {
293
- for i := 0 ; i < a .Len (); i ++ {
294
- if i > 0 {
295
- sb .WriteByte (' ' )
296
- }
297
- if a .values .IsValid (i ) {
298
- fmt .Fprintf (& sb , "%q" , a .values .ValueString (i ))
299
- } else {
300
- sb .WriteString (array .NullValueStr )
301
- }
362
+ if a .IsValid (i ) {
363
+ fmt .Fprintf (& sb , "%q" , a .Value (i ))
364
+ } else {
365
+ sb .WriteString (array .NullValueStr )
302
366
}
303
367
}
304
368
sb .WriteByte (']' )
@@ -309,20 +373,20 @@ func (a *String) String() string {
309
373
// is only valid for the lifetime of the array. Care should be taken not
310
374
// to store this string without also retaining the array.
311
375
func (a * String ) Value (i int ) string {
312
- if a .indices != nil {
313
- if a .indices .IsNull (i ) {
314
- // Flux relies on a NULL entry in the String array returning
315
- // the empty string.
316
- return ""
317
- }
318
- i = int (a .indices .Value (i ))
376
+ i , ok := a .valuesIndex (i )
377
+ if ! ok {
378
+ // Flux relies on a NULL entry in the String array returning
379
+ // the empty string.
380
+ return ""
319
381
}
320
382
return a .values .ValueString (i )
321
383
}
322
384
323
385
func (a * String ) ValueLen (i int ) int {
324
- if a .indices != nil {
325
- i = int (a .indices .Value (i ))
386
+ i , ok := a .valuesIndex (i )
387
+ if ! ok {
388
+ // Null values are zero length.
389
+ return 0
326
390
}
327
391
return a .values .ValueLen (i )
328
392
}
@@ -374,11 +438,14 @@ func MakeFromData(data arrow.ArrayData) Array {
374
438
return array .NewInt64Data (data )
375
439
case arrow .UINT64 :
376
440
return array .NewUint64Data (data )
377
- case arrow .STRING , arrow . DICTIONARY :
441
+ case arrow .STRING :
378
442
return NewStringData (data )
379
- default :
380
- panic (errors .Newf (codes .Internal , "invalid data type for flux array (%s)" , data .DataType ()))
443
+ case arrow .DICTIONARY , arrow .RUN_END_ENCODED :
444
+ if isStringDataType (data .DataType ()) {
445
+ return NewStringData (data )
446
+ }
381
447
}
448
+ panic (errors .Newf (codes .Internal , "invalid data type for flux array (%s)" , data .DataType ()))
382
449
}
383
450
384
451
func ToFloatConv (mem memory.Allocator , arr Array ) (* Float , error ) {
0 commit comments