Skip to content

Commit 2642ecd

Browse files
feat: Add Run-End Encoded array casting with overflow protection
Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences. rebased changes
1 parent 0452360 commit 2642ecd

File tree

2 files changed

+111
-26
lines changed

2 files changed

+111
-26
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 97 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ use crate::cast::dictionary::*;
4848
use crate::cast::list::*;
4949
use crate::cast::map::*;
5050
use crate::cast::run_array::{
51-
can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast,
51+
can_cast_to_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast,
5252
};
5353
use crate::cast::string::*;
5454

@@ -142,7 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
142142
}
143143
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
144144
(RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type),
145-
(_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type),
145+
(_, RunEndEncoded(_, _value_type)) => can_cast_to_run_end_encoded(from_type, to_type),
146146
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
147147
(List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => {
148148
can_cast_types(list_from.data_type(), list_to.data_type())
@@ -10716,13 +10716,13 @@ mod tests {
1071610716
)) as ArrayRef;
1071710717
assert_eq!(*fixed_array, *r);
1071810718
}
10719+
1071910720
#[cfg(test)]
1072010721
mod run_end_encoded_tests {
1072110722
use super::*;
1072210723
use arrow_schema::{DataType, Field};
1072310724
use std::sync::Arc;
1072410725

10725-
/// Test casting FROM RunEndEncoded to primitive types
1072610726
#[test]
1072710727
fn test_run_end_encoded_to_primitive() {
1072810728
// Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3]
@@ -10740,10 +10740,8 @@ mod tests {
1074010740
);
1074110741
}
1074210742

10743-
/// Test casting FROM RunEndEncoded to string
1074410743
#[test]
1074510744
fn test_run_end_encoded_to_string() {
10746-
// Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30]
1074710745
let run_ends = Int32Array::from(vec![2, 3, 5]);
1074810746
let values = Int32Array::from(vec![10, 20, 30]);
1074910747
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
@@ -10760,7 +10758,6 @@ mod tests {
1076010758
assert_eq!(result_array.value(2), "20");
1076110759
}
1076210760

10763-
/// Test casting TO RunEndEncoded from primitive types
1076410761
#[test]
1076510762
fn test_primitive_to_run_end_encoded() {
1076610763
// Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3]
@@ -10788,7 +10785,94 @@ mod tests {
1078810785
assert_eq!(values_array.values(), &[1, 2, 3]);
1078910786
}
1079010787

10791-
/// Test casting TO RunEndEncoded from string
10788+
#[test]
10789+
fn test_primitive_to_run_end_encoded_with_nulls() {
10790+
let source_array = Int32Array::from(vec![
10791+
Some(1),
10792+
Some(1),
10793+
None,
10794+
None,
10795+
Some(2),
10796+
Some(2),
10797+
Some(3),
10798+
Some(3),
10799+
None,
10800+
None,
10801+
Some(4),
10802+
Some(4),
10803+
Some(5),
10804+
Some(5),
10805+
None,
10806+
None,
10807+
]);
10808+
let array_ref = Arc::new(source_array) as ArrayRef;
10809+
let target_type = DataType::RunEndEncoded(
10810+
Arc::new(Field::new("run_ends", DataType::Int32, false)),
10811+
Arc::new(Field::new("values", DataType::Int32, true)),
10812+
);
10813+
let cast_result = cast(&array_ref, &target_type).unwrap();
10814+
let result_run_array = cast_result
10815+
.as_any()
10816+
.downcast_ref::<RunArray<Int32Type>>()
10817+
.unwrap();
10818+
assert_eq!(
10819+
result_run_array.run_ends().values(),
10820+
&[2, 4, 6, 8, 10, 12, 14, 16]
10821+
);
10822+
assert_eq!(
10823+
result_run_array
10824+
.values()
10825+
.as_primitive::<Int32Type>()
10826+
.values(),
10827+
&[1, 0, 2, 3, 0, 4, 5, 0]
10828+
);
10829+
assert_eq!(result_run_array.values().null_count(), 3);
10830+
}
10831+
10832+
#[test]
10833+
fn test_primitive_to_run_end_encoded_with_nulls_consecutive() {
10834+
let source_array = Int64Array::from(vec![
10835+
Some(1),
10836+
Some(1),
10837+
None,
10838+
None,
10839+
None,
10840+
None,
10841+
None,
10842+
None,
10843+
None,
10844+
None,
10845+
Some(4),
10846+
Some(20),
10847+
Some(500),
10848+
Some(500),
10849+
None,
10850+
None,
10851+
]);
10852+
let array_ref = Arc::new(source_array) as ArrayRef;
10853+
let target_type = DataType::RunEndEncoded(
10854+
Arc::new(Field::new("run_ends", DataType::Int16, false)),
10855+
Arc::new(Field::new("values", DataType::Int64, true)),
10856+
);
10857+
let cast_result = cast(&array_ref, &target_type).unwrap();
10858+
let result_run_array = cast_result
10859+
.as_any()
10860+
.downcast_ref::<RunArray<Int16Type>>()
10861+
.unwrap();
10862+
assert_eq!(
10863+
result_run_array.run_ends().values(),
10864+
&[2, 10, 11, 12, 14, 16]
10865+
);
10866+
assert_eq!(
10867+
result_run_array
10868+
.values()
10869+
.as_primitive::<Int64Type>()
10870+
.values(),
10871+
&[1, 0, 4, 20, 500, 0]
10872+
);
10873+
assert_eq!(result_run_array.values().null_count(), 2);
10874+
}
10875+
1079210876
#[test]
1079310877
fn test_string_to_run_end_encoded() {
1079410878
// Create a String array with repeated values: ["a", "a", "b", "c", "c"]
@@ -10818,7 +10902,6 @@ mod tests {
1081810902
assert_eq!(values_array.value(2), "c");
1081910903
}
1082010904

10821-
/// Test casting with type conversion (Int32 -> RunEndEncoded<Int32, String>)
1082210905
#[test]
1082310906
fn test_cast_with_type_conversion() {
1082410907
// Create an Int32 array: [1, 1, 2, 2, 3]
@@ -10851,7 +10934,6 @@ mod tests {
1085110934
assert_eq!(values_array.value(2), "3");
1085210935
}
1085310936

10854-
/// Test casting empty array to RunEndEncoded
1085510937
#[test]
1085610938
fn test_empty_array_to_run_end_encoded() {
1085710939
// Create an empty Int32 array
@@ -10876,7 +10958,6 @@ mod tests {
1087610958
assert_eq!(result_run_array.values().len(), 0);
1087710959
}
1087810960

10879-
/// Test casting RunEndEncoded with nulls
1088010961
#[test]
1088110962
fn test_run_end_encoded_with_nulls() {
1088210963
// Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2]
@@ -10895,7 +10976,6 @@ mod tests {
1089510976
assert_eq!(result_run_array.value(4), "2");
1089610977
}
1089710978

10898-
/// Test different index types (Int16, Int64)
1089910979
#[test]
1090010980
fn test_different_index_types() {
1090110981
// Test with Int16 index type
@@ -10917,6 +10997,7 @@ mod tests {
1091710997
let cast_result = cast(&array_ref, &target_type).unwrap();
1091810998
assert_eq!(cast_result.data_type(), &target_type);
1091910999
}
11000+
1092011001
#[test]
1092111002
fn test_unsupported_cast_to_run_end_encoded() {
1092211003
// Create a Struct array - complex nested type that might not be supported
@@ -10935,8 +11016,10 @@ mod tests {
1093511016
// Expect this to fail
1093611017
assert!(cast_result.is_err());
1093711018
}
11019+
1093811020
#[test]
1093911021
fn test_cast_run_end_encoded_int64_to_int16_should_fail() {
11022+
/// Test casting RunEndEncoded<Int64, String> to RunEndEncoded<Int16, String> should fail
1094011023
use arrow_array::{Int64Array, RunArray, StringArray};
1094111024
use arrow_schema::{DataType, Field};
1094211025
use std::sync::Arc;
@@ -10973,8 +11056,10 @@ mod tests {
1097311056
}
1097411057
}
1097511058
}
11059+
1097611060
#[test]
1097711061
fn test_cast_run_end_encoded_int16_to_int64_should_succeed() {
11062+
/// Test casting RunEndEncoded<Int16, String> to RunEndEncoded<Int64, String> should succeed
1097811063
use arrow_array::{Int16Array, RunArray, StringArray};
1097911064
use arrow_schema::{DataType, Field};
1098011065
use std::sync::Arc;
@@ -11023,6 +11108,7 @@ mod tests {
1102311108

1102411109
#[test]
1102511110
fn test_cast_run_end_encoded_int32_to_int16_should_fail() {
11111+
/// Test casting RunEndEncoded<Int32, String> to RunEndEncoded<Int16, String> should fail
1102611112
use arrow_array::{Int32Array, RunArray, StringArray};
1102711113
use arrow_schema::{DataType, Field};
1102811114
use std::sync::Arc;
@@ -11031,9 +11117,6 @@ mod tests {
1103111117
let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16
1103211118
let values = StringArray::from(vec!["x", "y", "z"]);
1103311119

11034-
println!("Original run_ends null count: {}", run_ends.null_count());
11035-
println!("Original run_ends values: {:?}", run_ends.values());
11036-
1103711120
let ree_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1103811121
let array_ref = Arc::new(ree_array) as ArrayRef;
1103911122

arrow-cast/src/cast/run_array.rs

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use crate::cast::*;
2+
23
/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE
34
/// and REE-to-other type conversions with proper validation and error handling.
45
///
@@ -171,7 +172,7 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
171172
values_indices.push(0);
172173
// Step 3: Identify runs of consecutive equal values
173174
for i in 1..cast_array.len() {
174-
// For simplicity, we'll use a basic comparison approach
175+
// We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded]
175176
let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) {
176177
(true, true) => true, // Both null
177178
(false, false) => {
@@ -190,18 +191,14 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
190191
}
191192

192193
// Add the final run end
193-
run_ends_vec.push(cast_array.len() as usize);
194+
run_ends_vec.push(cast_array.len());
194195

195196
// Step 4: Build the run_ends array
196197
for run_end in run_ends_vec {
197-
run_ends_builder.append_value(match K::Native::from_usize(run_end) {
198-
Some(value) => value,
199-
None => {
200-
return Err(ArrowError::CastError(
201-
"Run end index out of range".to_string(),
202-
))
203-
}
204-
});
198+
run_ends_builder.append_value(
199+
K::Native::from_usize(run_end)
200+
.ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?,
201+
);
205202
}
206203
let run_ends_array = run_ends_builder.finish();
207204

@@ -216,8 +213,13 @@ pub(crate) fn cast_to_run_end_encoded<K: RunEndIndexType>(
216213
Ok(Arc::new(run_array))
217214
}
218215

219-
// There might be a cleaner way to handle this but for now this works
220-
pub(crate) fn can_cast_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool {
216+
/// Checks if a given data type can be cast to a RunEndEncoded array.
217+
///
218+
/// # Arguments
219+
/// * `from_type` - The source data type to be checked
220+
/// * `to_type` - The target data type to be checked
221+
///
222+
pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool {
221223
match to_type {
222224
DataType::RunEndEncoded(_, _) => {
223225
// Check if from_type supports equality (can be REE-encoded)

0 commit comments

Comments
 (0)