Skip to content

Commit 7d6df4f

Browse files
feat: Add Run-End Encoded array casting with overflow protection
Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences.
1 parent 5307851 commit 7d6df4f

File tree

2 files changed

+164
-25
lines changed

2 files changed

+164
-25
lines changed

arrow-cast/src/cast/mod.rs

Lines changed: 142 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -745,14 +745,17 @@ pub fn cast_with_options(
745745
| Map(_, _)
746746
| Dictionary(_, _),
747747
) => Ok(new_null_array(to_type, array.len())),
748-
(RunEndEncoded(index_type, _), _) => match index_type.data_type() {
749-
Int16 => run_end_encoded_cast::<Int16Type>(array, to_type, cast_options),
750-
Int32 => run_end_encoded_cast::<Int32Type>(array, to_type, cast_options),
751-
Int64 => run_end_encoded_cast::<Int64Type>(array, to_type, cast_options),
748+
(RunEndEncoded(index_type, _), _) => {
749+
let mut new_cast_options = cast_options.clone();
750+
new_cast_options.safe = false;
751+
match index_type.data_type() {
752+
Int16 => run_end_encoded_cast::<Int16Type>(array, to_type, &new_cast_options),
753+
Int32 => run_end_encoded_cast::<Int32Type>(array, to_type, &new_cast_options),
754+
Int64 => run_end_encoded_cast::<Int64Type>(array, to_type, &new_cast_options),
752755
_ => Err(ArrowError::CastError(format!(
753756
"Casting from run end encoded type {from_type:?} to {to_type:?} not supported",
754757
))),
755-
},
758+
}},
756759
(_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() {
757760
Int16 => {
758761
cast_to_run_end_encoded::<Int16Type>(array, value_type.data_type(), cast_options)
@@ -10726,16 +10729,14 @@ mod tests {
1072610729
let values = Int32Array::from(vec![1, 2, 3]);
1072710730
let run_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
1072810731
let array_ref = Arc::new(run_array) as ArrayRef;
10729-
println!("1");
1073010732
// Cast to Int64
1073110733
let cast_result = cast(&array_ref, &DataType::Int64).unwrap();
10732-
println!("2");
1073310734
// Verify the result is a RunArray with Int64 values
10734-
let result_run_array = cast_result
10735-
.as_any()
10736-
.downcast_ref::<Int64Array>()
10737-
.unwrap();
10738-
assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]);
10735+
let result_run_array = cast_result.as_any().downcast_ref::<Int64Array>().unwrap();
10736+
assert_eq!(
10737+
result_run_array.values(),
10738+
&[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]
10739+
);
1073910740
}
1074010741

1074110742
/// Test casting FROM RunEndEncoded to string
@@ -10751,10 +10752,7 @@ mod tests {
1075110752
let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
1075210753

1075310754
// Verify the result is a RunArray with String values
10754-
let result_array = cast_result
10755-
.as_any()
10756-
.downcast_ref::<StringArray>()
10757-
.unwrap();
10755+
let result_array = cast_result.as_any().downcast_ref::<StringArray>().unwrap();
1075810756
// Check that values are correct
1075910757
assert_eq!(result_array.value(0), "10");
1076010758
assert_eq!(result_array.value(1), "10");
@@ -10890,10 +10888,7 @@ mod tests {
1089010888
let cast_result = cast(&array_ref, &DataType::Utf8).unwrap();
1089110889

1089210890
// Verify the result preserves nulls
10893-
let result_run_array = cast_result
10894-
.as_any()
10895-
.downcast_ref::<StringArray>()
10896-
.unwrap();
10891+
let result_run_array = cast_result.as_any().downcast_ref::<StringArray>().unwrap();
1089710892
assert_eq!(result_run_array.value(0), "1");
1089810893
assert!(result_run_array.is_null(2));
1089910894
assert_eq!(result_run_array.value(4), "2");
@@ -10939,5 +10934,132 @@ mod tests {
1093910934
// Expect this to fail
1094010935
assert!(cast_result.is_err());
1094110936
}
10937+
#[test]
10938+
fn test_cast_run_end_encoded_int64_to_int16_should_fail() {
10939+
use arrow_array::{Int64Array, RunArray, StringArray};
10940+
use arrow_schema::{DataType, Field};
10941+
use std::sync::Arc;
10942+
10943+
// Construct a valid REE array with Int64 run-ends
10944+
let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16
10945+
let values = StringArray::from(vec!["a", "b", "c"]);
10946+
10947+
let ree_array = RunArray::<Int64Type>::try_new(&run_ends, &values).unwrap();
10948+
let array_ref = Arc::new(ree_array) as ArrayRef;
10949+
10950+
// Attempt to cast to RunEndEncoded<Int16, Utf8>
10951+
let target_type = DataType::RunEndEncoded(
10952+
Arc::new(Field::new("run_ends", DataType::Int16, false)),
10953+
Arc::new(Field::new("values", DataType::Utf8, true)),
10954+
);
10955+
let cast_options = CastOptions {
10956+
safe: false, // This should make it fail instead of returning nulls
10957+
format_options: FormatOptions::default(),
10958+
};
10959+
10960+
// This should fail due to run-end overflow
10961+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
10962+
cast_with_options(&array_ref, &target_type,&cast_options);
10963+
10964+
match result {
10965+
Err(e) => {
10966+
assert!(e.to_string().contains("Cast error: Can't cast value 100000 to type Int16"));
10967+
}
10968+
Ok(_array_ref) => {
10969+
panic!("This should not happen");
10970+
}
10971+
}
10972+
}
10973+
#[test]
10974+
fn test_cast_run_end_encoded_int16_to_int64_should_succeed() {
10975+
use arrow_array::{Int16Array, RunArray, StringArray};
10976+
use arrow_schema::{DataType, Field};
10977+
use std::sync::Arc;
10978+
10979+
// Construct a valid REE array with Int16 run-ends
10980+
let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16
10981+
let values = StringArray::from(vec!["a", "b", "c"]);
10982+
10983+
10984+
let ree_array = RunArray::<Int16Type>::try_new(&run_ends, &values).unwrap();
10985+
let array_ref = Arc::new(ree_array) as ArrayRef;
10986+
10987+
// Attempt to cast to RunEndEncoded<Int64, Utf8> (upcast should succeed)
10988+
let target_type = DataType::RunEndEncoded(
10989+
Arc::new(Field::new("run_ends", DataType::Int64, false)),
10990+
Arc::new(Field::new("values", DataType::Utf8, true)),
10991+
);
10992+
let cast_options = CastOptions {
10993+
safe: false,
10994+
format_options: FormatOptions::default(),
10995+
};
10996+
10997+
// This should succeed due to valid upcast
10998+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
10999+
cast_with_options(&array_ref, &target_type, &cast_options);
11000+
11001+
match result {
11002+
Ok(array_ref) => {
11003+
// Downcast to RunArray<Int64Type>
11004+
let run_array = array_ref
11005+
.as_any()
11006+
.downcast_ref::<RunArray<Int64Type>>()
11007+
.unwrap();
11008+
11009+
// Verify the cast worked correctly
11010+
// Assert the values were cast correctly
11011+
assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]);
11012+
assert_eq!(run_array.values().as_string::<i32>().value(0), "a");
11013+
assert_eq!(run_array.values().as_string::<i32>().value(1), "b");
11014+
assert_eq!(run_array.values().as_string::<i32>().value(2), "c");
11015+
}
11016+
Err(e) => {
11017+
panic!("Cast should have succeeded but failed: {}", e);
11018+
}
11019+
}
11020+
}
11021+
11022+
#[test]
11023+
fn test_cast_run_end_encoded_int32_to_int16_should_fail() {
11024+
use arrow_array::{Int32Array, RunArray, StringArray};
11025+
use arrow_schema::{DataType, Field};
11026+
use std::sync::Arc;
11027+
11028+
// Construct a valid REE array with Int32 run-ends
11029+
let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16
11030+
let values = StringArray::from(vec!["x", "y", "z"]);
11031+
11032+
println!("Original run_ends null count: {}", run_ends.null_count());
11033+
println!("Original run_ends values: {:?}", run_ends.values());
11034+
11035+
let ree_array = RunArray::<Int32Type>::try_new(&run_ends, &values).unwrap();
11036+
let array_ref = Arc::new(ree_array) as ArrayRef;
11037+
11038+
// Attempt to cast to RunEndEncoded<Int16, Utf8> (downcast should fail)
11039+
let target_type = DataType::RunEndEncoded(
11040+
Arc::new(Field::new("run_ends", DataType::Int16, false)),
11041+
Arc::new(Field::new("values", DataType::Utf8, true)),
11042+
);
11043+
let cast_options = CastOptions {
11044+
safe: false,
11045+
format_options: FormatOptions::default(),
11046+
};
11047+
11048+
// This should fail due to run-end overflow
11049+
let result: Result<Arc<dyn Array + 'static>, ArrowError> =
11050+
cast_with_options(&array_ref, &target_type, &cast_options);
11051+
11052+
match result {
11053+
Ok(_) => {
11054+
panic!("Cast should have failed due to overflow but succeeded");
11055+
}
11056+
Err(e) => {
11057+
// Verify the error is about overflow/out of range
11058+
assert!(
11059+
e.to_string().contains("Can't cast value")
11060+
);
11061+
}
11062+
}
11063+
}
1094211064
}
1094311065
}

arrow-cast/src/cast/run_array.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
66
cast_options: &CastOptions,
77
) -> Result<ArrayRef, ArrowError> {
88
match array.data_type() {
9-
DataType::RunEndEncoded(_run_end_field, _values_field) => {
9+
DataType::RunEndEncoded(_, _) => {
1010
let run_array = array
1111
.as_any()
1212
.downcast_ref::<RunArray<K>>()
@@ -16,16 +16,33 @@ pub(crate) fn run_end_encoded_cast<K: RunEndIndexType>(
1616

1717
match to_type {
1818
// CASE 1: Stay as RunEndEncoded, cast only the values
19-
DataType::RunEndEncoded(_target_run_end_field, target_value_field) => {
19+
DataType::RunEndEncoded(target_index_field, target_value_field) => {
2020
let cast_values =
2121
cast_with_options(values, target_value_field.data_type(), cast_options)?;
2222

2323
let run_ends_array = PrimitiveArray::<K>::from_iter_values(
2424
run_array.run_ends().values().iter().copied(),
2525
);
26-
27-
let new_run_array =
28-
RunArray::<K>::try_new(&run_ends_array, cast_values.as_ref())?;
26+
let cast_run_ends = cast_with_options(
27+
&run_ends_array,
28+
target_index_field.data_type(),
29+
cast_options,
30+
)?;
31+
let new_run_array: ArrayRef = match target_index_field.data_type() {
32+
DataType::Int16 => {
33+
let re = cast_run_ends.as_primitive::<Int16Type>();
34+
Arc::new(RunArray::<Int16Type>::try_new(re, cast_values.as_ref())?)
35+
}
36+
DataType::Int32 => {
37+
let re = cast_run_ends.as_primitive::<Int32Type>();
38+
Arc::new(RunArray::<Int32Type>::try_new(re, cast_values.as_ref())?)
39+
}
40+
DataType::Int64 => {
41+
let re = cast_run_ends.as_primitive::<Int64Type>();
42+
Arc::new(RunArray::<Int64Type>::try_new(re, cast_values.as_ref())?)
43+
}
44+
_ => unreachable!("Run-end type must be i16, i32, or i64"),
45+
};
2946
Ok(Arc::new(new_run_array))
3047
}
3148

0 commit comments

Comments
 (0)