|
15 | 15 | // specific language governing permissions and limitations
|
16 | 16 | // under the License.
|
17 | 17 |
|
18 |
| -use std::sync::Arc; |
| 18 | +use std::sync::{Arc, Mutex}; |
19 | 19 |
|
20 | 20 | use arrow_schema::{DataType, Fields, SchemaBuilder};
|
21 | 21 |
|
22 | 22 | use crate::arrow::array_reader::byte_view_array::make_byte_view_array_reader;
|
| 23 | +use crate::arrow::array_reader::cached_array_reader::CacheRole; |
| 24 | +use crate::arrow::array_reader::cached_array_reader::CachedArrayReader; |
23 | 25 | use crate::arrow::array_reader::empty_array::make_empty_array_reader;
|
24 | 26 | use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
|
| 27 | +use crate::arrow::array_reader::row_group_cache::RowGroupCache; |
25 | 28 | use crate::arrow::array_reader::{
|
26 | 29 | make_byte_array_dictionary_reader, make_byte_array_reader, ArrayReader,
|
27 | 30 | FixedSizeListArrayReader, ListArrayReader, MapArrayReader, NullArrayReader,
|
28 | 31 | PrimitiveArrayReader, RowGroups, StructArrayReader,
|
29 | 32 | };
|
| 33 | +use crate::arrow::arrow_reader::metrics::ArrowReaderMetrics; |
30 | 34 | use crate::arrow::schema::{ParquetField, ParquetFieldType};
|
31 | 35 | use crate::arrow::ProjectionMask;
|
32 | 36 | use crate::basic::Type as PhysicalType;
|
33 | 37 | use crate::data_type::{BoolType, DoubleType, FloatType, Int32Type, Int64Type, Int96Type};
|
34 | 38 | use crate::errors::{ParquetError, Result};
|
35 | 39 | use crate::schema::types::{ColumnDescriptor, ColumnPath, Type};
|
36 | 40 |
|
| 41 | +/// Builder for [`CacheOptions`] |
| 42 | +#[derive(Debug, Clone)] |
| 43 | +pub struct CacheOptionsBuilder<'a> { |
| 44 | + /// Projection mask to apply to the cache |
| 45 | + pub projection_mask: &'a ProjectionMask, |
| 46 | + /// Cache to use for storing row groups |
| 47 | + pub cache: Arc<Mutex<RowGroupCache>>, |
| 48 | +} |
| 49 | + |
| 50 | +impl<'a> CacheOptionsBuilder<'a> { |
| 51 | + /// create a new cache options builder |
| 52 | + pub fn new(projection_mask: &'a ProjectionMask, cache: Arc<Mutex<RowGroupCache>>) -> Self { |
| 53 | + Self { |
| 54 | + projection_mask, |
| 55 | + cache, |
| 56 | + } |
| 57 | + } |
| 58 | + |
| 59 | + /// Return a new [`CacheOptions`] for producing (populating) the cache |
| 60 | + pub fn producer(self) -> CacheOptions<'a> { |
| 61 | + CacheOptions { |
| 62 | + projection_mask: self.projection_mask, |
| 63 | + cache: self.cache, |
| 64 | + role: CacheRole::Producer, |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + /// return a new [`CacheOptions`] for consuming (reading) the cache |
| 69 | + pub fn consumer(self) -> CacheOptions<'a> { |
| 70 | + CacheOptions { |
| 71 | + projection_mask: self.projection_mask, |
| 72 | + cache: self.cache, |
| 73 | + role: CacheRole::Consumer, |
| 74 | + } |
| 75 | + } |
| 76 | +} |
| 77 | + |
| 78 | +/// Cache options containing projection mask, cache, and role |
| 79 | +#[derive(Clone)] |
| 80 | +pub struct CacheOptions<'a> { |
| 81 | + pub projection_mask: &'a ProjectionMask, |
| 82 | + pub cache: Arc<Mutex<RowGroupCache>>, |
| 83 | + pub role: CacheRole, |
| 84 | +} |
| 85 | + |
37 | 86 | /// Builds [`ArrayReader`]s from parquet schema, projection mask, and RowGroups reader
|
38 | 87 | pub struct ArrayReaderBuilder<'a> {
|
| 88 | + /// Source of row group data |
39 | 89 | row_groups: &'a dyn RowGroups,
|
| 90 | + /// Optional cache options for the array reader |
| 91 | + cache_options: Option<&'a CacheOptions<'a>>, |
| 92 | + /// metrics |
| 93 | + metrics: &'a ArrowReaderMetrics, |
40 | 94 | }
|
41 | 95 |
|
42 | 96 | impl<'a> ArrayReaderBuilder<'a> {
|
43 |
| - pub fn new(row_groups: &'a dyn RowGroups) -> Self { |
44 |
| - Self { row_groups } |
| 97 | + pub fn new(row_groups: &'a dyn RowGroups, metrics: &'a ArrowReaderMetrics) -> Self { |
| 98 | + Self { |
| 99 | + row_groups, |
| 100 | + cache_options: None, |
| 101 | + metrics, |
| 102 | + } |
| 103 | + } |
| 104 | + |
| 105 | + /// Add cache options to the builder |
| 106 | + pub fn with_cache_options(mut self, cache_options: Option<&'a CacheOptions<'a>>) -> Self { |
| 107 | + self.cache_options = cache_options; |
| 108 | + self |
45 | 109 | }
|
46 | 110 |
|
47 | 111 | /// Create [`ArrayReader`] from parquet schema, projection mask, and parquet file reader.
|
@@ -69,7 +133,26 @@ impl<'a> ArrayReaderBuilder<'a> {
|
69 | 133 | mask: &ProjectionMask,
|
70 | 134 | ) -> Result<Option<Box<dyn ArrayReader>>> {
|
71 | 135 | match field.field_type {
|
72 |
| - ParquetFieldType::Primitive { .. } => self.build_primitive_reader(field, mask), |
| 136 | + ParquetFieldType::Primitive { col_idx, .. } => { |
| 137 | + let Some(reader) = self.build_primitive_reader(field, mask)? else { |
| 138 | + return Ok(None); |
| 139 | + }; |
| 140 | + let Some(cache_options) = self.cache_options.as_ref() else { |
| 141 | + return Ok(Some(reader)); |
| 142 | + }; |
| 143 | + |
| 144 | + if cache_options.projection_mask.leaf_included(col_idx) { |
| 145 | + Ok(Some(Box::new(CachedArrayReader::new( |
| 146 | + reader, |
| 147 | + Arc::clone(&cache_options.cache), |
| 148 | + col_idx, |
| 149 | + cache_options.role, |
| 150 | + self.metrics.clone(), // cheap clone |
| 151 | + )))) |
| 152 | + } else { |
| 153 | + Ok(Some(reader)) |
| 154 | + } |
| 155 | + } |
73 | 156 | ParquetFieldType::Group { .. } => match &field.arrow_type {
|
74 | 157 | DataType::Map(_, _) => self.build_map_reader(field, mask),
|
75 | 158 | DataType::Struct(_) => self.build_struct_reader(field, mask),
|
@@ -375,7 +458,8 @@ mod tests {
|
375 | 458 | )
|
376 | 459 | .unwrap();
|
377 | 460 |
|
378 |
| - let array_reader = ArrayReaderBuilder::new(&file_reader) |
| 461 | + let metrics = ArrowReaderMetrics::disabled(); |
| 462 | + let array_reader = ArrayReaderBuilder::new(&file_reader, &metrics) |
379 | 463 | .build_array_reader(fields.as_ref(), &mask)
|
380 | 464 | .unwrap();
|
381 | 465 |
|
|
0 commit comments