Allow tensors in tf.Datasets to have different dimensions.

hertschuh · hertschuh · commit 7da5b95fbf71 · 2024-03-15T12:16:38.000-07:00
The shape for the `tf.TensorSpec` for the `tf.Dataset` is determined by inspecting several batches and keeping dimensions that are common. Fixes #19124
diff --git a/keras/trainers/data_adapters/data_adapter_utils.py b/keras/trainers/data_adapters/data_adapter_utils.py
@@ -4,6 +4,8 @@
 from keras.api_export import keras_export
 from keras.utils import tree
 
+NUM_SAMPLES_FOR_TENSOR_SPEC = 4
+
 
 @keras_export("keras.utils.unpack_x_y_sample_weight")
 def unpack_x_y_sample_weight(data):
@@ -125,6 +127,54 @@ def class_weight_to_sample_weights(y, class_weight):
     return sample_weight
 
 
+def get_tensor_spec(batches):
+    """Return the common tensor spec for a list of batches.
+
+    Args:
+        batches: list of structures of tensors. The structures must be
+            identical, but the shape at each leaf may be different.
+    Returns: the common tensor spec for all the batches.
+    """
+    from keras.utils.module_utils import tensorflow as tf
+
+    def get_single_tensor_spec(*tensors):
+        x = tensors[0]
+        rank = len(x.shape)
+        if rank < 1:
+            raise ValueError(
+                "When passing a dataset to a Keras model, the arrays must "
+                f"be at least rank 1. Received: {x} of rank {len(x.shape)}."
+            )
+        for t in tensors:
+            if len(t.shape) != rank:
+                raise ValueError(
+                    "When passing a dataset to a Keras model, the "
+                    "corresponding arrays in each batch must have the same "
+                    f"rank. Received: {x} and {t}"
+                )
+        shape = []
+        # Merge shapes: go through each dimension one by one and keep the
+        # common values
+        for dims in zip(*[list(x.shape) for x in tensors]):
+            dims_set = set(dims)
+            shape.append(dims_set.pop() if len(dims_set) == 1 else None)
+        shape[0] = None  # batch size may not be static
+
+        dtype = backend.standardize_dtype(x.dtype)
+        if isinstance(x, tf.RaggedTensor):
+            return tf.RaggedTensorSpec(shape=shape, dtype=dtype)
+        if (
+            isinstance(x, tf.SparseTensor)
+            or is_scipy_sparse(x)
+            or is_jax_sparse(x)
+        ):
+            return tf.SparseTensorSpec(shape=shape, dtype=dtype)
+        else:
+            return tf.TensorSpec(shape=shape, dtype=dtype)
+
+    return tree.map_structure(get_single_tensor_spec, *batches)
+
+
 def get_jax_iterator(iterable):
     from keras.backend.jax.core import convert_to_tensor
 
diff --git a/keras/trainers/data_adapters/generator_data_adapter.py b/keras/trainers/data_adapters/generator_data_adapter.py
@@ -1,6 +1,5 @@
 import itertools
 
-from keras import backend
 from keras.trainers.data_adapters import data_adapter_utils
 from keras.trainers.data_adapters.data_adapter import DataAdapter
 from keras.utils import tree
@@ -10,49 +9,19 @@ class GeneratorDataAdapter(DataAdapter):
     """Adapter for Python generators."""
 
     def __init__(self, generator):
-        first_batch, generator = peek_and_restore(generator)
+        first_batches, generator = peek_and_restore(generator)
         self.generator = generator
-        self._first_batch = first_batch
+        self._first_batches = first_batches
         self._output_signature = None
-        if not isinstance(first_batch, tuple):
+        if not isinstance(first_batches[0], tuple):
             raise ValueError(
                 "When passing a Python generator to a Keras model, "
                 "the generator must return a tuple, either "
                 "(input,) or (inputs, targets) or "
                 "(inputs, targets, sample_weights). "
-                f"Received: {first_batch}"
+                f"Received: {first_batches[0]}"
             )
 
-    def _set_tf_output_signature(self):
-        from keras.utils.module_utils import tensorflow as tf
-
-        def get_tensor_spec(x):
-            shape = x.shape
-            if len(shape) < 1:
-                raise ValueError(
-                    "When passing a Python generator to a Keras model, "
-                    "the arrays returned by the generator "
-                    "must be at least rank 1. Received: "
-                    f"{x} of rank {len(x.shape)}"
-                )
-            shape = list(shape)
-            shape[0] = None  # The batch size is not guaranteed to be static.
-            dtype = backend.standardize_dtype(x.dtype)
-            if isinstance(x, tf.RaggedTensor):
-                return tf.RaggedTensorSpec(shape=shape, dtype=dtype)
-            if (
-                isinstance(x, tf.SparseTensor)
-                or data_adapter_utils.is_scipy_sparse(x)
-                or data_adapter_utils.is_jax_sparse(x)
-            ):
-                return tf.SparseTensorSpec(shape=shape, dtype=dtype)
-            else:
-                return tf.TensorSpec(shape=shape, dtype=dtype)
-
-        self._output_signature = tree.map_structure(
-            get_tensor_spec, self._first_batch
-        )
-
     def get_numpy_iterator(self):
         return data_adapter_utils.get_numpy_iterator(self.generator)
 
@@ -85,7 +54,9 @@ def get_tf_iterator():
                 yield batch
 
         if self._output_signature is None:
-            self._set_tf_output_signature()
+            self._output_signature = data_adapter_utils.get_tensor_spec(
+                self._first_batches
+            )
         ds = tf.data.Dataset.from_generator(
             get_tf_iterator,
             output_signature=self._output_signature,
@@ -106,5 +77,9 @@ def batch_size(self):
 
 
 def peek_and_restore(generator):
-    element = next(generator)
-    return element, itertools.chain([element], generator)
+    batches = list(
+        itertools.islice(
+            generator, data_adapter_utils.NUM_SAMPLES_FOR_TENSOR_SPEC
+        )
+    )
+    return batches, itertools.chain(batches, generator)
diff --git a/keras/trainers/data_adapters/py_dataset_adapter.py b/keras/trainers/data_adapters/py_dataset_adapter.py
@@ -9,11 +9,9 @@
 
 import numpy as np
 
-from keras import backend
 from keras.api_export import keras_export
 from keras.trainers.data_adapters import data_adapter_utils
 from keras.trainers.data_adapters.data_adapter import DataAdapter
-from keras.utils import tree
 
 
 @keras_export(["keras.utils.PyDataset", "keras.utils.Sequence"])
@@ -188,28 +186,6 @@ def __init__(
         self.shuffle = shuffle
         self._output_signature = None
 
-    def _set_tf_output_signature(self):
-        from keras.utils.module_utils import tensorflow as tf
-
-        def get_tensor_spec(x):
-            shape = x.shape
-            if len(shape) < 1:
-                raise ValueError(
-                    "The arrays returned by PyDataset.__getitem__() "
-                    "must be at least rank 1. Received: "
-                    f"{x} of rank {len(x.shape)}"
-                )
-            shape = list(shape)
-            shape[0] = None  # The batch size is not guaranteed to be static.
-            dtype = backend.standardize_dtype(x.dtype)
-            return tf.TensorSpec(shape=shape, dtype=dtype)
-
-        # Grab the first example
-        batch = self.py_dataset[0]
-        # Run checks on it and format it
-        batch = self._standardize_batch(batch)
-        self._output_signature = tree.map_structure(get_tensor_spec, batch)
-
     def _standardize_batch(self, batch):
         if isinstance(batch, dict):
             return batch
@@ -287,7 +263,15 @@ def get_tf_dataset(self):
         from keras.utils.module_utils import tensorflow as tf
 
         if self._output_signature is None:
-            self._set_tf_output_signature()
+            num_samples = min(
+                data_adapter_utils.NUM_SAMPLES_FOR_TENSOR_SPEC,
+                len(self.py_dataset),
+            )
+            batches = [
+                self._standardize_batch(self.py_dataset[i])
+                for i in range(num_samples)
+            ]
+            self._output_signature = data_adapter_utils.get_tensor_spec(batches)
 
         ds = tf.data.Dataset.from_generator(
             self._get_iterator,
diff --git a/keras/trainers/data_adapters/torch_data_loader_adapter.py b/keras/trainers/data_adapters/torch_data_loader_adapter.py
@@ -1,6 +1,7 @@
+import itertools
+
 import numpy as np
 
-from keras import backend
 from keras.trainers.data_adapters import data_adapter_utils
 from keras.trainers.data_adapters.data_adapter import DataAdapter
 from keras.utils import tree
@@ -19,6 +20,7 @@ def __init__(self, dataloader):
             )
 
         self._dataloader = dataloader
+        self._output_signature = None
         self._batch_size = dataloader.batch_size
         self._num_batches = None
         self._partial_batch_size = None
@@ -44,36 +46,24 @@ def get_jax_iterator(self):
     def get_tf_dataset(self):
         from keras.utils.module_utils import tensorflow as tf
 
-        output_signature = self.peek_and_get_tensor_spec()
+        if self._output_signature is None:
+            batches = list(
+                itertools.islice(
+                    self._dataloader,
+                    data_adapter_utils.NUM_SAMPLES_FOR_TENSOR_SPEC,
+                )
+            )
+            self._output_signature = tuple(
+                data_adapter_utils.get_tensor_spec(batches)
+            )
         return tf.data.Dataset.from_generator(
             self.get_numpy_iterator,
-            output_signature=output_signature,
+            output_signature=self._output_signature,
         )
 
     def get_torch_dataloader(self):
         return self._dataloader
 
-    def peek_and_get_tensor_spec(self):
-        from keras.utils.module_utils import tensorflow as tf
-
-        batch_data = next(iter(self._dataloader))
-
-        def get_tensor_spec(x):
-            shape = x.shape
-            if len(shape) < 1:
-                raise ValueError(
-                    "When passing a Pytorch DataLoader to a Keras model, "
-                    "the arrays returned by the generator "
-                    "must be at least rank 1. Received: "
-                    f"{x} of rank {len(x.shape)}"
-                )
-            shape = list(shape)
-            shape[0] = None  # The batch size is not guaranteed to be static.
-            dtype = backend.standardize_dtype(x.dtype)
-            return tf.TensorSpec(shape=shape, dtype=dtype)
-
-        return tuple(tree.map_structure(get_tensor_spec, batch_data))
-
     @property
     def num_batches(self):
         return self._num_batches