# Data Processing Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines. ## Capabilities ### Dataset Creation Create datasets from various data sources. ```python { .api } class Dataset: """A potentially large set of elements.""" @staticmethod def from_tensor_slices(tensors, name=None): """ Creates a Dataset whose elements are slices of the given tensors. Parameters: - tensors: A dataset element, whose components have the same first dimension - name: Optional name for the tf.data operation Returns: A Dataset """ @staticmethod def from_tensors(tensors, name=None): """ Creates a Dataset with a single element, comprising the given tensors. Parameters: - tensors: A dataset element - name: Optional name for the tf.data operation Returns: A Dataset """ @staticmethod def from_generator(generator, output_signature, args=None): """ Creates a Dataset whose elements are generated by generator. Parameters: - generator: A callable object that returns an object that supports the iter() protocol - output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator - args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator Returns: A Dataset """ @staticmethod def range(*args, **kwargs): """ Creates a Dataset of a step-separated range of values. Parameters: - *args: follows the same semantics as python's xrange - **kwargs: optional keyword arguments Returns: A RangeDataset """ @staticmethod def zip(datasets): """ Creates a Dataset by zipping together the given datasets. Parameters: - datasets: A nested structure of datasets Returns: A Dataset """ ``` ### Dataset Transformation Transform and manipulate dataset elements. ```python { .api } def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None): """ Maps map_func across the elements of this dataset. Parameters: - map_func: A function mapping a dataset element to another dataset element - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel - deterministic: A boolean controlling whether the map is allowed to return elements out of order - name: Optional name for the tf.data operation Returns: A Dataset """ def filter(self, predicate, name=None): """ Filters this dataset according to predicate. Parameters: - predicate: A function mapping a dataset element to a boolean - name: Optional name for the tf.data operation Returns: A Dataset """ def flat_map(self, map_func, name=None): """ Maps map_func across this dataset and flattens the result. Parameters: - map_func: A function mapping a dataset element to a dataset - name: Optional name for the tf.data operation Returns: A Dataset """ def interleave(self, map_func, cycle_length=None, block_length=None, num_parallel_calls=None, deterministic=None, name=None): """ Maps map_func across this dataset, and interleaves the results. Parameters: - map_func: A function mapping a dataset element to a dataset - cycle_length: The number of input elements that will be processed concurrently - block_length: The number of consecutive elements to produce from each input element before cycling to another input element - num_parallel_calls: The number of parallel calls for map_func - deterministic: A boolean controlling whether the interleave is allowed to return elements out of order - name: Optional name for the tf.data operation Returns: A Dataset """ ``` ### Dataset Batching and Sampling Operations for batching and sampling data. ```python { .api } def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None, deterministic=None, name=None): """ Combines consecutive elements of this dataset into batches. Parameters: - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel - deterministic: A boolean controlling whether the batch is allowed to return elements out of order - name: Optional name for the tf.data operation Returns: A Dataset """ def padded_batch(self, batch_size, padded_shapes=None, padding_values=None, drop_remainder=False, name=None): """ Combines consecutive elements of this dataset into padded batches. Parameters: - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch - padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching - padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements - name: Optional name for the tf.data operation Returns: A Dataset """ def unbatch(self, name=None): """ Splits elements of a dataset into multiple elements on the batch dimension. Parameters: - name: Optional name for the tf.data operation Returns: A Dataset """ def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None): """ Randomly shuffles the elements of this dataset. Parameters: - buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample - seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution - reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over - name: Optional name for the tf.data operation Returns: A Dataset """ def repeat(self, count=None, name=None): """ Repeats this dataset so each original value is seen count times. Parameters: - count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated - name: Optional name for the tf.data operation Returns: A Dataset """ def take(self, count, name=None): """ Creates a Dataset with at most count elements from this dataset. Parameters: - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset - name: Optional name for the tf.data operation Returns: A Dataset """ def skip(self, count, name=None): """ Creates a Dataset that skips count elements from this dataset. Parameters: - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset - name: Optional name for the tf.data operation Returns: A Dataset """ ``` ### Performance Optimization Operations for optimizing dataset performance. ```python { .api } def cache(self, filename="", name=None): """ Caches the elements in this dataset. Parameters: - filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset - name: Optional name for the tf.data operation Returns: A Dataset """ def prefetch(self, buffer_size, name=None): """ Creates a Dataset that prefetches elements from this dataset. Parameters: - buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching - name: Optional name for the tf.data operation Returns: A Dataset """ def parallel_interleave(map_func, cycle_length, block_length=1, sloppy=False, buffer_output_elements=None, prefetch_input_elements=None): """ A parallel version of the Dataset.interleave() transformation. Parameters: - map_func: A function mapping a nested structure of tensors to a Dataset - cycle_length: The number of input elements that will be processed concurrently - block_length: The number of consecutive elements to produce from each input element before cycling to another input element - sloppy: If false, the relative order of records produced by this transformation is deterministic - buffer_output_elements: The number of elements each iterator being interleaved should buffer - prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered Returns: A Dataset transformation function """ ``` ### Dataset Properties and Utilities Utility methods for inspecting and manipulating datasets. ```python { .api } @property def element_spec(self): """ The type specification of an element of this dataset. Returns: A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset """ def cardinality(self): """ Returns the cardinality of the dataset, if known. Returns: A scalar tf.int64 Tensor representing the cardinality of the dataset """ def enumerate(self, start=0, name=None): """ Enumerates the elements of this dataset. Parameters: - start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration - name: Optional name for the tf.data operation Returns: A Dataset """ def concatenate(self, dataset): """ Creates a Dataset by concatenating the given dataset with this dataset. Parameters: - dataset: Dataset to be concatenated Returns: A Dataset """ def reduce(self, initial_state, reduce_func, name=None): """ Reduces the input dataset to a single element. Parameters: - initial_state: An element representing the initial state of the reduction - reduce_func: A function that maps (old_state, input_element) to new_state - name: Optional name for the tf.data operation Returns: A dataset element """ def apply(self, transformation_func): """ Applies a transformation function to this dataset. Parameters: - transformation_func: A function that takes one Dataset argument and returns a Dataset Returns: The Dataset returned by applying transformation_func to this dataset """ ``` ## Usage Examples ```python import tensorflow as tf import numpy as np # Create datasets from different sources # From tensor slices data = np.array([1, 2, 3, 4, 5]) dataset = tf.data.Dataset.from_tensor_slices(data) # From tensors (single element) single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5]) # From generator def gen(): for i in range(100): yield i dataset_gen = tf.data.Dataset.from_generator( gen, output_signature=tf.TensorSpec(shape=(), dtype=tf.int32) ) # Range dataset range_dataset = tf.data.Dataset.range(10) # Dataset transformations # Map transformation squared_dataset = dataset.map(lambda x: x ** 2) # Filter transformation even_dataset = range_dataset.filter(lambda x: x % 2 == 0) # Batch transformation batched_dataset = range_dataset.batch(3) # Shuffle and repeat shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2) # Complex pipeline example (train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000) train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels)) train_dataset = (train_dataset .map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y)) # Normalize .shuffle(buffer_size=100) .batch(32) .prefetch(tf.data.AUTOTUNE)) # Performance optimizations # Cache dataset cached_dataset = train_dataset.cache() # Prefetch for performance prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE) # Parallel map parallel_mapped = range_dataset.map( lambda x: x * 2, num_parallel_calls=tf.data.AUTOTUNE ) # Text processing example text_data = ["hello world", "tensorflow data", "machine learning"] text_dataset = tf.data.Dataset.from_tensor_slices(text_data) # Split text into words word_dataset = text_dataset.flat_map( lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x)) ) # Iterate through dataset for element in range_dataset.take(5): print(element.numpy()) # Convert dataset to list (for small datasets) dataset_list = list(range_dataset.take(5).as_numpy_iterator()) ```