# Data Processing

Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines.

## Capabilities

### Dataset Creation

Create datasets from various data sources.

```python { .api }
class Dataset:
    """A potentially large set of elements."""
    
    @staticmethod
    def from_tensor_slices(tensors, name=None):
        """
        Creates a Dataset whose elements are slices of the given tensors.
        
        Parameters:
        - tensors: A dataset element, whose components have the same first dimension
        - name: Optional name for the tf.data operation
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def from_tensors(tensors, name=None):
        """
        Creates a Dataset with a single element, comprising the given tensors.
        
        Parameters:
        - tensors: A dataset element
        - name: Optional name for the tf.data operation
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def from_generator(generator, output_signature, args=None):
        """
        Creates a Dataset whose elements are generated by generator.
        
        Parameters:
        - generator: A callable object that returns an object that supports the iter() protocol
        - output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator
        - args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator
        
        Returns:
        A Dataset
        """
    
    @staticmethod
    def range(*args, **kwargs):
        """
        Creates a Dataset of a step-separated range of values.
        
        Parameters:
        - *args: follows the same semantics as python's xrange
        - **kwargs: optional keyword arguments
        
        Returns:
        A RangeDataset
        """
    
    @staticmethod
    def zip(datasets):
        """
        Creates a Dataset by zipping together the given datasets.
        
        Parameters:
        - datasets: A nested structure of datasets
        
        Returns:
        A Dataset
        """
```

### Dataset Transformation

Transform and manipulate dataset elements.

```python { .api }
def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None):
    """
    Maps map_func across the elements of this dataset.
    
    Parameters:
    - map_func: A function mapping a dataset element to another dataset element
    - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel
    - deterministic: A boolean controlling whether the map is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def filter(self, predicate, name=None):
    """
    Filters this dataset according to predicate.
    
    Parameters:
    - predicate: A function mapping a dataset element to a boolean
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def flat_map(self, map_func, name=None):
    """
    Maps map_func across this dataset and flattens the result.
    
    Parameters:
    - map_func: A function mapping a dataset element to a dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def interleave(self, map_func, cycle_length=None, block_length=None, 
               num_parallel_calls=None, deterministic=None, name=None):
    """
    Maps map_func across this dataset, and interleaves the results.
    
    Parameters:
    - map_func: A function mapping a dataset element to a dataset
    - cycle_length: The number of input elements that will be processed concurrently
    - block_length: The number of consecutive elements to produce from each input element before cycling to another input element
    - num_parallel_calls: The number of parallel calls for map_func
    - deterministic: A boolean controlling whether the interleave is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """
```

### Dataset Batching and Sampling

Operations for batching and sampling data.

```python { .api }
def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None, 
          deterministic=None, name=None):
    """
    Combines consecutive elements of this dataset into batches.
    
    Parameters:
    - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
    - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
    - num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel
    - deterministic: A boolean controlling whether the batch is allowed to return elements out of order
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def padded_batch(self, batch_size, padded_shapes=None, padding_values=None,
                 drop_remainder=False, name=None):
    """
    Combines consecutive elements of this dataset into padded batches.
    
    Parameters:
    - batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
    - padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching
    - padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components
    - drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def unbatch(self, name=None):
    """
    Splits elements of a dataset into multiple elements on the batch dimension.
    
    Parameters:
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None):
    """
    Randomly shuffles the elements of this dataset.
    
    Parameters:
    - buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample
    - seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution
    - reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def repeat(self, count=None, name=None):
    """
    Repeats this dataset so each original value is seen count times.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def take(self, count, name=None):
    """
    Creates a Dataset with at most count elements from this dataset.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def skip(self, count, name=None):
    """
    Creates a Dataset that skips count elements from this dataset.
    
    Parameters:
    - count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """
```

### Performance Optimization

Operations for optimizing dataset performance.

```python { .api }
def cache(self, filename="", name=None):
    """
    Caches the elements in this dataset.
    
    Parameters:
    - filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def prefetch(self, buffer_size, name=None):
    """
    Creates a Dataset that prefetches elements from this dataset.
    
    Parameters:
    - buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def parallel_interleave(map_func, cycle_length, block_length=1, 
                       sloppy=False, buffer_output_elements=None,
                       prefetch_input_elements=None):
    """
    A parallel version of the Dataset.interleave() transformation.
    
    Parameters:
    - map_func: A function mapping a nested structure of tensors to a Dataset
    - cycle_length: The number of input elements that will be processed concurrently
    - block_length: The number of consecutive elements to produce from each input element before cycling to another input element
    - sloppy: If false, the relative order of records produced by this transformation is deterministic
    - buffer_output_elements: The number of elements each iterator being interleaved should buffer
    - prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered
    
    Returns:
    A Dataset transformation function
    """
```

### Dataset Properties and Utilities

Utility methods for inspecting and manipulating datasets.

```python { .api }
@property
def element_spec(self):
    """
    The type specification of an element of this dataset.
    
    Returns:
    A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset
    """

def cardinality(self):
    """
    Returns the cardinality of the dataset, if known.
    
    Returns:
    A scalar tf.int64 Tensor representing the cardinality of the dataset
    """

def enumerate(self, start=0, name=None):
    """
    Enumerates the elements of this dataset.
    
    Parameters:
    - start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration
    - name: Optional name for the tf.data operation
    
    Returns:
    A Dataset
    """

def concatenate(self, dataset):
    """
    Creates a Dataset by concatenating the given dataset with this dataset.
    
    Parameters:
    - dataset: Dataset to be concatenated
    
    Returns:
    A Dataset
    """

def reduce(self, initial_state, reduce_func, name=None):
    """
    Reduces the input dataset to a single element.
    
    Parameters:
    - initial_state: An element representing the initial state of the reduction
    - reduce_func: A function that maps (old_state, input_element) to new_state
    - name: Optional name for the tf.data operation
    
    Returns:
    A dataset element
    """

def apply(self, transformation_func):
    """
    Applies a transformation function to this dataset.
    
    Parameters:
    - transformation_func: A function that takes one Dataset argument and returns a Dataset
    
    Returns:
    The Dataset returned by applying transformation_func to this dataset
    """
```

## Usage Examples

```python
import tensorflow as tf
import numpy as np

# Create datasets from different sources
# From tensor slices
data = np.array([1, 2, 3, 4, 5])
dataset = tf.data.Dataset.from_tensor_slices(data)

# From tensors (single element)
single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5])

# From generator
def gen():
    for i in range(100):
        yield i

dataset_gen = tf.data.Dataset.from_generator(
    gen, 
    output_signature=tf.TensorSpec(shape=(), dtype=tf.int32)
)

# Range dataset
range_dataset = tf.data.Dataset.range(10)

# Dataset transformations
# Map transformation
squared_dataset = dataset.map(lambda x: x ** 2)

# Filter transformation
even_dataset = range_dataset.filter(lambda x: x % 2 == 0)

# Batch transformation
batched_dataset = range_dataset.batch(3)

# Shuffle and repeat
shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2)

# Complex pipeline example
(train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000)

train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
train_dataset = (train_dataset
                 .map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y))  # Normalize
                 .shuffle(buffer_size=100)
                 .batch(32)
                 .prefetch(tf.data.AUTOTUNE))

# Performance optimizations
# Cache dataset
cached_dataset = train_dataset.cache()

# Prefetch for performance
prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)

# Parallel map
parallel_mapped = range_dataset.map(
    lambda x: x * 2, 
    num_parallel_calls=tf.data.AUTOTUNE
)

# Text processing example
text_data = ["hello world", "tensorflow data", "machine learning"]
text_dataset = tf.data.Dataset.from_tensor_slices(text_data)

# Split text into words
word_dataset = text_dataset.flat_map(
    lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x))
)

# Iterate through dataset
for element in range_dataset.take(5):
    print(element.numpy())

# Convert dataset to list (for small datasets)
dataset_list = list(range_dataset.take(5).as_numpy_iterator())
```