mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
437 lines
No EOL
13 KiB
Markdown
437 lines
No EOL
13 KiB
Markdown
# Data Processing
|
|
|
|
Dataset creation, transformation, and preprocessing pipeline operations for efficient data handling and training workflows. The tf.data API provides powerful tools for building scalable input pipelines.
|
|
|
|
## Capabilities
|
|
|
|
### Dataset Creation
|
|
|
|
Create datasets from various data sources.
|
|
|
|
```python { .api }
|
|
class Dataset:
|
|
"""A potentially large set of elements."""
|
|
|
|
@staticmethod
|
|
def from_tensor_slices(tensors, name=None):
|
|
"""
|
|
Creates a Dataset whose elements are slices of the given tensors.
|
|
|
|
Parameters:
|
|
- tensors: A dataset element, whose components have the same first dimension
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
@staticmethod
|
|
def from_tensors(tensors, name=None):
|
|
"""
|
|
Creates a Dataset with a single element, comprising the given tensors.
|
|
|
|
Parameters:
|
|
- tensors: A dataset element
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
@staticmethod
|
|
def from_generator(generator, output_signature, args=None):
|
|
"""
|
|
Creates a Dataset whose elements are generated by generator.
|
|
|
|
Parameters:
|
|
- generator: A callable object that returns an object that supports the iter() protocol
|
|
- output_signature: A nested structure of tf.TypeSpec objects corresponding to each component of an element yielded by generator
|
|
- args: A tf.Tensor object or a tuple of tf.Tensor objects to pass as arguments to generator
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
@staticmethod
|
|
def range(*args, **kwargs):
|
|
"""
|
|
Creates a Dataset of a step-separated range of values.
|
|
|
|
Parameters:
|
|
- *args: follows the same semantics as python's xrange
|
|
- **kwargs: optional keyword arguments
|
|
|
|
Returns:
|
|
A RangeDataset
|
|
"""
|
|
|
|
@staticmethod
|
|
def zip(datasets):
|
|
"""
|
|
Creates a Dataset by zipping together the given datasets.
|
|
|
|
Parameters:
|
|
- datasets: A nested structure of datasets
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
```
|
|
|
|
### Dataset Transformation
|
|
|
|
Transform and manipulate dataset elements.
|
|
|
|
```python { .api }
|
|
def map(self, map_func, num_parallel_calls=None, deterministic=None, name=None):
|
|
"""
|
|
Maps map_func across the elements of this dataset.
|
|
|
|
Parameters:
|
|
- map_func: A function mapping a dataset element to another dataset element
|
|
- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process asynchronously in parallel
|
|
- deterministic: A boolean controlling whether the map is allowed to return elements out of order
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def filter(self, predicate, name=None):
|
|
"""
|
|
Filters this dataset according to predicate.
|
|
|
|
Parameters:
|
|
- predicate: A function mapping a dataset element to a boolean
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def flat_map(self, map_func, name=None):
|
|
"""
|
|
Maps map_func across this dataset and flattens the result.
|
|
|
|
Parameters:
|
|
- map_func: A function mapping a dataset element to a dataset
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def interleave(self, map_func, cycle_length=None, block_length=None,
|
|
num_parallel_calls=None, deterministic=None, name=None):
|
|
"""
|
|
Maps map_func across this dataset, and interleaves the results.
|
|
|
|
Parameters:
|
|
- map_func: A function mapping a dataset element to a dataset
|
|
- cycle_length: The number of input elements that will be processed concurrently
|
|
- block_length: The number of consecutive elements to produce from each input element before cycling to another input element
|
|
- num_parallel_calls: The number of parallel calls for map_func
|
|
- deterministic: A boolean controlling whether the interleave is allowed to return elements out of order
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
```
|
|
|
|
### Dataset Batching and Sampling
|
|
|
|
Operations for batching and sampling data.
|
|
|
|
```python { .api }
|
|
def batch(self, batch_size, drop_remainder=False, num_parallel_calls=None,
|
|
deterministic=None, name=None):
|
|
"""
|
|
Combines consecutive elements of this dataset into batches.
|
|
|
|
Parameters:
|
|
- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
|
|
- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
|
|
- num_parallel_calls: A tf.int32 scalar tf.Tensor, representing the number elements to process in parallel
|
|
- deterministic: A boolean controlling whether the batch is allowed to return elements out of order
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def padded_batch(self, batch_size, padded_shapes=None, padding_values=None,
|
|
drop_remainder=False, name=None):
|
|
"""
|
|
Combines consecutive elements of this dataset into padded batches.
|
|
|
|
Parameters:
|
|
- batch_size: A tf.int64 scalar tf.Tensor, representing the number of consecutive elements of this dataset to combine in a single batch
|
|
- padded_shapes: A nested structure of tf.TensorShape or tf.int64 vector tensor-like objects representing the shape to which the respective component of each input element should be padded prior to batching
|
|
- padding_values: A nested structure of scalar-shaped tf.Tensor, representing the padding values to use for the respective components
|
|
- drop_remainder: A tf.bool scalar tf.Tensor, representing whether the last batch should be dropped in the case it has fewer than batch_size elements
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def unbatch(self, name=None):
|
|
"""
|
|
Splits elements of a dataset into multiple elements on the batch dimension.
|
|
|
|
Parameters:
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None, name=None):
|
|
"""
|
|
Randomly shuffles the elements of this dataset.
|
|
|
|
Parameters:
|
|
- buffer_size: A tf.int64 scalar tf.Tensor, representing the number of elements from this dataset from which the new dataset will sample
|
|
- seed: Optional tf.int64 scalar tf.Tensor, representing the random seed that will be used to create the distribution
|
|
- reshuffle_each_iteration: If true, the dataset will be reshuffled each time it is iterated over
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def repeat(self, count=None, name=None):
|
|
"""
|
|
Repeats this dataset so each original value is seen count times.
|
|
|
|
Parameters:
|
|
- count: A tf.int64 scalar tf.Tensor, representing the number of times the dataset should be repeated
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def take(self, count, name=None):
|
|
"""
|
|
Creates a Dataset with at most count elements from this dataset.
|
|
|
|
Parameters:
|
|
- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be taken to form the new dataset
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def skip(self, count, name=None):
|
|
"""
|
|
Creates a Dataset that skips count elements from this dataset.
|
|
|
|
Parameters:
|
|
- count: A tf.int64 scalar tf.Tensor, representing the number of elements of this dataset that should be skipped to form the new dataset
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
```
|
|
|
|
### Performance Optimization
|
|
|
|
Operations for optimizing dataset performance.
|
|
|
|
```python { .api }
|
|
def cache(self, filename="", name=None):
|
|
"""
|
|
Caches the elements in this dataset.
|
|
|
|
Parameters:
|
|
- filename: A tf.string scalar tf.Tensor, representing the name of a directory on the filesystem to use for caching elements in this Dataset
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def prefetch(self, buffer_size, name=None):
|
|
"""
|
|
Creates a Dataset that prefetches elements from this dataset.
|
|
|
|
Parameters:
|
|
- buffer_size: A tf.int64 scalar tf.Tensor, representing the maximum number of elements that will be buffered when prefetching
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def parallel_interleave(map_func, cycle_length, block_length=1,
|
|
sloppy=False, buffer_output_elements=None,
|
|
prefetch_input_elements=None):
|
|
"""
|
|
A parallel version of the Dataset.interleave() transformation.
|
|
|
|
Parameters:
|
|
- map_func: A function mapping a nested structure of tensors to a Dataset
|
|
- cycle_length: The number of input elements that will be processed concurrently
|
|
- block_length: The number of consecutive elements to produce from each input element before cycling to another input element
|
|
- sloppy: If false, the relative order of records produced by this transformation is deterministic
|
|
- buffer_output_elements: The number of elements each iterator being interleaved should buffer
|
|
- prefetch_input_elements: The number of input elements to transform to iterators in parallel and keep buffered
|
|
|
|
Returns:
|
|
A Dataset transformation function
|
|
"""
|
|
```
|
|
|
|
### Dataset Properties and Utilities
|
|
|
|
Utility methods for inspecting and manipulating datasets.
|
|
|
|
```python { .api }
|
|
@property
|
|
def element_spec(self):
|
|
"""
|
|
The type specification of an element of this dataset.
|
|
|
|
Returns:
|
|
A nested structure of tf.TypeSpec objects matching the structure of an element of this dataset
|
|
"""
|
|
|
|
def cardinality(self):
|
|
"""
|
|
Returns the cardinality of the dataset, if known.
|
|
|
|
Returns:
|
|
A scalar tf.int64 Tensor representing the cardinality of the dataset
|
|
"""
|
|
|
|
def enumerate(self, start=0, name=None):
|
|
"""
|
|
Enumerates the elements of this dataset.
|
|
|
|
Parameters:
|
|
- start: A tf.int64 scalar tf.Tensor, representing the start value for enumeration
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def concatenate(self, dataset):
|
|
"""
|
|
Creates a Dataset by concatenating the given dataset with this dataset.
|
|
|
|
Parameters:
|
|
- dataset: Dataset to be concatenated
|
|
|
|
Returns:
|
|
A Dataset
|
|
"""
|
|
|
|
def reduce(self, initial_state, reduce_func, name=None):
|
|
"""
|
|
Reduces the input dataset to a single element.
|
|
|
|
Parameters:
|
|
- initial_state: An element representing the initial state of the reduction
|
|
- reduce_func: A function that maps (old_state, input_element) to new_state
|
|
- name: Optional name for the tf.data operation
|
|
|
|
Returns:
|
|
A dataset element
|
|
"""
|
|
|
|
def apply(self, transformation_func):
|
|
"""
|
|
Applies a transformation function to this dataset.
|
|
|
|
Parameters:
|
|
- transformation_func: A function that takes one Dataset argument and returns a Dataset
|
|
|
|
Returns:
|
|
The Dataset returned by applying transformation_func to this dataset
|
|
"""
|
|
```
|
|
|
|
## Usage Examples
|
|
|
|
```python
|
|
import tensorflow as tf
|
|
import numpy as np
|
|
|
|
# Create datasets from different sources
|
|
# From tensor slices
|
|
data = np.array([1, 2, 3, 4, 5])
|
|
dataset = tf.data.Dataset.from_tensor_slices(data)
|
|
|
|
# From tensors (single element)
|
|
single_element = tf.data.Dataset.from_tensors([1, 2, 3, 4, 5])
|
|
|
|
# From generator
|
|
def gen():
|
|
for i in range(100):
|
|
yield i
|
|
|
|
dataset_gen = tf.data.Dataset.from_generator(
|
|
gen,
|
|
output_signature=tf.TensorSpec(shape=(), dtype=tf.int32)
|
|
)
|
|
|
|
# Range dataset
|
|
range_dataset = tf.data.Dataset.range(10)
|
|
|
|
# Dataset transformations
|
|
# Map transformation
|
|
squared_dataset = dataset.map(lambda x: x ** 2)
|
|
|
|
# Filter transformation
|
|
even_dataset = range_dataset.filter(lambda x: x % 2 == 0)
|
|
|
|
# Batch transformation
|
|
batched_dataset = range_dataset.batch(3)
|
|
|
|
# Shuffle and repeat
|
|
shuffled_dataset = range_dataset.shuffle(buffer_size=10).repeat(2)
|
|
|
|
# Complex pipeline example
|
|
(train_images, train_labels) = np.random.random((1000, 28, 28, 1)), np.random.randint(0, 10, 1000)
|
|
|
|
train_dataset = tf.data.Dataset.from_tensor_slices((train_images, train_labels))
|
|
train_dataset = (train_dataset
|
|
.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y)) # Normalize
|
|
.shuffle(buffer_size=100)
|
|
.batch(32)
|
|
.prefetch(tf.data.AUTOTUNE))
|
|
|
|
# Performance optimizations
|
|
# Cache dataset
|
|
cached_dataset = train_dataset.cache()
|
|
|
|
# Prefetch for performance
|
|
prefetched_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
|
|
|
|
# Parallel map
|
|
parallel_mapped = range_dataset.map(
|
|
lambda x: x * 2,
|
|
num_parallel_calls=tf.data.AUTOTUNE
|
|
)
|
|
|
|
# Text processing example
|
|
text_data = ["hello world", "tensorflow data", "machine learning"]
|
|
text_dataset = tf.data.Dataset.from_tensor_slices(text_data)
|
|
|
|
# Split text into words
|
|
word_dataset = text_dataset.flat_map(
|
|
lambda x: tf.data.Dataset.from_tensor_slices(tf.strings.split(x))
|
|
)
|
|
|
|
# Iterate through dataset
|
|
for element in range_dataset.take(5):
|
|
print(element.numpy())
|
|
|
|
# Convert dataset to list (for small datasets)
|
|
dataset_list = list(range_dataset.take(5).as_numpy_iterator())
|
|
``` |