# Core Data Structures

Fundamental data containers that form the foundation of PyArrow's columnar data processing capabilities. These structures provide efficient storage and manipulation of typed data in memory-optimized columnar layouts.

## Capabilities

### Arrays

One-dimensional sequences of values with a specific data type. Arrays are immutable and provide the basic building blocks for all other data structures in PyArrow.

```python { .api }
def array(obj, type=None, mask=None, size=None, from_pandas=None, safe=True):
    """
    Create Arrow array from Python sequence, NumPy array, or pandas data.
    
    Parameters:
    - obj: sequence, NumPy array, or pandas Series to convert
    - type: DataType, explicit type for the array
    - mask: array-like, boolean mask for null values
    - size: int, length of array if obj is scalar
    - from_pandas: bool, interpret pandas-specific data
    - safe: bool, check for overflow/truncation during conversion
    
    Returns:
    Array: Arrow array with specified type
    """

def chunked_array(arrays, type=None):
    """
    Create chunked array from list of arrays.
    
    Parameters:
    - arrays: sequence of Array objects
    - type: DataType, explicit type (must match all arrays)
    
    Returns:
    ChunkedArray: Chunked array composed of input arrays
    """

def nulls(size, type=None):
    """
    Create array of null values.
    
    Parameters:
    - size: int, length of array
    - type: DataType, type of nulls (default: null type)
    
    Returns:
    Array: Array of null values
    """

def repeat(value, size):
    """
    Create array by repeating a single value.
    
    Parameters:
    - value: scalar value to repeat
    - size: int, number of repetitions
    
    Returns:
    Array: Array with repeated value
    """

def arange(start, stop=None, step=1, dtype=None):
    """
    Create array with range of values.
    
    Parameters:
    - start: int, start value (or stop if stop is None)
    - stop: int, stop value (exclusive)
    - step: int, step size
    - dtype: DataType, array data type
    
    Returns:
    Array: Array with range values
    """

class Array:
    """
    Base class for all Arrow arrays.
    
    Attributes:
    - type: DataType of the array
    - length: Number of elements
    - null_count: Number of null values
    - is_valid: Boolean array indicating non-null values
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def to_pylist(self):
        """Convert to Python list."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas Series."""
    
    def to_numpy(self, **kwargs):
        """Convert to NumPy array."""
    
    def slice(self, offset=0, length=None):
        """Return slice of array."""
    
    def take(self, indices):
        """Select elements by indices."""
    
    def filter(self, mask):
        """Filter array by boolean mask."""
    
    def sort(self, **kwargs):
        """Return sorted array."""
    
    def unique(self):
        """Return array of unique values."""
    
    def value_counts(self):
        """Return struct array of value counts."""

class ChunkedArray:
    """
    Array composed of multiple contiguous arrays (chunks).
    
    Attributes:
    - type: DataType of the chunked array
    - length: Total number of elements across chunks
    - null_count: Total number of null values
    - num_chunks: Number of chunks
    - chunks: List of Array chunks
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def chunk(self, i):
        """Get chunk at index i."""
    
    def to_pylist(self):
        """Convert to Python list."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas Series."""
    
    def slice(self, offset=0, length=None):
        """Return slice of chunked array."""
    
    def take(self, indices):
        """Select elements by indices."""
    
    def filter(self, mask):
        """Filter by boolean mask."""
    
    def combine_chunks(self):
        """Combine chunks into single array."""
```

### Tables

Two-dimensional datasets with named columns, similar to SQL tables or pandas DataFrames. Tables provide the primary interface for working with tabular data in PyArrow.

```python { .api }
def table(data, schema=None, metadata=None, columns=None):
    """
    Create Arrow table from various data sources.
    
    Parameters:
    - data: dict, list of arrays, pandas DataFrame, or RecordBatch
    - schema: Schema, explicit schema for the table
    - metadata: dict, key-value metadata
    - columns: list of str, column names (when data is list)
    
    Returns:
    Table: Arrow table with specified schema
    """

def record_batch(data, schema=None, metadata=None):
    """
    Create RecordBatch from data.
    
    Parameters:
    - data: dict, list of arrays, or sequence
    - schema: Schema, explicit schema
    - metadata: dict, key-value metadata
    
    Returns:
    RecordBatch: Single batch of columnar data
    """

def concat_tables(tables, promote=False):
    """
    Concatenate tables vertically.
    
    Parameters:
    - tables: sequence of Table objects
    - promote: bool, promote schemas to compatible type
    
    Returns:
    Table: Concatenated table
    """

def concat_arrays(arrays):
    """
    Concatenate arrays into single array.
    
    Parameters:
    - arrays: sequence of Array objects with same type
    
    Returns:
    Array: Concatenated array
    """

def concat_batches(batches, promote=False):
    """
    Concatenate record batches.
    
    Parameters:
    - batches: sequence of RecordBatch objects
    - promote: bool, promote schemas to compatible type
    
    Returns:
    Table: Table created from concatenated batches
    """

class Table:
    """
    Two-dimensional table of columnar data.
    
    Attributes:
    - schema: Schema of the table
    - num_columns: Number of columns
    - num_rows: Number of rows
    - column_names: List of column names
    - columns: List of ChunkedArray columns
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def column(self, i):
        """Get column by index or name."""
    
    def select(self, columns):
        """Select subset of columns."""
    
    def slice(self, offset=0, length=None):
        """Return slice of table."""
    
    def filter(self, mask):
        """Filter rows by boolean mask."""
    
    def take(self, indices):
        """Select rows by indices."""
    
    def sort_by(self, sorting):
        """Sort table by columns."""
    
    def group_by(self, keys):
        """Group table by columns."""
    
    def join(self, right_table, **kwargs):
        """Join with another table."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas DataFrame."""
    
    def to_pydict(self):
        """Convert to dictionary of Python lists."""
    
    def to_batches(self, max_chunksize=None):
        """Convert to iterator of RecordBatch objects."""
    
    def add_column(self, i, field, column):
        """Add column at position i."""
    
    def append_column(self, field, column):
        """Append column to table."""
    
    def remove_column(self, i):
        """Remove column at position i."""
    
    def rename_columns(self, names):
        """Rename columns."""
    
    def drop(self, columns):
        """Drop columns by name."""
    
    def replace_schema_metadata(self, metadata):
        """Replace table metadata."""

class RecordBatch:
    """
    Collection of arrays with shared length representing a single batch.
    
    Attributes:
    - schema: Schema of the batch
    - num_columns: Number of columns
    - num_rows: Number of rows
    - column_names: List of column names
    - columns: List of Array columns
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def column(self, i):
        """Get column by index or name."""
    
    def select(self, columns):
        """Select subset of columns."""
    
    def slice(self, offset=0, length=None):
        """Return slice of batch."""
    
    def filter(self, mask):
        """Filter rows by boolean mask."""
    
    def take(self, indices):
        """Select rows by indices."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas DataFrame."""
    
    def to_pydict(self):
        """Convert to dictionary of Python lists."""
    
    def add_column(self, i, field, column):
        """Add column at position i."""
    
    def remove_column(self, i):
        """Remove column at position i."""
    
    def rename_columns(self, names):
        """Rename columns."""

class RecordBatchReader:
    """
    Interface for reading stream of record batches.
    """
    
    def __iter__(self): ...
    
    def read_next_batch(self):
        """Read next batch from stream."""
    
    def read_all(self):
        """Read all batches into table."""
    
    def schema(self):
        """Get schema of batches."""

class TableGroupBy:
    """
    Grouped table operations.
    """
    
    def aggregate(self, aggregations):
        """Perform aggregations on groups."""
```

### Schemas and Fields

Schema definitions that describe table structure, column types, and metadata. Schemas provide type safety and enable efficient data processing by defining the expected structure of tabular data.

```python { .api }
def schema(fields, metadata=None):
    """
    Create schema from list of fields.
    
    Parameters:
    - fields: sequence of Field objects or (name, type) tuples
    - metadata: dict, key-value metadata for schema
    
    Returns:
    Schema: Schema object with specified fields
    """

def field(name, type, nullable=True, metadata=None):
    """
    Create field with name and type.
    
    Parameters:
    - name: str, field name
    - type: DataType, field data type
    - nullable: bool, whether field can contain nulls
    - metadata: dict, key-value metadata for field
    
    Returns:
    Field: Field object with specified properties
    """

def unify_schemas(schemas):
    """
    Unify multiple schemas into compatible schema.
    
    Parameters:
    - schemas: sequence of Schema objects
    
    Returns:
    Schema: Unified schema compatible with all input schemas
    """

class Schema:
    """
    Schema defining structure of tabular data.
    
    Attributes:
    - names: List of field names
    - types: List of field types
    - metadata: Key-value metadata
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def field(self, i):
        """Get field by index or name."""
    
    def get_field_index(self, name):
        """Get index of field by name."""
    
    def select(self, names):
        """Select subset of fields."""
    
    def insert(self, i, field):
        """Insert field at position i."""
    
    def append(self, field):
        """Append field to schema."""
    
    def remove(self, i):
        """Remove field at position i."""
    
    def with_metadata(self, metadata):
        """Return schema with new metadata."""
    
    def equals(self, other, check_metadata=True):
        """Check equality with another schema."""
    
    def to_string(self, **kwargs):
        """String representation of schema."""

class Field:
    """
    Named field in a schema with type and metadata.
    
    Attributes:
    - name: Field name
    - type: DataType of field
    - nullable: Whether field can contain nulls
    - metadata: Key-value metadata
    """
    
    def with_name(self, name):
        """Return field with new name."""
    
    def with_type(self, type):
        """Return field with new type."""
    
    def with_nullable(self, nullable):
        """Return field with new nullable setting."""
    
    def with_metadata(self, metadata):
        """Return field with new metadata."""
    
    def equals(self, other, check_metadata=True):
        """Check equality with another field."""
    
    def to_string(self, **kwargs):
        """String representation of field."""

class KeyValueMetadata:
    """
    Key-value metadata container.
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def get(self, key, default=None):
        """Get value by key."""
    
    def keys(self):
        """Get all keys."""
    
    def values(self):
        """Get all values."""
    
    def items(self):
        """Get key-value pairs."""
    
    def to_dict(self):
        """Convert to Python dictionary."""
```

### Scalars

Single typed values that provide consistent interface for working with individual data elements. Scalars maintain type information and null state, enabling type-safe operations on individual values.

```python { .api }
def scalar(value, type=None):
    """
    Create scalar from Python value.
    
    Parameters:
    - value: Python value to wrap
    - type: DataType, explicit type for scalar
    
    Returns:
    Scalar: Typed scalar value
    """

# Scalar constants
NA = ...  # Not Available scalar
NULL = ...  # Null scalar

class Scalar:
    """
    Base class for typed scalar values.
    
    Attributes:
    - type: DataType of scalar
    - is_valid: Whether scalar is non-null
    """
    
    def __eq__(self, other): ...
    def __hash__(self): ...
    
    def as_py(self):
        """Convert to Python value."""
    
    def cast(self, target_type, safe=True):
        """Cast to different type."""
    
    def equals(self, other):
        """Check equality with another scalar."""

# Specific scalar types are available for all Arrow data types:
# NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
# UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar,
# FloatScalar, DoubleScalar, Decimal128Scalar, StringScalar, BinaryScalar,
# Date32Scalar, Date64Scalar, TimestampScalar, Time32Scalar, Time64Scalar,
# DurationScalar, ListScalar, StructScalar, MapScalar, DictionaryScalar, etc.
```

### Tensors and Sparse Data

Multi-dimensional arrays and sparse data structures for advanced numerical computing and machine learning applications.

```python { .api }
class Tensor:
    """
    Multi-dimensional array with Arrow data.
    
    Attributes:
    - type: DataType of tensor elements
    - shape: Shape tuple of tensor dimensions
    - strides: Strides tuple for memory layout
    - is_mutable: Whether tensor data is mutable
    """
    
    def __getitem__(self, key): ...
    
    def to_numpy(self):
        """Convert to NumPy array."""
    
    def equals(self, other):
        """Check equality with another tensor."""

class SparseCOOTensor:
    """Sparse tensor in COOrdinate format."""
    
class SparseCSRMatrix:
    """Sparse matrix in Compressed Sparse Row format."""
    
class SparseCSCMatrix:
    """Sparse matrix in Compressed Sparse Column format."""

class SparseCSFTensor:
    """Sparse tensor in Compressed Sparse Fiber format."""
```

## Type Definitions

### Memory Management

```python { .api }
class DictionaryMemo:
    """
    Memo for dictionary encoding to ensure consistent dictionaries.
    """
    
    def __init__(self): ...
    
    def get_dictionary(self, type):
        """Get dictionary for type."""
    
    def set_dictionary(self, type, dictionary):
        """Set dictionary for type.</""
```

## Usage Examples

### Creating and Manipulating Arrays

```python
import pyarrow as pa
import numpy as np

# Create arrays from various sources
int_array = pa.array([1, 2, 3, 4, 5])
str_array = pa.array(['apple', 'banana', 'cherry', None])
np_array = pa.array(np.random.randn(1000))

# Create chunked array
chunks = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
chunked = pa.chunked_array(chunks)

# Array operations
filtered = int_array.filter(pa.array([True, False, True, False, True]))
sorted_array = str_array.sort()
unique_values = str_array.unique()

# Convert to other formats
python_list = int_array.to_pylist()
pandas_series = int_array.to_pandas()
numpy_array = int_array.to_numpy()
```

### Working with Tables

```python
import pyarrow as pa

# Create table from dictionary
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000.0, 60000.0, 70000.0, 55000.0, 65000.0]
}
table = pa.table(data)

# Table operations
subset = table.select(['name', 'age'])
filtered = table.filter(pa.compute.greater(table['age'], 30))
sorted_table = table.sort_by([('age', 'descending')])

# Add/remove columns
new_table = table.add_column(4, pa.field('bonus', pa.float64()), 
                            pa.array([5000.0, 6000.0, 7000.0, 5500.0, 6500.0]))
dropped = table.drop(['salary'])

# Convert to pandas
df = table.to_pandas()
```

### Schema Definition

```python
import pyarrow as pa

# Define schema explicitly
schema = pa.schema([
    pa.field('id', pa.int64()),
    pa.field('name', pa.string()),
    pa.field('scores', pa.list_(pa.float64())),
    pa.field('metadata', pa.map_(pa.string(), pa.string()))
])

# Create table with schema
table = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'scores': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
    'metadata': [{'key': 'value'}, {}, {'foo': 'bar'}]
}, schema=schema)

# Schema operations
field = schema.field('name')
field_index = schema.get_field_index('scores')
partial_schema = schema.select(['id', 'name'])
```