Skip to content

Export Handler

Source code in tokensmith/export/handler.py
def __init__(self, manager: 'DatasetManager'):
    self.manager = manager

manager instance-attribute

manager = manager

export_batches

Export specific batches to a file.

Parameters:

Name Type Description Default

batch_ids

List[int]

List of batch IDs to export.

required

batch_size

int

The size of each batch.

required

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

flatten_batches

bool

If True, flattens all batches into a single list of samples.

False

Raises:

Type Description
ValueError

If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py
def export_batches(
    self,
    batch_ids: List[int],
    batch_size: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    flatten_batches: bool = False
) -> None:
    """
    Export specific batches to a file.

    Parameters:
        batch_ids (List[int]): List of batch IDs to export.
        batch_size (int): The size of each batch.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        flatten_batches (bool): If True, flattens all batches into a single list of samples.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get batches using the sample handler
    batches = self.manager.sample.get_batches_by_ids(
        batch_ids=batch_ids,
        batch_size=batch_size,
        return_doc_details=include_doc_details,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer
    )

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    if format_type == "jsonl":
        self._export_to_jsonl(batches, output_path, flatten_batches, include_doc_details, "batch")
    elif format_type == "csv":
        self._export_to_csv(batches, output_path, flatten_batches, include_doc_details, "batch")

export_sequences

Export specific sequences to a file.

Parameters:

Name Type Description Default

sequence_indices

List[int]

List of sequence indices to export.

required

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

Raises:

Type Description
ValueError

If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py
def export_sequences(
    self,
    sequence_indices: List[int],
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False
) -> None:
    """
    Export specific sequences to a file.

    Parameters:
        sequence_indices (List[int]): List of sequence indices to export.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get samples using the sample handler
    samples = self.manager.sample.get_samples_by_indices(
        indices=sequence_indices,
        return_doc_details=include_doc_details,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer
    )

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    if format_type == "jsonl":
        self._export_to_jsonl([samples], output_path, True, include_doc_details, "sequence")
    elif format_type == "csv":
        self._export_to_csv([samples], output_path, True, include_doc_details, "sequence")

export_entire_dataset

export_entire_dataset(output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, chunk_size=1000)

Export the entire dataset to a file.

Parameters:

Name Type Description Default

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

chunk_size

int

Number of samples to process at a time to manage memory usage.

1000

Raises:

Type Description
ValueError

If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py
def export_entire_dataset(
    self,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    chunk_size: int = 1000
) -> None:
    """
    Export the entire dataset to a file.

    Parameters:
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        chunk_size (int): Number of samples to process at a time to manage memory usage.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get total number of samples
    total_samples = len(self.manager.WriteableMMapIndexedDataset.batch_info.shuffle_idx)

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Export in chunks to manage memory
    if format_type == "jsonl":
        with open(output_path, 'w', encoding='utf-8') as f:
            for start_idx in range(0, total_samples, chunk_size):
                end_idx = min(start_idx + chunk_size, total_samples)
                chunk_indices = list(range(start_idx, end_idx))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                self._write_chunk_to_jsonl(chunk_samples, f, include_doc_details, start_idx)

    elif format_type == "csv":
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = None
            for start_idx in range(0, total_samples, chunk_size):
                end_idx = min(start_idx + chunk_size, total_samples)
                chunk_indices = list(range(start_idx, end_idx))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                writer = self._write_chunk_to_csv(chunk_samples, f, writer, include_doc_details, start_idx)

export_sequence_range

export_sequence_range(start_idx, end_idx, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False)

Export a range of sequences to a file.

Parameters:

Name Type Description Default

start_idx

int

Starting sequence index (inclusive).

required

end_idx

int

Ending sequence index (exclusive).

required

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

Raises:

Type Description
ValueError

If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_idx >= end_idx or indices are negative.

Source code in tokensmith/export/handler.py
def export_sequence_range(
    self,
    start_idx: int,
    end_idx: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False
) -> None:
    """
    Export a range of sequences to a file.

    Parameters:
        start_idx (int): Starting sequence index (inclusive).
        end_idx (int): Ending sequence index (exclusive).
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_idx >= end_idx or indices are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_idx, int) or not isinstance(end_idx, int):
        raise ValueError("start_idx and end_idx must be integers")

    if start_idx < 0 or end_idx < 0:
        raise ValueError("start_idx and end_idx must be non-negative")

    if start_idx >= end_idx:
        raise ValueError("start_idx must be less than end_idx")

    # Generate sequence indices for the range
    sequence_indices = list(range(start_idx, end_idx))

    # Use the existing export_sequences method
    self.export_sequences(
        sequence_indices=sequence_indices,
        output_path=output_path,
        format_type=format_type,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer,
        include_doc_details=include_doc_details
    )

export_batch_range

Export a range of batches to a file.

Parameters:

Name Type Description Default

start_batch

int

Starting batch ID (inclusive).

required

end_batch

int

Ending batch ID (exclusive).

required

batch_size

int

The size of each batch.

required

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

flatten_batches

bool

If True, flattens all batches into a single list of samples.

False

Raises:

Type Description
ValueError

If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_batch >= end_batch or batch IDs are negative.

Source code in tokensmith/export/handler.py
def export_batch_range(
    self,
    start_batch: int,
    end_batch: int,
    batch_size: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    flatten_batches: bool = False
) -> None:
    """
    Export a range of batches to a file.

    Parameters:
        start_batch (int): Starting batch ID (inclusive).
        end_batch (int): Ending batch ID (exclusive).
        batch_size (int): The size of each batch.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        flatten_batches (bool): If True, flattens all batches into a single list of samples.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_batch >= end_batch or batch IDs are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_batch, int) or not isinstance(end_batch, int):
        raise ValueError("start_batch and end_batch must be integers")

    if start_batch < 0 or end_batch < 0:
        raise ValueError("start_batch and end_batch must be non-negative")

    if start_batch >= end_batch:
        raise ValueError("start_batch must be less than end_batch")

    # Generate batch IDs for the range
    batch_ids = list(range(start_batch, end_batch))

    # Use the existing export_batches method
    self.export_batches(
        batch_ids=batch_ids,
        batch_size=batch_size,
        output_path=output_path,
        format_type=format_type,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer,
        include_doc_details=include_doc_details,
        flatten_batches=flatten_batches
    )

export_dataset_range

export_dataset_range(start_idx, end_idx, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, chunk_size=1000)

Export a range of the dataset to a file with memory-efficient chunking.

Parameters:

Name Type Description Default

start_idx

int

Starting sequence index (inclusive).

required

end_idx

int

Ending sequence index (exclusive).

required

output_path

str

Path to the output file.

required

format_type

str

Format to export ("jsonl" or "csv").

'jsonl'

return_detokenized

bool

If True, exports detokenized text; otherwise exports token arrays.

True

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

include_doc_details

bool

If True, includes document details in the export.

False

chunk_size

int

Number of samples to process at a time to manage memory usage.

1000

Raises:

Type Description
ValueError

If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_idx >= end_idx or indices are negative.

Source code in tokensmith/export/handler.py
def export_dataset_range(
    self,
    start_idx: int,
    end_idx: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    chunk_size: int = 1000
) -> None:
    """
    Export a range of the dataset to a file with memory-efficient chunking.

    Parameters:
        start_idx (int): Starting sequence index (inclusive).
        end_idx (int): Ending sequence index (exclusive).
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        chunk_size (int): Number of samples to process at a time to manage memory usage.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_idx >= end_idx or indices are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_idx, int) or not isinstance(end_idx, int):
        raise ValueError("start_idx and end_idx must be integers")

    if start_idx < 0 or end_idx < 0:
        raise ValueError("start_idx and end_idx must be non-negative")

    if start_idx >= end_idx:
        raise ValueError("start_idx must be less than end_idx")

    # Get total number of samples to validate range
    # total_samples = len(self.manager.WriteableMMapIndexedDataset.batch_info.shuffle_idx)
    # if end_idx > total_samples:
    #     raise ValueError(f"end_idx ({end_idx}) exceeds dataset size ({total_samples})")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Export in chunks to manage memory
    if format_type == "jsonl":
        with open(output_path, 'w', encoding='utf-8') as f:
            current_idx = start_idx
            while current_idx < end_idx:
                chunk_end = min(current_idx + chunk_size, end_idx)
                chunk_indices = list(range(current_idx, chunk_end))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                self._write_chunk_to_jsonl(chunk_samples, f, include_doc_details, current_idx)
                current_idx = chunk_end

    elif format_type == "csv":
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = None
            current_idx = start_idx
            while current_idx < end_idx:
                chunk_end = min(current_idx + chunk_size, end_idx)
                chunk_indices = list(range(current_idx, chunk_end))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                writer = self._write_chunk_to_csv(chunk_samples, f, writer, include_doc_details, current_idx)
                current_idx = chunk_end