Export Handler¶

Source code in tokensmith/export/handler.py

def __init__(self, manager: 'DatasetManager'):
    self.manager = manager

manager `instance-attribute` ¶

manager = manager

export_batches ¶

export_batches(batch_ids, batch_size, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, flatten_batches=False)

Export specific batches to a file.

Parameters:

Name	Type	Description	Default
`batch_ids` ¶	`List[int]`	List of batch IDs to export.	required
`batch_size` ¶	`int`	The size of each batch.	required
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`
`flatten_batches` ¶	`bool`	If True, flattens all batches into a single list of samples.	`False`

Raises:

Type	Description
`ValueError`	If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py

def export_batches(
    self,
    batch_ids: List[int],
    batch_size: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    flatten_batches: bool = False
) -> None:
    """
    Export specific batches to a file.

    Parameters:
        batch_ids (List[int]): List of batch IDs to export.
        batch_size (int): The size of each batch.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        flatten_batches (bool): If True, flattens all batches into a single list of samples.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get batches using the sample handler
    batches = self.manager.sample.get_batches_by_ids(
        batch_ids=batch_ids,
        batch_size=batch_size,
        return_doc_details=include_doc_details,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer
    )

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    if format_type == "jsonl":
        self._export_to_jsonl(batches, output_path, flatten_batches, include_doc_details, "batch")
    elif format_type == "csv":
        self._export_to_csv(batches, output_path, flatten_batches, include_doc_details, "batch")

export_sequences ¶

export_sequences(sequence_indices, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False)

Export specific sequences to a file.

Parameters:

Name	Type	Description	Default
`sequence_indices` ¶	`List[int]`	List of sequence indices to export.	required
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`

Raises:

Type	Description
`ValueError`	If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py

def export_sequences(
    self,
    sequence_indices: List[int],
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False
) -> None:
    """
    Export specific sequences to a file.

    Parameters:
        sequence_indices (List[int]): List of sequence indices to export.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get samples using the sample handler
    samples = self.manager.sample.get_samples_by_indices(
        indices=sequence_indices,
        return_doc_details=include_doc_details,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer
    )

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    if format_type == "jsonl":
        self._export_to_jsonl([samples], output_path, True, include_doc_details, "sequence")
    elif format_type == "csv":
        self._export_to_csv([samples], output_path, True, include_doc_details, "sequence")

export_entire_dataset ¶

export_entire_dataset(output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, chunk_size=1000)

Export the entire dataset to a file.

Parameters:

Name	Type	Description	Default
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`
`chunk_size` ¶	`int`	Number of samples to process at a time to manage memory usage.	`1000`

Raises:

Type	Description
`ValueError`	If format_type is not supported or tokenizer is None when return_detokenized is True.

Source code in tokensmith/export/handler.py

def export_entire_dataset(
    self,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    chunk_size: int = 1000
) -> None:
    """
    Export the entire dataset to a file.

    Parameters:
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        chunk_size (int): Number of samples to process at a time to manage memory usage.

    Raises:
        ValueError: If format_type is not supported or tokenizer is None when return_detokenized is True.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    # Get total number of samples
    total_samples = len(self.manager.WriteableMMapIndexedDataset.batch_info.shuffle_idx)

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Export in chunks to manage memory
    if format_type == "jsonl":
        with open(output_path, 'w', encoding='utf-8') as f:
            for start_idx in range(0, total_samples, chunk_size):
                end_idx = min(start_idx + chunk_size, total_samples)
                chunk_indices = list(range(start_idx, end_idx))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                self._write_chunk_to_jsonl(chunk_samples, f, include_doc_details, start_idx)

    elif format_type == "csv":
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = None
            for start_idx in range(0, total_samples, chunk_size):
                end_idx = min(start_idx + chunk_size, total_samples)
                chunk_indices = list(range(start_idx, end_idx))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                writer = self._write_chunk_to_csv(chunk_samples, f, writer, include_doc_details, start_idx)

export_sequence_range ¶

export_sequence_range(start_idx, end_idx, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False)

Export a range of sequences to a file.

Parameters:

Name	Type	Description	Default
`start_idx` ¶	`int`	Starting sequence index (inclusive).	required
`end_idx` ¶	`int`	Ending sequence index (exclusive).	required
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`

Raises:

Type	Description
`ValueError`	If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_idx >= end_idx or indices are negative.

Source code in tokensmith/export/handler.py

def export_sequence_range(
    self,
    start_idx: int,
    end_idx: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False
) -> None:
    """
    Export a range of sequences to a file.

    Parameters:
        start_idx (int): Starting sequence index (inclusive).
        end_idx (int): Ending sequence index (exclusive).
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_idx >= end_idx or indices are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_idx, int) or not isinstance(end_idx, int):
        raise ValueError("start_idx and end_idx must be integers")

    if start_idx < 0 or end_idx < 0:
        raise ValueError("start_idx and end_idx must be non-negative")

    if start_idx >= end_idx:
        raise ValueError("start_idx must be less than end_idx")

    # Generate sequence indices for the range
    sequence_indices = list(range(start_idx, end_idx))

    # Use the existing export_sequences method
    self.export_sequences(
        sequence_indices=sequence_indices,
        output_path=output_path,
        format_type=format_type,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer,
        include_doc_details=include_doc_details
    )

export_batch_range ¶

export_batch_range(start_batch, end_batch, batch_size, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, flatten_batches=False)

Export a range of batches to a file.

Parameters:

Name	Type	Description	Default
`start_batch` ¶	`int`	Starting batch ID (inclusive).	required
`end_batch` ¶	`int`	Ending batch ID (exclusive).	required
`batch_size` ¶	`int`	The size of each batch.	required
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`
`flatten_batches` ¶	`bool`	If True, flattens all batches into a single list of samples.	`False`

Raises:

Type	Description
`ValueError`	If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_batch >= end_batch or batch IDs are negative.

Source code in tokensmith/export/handler.py

def export_batch_range(
    self,
    start_batch: int,
    end_batch: int,
    batch_size: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    flatten_batches: bool = False
) -> None:
    """
    Export a range of batches to a file.

    Parameters:
        start_batch (int): Starting batch ID (inclusive).
        end_batch (int): Ending batch ID (exclusive).
        batch_size (int): The size of each batch.
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        flatten_batches (bool): If True, flattens all batches into a single list of samples.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_batch >= end_batch or batch IDs are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_batch, int) or not isinstance(end_batch, int):
        raise ValueError("start_batch and end_batch must be integers")

    if start_batch < 0 or end_batch < 0:
        raise ValueError("start_batch and end_batch must be non-negative")

    if start_batch >= end_batch:
        raise ValueError("start_batch must be less than end_batch")

    # Generate batch IDs for the range
    batch_ids = list(range(start_batch, end_batch))

    # Use the existing export_batches method
    self.export_batches(
        batch_ids=batch_ids,
        batch_size=batch_size,
        output_path=output_path,
        format_type=format_type,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer,
        include_doc_details=include_doc_details,
        flatten_batches=flatten_batches
    )

export_dataset_range ¶

export_dataset_range(start_idx, end_idx, output_path, format_type='jsonl', return_detokenized=True, tokenizer=None, include_doc_details=False, chunk_size=1000)

Export a range of the dataset to a file with memory-efficient chunking.

Parameters:

Name	Type	Description	Default
`start_idx` ¶	`int`	Starting sequence index (inclusive).	required
`end_idx` ¶	`int`	Ending sequence index (exclusive).	required
`output_path` ¶	`str`	Path to the output file.	required
`format_type` ¶	`str`	Format to export ("jsonl" or "csv").	`'jsonl'`
`return_detokenized` ¶	`bool`	If True, exports detokenized text; otherwise exports token arrays.	`True`
`tokenizer` ¶	`Optional[Any]`	The tokenizer to use for detokenization (required if return_detokenized is True).	`None`
`include_doc_details` ¶	`bool`	If True, includes document details in the export.	`False`
`chunk_size` ¶	`int`	Number of samples to process at a time to manage memory usage.	`1000`

Raises:

Type	Description
`ValueError`	If format_type is not supported, tokenizer is None when return_detokenized is True, or if start_idx >= end_idx or indices are negative.

Source code in tokensmith/export/handler.py

def export_dataset_range(
    self,
    start_idx: int,
    end_idx: int,
    output_path: str,
    format_type: str = "jsonl",
    return_detokenized: bool = True,
    tokenizer: Optional[Any] = None,
    include_doc_details: bool = False,
    chunk_size: int = 1000
) -> None:
    """
    Export a range of the dataset to a file with memory-efficient chunking.

    Parameters:
        start_idx (int): Starting sequence index (inclusive).
        end_idx (int): Ending sequence index (exclusive).
        output_path (str): Path to the output file.
        format_type (str): Format to export ("jsonl" or "csv").
        return_detokenized (bool): If True, exports detokenized text; otherwise exports token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        include_doc_details (bool): If True, includes document details in the export.
        chunk_size (int): Number of samples to process at a time to manage memory usage.

    Raises:
        ValueError: If format_type is not supported, tokenizer is None when return_detokenized is True,
                   or if start_idx >= end_idx or indices are negative.
    """
    if format_type not in ["jsonl", "csv"]:
        raise ValueError("format_type must be 'jsonl' or 'csv'")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True")

    if not isinstance(start_idx, int) or not isinstance(end_idx, int):
        raise ValueError("start_idx and end_idx must be integers")

    if start_idx < 0 or end_idx < 0:
        raise ValueError("start_idx and end_idx must be non-negative")

    if start_idx >= end_idx:
        raise ValueError("start_idx must be less than end_idx")

    # Get total number of samples to validate range
    # total_samples = len(self.manager.WriteableMMapIndexedDataset.batch_info.shuffle_idx)
    # if end_idx > total_samples:
    #     raise ValueError(f"end_idx ({end_idx}) exceeds dataset size ({total_samples})")

    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Export in chunks to manage memory
    if format_type == "jsonl":
        with open(output_path, 'w', encoding='utf-8') as f:
            current_idx = start_idx
            while current_idx < end_idx:
                chunk_end = min(current_idx + chunk_size, end_idx)
                chunk_indices = list(range(current_idx, chunk_end))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                self._write_chunk_to_jsonl(chunk_samples, f, include_doc_details, current_idx)
                current_idx = chunk_end

    elif format_type == "csv":
        with open(output_path, 'w', newline='', encoding='utf-8') as f:
            writer = None
            current_idx = start_idx
            while current_idx < end_idx:
                chunk_end = min(current_idx + chunk_size, end_idx)
                chunk_indices = list(range(current_idx, chunk_end))

                chunk_samples = self.manager.sample.get_samples_by_indices(
                    indices=chunk_indices,
                    return_doc_details=include_doc_details,
                    return_detokenized=return_detokenized,
                    tokenizer=tokenizer
                )

                writer = self._write_chunk_to_csv(chunk_samples, f, writer, include_doc_details, current_idx)
                current_idx = chunk_end

Export Handler¶

manager instance-attribute ¶

export_batches ¶

batch_ids ¶

batch_size ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

flatten_batches ¶

export_sequences ¶

sequence_indices ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

export_entire_dataset ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

chunk_size ¶

export_sequence_range ¶

start_idx ¶

end_idx ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

export_batch_range ¶

start_batch ¶

end_batch ¶

batch_size ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

flatten_batches ¶

export_dataset_range ¶

start_idx ¶

end_idx ¶

output_path ¶

format_type ¶

return_detokenized ¶

tokenizer ¶

include_doc_details ¶

chunk_size ¶

manager `instance-attribute` ¶

`batch_ids` ¶

`batch_size` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`flatten_batches` ¶

`sequence_indices` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`chunk_size` ¶

`start_idx` ¶

`end_idx` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`start_batch` ¶

`end_batch` ¶

`batch_size` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`flatten_batches` ¶

`start_idx` ¶

`end_idx` ¶

`output_path` ¶

`format_type` ¶

`return_detokenized` ¶

`tokenizer` ¶

`include_doc_details` ¶

`chunk_size` ¶