DatasetManager¶

Source code in tokensmith/manager.py

def __init__(self):
    # Edit, Inspect, Sample, and Export handlers are initialized to None and will be set up when setup_edit_inspect_sample_export is called
    self.edit: Optional[EditHandler] = None
    self.inspect: Optional[InspectHandler] = None
    self.sample: Optional[SampleHandler] = None
    self.export: Optional[ExportHandler] = None

    # SearchHandler will be initialized when setup_search is called
    self.search: Optional[SearchHandler] = None

    self.ingest: IngestHandler = IngestHandler(self) 

edit `instance-attribute` ¶

edit = None

inspect `instance-attribute` ¶

inspect = None

sample `instance-attribute` ¶

sample = None

export `instance-attribute` ¶

export = None

search `instance-attribute` ¶

search = None

ingest `instance-attribute` ¶

ingest = IngestHandler(self)

setup_search ¶

setup_search(bin_file_path, search_index_save_path, vocab, verbose=False, reuse=True)

Initializes the SearchHandler by building or loading the index. Should be called explicitly if search functionality is required. Not done automatically to avoid unnecessary overhead.

Parameters:

Name	Type	Description	Default
`bin_save_path` ¶	`str`	Path to the binary file containing the dataset.	required
`search_index_save_path` ¶	`str`	Path to save the search index.	required
`vocab` ¶	`Dict[str, int]`	Vocabulary mapping words to their indices.	required
`verbose` ¶	`bool`	If True, enables verbose output during index building.	`False`
`reuse` ¶	`bool`	If True, reuses the existing index if available.	`True`

Raises:

Type	Description
`ValueError`	If SearchHandler is already initialized. If vocab is not 216 or 232. If reuse is True but the index path does not exist.

Returns:

Type	Description
	None

Source code in tokensmith/manager.py

def setup_search(self, bin_file_path: str, search_index_save_path: str, vocab: int, verbose: bool = False, reuse: bool = True):
    """
    Initializes the SearchHandler by building or loading the index.
    Should be called explicitly if search functionality is required.
    Not done automatically to avoid unnecessary overhead.

    Parameters:
        bin_save_path (str): Path to the binary file containing the dataset.
        search_index_save_path (str): Path to save the search index.
        vocab (Dict[str, int]): Vocabulary mapping words to their indices.
        verbose (bool): If True, enables verbose output during index building.
        reuse (bool): If True, reuses the existing index if available.

    Raises:
        ValueError: 
            - If SearchHandler is already initialized.
            - If vocab is not 2**16 or 2**32.
            - If reuse is True but the index path does not exist.

    Returns:
        None
    """
    if self.search is None:

        self.search = SearchHandler(
            bin_file_path=bin_file_path,
            index_save_path=search_index_save_path,
            vocab=vocab,
            verbose=verbose,
            reuse=reuse
        )
    else:
        raise ValueError(
            "SearchHandler already initialized. "
            "Create a new DatasetManager instance or reset `search` manually."
        )

setup_edit_inspect_sample_export ¶

setup_edit_inspect_sample_export(dataset_prefix, batch_info_save_prefix, train_iters, train_batch_size, train_seq_len, seed, splits_string='969,30,1', packing_impl='packed', allow_chopped=True, add_extra_token_to_seq=1)

Initializes the EditHandler, InspectHandler, SampleHandler, and ExportHandler. This method is called to set up the handlers with the provided bin file path.

Parameters:

Name	Type	Description	Default
`dataset_prefix` ¶	`str`	Prefix for the dataset files. This is used to locate the {dataset_prefix}.bin and {dataset_prefix}.idx files.	required
`batch_info_save_prefix` ¶	`str`	Prefix for the batch information files. This is used to locate the doc/sample/shuffle indexes with the given prefix/save path if the files are not found.	required
`train_iters` ¶	`int`	Number of training iterations for simulated training.	required
`train_batch_size` ¶	`int`	Size of each training batch for simulated training.	required
`train_seq_len` ¶	`int`	Length of the training sequences.	required
`seed` ¶	`int`	Random seed for simulated training.	required
`splits_string` ¶	`str`	Comma-separated string of train/val/test splits. (defaults to '969,30,1' which means 96.9% train, 3% val, and 0.1% test).	`'969,30,1'`
`packing_impl` ¶	`str`	Implementation for packing sequences. One of 'packed', 'pack_until_overflow', 'unpacked'. (defaults to 'packed').	`'packed'`
`allow_chopped` ¶	`bool`	WARNING: if your packing impl is packed, this is ignored. Allow chopped samples in the dataset. E.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024 (defaults to True).	`True`
`add_extra_token_to_seq` ¶	`int`	Number of extra tokens to add to each sequence (defaults to 1 to account for causal language modeling).	`1`

Raises:

Type	Description
`ValueError`	If any of the handlers are already initialized. If train_seq_len is not a positive integer. If add_extra_token_to_seq is not a non-negative integer.

Returns:

Type	Description
	None

Source code in tokensmith/manager.py

def setup_edit_inspect_sample_export(self, dataset_prefix: str, batch_info_save_prefix: str,
                                     train_iters: int, train_batch_size: int, train_seq_len: int, seed: int, splits_string: str = '969,30,1',
                                     packing_impl: str = 'packed',
                                     allow_chopped: bool = True,
                                     add_extra_token_to_seq: int = 1):
    """
    Initializes the EditHandler, InspectHandler, SampleHandler, and ExportHandler.
    This method is called to set up the handlers with the provided bin file path.

    Parameters:
        dataset_prefix (str): Prefix for the dataset files. This is used to locate the {dataset_prefix}.bin and {dataset_prefix}.idx files.
        batch_info_save_prefix (str): Prefix for the batch information files. This is used to locate the doc/sample/shuffle indexes with the given prefix/save path if the files are not found.
        train_iters (int): Number of training iterations for simulated training.
        train_batch_size (int): Size of each training batch for simulated training.
        train_seq_len (int): Length of the training sequences.
        seed (int): Random seed for simulated training.
        splits_string (str): Comma-separated string of train/val/test splits. (defaults to '969,30,1' which means 96.9% train, 3% val, and 0.1% test).
        packing_impl (str): Implementation for packing sequences. One of 'packed', 'pack_until_overflow', 'unpacked'. (defaults to 'packed').
        allow_chopped (bool): WARNING: if your packing impl is packed, this is ignored. Allow chopped samples in the dataset. E.g if your sequence length is 1024 and you have a sample of length 1026, it will be chopped to 1024 (defaults to True).
        add_extra_token_to_seq (int): Number of extra tokens to add to each sequence (defaults to 1 to account for causal language modeling).

    Raises:
        ValueError: 
            - If any of the handlers are already initialized.
            - If train_seq_len is not a positive integer.
            - If add_extra_token_to_seq is not a non-negative integer.

    Returns:
        None
    """

    self.WriteableMMapIndexedDataset = WriteableMMapIndexedDataset(
        dataset_prefix=dataset_prefix,
        batch_info_save_prefix=batch_info_save_prefix,
        train_iters=train_iters,
        train_batch_size=train_batch_size,
        seed=seed,
        splits_string=splits_string,
        packing_impl=packing_impl,
        allow_chopped=allow_chopped,
        train_seq_len=train_seq_len,
        add_extra_token_to_seq=add_extra_token_to_seq
    )

    if self.edit is None:
        self.edit = EditHandler(self)
    else:
        raise ValueError("EditHandler already initialized. Create a new DatasetManager instance or reset `edit` manually.")

    if self.inspect is None:
        self.inspect = InspectHandler(self)
    else:
        raise ValueError("InspectHandler already initialized. Create a new DatasetManager instance or reset `inspect` manually.")

    if self.sample is None:
        self.sample = SampleHandler(self)
    else:
        raise ValueError("SampleHandler already initialized. Create a new DatasetManager instance or reset `sample` manually.")

    if self.export is None:
        self.export = ExportHandler(self)
    else:
        raise ValueError("ExportHandler already initialized. Create a new DatasetManager instance or reset `export` manually.")

DatasetManager¶

edit `instance-attribute` ¶

inspect `instance-attribute` ¶

sample `instance-attribute` ¶

export `instance-attribute` ¶

search `instance-attribute` ¶

ingest `instance-attribute` ¶

setup_search ¶

`bin_save_path` ¶

`search_index_save_path` ¶

`vocab` ¶

`verbose` ¶

`reuse` ¶

setup_edit_inspect_sample_export ¶

`dataset_prefix` ¶

`batch_info_save_prefix` ¶

`train_iters` ¶

`train_batch_size` ¶

`train_seq_len` ¶

`seed` ¶

`splits_string` ¶

`packing_impl` ¶

`allow_chopped` ¶

`add_extra_token_to_seq` ¶

DatasetManager¶

edit instance-attribute ¶

inspect instance-attribute ¶

sample instance-attribute ¶

export instance-attribute ¶

search instance-attribute ¶

ingest instance-attribute ¶

setup_search ¶

bin_save_path ¶

search_index_save_path ¶

vocab ¶

verbose ¶

reuse ¶

setup_edit_inspect_sample_export ¶

dataset_prefix ¶

batch_info_save_prefix ¶

train_iters ¶

train_batch_size ¶

train_seq_len ¶

seed ¶

splits_string ¶

packing_impl ¶

allow_chopped ¶

add_extra_token_to_seq ¶

edit `instance-attribute` ¶

inspect `instance-attribute` ¶

sample `instance-attribute` ¶

export `instance-attribute` ¶

search `instance-attribute` ¶

ingest `instance-attribute` ¶

`bin_save_path` ¶

`search_index_save_path` ¶

`vocab` ¶

`verbose` ¶

`reuse` ¶

`dataset_prefix` ¶

`batch_info_save_prefix` ¶

`train_iters` ¶

`train_batch_size` ¶

`train_seq_len` ¶

`seed` ¶

`splits_string` ¶

`packing_impl` ¶

`allow_chopped` ¶

`add_extra_token_to_seq` ¶