Skip to content

Sample Handler

Source code in tokensmith/sample/handler.py
def __init__(self, manager: 'DatasetManager'):
    self.manager = manager

manager instance-attribute

manager = manager

get_samples_by_indices

get_samples_by_indices(indices, return_doc_details=False, return_detokenized=False, tokenizer=None)

Returns a list of samples by their indices, optionally with document details and/or detokenized.

Parameters:

Name Type Description Default

indices

List[int]

List of sample indices to retrieve.

required

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

Raises:

Type Description
ValueError

If indices is not a list of non-negative integers or if tokenizer is None when return_detokenized is True.

Returns:

Type Description
Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/sample/handler.py
def get_samples_by_indices(
    self, 
    indices: List[int], 
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
) -> Union[List[List[np.ndarray]], List[str], List[Tuple[List[np.ndarray], Dict]], List[Tuple[str, Dict]]]:
    """
    Returns a list of samples by their indices, optionally with document details and/or detokenized.

    Parameters:
        indices (List[int]): List of sample indices to retrieve.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).

    Raises:
        ValueError: If indices is not a list of non-negative integers or if tokenizer is None when return_detokenized is True.

    Returns:
        List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).
        List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).
        List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).
        List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if not isinstance(indices, list):
        raise ValueError("indices must be a list.")
    if not all(isinstance(i, int) for i in indices):
        raise ValueError("All elements in indices must be integers.")
    if not all(i >= 0 for i in indices):
        raise ValueError("All elements in indices must be non-negative integers.")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True.")

    samples = []
    for index in indices:
        response = self.manager.WriteableMMapIndexedDataset.get_example_by_id(
            example_loc=index,
            return_doc_details=return_doc_details
        )

        if return_doc_details:
            output_seq, doc_details = response
        else:
            output_seq = response
            doc_details = None

        if return_detokenized:
            output_seq = generate_training_sample(output_seq, tokenizer)

        if return_doc_details:
            samples.append((output_seq, doc_details))
        else:
            samples.append(output_seq)

    return samples

get_batches_by_ids

get_batches_by_ids(batch_ids, batch_size, return_doc_details=False, return_detokenized=False, tokenizer=None)

Returns samples from multiple batches by their batch IDs, organized by batch, optionally with document details and/or detokenized.

Parameters:

Name Type Description Default

batch_ids

List[int]

List of batch IDs to retrieve.

required

batch_size

int

The size of each batch.

required

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

Raises:

Type Description
ValueError

If batch_ids is not a list of non-negative integers or if tokenizer is None when return_detokenized is True.

Returns:

Type Description
Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[List[np.ndarray]]]: A list of batches, where each batch is a list of samples (if return_detokenized is False and return_doc_details is False).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[str]]: A list of batches, where each batch is a list of detokenized text samples (if return_detokenized is True and return_doc_details is False).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[Tuple[List[np.ndarray], Dict]]]: A list of batches with token sequences and document details (if return_detokenized is False and return_doc_details is True).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[Tuple[str, Dict]]]: A list of batches with detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/sample/handler.py
def get_batches_by_ids(
    self,
    batch_ids: List[int],
    batch_size: int,
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
) -> Union[List[List[List[np.ndarray]]], List[List[str]], List[List[Tuple[List[np.ndarray], Dict]]], List[List[Tuple[str, Dict]]]]:
    """
    Returns samples from multiple batches by their batch IDs, organized by batch, optionally with document details and/or detokenized.

    Parameters:
        batch_ids (List[int]): List of batch IDs to retrieve.
        batch_size (int): The size of each batch.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).

    Raises:
        ValueError: If batch_ids is not a list of non-negative integers or if tokenizer is None when return_detokenized is True.

    Returns:
        List[List[List[np.ndarray]]]: A list of batches, where each batch is a list of samples (if return_detokenized is False and return_doc_details is False).
        List[List[str]]: A list of batches, where each batch is a list of detokenized text samples (if return_detokenized is True and return_doc_details is False).
        List[List[Tuple[List[np.ndarray], Dict]]]: A list of batches with token sequences and document details (if return_detokenized is False and return_doc_details is True).
        List[List[Tuple[str, Dict]]]: A list of batches with detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if not isinstance(batch_ids, list):
        raise ValueError("batch_ids must be a list.")
    if not all(isinstance(i, int) for i in batch_ids):
        raise ValueError("All elements in batch_ids must be integers.")
    if not all(i >= 0 for i in batch_ids):
        raise ValueError("All elements in batch_ids must be non-negative integers.")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True.")

    # Collect samples organized by batch
    batches = []
    for batch_id in batch_ids:
        batch_indices = [i for i in range(batch_id * batch_size, (batch_id + 1) * batch_size)]
        batch_samples = self.get_samples_by_indices(
            indices=batch_indices,
            return_doc_details=return_doc_details,
            return_detokenized=return_detokenized,
            tokenizer=tokenizer
        )
        batches.append(batch_samples)

    return batches

get_samples_by_policy

get_samples_by_policy(policy_fn, *policy_args, return_doc_details=False, return_detokenized=False, tokenizer=None, **policy_kwargs)

Returns samples based on a sampling policy function that generates indices.

Parameters:

Name Type Description Default

policy_fn

callable

A function that returns a list of sample indices.

required

*policy_args

Positional arguments to pass to the policy function.

()

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

**policy_kwargs

Keyword arguments to pass to the policy function.

{}

Raises:

Type Description
ValueError

If policy_fn is not callable or doesn't return a list of integers.

Returns:

Type Description
Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/sample/handler.py
def get_samples_by_policy(
    self,
    policy_fn: callable,
    *policy_args,
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
    **policy_kwargs
) -> Union[List[List[np.ndarray]], List[str], List[Tuple[List[np.ndarray], Dict]], List[Tuple[str, Dict]]]:
    """
    Returns samples based on a sampling policy function that generates indices.

    Parameters:
        policy_fn (callable): A function that returns a list of sample indices.
        *policy_args: Positional arguments to pass to the policy function.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        **policy_kwargs: Keyword arguments to pass to the policy function.

    Raises:
        ValueError: If policy_fn is not callable or doesn't return a list of integers.

    Returns:
        List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).
        List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).
        List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).
        List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if not callable(policy_fn):
        raise ValueError("policy_fn must be callable.")

    indices = policy_fn(*policy_args, **policy_kwargs)

    if not isinstance(indices, list):
        raise ValueError("policy_fn must return a list of integers.")

    return self.get_samples_by_indices(
        indices=indices,
        return_doc_details=return_doc_details,
        return_detokenized=return_detokenized,
        tokenizer=tokenizer
    )

get_batches_by_policy

Returns batches of samples based on a sampling policy function that generates batch IDs.

Parameters:

Name Type Description Default

policy_fn

callable

A function that returns a list of batch IDs.

required

batch_size

int

The size of each batch.

required

*policy_args

Positional arguments to pass to the policy function.

()

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

**policy_kwargs

Keyword arguments to pass to the policy function.

{}

Raises:

Type Description
ValueError

If policy_fn is not callable or doesn't return a list of integers.

Returns:

Type Description
Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[List[np.ndarray]]]: A list of batches, where each batch is a list of samples (if return_detokenized is False and return_doc_details is False).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[str]]: A list of batches, where each batch is a list of detokenized text samples (if return_detokenized is True and return_doc_details is False).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[Tuple[List[np.ndarray], Dict]]]: A list of batches with token sequences and document details (if return_detokenized is False and return_doc_details is True).

Union[List[List[List[ndarray]]], List[List[str]], List[List[Tuple[List[ndarray], Dict]]], List[List[Tuple[str, Dict]]]]

List[List[Tuple[str, Dict]]]: A list of batches with detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/sample/handler.py
def get_batches_by_policy(
    self,
    policy_fn: callable,
    batch_size: int,
    *policy_args,
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
    **policy_kwargs
) -> Union[List[List[List[np.ndarray]]], List[List[str]], List[List[Tuple[List[np.ndarray], Dict]]], List[List[Tuple[str, Dict]]]]:
    """
    Returns batches of samples based on a sampling policy function that generates batch IDs.

    Parameters:
        policy_fn (callable): A function that returns a list of batch IDs.
        batch_size (int): The size of each batch.
        *policy_args: Positional arguments to pass to the policy function.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).
        **policy_kwargs: Keyword arguments to pass to the policy function.

    Raises:
        ValueError: If policy_fn is not callable or doesn't return a list of integers.

    Returns:
        List[List[List[np.ndarray]]]: A list of batches, where each batch is a list of samples (if return_detokenized is False and return_doc_details is False).
        List[List[str]]: A list of batches, where each batch is a list of detokenized text samples (if return_detokenized is True and return_doc_details is False).
        List[List[Tuple[List[np.ndarray], Dict]]]: A list of batches with token sequences and document details (if return_detokenized is False and return_doc_details is True).
        List[List[Tuple[str, Dict]]]: A list of batches with detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if not callable(policy_fn):
        raise ValueError("policy_fn must be callable.")

    batch_ids = policy_fn(*policy_args, **policy_kwargs)

    if not isinstance(batch_ids, list):
        raise ValueError("policy_fn must return a list of integers.")

    batches = []
    for batch_id in batch_ids:
        # Get indices for this single batch
        indices = [i for i in range(batch_id * batch_size, (batch_id + 1) * batch_size)]
        batch = self.get_samples_by_indices(
            indices=indices,
            return_doc_details=return_doc_details,
            return_detokenized=return_detokenized,
            tokenizer=tokenizer
        )
        batches.append(batch)

    return batches