Skip to content

Inspect Handler

Source code in tokensmith/inspect/handler.py
def __init__(self, manager: 'DatasetManager'):
    self.manager = manager

manager instance-attribute

manager = manager

inspect_sample_by_id

inspect_sample_by_id(sample_id, return_doc_details=False, return_detokenized=False, tokenizer=None)

Returns a sample by its ID, optionally with document details and/or detokenized.

Parameters:

Name Type Description Default

sample_id

int

The index of the sample to retrieve.

required

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

Raises:

Type Description
ValueError

If sample_id is not a non-negative integer or if tokenizer is None when return_detokenized is True.

Returns:

Name Type Description
Union[List[ndarray], str, Tuple[List[ndarray], Dict], Tuple[str, Dict]]

List[np.ndarray]: A list of numpy arrays representing the token sequence (if return_detokenized is False and return_doc_details is False).

str Union[List[ndarray], str, Tuple[List[ndarray], Dict], Tuple[str, Dict]]

Detokenized text (if return_detokenized is True and return_doc_details is False).

Union[List[ndarray], str, Tuple[List[ndarray], Dict], Tuple[str, Dict]]

Tuple[List[np.ndarray], Dict]: Token sequence and document details (if return_detokenized is False and return_doc_details is True).

Union[List[ndarray], str, Tuple[List[ndarray], Dict], Tuple[str, Dict]]

Tuple[str, Dict]: Detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/inspect/handler.py
def inspect_sample_by_id(
    self,
    sample_id: int,
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
) -> Union[List[np.ndarray], str, Tuple[List[np.ndarray], Dict], Tuple[str, Dict]]:
    """
    Returns a sample by its ID, optionally with document details and/or detokenized.

    Parameters:
        sample_id (int): The index of the sample to retrieve.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).

    Raises:
        ValueError: If sample_id is not a non-negative integer or if tokenizer is None when return_detokenized is True.

    Returns:
        List[np.ndarray]: A list of numpy arrays representing the token sequence (if return_detokenized is False and return_doc_details is False).
        str: Detokenized text (if return_detokenized is True and return_doc_details is False).
        Tuple[List[np.ndarray], Dict]: Token sequence and document details (if return_detokenized is False and return_doc_details is True).
        Tuple[str, Dict]: Detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if sample_id < 0:
        raise ValueError("sample_id must be a non-negative integer.")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True.")

    response = self.manager.WriteableMMapIndexedDataset.get_example_by_id(
        example_loc=sample_id,
        return_doc_details=return_doc_details
    )

    if return_doc_details:
        output_seq, doc_details = response
    else:
        output_seq = response
        doc_details = None

    if return_detokenized:
        output_seq = generate_training_sample(output_seq, tokenizer)

    if return_doc_details:
        return output_seq, doc_details
    else:
        return output_seq

inspect_sample_by_batch

inspect_sample_by_batch(batch_id, batch_size, return_doc_details=False, return_detokenized=False, tokenizer=None)

Returns a batch of samples by batch ID, optionally with document details and/or detokenized.

Parameters:

Name Type Description Default

batch_id

int

The index of the batch to retrieve.

required

batch_size

int

The size of the batch.

required

return_doc_details

bool

If True, includes associated document details.

False

return_detokenized

bool

If True, returns detokenized text instead of token arrays.

False

tokenizer

Optional[Any]

The tokenizer to use for detokenization (required if return_detokenized is True).

None

Raises:

Type Description
ValueError

If batch_id is not a non-negative integer or if tokenizer is None when return_detokenized is True.

Returns:

Type Description
Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).

Union[List[List[ndarray]], List[str], List[Tuple[List[ndarray], Dict]], List[Tuple[str, Dict]]]

List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).

Source code in tokensmith/inspect/handler.py
def inspect_sample_by_batch(
    self,
    batch_id: int,
    batch_size: int,
    return_doc_details: bool = False,
    return_detokenized: bool = False,
    tokenizer: Optional[Any] = None,
) -> Union[List[List[np.ndarray]], List[str], List[Tuple[List[np.ndarray], Dict]], List[Tuple[str, Dict]]]:
    """
    Returns a batch of samples by batch ID, optionally with document details and/or detokenized.

    Parameters:
        batch_id (int): The index of the batch to retrieve.
        batch_size (int): The size of the batch.
        return_doc_details (bool): If True, includes associated document details.
        return_detokenized (bool): If True, returns detokenized text instead of token arrays.
        tokenizer: The tokenizer to use for detokenization (required if return_detokenized is True).

    Raises:
        ValueError: If batch_id is not a non-negative integer or if tokenizer is None when return_detokenized is True.

    Returns:
        List[List[np.ndarray]]: A list of samples, where each sample is a list of token arrays (if return_detokenized is False and return_doc_details is False).
        List[str]: A list of detokenized text samples (if return_detokenized is True and return_doc_details is False).
        List[Tuple[List[np.ndarray], Dict]]: A list of tuples containing token sequences and document details (if return_detokenized is False and return_doc_details is True).
        List[Tuple[str, Dict]]: A list of tuples containing detokenized text and document details (if return_detokenized is True and return_doc_details is True).
    """
    if batch_id < 0:
        raise ValueError("batch_id must be a non-negative integer.")

    if return_detokenized and tokenizer is None:
        raise ValueError("tokenizer must be provided if return_detokenized is True.")

    indices = [i for i in range(batch_id * batch_size, (batch_id + 1) * batch_size)]

    batch_data = []

    for sample_id in indices:
        sample = self.inspect_sample_by_id(
            sample_id=sample_id,
            return_doc_details=return_doc_details,
            return_detokenized=return_detokenized,
            tokenizer=tokenizer
        )
        batch_data.append(sample)

    return batch_data