def parse_args():
parser = argparse.ArgumentParser(description="TokenSmith Streamlit UI")
# Search arguments
parser.add_argument("--bin-file-path", type=str, help="Path to the binary file containing the dataset")
parser.add_argument("--search-index-path", type=str, help="Path to save/load the search index")
parser.add_argument("--vocab", type=int, choices=[2**16, 2**32], help="Vocabulary size (65536 or 4294967296)")
parser.add_argument("--search-verbose", action="store_true", help="Enable verbose output for search index building")
parser.add_argument("--reuse-index", action="store_true", help="Reuse existing search index if available")
# Dataset arguments
parser.add_argument("--dataset-prefix", type=str, help="Prefix for the dataset files")
parser.add_argument("--batch-info-prefix", type=str, help="Prefix for the batch information files")
parser.add_argument("--train-iters", type=int, default=1000, help="Number of training iterations")
parser.add_argument("--train-batch-size", type=int, default=16, help="Training batch size")
parser.add_argument("--train-seq-len", type=int, default=1024, help="Training sequence length")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
parser.add_argument("--splits", type=str, default="969,30,1", help="Train/val/test splits")
parser.add_argument("--packing-impl", type=str, default="packed", choices=["packed", "pack_until_overflow", "unpacked"], help="Packing implementation")
parser.add_argument("--allow-chopped", action="store_true", help="Allow chopped samples")
parser.add_argument("--extra-tokens", type=int, default=1, help="Extra tokens to add to sequence")
# Tokenizer arguments
parser.add_argument("--tokenizer-path", type=str, help="Path to tokenizer for detokenization")
# Mode argument
parser.add_argument("--mode", type=str, choices=["search", "inspect", "both"], default="both",
help="UI mode: 'search' for search only, 'inspect' for inspect and view documents, 'both' for all features")
return parser.parse_args()