Coverage for maze_dataset/tokenization/modular/fst_load.py: 44%
36 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-27 23:43 -0600
« prev ^ index » next coverage.py v7.6.12, created at 2025-03-27 23:43 -0600
1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst`
3this file handles the creation of this fst file, which we ship to the user
5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`.
6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load`
7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and
8we cannot circularly import
10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before
12"""
14import warnings
15from functools import cache
16from pathlib import Path
18_RUST_FST_LOADED: bool = False
19"""if the rust_fst module was loaded successfully"""
21_RUST_FST_ERR_MSG: str = (
22 "you need the `rust_fst` package to use `maze_dataset.tokenization.modular` properly. installing `maze-dataset[tokenization]` will install it\n"
23 "Note that rust-fst doesn't work on mac, see https://github.com/understanding-search/maze-dataset/issues/57\n"
24 "and this makes modular tokenizers not checkable on mac. Things should still work, but you will have no guarantee that a tokenizer is tested.\n"
25 "If you can find away around this, please let us know!\n"
26)
29class RustFstNotLoadedWarning(UserWarning):
30 """warning for when `rust_fst` is not loaded"""
33try:
34 from rust_fst import Set as FstSet # type: ignore[import-untyped]
36 _RUST_FST_LOADED = True
37except ImportError as e:
38 warnings.warn(_RUST_FST_ERR_MSG + str(e), RustFstNotLoadedWarning)
39 _RUST_FST_LOADED = False
41MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst"
44@cache
45def get_tokenizers_fst() -> "FstSet":
46 """(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
47 return FstSet(MMT_FST_PATH.as_posix())
50def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
51 """check if a tokenizer is in the fst set
53 prints nearest matches if `do_except` is `True` and the tokenizer is not found
54 """
55 search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
56 in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
58 if do_except and not in_fst:
59 search_1: list[str] | None = None
60 search_2: list[str] | None = None
61 try:
62 search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
63 search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
64 except Exception: # noqa: BLE001, S110
65 # the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
66 pass
68 err_msg: str = (
69 f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
70 f"\nedit dist 0 (should be empty?): {search_0}"
71 + (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
72 + (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
73 )
74 raise ValueError(err_msg)
76 return in_fst
79def _check_tokenizer_in_fst_mock(tokenizer_name: str, do_except: bool = False) -> bool: # noqa: ARG001
80 """mock function for `check_tokenizer_in_fst`
82 runs when we cant import `rust_fst` which sets `_RUST_FST_LOADED` to `False`
83 """
84 warnings.warn(
85 _RUST_FST_ERR_MSG
86 + "you are seeing this warning probably because you tried to run"
87 "`MazeTokenizerModular(...).is_tested_tokenizer()` on a mac or without `rust_fst` installed"
88 + "this is fine, but note that the tokenizer will be checked for validity, but is not part of the tested set"
89 )
90 return True
93# override the function if we can't load rust_fst
94if not _RUST_FST_LOADED:
95 check_tokenizer_in_fst = _check_tokenizer_in_fst_mock