docs for maze-dataset v1.3.2
View Source on GitHub

maze_dataset.tokenization.modular.fst_load

to check if a tokenizer is one of our "approved" ones, look in an fst set we made with rust_fst

this file handles the creation of this fst file, which we ship to the user

this file relies on importing get_all_tokenizers and thus MazeTokenizerModular. as such, loading this file for validating a tokenizer is the separate maze_dataset.tokenization.modular.fst_load module, since we need to be able to import that from maze_dataset.tokenization.modular.maze_tokenizer_modular and we cannot circularly import

thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before


 1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst`
 2
 3this file handles the creation of this fst file, which we ship to the user
 4
 5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`.
 6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load`
 7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and
 8we cannot circularly import
 9
10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before
11
12"""
13
14import warnings
15from functools import cache
16from pathlib import Path
17
18_RUST_FST_LOADED: bool = False
19"""if the rust_fst module was loaded successfully"""
20
21_RUST_FST_ERR_MSG: str = (
22	"you need the `rust_fst` package to use `maze_dataset.tokenization.modular` properly. installing `maze-dataset[tokenization]` will install it\n"
23	"Note that rust-fst doesn't work on mac, see https://github.com/understanding-search/maze-dataset/issues/57\n"
24	"and this makes modular tokenizers not checkable on mac. Things should still work, but you will have no guarantee that a tokenizer is tested.\n"
25	"If you can find away around this, please let us know!\n"
26)
27
28
29class RustFstNotLoadedWarning(UserWarning):
30	"""warning for when `rust_fst` is not loaded"""
31
32
33try:
34	from rust_fst import Set as FstSet  # type: ignore[import-untyped]
35
36	_RUST_FST_LOADED = True
37except ImportError as e:
38	warnings.warn(_RUST_FST_ERR_MSG + str(e), RustFstNotLoadedWarning)
39	_RUST_FST_LOADED = False
40
41MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst"
42
43
44@cache
45def get_tokenizers_fst() -> "FstSet":
46	"""(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
47	return FstSet(MMT_FST_PATH.as_posix())
48
49
50def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
51	"""check if a tokenizer is in the fst set
52
53	prints nearest matches if `do_except` is `True` and the tokenizer is not found
54	"""
55	search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
56	in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
57
58	if do_except and not in_fst:
59		search_1: list[str] | None = None
60		search_2: list[str] | None = None
61		try:
62			search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
63			search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
64		except Exception:  # noqa: BLE001, S110
65			# the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
66			pass
67
68		err_msg: str = (
69			f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
70			f"\nedit dist 0 (should be empty?): {search_0}"
71			+ (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
72			+ (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
73		)
74		raise ValueError(err_msg)
75
76	return in_fst
77
78
79def _check_tokenizer_in_fst_mock(tokenizer_name: str, do_except: bool = False) -> bool:  # noqa: ARG001
80	"""mock function for `check_tokenizer_in_fst`
81
82	runs when we cant import `rust_fst` which sets `_RUST_FST_LOADED` to `False`
83	"""
84	warnings.warn(
85		_RUST_FST_ERR_MSG
86		+ "you are seeing this warning probably because you tried to run"
87		"`MazeTokenizerModular(...).is_tested_tokenizer()` on a mac or without `rust_fst` installed"
88		+ "this is fine, but note that the tokenizer will be checked for validity, but is not part of the tested set"
89	)
90	return True
91
92
93# override the function if we can't load rust_fst
94if not _RUST_FST_LOADED:
95	check_tokenizer_in_fst = _check_tokenizer_in_fst_mock

class RustFstNotLoadedWarning(builtins.UserWarning):
30class RustFstNotLoadedWarning(UserWarning):
31	"""warning for when `rust_fst` is not loaded"""

warning for when rust_fst is not loaded

Inherited Members
builtins.UserWarning
UserWarning
builtins.BaseException
with_traceback
add_note
args
MMT_FST_PATH: pathlib.Path = PosixPath('/home/miv/projects/mazes/maze-dataset/maze_dataset/tokenization/modular/MazeTokenizerModular_testedmaze_dataset.tokenization.modular.fst')
@cache
def get_tokenizers_fst() -> rust_fst.set.Set:
45@cache
46def get_tokenizers_fst() -> "FstSet":
47	"""(cached) load the tokenizers fst set from `MMT_FST_PATH`"""
48	return FstSet(MMT_FST_PATH.as_posix())

(cached) load the tokenizers fst set from MMT_FST_PATH

def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
51def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool:
52	"""check if a tokenizer is in the fst set
53
54	prints nearest matches if `do_except` is `True` and the tokenizer is not found
55	"""
56	search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0))
57	in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name
58
59	if do_except and not in_fst:
60		search_1: list[str] | None = None
61		search_2: list[str] | None = None
62		try:
63			search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1))
64			search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2))
65		except Exception:  # noqa: BLE001, S110
66			# the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors
67			pass
68
69		err_msg: str = (
70			f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:"
71			f"\nedit dist 0 (should be empty?): {search_0}"
72			+ (f"\nedit dist 1: {search_1}" if search_1 is not None else "")
73			+ (f"\nedit dist 2: {search_2}" if search_2 is not None else "")
74		)
75		raise ValueError(err_msg)
76
77	return in_fst

check if a tokenizer is in the fst set

prints nearest matches if do_except is True and the tokenizer is not found