Coverage for maze_dataset/tokenization/modular/fst_load.py: 44%

36 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-27 23:43 -0600

1"""to check if a tokenizer is one of our "approved" ones, look in an fst set we made with `rust_fst` 

2 

3this file handles the creation of this fst file, which we ship to the user 

4 

5this file relies on importing `get_all_tokenizers` and thus `MazeTokenizerModular`. 

6as such, loading this file for validating a tokenizer is the separate `maze_dataset.tokenization.modular.fst_load` 

7module, since we need to be able to import that from `maze_dataset.tokenization.modular.maze_tokenizer_modular` and 

8we cannot circularly import 

9 

10thanks to https://github.com/rozbb for suggesting doing this instead of storing a whole bunch of hashes like we were doing before 

11 

12""" 

13 

14import warnings 

15from functools import cache 

16from pathlib import Path 

17 

18_RUST_FST_LOADED: bool = False 

19"""if the rust_fst module was loaded successfully""" 

20 

21_RUST_FST_ERR_MSG: str = ( 

22 "you need the `rust_fst` package to use `maze_dataset.tokenization.modular` properly. installing `maze-dataset[tokenization]` will install it\n" 

23 "Note that rust-fst doesn't work on mac, see https://github.com/understanding-search/maze-dataset/issues/57\n" 

24 "and this makes modular tokenizers not checkable on mac. Things should still work, but you will have no guarantee that a tokenizer is tested.\n" 

25 "If you can find away around this, please let us know!\n" 

26) 

27 

28 

29class RustFstNotLoadedWarning(UserWarning): 

30 """warning for when `rust_fst` is not loaded""" 

31 

32 

33try: 

34 from rust_fst import Set as FstSet # type: ignore[import-untyped] 

35 

36 _RUST_FST_LOADED = True 

37except ImportError as e: 

38 warnings.warn(_RUST_FST_ERR_MSG + str(e), RustFstNotLoadedWarning) 

39 _RUST_FST_LOADED = False 

40 

41MMT_FST_PATH: Path = Path(__file__).parent / "MazeTokenizerModular_tested.fst" 

42 

43 

44@cache 

45def get_tokenizers_fst() -> "FstSet": 

46 """(cached) load the tokenizers fst set from `MMT_FST_PATH`""" 

47 return FstSet(MMT_FST_PATH.as_posix()) 

48 

49 

50def check_tokenizer_in_fst(tokenizer_name: str, do_except: bool = False) -> bool: 

51 """check if a tokenizer is in the fst set 

52 

53 prints nearest matches if `do_except` is `True` and the tokenizer is not found 

54 """ 

55 search_0: list[str] = list(get_tokenizers_fst().search(tokenizer_name, 0)) 

56 in_fst: bool = len(search_0) == 1 and search_0[0] == tokenizer_name 

57 

58 if do_except and not in_fst: 

59 search_1: list[str] | None = None 

60 search_2: list[str] | None = None 

61 try: 

62 search_1 = list(get_tokenizers_fst().search(tokenizer_name, 1)) 

63 search_2 = list(get_tokenizers_fst().search(tokenizer_name, 2)) 

64 except Exception: # noqa: BLE001, S110 

65 # the only thing failing here is getting possible match tokenizers, so it's fine to just ignore the errors 

66 pass 

67 

68 err_msg: str = ( 

69 f"Tokenizer `{tokenizer_name}` not found in the list of tested tokenizers, and {do_except = }. We found the following matches based on edit distance:" 

70 f"\nedit dist 0 (should be empty?): {search_0}" 

71 + (f"\nedit dist 1: {search_1}" if search_1 is not None else "") 

72 + (f"\nedit dist 2: {search_2}" if search_2 is not None else "") 

73 ) 

74 raise ValueError(err_msg) 

75 

76 return in_fst 

77 

78 

79def _check_tokenizer_in_fst_mock(tokenizer_name: str, do_except: bool = False) -> bool: # noqa: ARG001 

80 """mock function for `check_tokenizer_in_fst` 

81 

82 runs when we cant import `rust_fst` which sets `_RUST_FST_LOADED` to `False` 

83 """ 

84 warnings.warn( 

85 _RUST_FST_ERR_MSG 

86 + "you are seeing this warning probably because you tried to run" 

87 "`MazeTokenizerModular(...).is_tested_tokenizer()` on a mac or without `rust_fst` installed" 

88 + "this is fine, but note that the tokenizer will be checked for validity, but is not part of the tested set" 

89 ) 

90 return True 

91 

92 

93# override the function if we can't load rust_fst 

94if not _RUST_FST_LOADED: 

95 check_tokenizer_in_fst = _check_tokenizer_in_fst_mock