Coverage for tests/unit/tokenization/test_vocab.py: 100%

21 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-03-24 00:33 -0600

1import pytest 

2 

3from maze_dataset.constants import ( 

4 SPECIAL_TOKENS, 

5 VOCAB, 

6 VOCAB_LIST, 

7 VOCAB_TOKEN_TO_INDEX, 

8) 

9 

10 

11def test_special_tokens_base(): 

12 # Test the getitem method 

13 assert SPECIAL_TOKENS["ADJLIST_START"] == "<ADJLIST_START>" 

14 

15 with pytest.raises(KeyError): 

16 SPECIAL_TOKENS["NON_EXISTENT_KEY"] 

17 

18 # Test the len method 

19 assert len(SPECIAL_TOKENS) == 11 

20 

21 # Test the contains method 

22 assert "ADJLIST_START" in SPECIAL_TOKENS 

23 assert "NON_EXISTENT_KEY" not in SPECIAL_TOKENS 

24 

25 # Test the values method 

26 assert "<ADJLIST_START>" in SPECIAL_TOKENS.values() 

27 

28 # Test the items method 

29 assert ("ADJLIST_START", "<ADJLIST_START>") in SPECIAL_TOKENS.items() 

30 

31 # Test the keys method 

32 assert "ADJLIST_START" in SPECIAL_TOKENS 

33 

34 

35def test_vocab(): 

36 assert len(VOCAB) == 4096 

37 # due to typing issue with VOCAB being instance of a dynamic dataclass 

38 assert VOCAB.CTT_10 == "10" # type: ignore[attr-defined] 

39 assert VOCAB_LIST[0] == "<ADJLIST_START>" 

40 assert VOCAB_LIST[706] == "&" 

41 assert VOCAB_TOKEN_TO_INDEX["<UNK>"] == 19 

42 assert VOCAB_TOKEN_TO_INDEX["0"] == 320 

43 assert VOCAB_TOKEN_TO_INDEX["-1"] == 703 

44 assert VOCAB_TOKEN_TO_INDEX["(0,0)"] == 1596