Exploring How Tokenizers Work
Post

Exploring How Tokenizers Work

Requirements:

1
2
3
4
5
6
7
8
9
10
11
apt update && apt install python3-pip -y
mkdir llm-chatbot
cd llm-chatbot
cat >requirements.txt <<EOF
transformers==4.48.3
torch==2.6.0
accelerate==1.4.0
einops==0.8.0
jinja2==3.1.6
EOF
pip install -r requirements.txt

TinyLlama Tokenizer

To use it, first execute python3 and run one command at the time:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from transformers import AutoModelForCausalLM, AutoTokenizer; #Loading the Model and Tokenizer

revision_id = "fe8a4ea1ffedaf415f4da2f062534de366a451e6" #load the TinyLlama/TinyLlama-1.1B-Chat-v1.0
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );

tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", revision=revision_id); #load the tokenizer from the model

user_input = "How much is a gazillion?"; # create a prompt for the model

user_input_as_tokens = tokenizer(user_input, return_tensors="pt").input_ids.to(model.device); # Converting Prompt To List of Tokens

model_output = model.generate(input_ids=user_input_as_tokens, max_new_tokens=50); # generate method present inside the model object

print(tokenizer.decode(model_output[0]));

user_input = "How much is a gazillion?<|assistant|>";

user_input_as_tokens = tokenizer(user_input, return_tensors="pt").input_ids.to(model.device);

model_output = model.generate(input_ids=user_input_as_tokens, max_new_tokens=50);

print(tokenizer.decode(model_output[0]));

print(user_input_as_tokens);

for id in user_input_as_tokens[0]:
    print(tokenizer.decode(id));

for id in model_output[0]:
    print(tokenizer.decode(id));

print(model_output[0]);

print(tokenizer.decode(29900));
print(tokenizer.decode(29892));

user_input = "What is a gazebo? <|assistant|>";
user_input_as_tokens = tokenizer(user_input, return_tensors="pt").input_ids.to(model.device);
print(user_input_as_tokens);

user_input = "Which country is indigenous to gazelles? <|assistant|>";
user_input_as_tokens = tokenizer(user_input, return_tensors="pt").input_ids.to(model.device);
print(user_input_as_tokens);

print(tokenizer.decode(12642));

revision_id = "0a67737cc96d2554230f90338b163bc6380a2a85"
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", revision=revision_id);
print(tokenizer.decode(12642));

revision_id = "fe8a4ea1ffedaf415f4da2f062534de366a451e6"
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", revision=revision_id);
print(tokenizer.decode(12642));

revision_id = "0a67737cc96d2554230f90338b163bc6380a2a85"

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", revision=revision_id);
print(tokenizer.decode(12642));
print(tokenizer.decode(12652));

revision_id = "fe8a4ea1ffedaf415f4da2f062534de366a451e6"
model = AutoModelForCausalLM.from_pretrained(
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", revision=revision_id);
print(tokenizer.decode(12642));
print(tokenizer.decode(12652));

TinySwallow Tokenizer

1
2
3
4
5
6
7
8
9
10
11
12
from transformers import AutoModelForCausalLM, AutoTokenizer;
revision_id = "91e9fcc30f56d224aea84356c4d850cc4c5a3260"
model = AutoModelForCausalLM.from_pretrained(
    "SakanaAI/TinySwallow-1.5B-Instruct",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("SakanaAI/TinySwallow-1.5B-Instruct", revision=revision_id);
print(tokenizer.decode(12652));
print(tokenizer.decode(12452));

DeepSeek Tokenizer

1
2
3
4
5
6
7
8
9
10
11
12
from transformers import AutoModelForCausalLM, AutoTokenizer;
revision_id = "66c54660eae7e90c9ba259bfdf92d07d6e3ce8aa"
model = AutoModelForCausalLM.from_pretrained(
    "deepseek-ai/deepseek-vl2-tiny",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-vl2-tiny", revision=revision_id);
print(tokenizer.decode(12652));
print(tokenizer.decode(12452));

TinyMistral

1
2
3
4
5
6
7
8
9
10
11
12
from transformers import AutoModelForCausalLM, AutoTokenizer;
revision_id = "dcd3a7c4d80c2f8c338eea58d8067460c06a027f"
model = AutoModelForCausalLM.from_pretrained(
    "Felladrin/TinyMistral-248M-Chat-v4",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("Felladrin/TinyMistral-248M-Chat-v4", revision=revision_id);
print(tokenizer.decode(12652));
print(tokenizer.decode(12452));

microsoft/Phi

1
2
3
4
5
6
7
8
9
10
11
from transformers import AutoModelForCausalLM, AutoTokenizer;
revision_id = "0a67737cc96d2554230f90338b163bc6380a2a85"
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    revision=revision_id,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    );
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", revision=revision_id);
print(tokenizer.decode(12642));