Skip to content

Commit

Permalink
remove tiktoken pin (#1759)
Browse files Browse the repository at this point in the history
  • Loading branch information
jongwook committed Nov 6, 2023
1 parent b9f17e1 commit 746aaae
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 3 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ numpy
torch
tqdm
more-itertools
tiktoken==0.3.3
tiktoken
14 changes: 12 additions & 2 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
import pytest

from whisper.tokenizer import get_tokenizer


def test_tokenizer():
@pytest.mark.parametrize("multilingual", [True, False])
def test_tokenizer(multilingual):
tokenizer = get_tokenizer(multilingual=False)
assert tokenizer.sot in tokenizer.sot_sequence
assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens)
assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens)


def test_multilingual_tokenizer():
gpt2_tokenizer = get_tokenizer(multilingual=False)
multilingual_tokenizer = get_tokenizer(multilingual=True)

Expand All @@ -20,5 +30,5 @@ def test_split_on_unicode():
tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378]
words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens)

assert words == [" elle", " est", " l", "'", "", "é", "rit", "oire"]
assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"]
assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]]

0 comments on commit 746aaae

Please sign in to comment.