Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: windows-latest
strategy:
matrix:
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
python: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand Down Expand Up @@ -52,6 +52,7 @@ jobs:
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python: ["3.13", "3.13t"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
Expand All @@ -72,7 +73,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: 3.13
python-version: ${{ matrix.python }}
architecture: "x64"


Expand Down
38 changes: 38 additions & 0 deletions bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,44 @@ def test_multiprocessing_with_parallelism(self):
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)

def test_multithreaded_concurrency(self):

# Thread worker functions
def encode_batch(batch):
tokenizer = Tokenizer(BPE())
return tokenizer.encode_batch(batch)

def encode_batch_fast(batch):
tokenizer = Tokenizer(BPE())
return tokenizer.encode_batch_fast(batch)

# Create some significant workload
batches = [
["my name is john " * 50] * 20,
["my name is paul " * 50] * 20,
["my name is ringo " * 50] * 20,
]

# Many encoding operations to run concurrently
tasks = [
(encode_batch, batches[0]),
(encode_batch_fast, batches[1]),
(encode_batch, batches[2]),
] * 10

executor = concurrent.futures.ThreadPoolExecutor(max_workers=4)

futures = []
for task in tasks:
futures.append(executor.submit(*task))

# All tasks should complete successfully
results = [f.result() for f in futures]

# Verify results
assert len(results) == 30
assert all(len(result) == 20 for result in results)

def test_from_pretrained(self):
tokenizer = Tokenizer.from_pretrained("bert-base-cased")
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
Expand Down