From bf150e442eaed9af6e6f559cb51ba0cd96dded81 Mon Sep 17 00:00:00 2001
From: Barabazs <31799121+Barabazs@users.noreply.github.com>
Date: Thu, 2 Oct 2025 16:06:38 +0000
Subject: [PATCH 1/2] feat: update Punkt tokenizer to use pre-trained model and
 handle missing data

---
 whisperx/alignment.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 3e19292ab..ae9f997c5 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -22,9 +22,8 @@
     SingleWordSegment,
     SegmentData,
 )
-from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
-
-PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof', 'jr', 'sr', 'ph.d']
+import nltk
+from nltk.data import load as nltk_load
 
 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
 
@@ -188,9 +187,11 @@ def align(
                 clean_wdx.append(wdx)
 
 
-        punkt_param = PunktParameters()
-        punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
-        sentence_splitter = PunktSentenceTokenizer(punkt_param)
+        try:
+            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
+        except LookupError:
+            nltk.download('punkt_tab', quiet=True)
+            sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
         sentence_spans = list(sentence_splitter.span_tokenize(text))
 
         segment_data[sdx] = {

From b1c8ac7de62c5969343352f2d10a31ebe5a107fd Mon Sep 17 00:00:00 2001
From: Nguyen Binh <nguyenvulebinh@gmail.com>
Date: Thu, 11 Apr 2024 16:01:20 +0200
Subject: [PATCH 2/2] Change alignment model for Vietnamese language

Since the current model is a wav2vec2 pre-trained model for Vietnamese audio, it won't work with alignment tasks. To make it work as expected, I recommend chaining to a fine-tuned ASR version.
---
 whisperx/alignment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index ae9f997c5..765cb82ad 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -52,7 +52,7 @@
     "tr": "mpoyraz/wav2vec2-xls-r-300m-cv7-turkish",
     "da": "saattrupdan/wav2vec2-xls-r-300m-ftspeech",
     "he": "imvladikon/wav2vec2-xls-r-300m-hebrew",
-    "vi": 'nguyenvulebinh/wav2vec2-base-vi',
+    "vi": 'nguyenvulebinh/wav2vec2-base-vi-vlsp2020',
     "ko": "kresnik/wav2vec2-large-xlsr-korean",
     "ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
     "te": "anuragshas/wav2vec2-large-xlsr-53-telugu",