Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions whisperx/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

from whisperx.audio import SAMPLE_RATE, load_audio
from whisperx.utils import interpolate_nans
from whisperx.utils import interpolate_nans, PUNKT_LANGUAGES
from whisperx.schema import (
AlignedTranscriptionResult,
SingleSegment,
Expand Down Expand Up @@ -192,11 +192,13 @@ def align(
clean_wdx.append(wdx)


# Use language-specific Punkt model if available otherwise we fallback to English.
punkt_lang = PUNKT_LANGUAGES.get(model_lang, 'english')
try:
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
except LookupError:
nltk.download('punkt_tab', quiet=True)
sentence_splitter = nltk_load('tokenizers/punkt/english.pickle')
sentence_splitter = nltk_load(f'tokenizers/punkt_tab/{punkt_lang}.pickle')
sentence_spans = list(sentence_splitter.span_tokenize(text))

segment_data[sdx] = {
Expand Down
23 changes: 23 additions & 0 deletions whisperx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,29 @@

LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]

# Mapping of language codes to NLTK Punkt tokenizer model names
PUNKT_LANGUAGES = {
'cs': 'czech',
'da': 'danish',
'de': 'german',
'el': 'greek',
'en': 'english',
'es': 'spanish',
'et': 'estonian',
'fi': 'finnish',
'fr': 'french',
'it': 'italian',
'nl': 'dutch',
'no': 'norwegian',
'pl': 'polish',
'pt': 'portuguese',
'sl': 'slovene',
'sv': 'swedish',
'tr': 'turkish',
"ml": "malayalam",
"ru": "russian",
}

system_encoding = sys.getdefaultencoding()

if system_encoding != "utf-8":
Expand Down