From 6be7b7c8adcfd5d4ca5a6f3802eebe0f5875dff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Mon, 9 Dec 2024 10:23:53 +0100 Subject: [PATCH 1/9] add returnn ppl job --- returnn/perplexity.py | 93 +++++++++++++++++++++++++++++++++++++++++++ returnn/training.py | 10 ++--- 2 files changed, 98 insertions(+), 5 deletions(-) create mode 100644 returnn/perplexity.py diff --git a/returnn/perplexity.py b/returnn/perplexity.py new file mode 100644 index 00000000..67987a76 --- /dev/null +++ b/returnn/perplexity.py @@ -0,0 +1,93 @@ +__all__ = ["ReturnnCalculatePerplexityJob"] + +import shutil +import subprocess as sp +from typing import Union + +from sisyphus import Job, Task, setup_path, tk + +import i6_core.util as util + +from .config import ReturnnConfig +from .training import PtCheckpoint, Checkpoint + +Path = setup_path(__package__) + + +class ReturnnCalculatePerplexityJob(Job): + """ + Calculates the perplexity of a language model trained in RETURNN + on an evaluation data set + """ + + def __init__( + self, + returnn_config: ReturnnConfig, + returnn_model: Union[PtCheckpoint, Checkpoint], + eval_dataset: tk.Path, + *, + log_verbosity: int = 3, + returnn_root: tk.Path, + returnn_python_exe: tk.Path, + ): + returnn_config.config.pop("train") + returnn_config.config.pop("dev") + returnn_config.config["eval_datasets"] = {"eval": eval_dataset} + + # TODO verify paths + if isinstance(returnn_model, PtCheckpoint): + model_path = returnn_model.path + self.add_input(returnn_model.path) + elif isinstance(returnn_model, Checkpoint): + model_path = returnn_model.index_path + self.add_input(returnn_model.index_path) + else: + raise NotImplementedError(f"returnn model has unknown type: {type(returnn_model)}") + + returnn_config.config["model"] = model_path + + returnn_config.post_config["log_verbosity"] = log_verbosity + + self.returnn_config = returnn_config + + self.returnn_python_exe = returnn_python_exe + self.returnn_root = returnn_root + + self.out_returnn_config_file = self.output_path("returnn.config") + self.out_returnn_log = self.output_path("returnn.log") + self.out_perplexities = self.output_var("ppl_score") + + self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4} + + def tasks(self): + yield Task("create_files", mini_task=True) + yield Task("run", resume="run", rqmt=self.rqmt) + yield Task("gather", mini_task=True) + + def _get_run_cmd(self): + run_cmd = [ + self.returnn_python_exe.get_path(), + self.returnn_root.join_right("rnn.py").get_path(), + self.out_returnn_config_file.get_path(), + "++task eval", + ] + return run_cmd + + def create_files(self): + self.returnn_config.write(self.out_returnn_config_file.get_path()) + + util.create_executable("rnn.sh", self._get_run_cmd()) + + def run(self): + sp.check_call(self._get_run_cmd()) + + shutil.move("returnn_log", self.out_returnn_log.get_path()) + + def gather(self): + for data_key in self.out_perplexities.keys(): + print(data_key) + + @classmethod + def hash(cls, parsed_args): + del parsed_args["log_verbosity"] + return super().hash(parsed_args) diff --git a/returnn/training.py b/returnn/training.py index 03e17127..6e6f565f 100644 --- a/returnn/training.py +++ b/returnn/training.py @@ -35,12 +35,12 @@ class ReturnnModel: This is deprecated, use :class:`Checkpoint` instead. """ - def __init__(self, returnn_config_file, model, epoch): + def __init__(self, returnn_config_file: Path, model: Path, epoch: int): """ - :param Path returnn_config_file: Path to a returnn config file - :param Path model: Path to a RETURNN checkpoint (only the .meta for Tensorflow) - :param int epoch: + :param returnn_config_file: Path to a returnn config file + :param model: Path to a RETURNN checkpoint (only the .meta for Tensorflow) + :param epoch: """ self.returnn_config_file = returnn_config_file self.model = model @@ -52,7 +52,7 @@ class Checkpoint: Checkpoint object which holds the (Tensorflow) index file path as tk.Path, and will return the checkpoint path as common prefix of the .index/.meta/.data[...] - A checkpoint object should directly assigned to a RasrConfig entry (do not call `.ckpt_path`) + A checkpoint object should directly be assigned to a RasrConfig entry (do not call `.ckpt_path`) so that the hash will resolve correctly """ From 9e9393958aaf52f4a381c28944242ab03546df7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Tue, 10 Dec 2024 08:55:02 +0100 Subject: [PATCH 2/9] doc fix Co-authored-by: michelwi --- returnn/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/returnn/training.py b/returnn/training.py index 6e6f565f..749b25f5 100644 --- a/returnn/training.py +++ b/returnn/training.py @@ -52,7 +52,7 @@ class Checkpoint: Checkpoint object which holds the (Tensorflow) index file path as tk.Path, and will return the checkpoint path as common prefix of the .index/.meta/.data[...] - A checkpoint object should directly be assigned to a RasrConfig entry (do not call `.ckpt_path`) + A checkpoint object should be directly assigned to a RasrConfig entry (do not call `.ckpt_path`) so that the hash will resolve correctly """ From 44e834c3fc4c2fc3290a02b5ddccca8ed6e27d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?= <31628502+Icemole@users.noreply.github.com> Date: Mon, 9 Dec 2024 16:09:55 +0100 Subject: [PATCH 3/9] Add left/right context orth to lib.corpus (#564) Co-authored-by: Albert Zeyer Co-authored-by: Eugen Beck --- lib/corpus.py | 52 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/lib/corpus.py b/lib/corpus.py index 076cb048..11c9b861 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]): def endElement(self, name: str): e = self.elements[-1] - if name == "orth": + if name in {"orth", "left-context-orth", "right-context-orth"}: assert isinstance(e, Segment) # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during @@ -124,7 +124,7 @@ def endElement(self, name: str): text = self.chars.strip() text = re.sub(" +", " ", text) text = re.sub("\n", "", text) - e.orth = text + setattr(e, name.replace("-", "_"), text) elif isinstance(e, Speaker) and name != "speaker-description": # we allow all sorts of elements within a speaker description e.attribs[name] = self.chars.strip() @@ -356,15 +356,39 @@ def get_segment_mapping(self) -> Dict[str, Segment]: class Segment(NamedEntity): - def __init__(self): + def __init__( + self, + *, + start: float = 0.0, + end: float = 0.0, + track: Optional[int] = None, + orth: Optional[str] = None, + left_context_orth: Optional[str] = None, + right_context_orth: Optional[str] = None, + speaker_name: Optional[str] = None, + recording: Optional[Recording] = None, + ): + """ + :param start: Segment start. + :param end: Segment end. + :param track: Segment track/channel. + :param orth: Segment text. + :param left_context_orth: Optional left context when aligning (specific for RASR alignment). + :param right_context_orth: Optional right context when aligning (specific for RASR alignment). + :param speaker_name: Speaker name. + :param recording: Recording in which the segment is embedded. + """ super().__init__() - self.start = 0.0 - self.end = 0.0 - self.track: Optional[int] = None - self.orth: Optional[str] = None - self.speaker_name: Optional[str] = None - self.recording: Optional[Recording] = None + self.start = start + self.end = end + self.track = track + self.orth = orth + self.left_context_orth = left_context_orth + self.right_context_orth = right_context_orth + self.speaker_name = speaker_name + + self.recording = recording def fullname(self) -> str: return self.recording.fullname() + "/" + self.name @@ -384,6 +408,16 @@ def dump(self, out: TextIO, indentation: str = ""): out.write('%s \n' % (indentation, self.speaker_name)) if self.orth is not None: out.write("%s %s \n" % (indentation, saxutils.escape(self.orth))) + if self.left_context_orth is not None: + out.write( + "%s %s \n" + % (indentation, saxutils.escape(self.left_context_orth)) + ) + if self.right_context_orth is not None: + out.write( + "%s %s \n" + % (indentation, saxutils.escape(self.right_context_orth)) + ) if has_child_element: out.write("%s\n" % indentation) else: From fefe736b9ff19a503874968ab94d128df92f92ad Mon Sep 17 00:00:00 2001 From: Moritz Gunz Date: Mon, 9 Dec 2024 07:12:02 -0800 Subject: [PATCH 4/9] Update docs of DenseLabelInfo (#561) --- mm/context_label.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/context_label.py b/mm/context_label.py index 050fee5d..4cfaca08 100644 --- a/mm/context_label.py +++ b/mm/context_label.py @@ -13,7 +13,8 @@ class DenseLabelInfo: """ Attributes: - n_contexts: number of phonemes in lexicon ( usually need to + 1 for non-context # in rasr) + n_contexts: number of phonemes in lexicon + number of non-word phonemes (like [SILENCE], [NOISE] or [MUSIC]) + and + 1 for non-context # in rasr. use_word_end_classes: if word end class is used for no tying dense label use_boundary_classes: if bounary class is used for no tying dense label num_hmm_states_per_phon: the number of hmm states per phoneme From 3771fd3605e36ff4dc1cd83c4faeccffb94911fa Mon Sep 17 00:00:00 2001 From: DanEnergetics Date: Mon, 9 Dec 2024 16:34:59 +0100 Subject: [PATCH 5/9] Add support for DelayedBase in CreateDummyMixturesJob (#562) --- mm/mixtures.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mixtures.py b/mm/mixtures.py index f22fbc4e..66a71b26 100644 --- a/mm/mixtures.py +++ b/mm/mixtures.py @@ -14,6 +14,7 @@ from typing import Dict, Optional, Union from sisyphus import * +from sisyphus.delayed_ops import DelayedBase Path = setup_path(__package__) @@ -514,8 +515,8 @@ def tasks(self): yield Task("run", mini_task=True) def run(self): - num_mixtures = int(self.num_mixtures.get() if isinstance(self.num_mixtures, tk.Variable) else self.num_mixtures) - num_features = int(self.num_features.get() if isinstance(self.num_features, tk.Variable) else self.num_features) + num_mixtures = int(self.num_mixtures.get() if isinstance(self.num_mixtures, DelayedBase) else self.num_mixtures) + num_features = int(self.num_features.get() if isinstance(self.num_features, DelayedBase) else self.num_features) with open(tk.uncached_path(self.out_mixtures), "wb") as f: f.write(b"MIXSET\0\0") From 6453ed95db2669015014f4a85a5f091bb5826f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Thu, 23 Jan 2025 17:01:48 +0100 Subject: [PATCH 6/9] pre-merge --- lib/corpus.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/corpus.py b/lib/corpus.py index 11c9b861..e2cb4afa 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -390,6 +390,12 @@ def __init__( self.recording = recording + def full_orth(self) -> str: + """ + :return: Left context orth (if any) + orth + right context orth (if any). + """ + return " ".join([s for s in [self.left_context_orth, self.orth, self.right_context_orth] if s]) + def fullname(self) -> str: return self.recording.fullname() + "/" + self.name From c66b1ffbb5ed7bb79c4e86573358c848deb0e94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Thu, 23 Jan 2025 17:13:06 +0100 Subject: [PATCH 7/9] updates --- returnn/perplexity.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/returnn/perplexity.py b/returnn/perplexity.py index 67987a76..00d2966a 100644 --- a/returnn/perplexity.py +++ b/returnn/perplexity.py @@ -23,8 +23,8 @@ class ReturnnCalculatePerplexityJob(Job): def __init__( self, returnn_config: ReturnnConfig, - returnn_model: Union[PtCheckpoint, Checkpoint], - eval_dataset: tk.Path, + returnn_model: PtCheckpoint, + eval_dataset: ReturnnConfig, *, log_verbosity: int = 3, returnn_root: tk.Path, @@ -32,20 +32,10 @@ def __init__( ): returnn_config.config.pop("train") returnn_config.config.pop("dev") - returnn_config.config["eval_datasets"] = {"eval": eval_dataset} - - # TODO verify paths - if isinstance(returnn_model, PtCheckpoint): - model_path = returnn_model.path - self.add_input(returnn_model.path) - elif isinstance(returnn_model, Checkpoint): - model_path = returnn_model.index_path - self.add_input(returnn_model.index_path) - else: - raise NotImplementedError(f"returnn model has unknown type: {type(returnn_model)}") + returnn_config.update(eval_dataset) + model_path = returnn_model.path returnn_config.config["model"] = model_path - returnn_config.post_config["log_verbosity"] = log_verbosity self.returnn_config = returnn_config @@ -55,7 +45,7 @@ def __init__( self.out_returnn_config_file = self.output_path("returnn.config") self.out_returnn_log = self.output_path("returnn.log") - self.out_perplexities = self.output_var("ppl_score") + self.out_perplexity = self.output_var("ppl_score") self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4} @@ -75,17 +65,15 @@ def _get_run_cmd(self): def create_files(self): self.returnn_config.write(self.out_returnn_config_file.get_path()) - util.create_executable("rnn.sh", self._get_run_cmd()) def run(self): sp.check_call(self._get_run_cmd()) - shutil.move("returnn_log", self.out_returnn_log.get_path()) - def gather(self): - for data_key in self.out_perplexities.keys(): - print(data_key) + # TODO get ppl + ppl = None + self.out_perplexity.set(ppl) @classmethod def hash(cls, parsed_args): From b1dee3cc51e2bc8e2a924d3496ee3e6fa578cf0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Thu, 23 Jan 2025 17:55:59 +0100 Subject: [PATCH 8/9] refactor --- returnn/perplexity.py | 88 ++++++++++++++----------------------------- 1 file changed, 28 insertions(+), 60 deletions(-) diff --git a/returnn/perplexity.py b/returnn/perplexity.py index 00d2966a..f36a34cc 100644 --- a/returnn/perplexity.py +++ b/returnn/perplexity.py @@ -1,81 +1,49 @@ -__all__ = ["ReturnnCalculatePerplexityJob"] +__all__ = ["ExtractPerplexityFromLearningRatesFileJob"] -import shutil -import subprocess as sp -from typing import Union +import ast +from typing import List from sisyphus import Job, Task, setup_path, tk -import i6_core.util as util -from .config import ReturnnConfig -from .training import PtCheckpoint, Checkpoint Path = setup_path(__package__) -class ReturnnCalculatePerplexityJob(Job): +class ExtractPerplexityFromLearningRatesFileJob(Job): """ - Calculates the perplexity of a language model trained in RETURNN - on an evaluation data set + Extracts the perplexity from the RETURNN learning rates files. """ def __init__( self, - returnn_config: ReturnnConfig, - returnn_model: PtCheckpoint, - eval_dataset: ReturnnConfig, - *, - log_verbosity: int = 3, - returnn_root: tk.Path, - returnn_python_exe: tk.Path, + returnn_learning_rates: tk.Path, + eval_datasets: List[str], + loss_names: List[str], ): - returnn_config.config.pop("train") - returnn_config.config.pop("dev") - returnn_config.update(eval_dataset) + self.returnn_learning_rates = returnn_learning_rates + self.eval_datasets = sorted(eval_datasets) + self.loss_names = sorted(loss_names) - model_path = returnn_model.path - returnn_config.config["model"] = model_path - returnn_config.post_config["log_verbosity"] = log_verbosity + self.out_perplexities = self.output_path("ppl.txt") - self.returnn_config = returnn_config - - self.returnn_python_exe = returnn_python_exe - self.returnn_root = returnn_root - - self.out_returnn_config_file = self.output_path("returnn.config") - self.out_returnn_log = self.output_path("returnn.log") - self.out_perplexity = self.output_var("ppl_score") - - self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4} + self.rqmt = {"gpu": 0, "cpu": 1, "mem": 1, "time": 1} def tasks(self): - yield Task("create_files", mini_task=True) - yield Task("run", resume="run", rqmt=self.rqmt) - yield Task("gather", mini_task=True) - - def _get_run_cmd(self): - run_cmd = [ - self.returnn_python_exe.get_path(), - self.returnn_root.join_right("rnn.py").get_path(), - self.out_returnn_config_file.get_path(), - "++task eval", - ] - return run_cmd - - def create_files(self): - self.returnn_config.write(self.out_returnn_config_file.get_path()) - util.create_executable("rnn.sh", self._get_run_cmd()) + yield Task("run", resume="run", mini_task=True) def run(self): - sp.check_call(self._get_run_cmd()) - shutil.move("returnn_log", self.out_returnn_log.get_path()) - - # TODO get ppl - ppl = None - self.out_perplexity.set(ppl) - - @classmethod - def hash(cls, parsed_args): - del parsed_args["log_verbosity"] - return super().hash(parsed_args) + with open(self.returnn_learning_rates.get_path(), "rt", encoding="utf-8") as f_in: + data = f_in.read() + lr_dict = ast.literal_eval(data) + lr_dict = sorted(lr_dict.items(), reverse=True) + last_entry = lr_dict[0] + + res = [] + for data_set in self.eval_datasets: + for loss in self.loss_names: + full_name = f"{data_set}_loss_{loss}" + res.append(f"{data_set} - {loss}: {last_entry[full_name]} \n") + + with open(self.out_perplexities.get_path(), "wt", encoding="utf-8") as f_out: + f_out.writelines(res) From 1a6543174337345b5790fec5c70f0553551509b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= Date: Wed, 29 Jan 2025 08:19:18 +0100 Subject: [PATCH 9/9] changes --- returnn/perplexity.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/returnn/perplexity.py b/returnn/perplexity.py index f36a34cc..6ec07ec2 100644 --- a/returnn/perplexity.py +++ b/returnn/perplexity.py @@ -3,13 +3,10 @@ import ast from typing import List -from sisyphus import Job, Task, setup_path, tk +from sisyphus import Job, Task, tk -Path = setup_path(__package__) - - class ExtractPerplexityFromLearningRatesFileJob(Job): """ Extracts the perplexity from the RETURNN learning rates files. @@ -19,13 +16,13 @@ def __init__( self, returnn_learning_rates: tk.Path, eval_datasets: List[str], - loss_names: List[str], ): self.returnn_learning_rates = returnn_learning_rates self.eval_datasets = sorted(eval_datasets) - self.loss_names = sorted(loss_names) - self.out_perplexities = self.output_path("ppl.txt") + self.out_ppl_file = self.output_path("ppl.txt") + + self.out_perplexities = {f"ppl_{d}": self.output_var(f"ppl_{d}") for d in eval_datasets} self.rqmt = {"gpu": 0, "cpu": 1, "mem": 1, "time": 1} @@ -41,9 +38,8 @@ def run(self): res = [] for data_set in self.eval_datasets: - for loss in self.loss_names: - full_name = f"{data_set}_loss_{loss}" - res.append(f"{data_set} - {loss}: {last_entry[full_name]} \n") + full_name = f"{data_set}_loss_ppl" # TODO actually check which name fits + res.append(f"{data_set} - ppl: {last_entry[full_name]} \n") - with open(self.out_perplexities.get_path(), "wt", encoding="utf-8") as f_out: + with open(self.out_ppl_file.get_path(), "wt", encoding="utf-8") as f_out: f_out.writelines(res)