From 6be7b7c8adcfd5d4ca5a6f3802eebe0f5875dff2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Mon, 9 Dec 2024 10:23:53 +0100
Subject: [PATCH 1/9] add returnn ppl job

---
 returnn/perplexity.py | 93 +++++++++++++++++++++++++++++++++++++++++++
 returnn/training.py   | 10 ++---
 2 files changed, 98 insertions(+), 5 deletions(-)
 create mode 100644 returnn/perplexity.py

diff --git a/returnn/perplexity.py b/returnn/perplexity.py
new file mode 100644
index 00000000..67987a76
--- /dev/null
+++ b/returnn/perplexity.py
@@ -0,0 +1,93 @@
+__all__ = ["ReturnnCalculatePerplexityJob"]
+
+import shutil
+import subprocess as sp
+from typing import Union
+
+from sisyphus import Job, Task, setup_path, tk
+
+import i6_core.util as util
+
+from .config import ReturnnConfig
+from .training import PtCheckpoint, Checkpoint
+
+Path = setup_path(__package__)
+
+
+class ReturnnCalculatePerplexityJob(Job):
+    """
+    Calculates the perplexity of a language model trained in RETURNN
+    on an evaluation data set
+    """
+
+    def __init__(
+        self,
+        returnn_config: ReturnnConfig,
+        returnn_model: Union[PtCheckpoint, Checkpoint],
+        eval_dataset: tk.Path,
+        *,
+        log_verbosity: int = 3,
+        returnn_root: tk.Path,
+        returnn_python_exe: tk.Path,
+    ):
+        returnn_config.config.pop("train")
+        returnn_config.config.pop("dev")
+        returnn_config.config["eval_datasets"] = {"eval": eval_dataset}
+
+        # TODO verify paths
+        if isinstance(returnn_model, PtCheckpoint):
+            model_path = returnn_model.path
+            self.add_input(returnn_model.path)
+        elif isinstance(returnn_model, Checkpoint):
+            model_path = returnn_model.index_path
+            self.add_input(returnn_model.index_path)
+        else:
+            raise NotImplementedError(f"returnn model has unknown type: {type(returnn_model)}")
+
+        returnn_config.config["model"] = model_path
+
+        returnn_config.post_config["log_verbosity"] = log_verbosity
+
+        self.returnn_config = returnn_config
+
+        self.returnn_python_exe = returnn_python_exe
+        self.returnn_root = returnn_root
+
+        self.out_returnn_config_file = self.output_path("returnn.config")
+        self.out_returnn_log = self.output_path("returnn.log")
+        self.out_perplexities = self.output_var("ppl_score")
+
+        self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4}
+
+    def tasks(self):
+        yield Task("create_files", mini_task=True)
+        yield Task("run", resume="run", rqmt=self.rqmt)
+        yield Task("gather", mini_task=True)
+
+    def _get_run_cmd(self):
+        run_cmd = [
+            self.returnn_python_exe.get_path(),
+            self.returnn_root.join_right("rnn.py").get_path(),
+            self.out_returnn_config_file.get_path(),
+            "++task eval",
+        ]
+        return run_cmd
+
+    def create_files(self):
+        self.returnn_config.write(self.out_returnn_config_file.get_path())
+
+        util.create_executable("rnn.sh", self._get_run_cmd())
+
+    def run(self):
+        sp.check_call(self._get_run_cmd())
+
+        shutil.move("returnn_log", self.out_returnn_log.get_path())
+
+    def gather(self):
+        for data_key in self.out_perplexities.keys():
+            print(data_key)
+
+    @classmethod
+    def hash(cls, parsed_args):
+        del parsed_args["log_verbosity"]
+        return super().hash(parsed_args)
diff --git a/returnn/training.py b/returnn/training.py
index 03e17127..6e6f565f 100644
--- a/returnn/training.py
+++ b/returnn/training.py
@@ -35,12 +35,12 @@ class ReturnnModel:
     This is deprecated, use :class:`Checkpoint` instead.
     """
 
-    def __init__(self, returnn_config_file, model, epoch):
+    def __init__(self, returnn_config_file: Path, model: Path, epoch: int):
         """
 
-        :param Path returnn_config_file: Path to a returnn config file
-        :param Path model: Path to a RETURNN checkpoint (only the .meta for Tensorflow)
-        :param int epoch:
+        :param returnn_config_file: Path to a returnn config file
+        :param model: Path to a RETURNN checkpoint (only the .meta for Tensorflow)
+        :param epoch:
         """
         self.returnn_config_file = returnn_config_file
         self.model = model
@@ -52,7 +52,7 @@ class Checkpoint:
     Checkpoint object which holds the (Tensorflow) index file path as tk.Path,
     and will return the checkpoint path as common prefix of the .index/.meta/.data[...]
 
-    A checkpoint object should directly assigned to a RasrConfig entry (do not call `.ckpt_path`)
+    A checkpoint object should directly be assigned to a RasrConfig entry (do not call `.ckpt_path`)
     so that the hash will resolve correctly
     """
 

From 9e9393958aaf52f4a381c28944242ab03546df7c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Tue, 10 Dec 2024 08:55:02 +0100
Subject: [PATCH 2/9] doc fix

Co-authored-by: michelwi <michelwi@users.noreply.github.com>
---
 returnn/training.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/returnn/training.py b/returnn/training.py
index 6e6f565f..749b25f5 100644
--- a/returnn/training.py
+++ b/returnn/training.py
@@ -52,7 +52,7 @@ class Checkpoint:
     Checkpoint object which holds the (Tensorflow) index file path as tk.Path,
     and will return the checkpoint path as common prefix of the .index/.meta/.data[...]
 
-    A checkpoint object should directly be assigned to a RasrConfig entry (do not call `.ckpt_path`)
+    A checkpoint object should be directly assigned to a RasrConfig entry (do not call `.ckpt_path`)
     so that the hash will resolve correctly
     """
 

From 44e834c3fc4c2fc3290a02b5ddccca8ed6e27d9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nahuel=20Unai=20Rosell=C3=B3=20Beneitez?=
 <31628502+Icemole@users.noreply.github.com>
Date: Mon, 9 Dec 2024 16:09:55 +0100
Subject: [PATCH 3/9] Add left/right context orth to lib.corpus (#564)

Co-authored-by: Albert Zeyer <zeyer@cs.rwth-aachen.de>
Co-authored-by: Eugen Beck <curufinwe@users.noreply.github.com>
---
 lib/corpus.py | 52 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/lib/corpus.py b/lib/corpus.py
index 076cb048..11c9b861 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -116,7 +116,7 @@ def startElement(self, name: str, attrs: Dict[str, str]):
     def endElement(self, name: str):
         e = self.elements[-1]
 
-        if name == "orth":
+        if name in {"orth", "left-context-orth", "right-context-orth"}:
             assert isinstance(e, Segment)
             # we do some processing of the text that goes into the orth tag to get a nicer formating, some corpora may have
             # multiline content in the orth tag, but to keep it that way might not be consistent with the indentation during
@@ -124,7 +124,7 @@ def endElement(self, name: str):
             text = self.chars.strip()
             text = re.sub(" +", " ", text)
             text = re.sub("\n", "", text)
-            e.orth = text
+            setattr(e, name.replace("-", "_"), text)
         elif isinstance(e, Speaker) and name != "speaker-description":
             # we allow all sorts of elements within a speaker description
             e.attribs[name] = self.chars.strip()
@@ -356,15 +356,39 @@ def get_segment_mapping(self) -> Dict[str, Segment]:
 
 
 class Segment(NamedEntity):
-    def __init__(self):
+    def __init__(
+        self,
+        *,
+        start: float = 0.0,
+        end: float = 0.0,
+        track: Optional[int] = None,
+        orth: Optional[str] = None,
+        left_context_orth: Optional[str] = None,
+        right_context_orth: Optional[str] = None,
+        speaker_name: Optional[str] = None,
+        recording: Optional[Recording] = None,
+    ):
+        """
+        :param start: Segment start.
+        :param end: Segment end.
+        :param track: Segment track/channel.
+        :param orth: Segment text.
+        :param left_context_orth: Optional left context when aligning (specific for RASR alignment).
+        :param right_context_orth: Optional right context when aligning (specific for RASR alignment).
+        :param speaker_name: Speaker name.
+        :param recording: Recording in which the segment is embedded.
+        """
         super().__init__()
-        self.start = 0.0
-        self.end = 0.0
-        self.track: Optional[int] = None
-        self.orth: Optional[str] = None
-        self.speaker_name: Optional[str] = None
 
-        self.recording: Optional[Recording] = None
+        self.start = start
+        self.end = end
+        self.track = track
+        self.orth = orth
+        self.left_context_orth = left_context_orth
+        self.right_context_orth = right_context_orth
+        self.speaker_name = speaker_name
+
+        self.recording = recording
 
     def fullname(self) -> str:
         return self.recording.fullname() + "/" + self.name
@@ -384,6 +408,16 @@ def dump(self, out: TextIO, indentation: str = ""):
             out.write('%s  <speaker name="%s"/>\n' % (indentation, self.speaker_name))
         if self.orth is not None:
             out.write("%s  <orth> %s </orth>\n" % (indentation, saxutils.escape(self.orth)))
+        if self.left_context_orth is not None:
+            out.write(
+                "%s  <left-context-orth> %s </left-context-orth>\n"
+                % (indentation, saxutils.escape(self.left_context_orth))
+            )
+        if self.right_context_orth is not None:
+            out.write(
+                "%s  <right-context-orth> %s </right-context-orth>\n"
+                % (indentation, saxutils.escape(self.right_context_orth))
+            )
         if has_child_element:
             out.write("%s</segment>\n" % indentation)
         else:

From fefe736b9ff19a503874968ab94d128df92f92ad Mon Sep 17 00:00:00 2001
From: Moritz Gunz <moritz.gunz@gmail.com>
Date: Mon, 9 Dec 2024 07:12:02 -0800
Subject: [PATCH 4/9] Update docs of DenseLabelInfo (#561)

---
 mm/context_label.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/context_label.py b/mm/context_label.py
index 050fee5d..4cfaca08 100644
--- a/mm/context_label.py
+++ b/mm/context_label.py
@@ -13,7 +13,8 @@
 class DenseLabelInfo:
     """
     Attributes:
-        n_contexts: number of phonemes in lexicon ( usually need to + 1 for non-context # in rasr)
+        n_contexts: number of phonemes in lexicon + number of non-word phonemes (like [SILENCE], [NOISE] or [MUSIC])
+            and + 1 for non-context # in rasr.
         use_word_end_classes: if word end class is used for no tying dense label
         use_boundary_classes: if bounary class is used for no tying dense label
         num_hmm_states_per_phon: the number of hmm states per phoneme

From 3771fd3605e36ff4dc1cd83c4faeccffb94911fa Mon Sep 17 00:00:00 2001
From: DanEnergetics <daniel.mann@online.de>
Date: Mon, 9 Dec 2024 16:34:59 +0100
Subject: [PATCH 5/9] Add support for DelayedBase in CreateDummyMixturesJob
 (#562)

---
 mm/mixtures.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/mixtures.py b/mm/mixtures.py
index f22fbc4e..66a71b26 100644
--- a/mm/mixtures.py
+++ b/mm/mixtures.py
@@ -14,6 +14,7 @@
 from typing import Dict, Optional, Union
 
 from sisyphus import *
+from sisyphus.delayed_ops import DelayedBase
 
 Path = setup_path(__package__)
 
@@ -514,8 +515,8 @@ def tasks(self):
         yield Task("run", mini_task=True)
 
     def run(self):
-        num_mixtures = int(self.num_mixtures.get() if isinstance(self.num_mixtures, tk.Variable) else self.num_mixtures)
-        num_features = int(self.num_features.get() if isinstance(self.num_features, tk.Variable) else self.num_features)
+        num_mixtures = int(self.num_mixtures.get() if isinstance(self.num_mixtures, DelayedBase) else self.num_mixtures)
+        num_features = int(self.num_features.get() if isinstance(self.num_features, DelayedBase) else self.num_features)
 
         with open(tk.uncached_path(self.out_mixtures), "wb") as f:
             f.write(b"MIXSET\0\0")

From 6453ed95db2669015014f4a85a5f091bb5826f66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Thu, 23 Jan 2025 17:01:48 +0100
Subject: [PATCH 6/9] pre-merge

---
 lib/corpus.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/corpus.py b/lib/corpus.py
index 11c9b861..e2cb4afa 100644
--- a/lib/corpus.py
+++ b/lib/corpus.py
@@ -390,6 +390,12 @@ def __init__(
 
         self.recording = recording
 
+    def full_orth(self) -> str:
+        """
+        :return: Left context orth (if any) + orth + right context orth (if any).
+        """
+        return " ".join([s for s in [self.left_context_orth, self.orth, self.right_context_orth] if s])
+
     def fullname(self) -> str:
         return self.recording.fullname() + "/" + self.name
 

From c66b1ffbb5ed7bb79c4e86573358c848deb0e94d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Thu, 23 Jan 2025 17:13:06 +0100
Subject: [PATCH 7/9] updates

---
 returnn/perplexity.py | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/returnn/perplexity.py b/returnn/perplexity.py
index 67987a76..00d2966a 100644
--- a/returnn/perplexity.py
+++ b/returnn/perplexity.py
@@ -23,8 +23,8 @@ class ReturnnCalculatePerplexityJob(Job):
     def __init__(
         self,
         returnn_config: ReturnnConfig,
-        returnn_model: Union[PtCheckpoint, Checkpoint],
-        eval_dataset: tk.Path,
+        returnn_model: PtCheckpoint,
+        eval_dataset: ReturnnConfig,
         *,
         log_verbosity: int = 3,
         returnn_root: tk.Path,
@@ -32,20 +32,10 @@ def __init__(
     ):
         returnn_config.config.pop("train")
         returnn_config.config.pop("dev")
-        returnn_config.config["eval_datasets"] = {"eval": eval_dataset}
-
-        # TODO verify paths
-        if isinstance(returnn_model, PtCheckpoint):
-            model_path = returnn_model.path
-            self.add_input(returnn_model.path)
-        elif isinstance(returnn_model, Checkpoint):
-            model_path = returnn_model.index_path
-            self.add_input(returnn_model.index_path)
-        else:
-            raise NotImplementedError(f"returnn model has unknown type: {type(returnn_model)}")
+        returnn_config.update(eval_dataset)
 
+        model_path = returnn_model.path
         returnn_config.config["model"] = model_path
-
         returnn_config.post_config["log_verbosity"] = log_verbosity
 
         self.returnn_config = returnn_config
@@ -55,7 +45,7 @@ def __init__(
 
         self.out_returnn_config_file = self.output_path("returnn.config")
         self.out_returnn_log = self.output_path("returnn.log")
-        self.out_perplexities = self.output_var("ppl_score")
+        self.out_perplexity = self.output_var("ppl_score")
 
         self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4}
 
@@ -75,17 +65,15 @@ def _get_run_cmd(self):
 
     def create_files(self):
         self.returnn_config.write(self.out_returnn_config_file.get_path())
-
         util.create_executable("rnn.sh", self._get_run_cmd())
 
     def run(self):
         sp.check_call(self._get_run_cmd())
-
         shutil.move("returnn_log", self.out_returnn_log.get_path())
 
-    def gather(self):
-        for data_key in self.out_perplexities.keys():
-            print(data_key)
+        # TODO get ppl
+        ppl = None
+        self.out_perplexity.set(ppl)
 
     @classmethod
     def hash(cls, parsed_args):

From b1dee3cc51e2bc8e2a924d3496ee3e6fa578cf0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Thu, 23 Jan 2025 17:55:59 +0100
Subject: [PATCH 8/9] refactor

---
 returnn/perplexity.py | 88 ++++++++++++++-----------------------------
 1 file changed, 28 insertions(+), 60 deletions(-)

diff --git a/returnn/perplexity.py b/returnn/perplexity.py
index 00d2966a..f36a34cc 100644
--- a/returnn/perplexity.py
+++ b/returnn/perplexity.py
@@ -1,81 +1,49 @@
-__all__ = ["ReturnnCalculatePerplexityJob"]
+__all__ = ["ExtractPerplexityFromLearningRatesFileJob"]
 
-import shutil
-import subprocess as sp
-from typing import Union
+import ast
+from typing import List
 
 from sisyphus import Job, Task, setup_path, tk
 
-import i6_core.util as util
 
-from .config import ReturnnConfig
-from .training import PtCheckpoint, Checkpoint
 
 Path = setup_path(__package__)
 
 
-class ReturnnCalculatePerplexityJob(Job):
+class ExtractPerplexityFromLearningRatesFileJob(Job):
     """
-    Calculates the perplexity of a language model trained in RETURNN
-    on an evaluation data set
+    Extracts the perplexity from the RETURNN learning rates files.
     """
 
     def __init__(
         self,
-        returnn_config: ReturnnConfig,
-        returnn_model: PtCheckpoint,
-        eval_dataset: ReturnnConfig,
-        *,
-        log_verbosity: int = 3,
-        returnn_root: tk.Path,
-        returnn_python_exe: tk.Path,
+        returnn_learning_rates: tk.Path,
+        eval_datasets: List[str],
+        loss_names: List[str],
     ):
-        returnn_config.config.pop("train")
-        returnn_config.config.pop("dev")
-        returnn_config.update(eval_dataset)
+        self.returnn_learning_rates = returnn_learning_rates
+        self.eval_datasets = sorted(eval_datasets)
+        self.loss_names = sorted(loss_names)
 
-        model_path = returnn_model.path
-        returnn_config.config["model"] = model_path
-        returnn_config.post_config["log_verbosity"] = log_verbosity
+        self.out_perplexities = self.output_path("ppl.txt")
 
-        self.returnn_config = returnn_config
-
-        self.returnn_python_exe = returnn_python_exe
-        self.returnn_root = returnn_root
-
-        self.out_returnn_config_file = self.output_path("returnn.config")
-        self.out_returnn_log = self.output_path("returnn.log")
-        self.out_perplexity = self.output_var("ppl_score")
-
-        self.rqmt = {"gpu": 0, "cpu": 2, "mem": 4, "time": 4}
+        self.rqmt = {"gpu": 0, "cpu": 1, "mem": 1, "time": 1}
 
     def tasks(self):
-        yield Task("create_files", mini_task=True)
-        yield Task("run", resume="run", rqmt=self.rqmt)
-        yield Task("gather", mini_task=True)
-
-    def _get_run_cmd(self):
-        run_cmd = [
-            self.returnn_python_exe.get_path(),
-            self.returnn_root.join_right("rnn.py").get_path(),
-            self.out_returnn_config_file.get_path(),
-            "++task eval",
-        ]
-        return run_cmd
-
-    def create_files(self):
-        self.returnn_config.write(self.out_returnn_config_file.get_path())
-        util.create_executable("rnn.sh", self._get_run_cmd())
+        yield Task("run", resume="run", mini_task=True)
 
     def run(self):
-        sp.check_call(self._get_run_cmd())
-        shutil.move("returnn_log", self.out_returnn_log.get_path())
-
-        # TODO get ppl
-        ppl = None
-        self.out_perplexity.set(ppl)
-
-    @classmethod
-    def hash(cls, parsed_args):
-        del parsed_args["log_verbosity"]
-        return super().hash(parsed_args)
+        with open(self.returnn_learning_rates.get_path(), "rt", encoding="utf-8") as f_in:
+            data = f_in.read()
+            lr_dict = ast.literal_eval(data)
+            lr_dict = sorted(lr_dict.items(), reverse=True)
+            last_entry = lr_dict[0]
+
+        res = []
+        for data_set in self.eval_datasets:
+            for loss in self.loss_names:
+                full_name = f"{data_set}_loss_{loss}"
+                res.append(f"{data_set} - {loss}: {last_entry[full_name]} \n")
+
+        with open(self.out_perplexities.get_path(), "wt", encoding="utf-8") as f_out:
+            f_out.writelines(res)

From 1a6543174337345b5790fec5c70f0553551509b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christoph=20M=2E=20L=C3=BCscher?= <chris.luscher@gmx.de>
Date: Wed, 29 Jan 2025 08:19:18 +0100
Subject: [PATCH 9/9] changes

---
 returnn/perplexity.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/returnn/perplexity.py b/returnn/perplexity.py
index f36a34cc..6ec07ec2 100644
--- a/returnn/perplexity.py
+++ b/returnn/perplexity.py
@@ -3,13 +3,10 @@
 import ast
 from typing import List
 
-from sisyphus import Job, Task, setup_path, tk
+from sisyphus import Job, Task, tk
 
 
 
-Path = setup_path(__package__)
-
-
 class ExtractPerplexityFromLearningRatesFileJob(Job):
     """
     Extracts the perplexity from the RETURNN learning rates files.
@@ -19,13 +16,13 @@ def __init__(
         self,
         returnn_learning_rates: tk.Path,
         eval_datasets: List[str],
-        loss_names: List[str],
     ):
         self.returnn_learning_rates = returnn_learning_rates
         self.eval_datasets = sorted(eval_datasets)
-        self.loss_names = sorted(loss_names)
 
-        self.out_perplexities = self.output_path("ppl.txt")
+        self.out_ppl_file = self.output_path("ppl.txt")
+
+        self.out_perplexities = {f"ppl_{d}": self.output_var(f"ppl_{d}") for d in eval_datasets}
 
         self.rqmt = {"gpu": 0, "cpu": 1, "mem": 1, "time": 1}
 
@@ -41,9 +38,8 @@ def run(self):
 
         res = []
         for data_set in self.eval_datasets:
-            for loss in self.loss_names:
-                full_name = f"{data_set}_loss_{loss}"
-                res.append(f"{data_set} - {loss}: {last_entry[full_name]} \n")
+            full_name = f"{data_set}_loss_ppl" # TODO actually check which name fits
+            res.append(f"{data_set} - ppl: {last_entry[full_name]} \n")
 
-        with open(self.out_perplexities.get_path(), "wt", encoding="utf-8") as f_out:
+        with open(self.out_ppl_file.get_path(), "wt", encoding="utf-8") as f_out:
             f_out.writelines(res)