EleutherAI · lauritowal · Apr 26, 2023 · Apr 26, 2023 · Apr 26, 2023 · Apr 29, 2023
diff --git a/elk/evaluation/evaluate.py b/elk/evaluation/evaluate.py
@@ -8,8 +8,11 @@
 
 from ..files import elk_reporter_dir
 from ..metrics import evaluate_preds
-from ..run import Run
+from ..run import LayerApplied, LayerOutput, Run
 from ..utils import Color
+from ..utils.types import PromptEnsembling
+
+PROMPT_ENSEMBLING = "prompt_ensembling"
 
 
 @dataclass(kw_only=True)
@@ -31,7 +34,7 @@ def execute(self, highlight_color: Color = "cyan"):
     @torch.inference_mode()
     def apply_to_layer(
         self, layer: int, devices: list[str], world_size: int
-    ) -> dict[str, pd.DataFrame]:
+    ) -> LayerApplied:
         """Evaluate a single reporter on a single layer."""
         device = self.get_device(devices, world_size)
         val_output = self.prepare_data(device, layer, "val")
@@ -42,25 +45,34 @@ def apply_to_layer(
         reporter = torch.load(reporter_path, map_location=device)
 
         row_bufs = defaultdict(list)
+
+        layer_outputs: list[LayerOutput] = []
+
         for ds_name, (val_h, val_gt, val_lm_preds) in val_output.items():
             meta = {"dataset": ds_name, "layer": layer}
 
             val_credences = reporter(val_h)
-            for mode in ("none", "partial", "full"):
+
+            layer_outputs.append(LayerOutput(val_gt, val_credences, meta))
+            for prompt_ensembling in PromptEnsembling.all():
                 row_bufs["eval"].append(
                     {
                         **meta,
-                        "ensembling": mode,
-                        **evaluate_preds(val_gt, val_credences, mode).to_dict(),
+                        PROMPT_ENSEMBLING: prompt_ensembling.value,
+                        **evaluate_preds(
+                            val_gt, val_credences, prompt_ensembling
+                        ).to_dict(),
                     }
                 )
 
                 if val_lm_preds is not None:
                     row_bufs["lm_eval"].append(
                         {
                             **meta,
-                            "ensembling": mode,
-                            **evaluate_preds(val_gt, val_lm_preds, mode).to_dict(),
+                            PROMPT_ENSEMBLING: prompt_ensembling.value,
+                            **evaluate_preds(
+                                val_gt, val_lm_preds, prompt_ensembling
+                            ).to_dict(),
                         }
                     )
 
@@ -75,11 +87,14 @@ def apply_to_layer(
                         model.eval()
                         row_bufs["lr_eval"].append(
                             {
-                                "ensembling": mode,
+                                PROMPT_ENSEMBLING: prompt_ensembling.value,
                                 "inlp_iter": i,
                                 **meta,
-                                **evaluate_preds(val_gt, model(val_h), mode).to_dict(),
+                                **evaluate_preds(
+                                    val_gt, model(val_h), prompt_ensembling
+                                ).to_dict(),
                             }
                         )
-
-        return {k: pd.DataFrame(v) for k, v in row_bufs.items()}
+        return LayerApplied(
+            layer_outputs, {k: pd.DataFrame(v) for k, v in row_bufs.items()}
+        )
diff --git a/elk/metrics/accuracy.py b/elk/metrics/accuracy.py
@@ -14,11 +14,14 @@ class AccuracyResult:
     """Lower bound of the confidence interval."""
     upper: float
     """Upper bound of the confidence interval."""
+    cal_thresh: float | None
+    """The threshold used to compute the calibrated accuracy."""
 
 
 def accuracy_ci(
     y_true: Tensor,
     y_pred: Tensor,
+    cal_thresh: float | None = None,
     *,
     num_samples: int = 1000,
     level: float = 0.95,
@@ -79,4 +82,4 @@ def accuracy_ci(
     # Compute the point estimate. Call flatten to ensure that we get a single number
     # computed across cluster boundaries even if the inputs were clustered.
     estimate = y_true.flatten().eq(y_pred.flatten()).float().mean().item()
-    return AccuracyResult(estimate, lower, upper)
+    return AccuracyResult(estimate, lower, upper, cal_thresh)
diff --git a/elk/metrics/eval.py b/elk/metrics/eval.py
@@ -1,15 +1,22 @@
 from dataclasses import asdict, dataclass
-from typing import Literal
 
 import torch
 from einops import repeat
 from torch import Tensor
 
+from ..utils.types import PromptEnsembling
 from .accuracy import AccuracyResult, accuracy_ci
 from .calibration import CalibrationError, CalibrationEstimate
 from .roc_auc import RocAucResult, roc_auc_ci
 
 
+@dataclass
+class LayerOutput:
+    val_gt: Tensor
+    val_credences: Tensor
+    meta: dict
+
+
 @dataclass(frozen=True)
 class EvalResult:
     """The result of evaluating a classifier."""
@@ -26,7 +33,7 @@ class EvalResult:
     cal_thresh: float | None
     """The threshold used to compute the calibrated accuracy."""
 
-    def to_dict(self, prefix: str = "") -> dict[str, float]:
+    def to_dict(self, prefix: str = "") -> dict[str, float | None]:
         """Convert the result to a dictionary."""
         acc_dict = {f"{prefix}acc_{k}": v for k, v in asdict(self.accuracy).items()}
         cal_acc_dict = (
@@ -49,67 +56,164 @@ def to_dict(self, prefix: str = "") -> dict[str, float]:
         }
 
 
+def calc_auroc(
+    y_logits: Tensor,
+    y_true: Tensor,
+    ensembling: PromptEnsembling,
+    num_classes: int,
+) -> RocAucResult:
+    """
+    Calculate the AUROC
+
+    Args:
+        y_true: Ground truth tensor of shape (n,).
+        y_logits: Predicted class tensor of shape (n, num_variants, num_classes).
+        prompt_ensembling: The prompt_ensembling mode.
+        num_classes: The number of classes.
+
+    Returns:
+        RocAucResult: A dictionary containing the AUROC and confidence interval.
+    """
+    if ensembling == PromptEnsembling.NONE:
+        auroc = roc_auc_ci(
+            to_one_hot(y_true, num_classes).long().flatten(1), y_logits.flatten(1)
+        )
+    elif ensembling in (PromptEnsembling.PARTIAL, PromptEnsembling.FULL):
+        # Pool together the negative and positive class logits
+        if num_classes == 2:
+            auroc = roc_auc_ci(y_true, y_logits[..., 1] - y_logits[..., 0])
+        else:
+            auroc = roc_auc_ci(to_one_hot(y_true, num_classes).long(), y_logits)
+    else:
+        raise ValueError(f"Unknown mode: {ensembling}")
+
+    return auroc
+
+
+def calc_calibrated_accuracies(y_true, pos_probs) -> AccuracyResult:
+    """
+    Calculate the calibrated accuracies
+
+    Args:
+        y_true: Ground truth tensor of shape (n,).
+        pos_probs: Predicted class tensor of shape (n, num_variants, num_classes).
+
+    Returns:
+        AccuracyResult: A dictionary containing the accuracy and confidence interval.
+    """
+
+    cal_thresh = pos_probs.float().quantile(y_true.float().mean()).item()
+    cal_preds = pos_probs.gt(cal_thresh).to(torch.int)
+    cal_acc = accuracy_ci(y_true, cal_preds, cal_thresh)
+    return cal_acc
+
+
+def calc_calibrated_errors(y_true, pos_probs) -> CalibrationEstimate:
+    """
+    Calculate the expected calibration error.
+
+    Args:
+        y_true: Ground truth tensor of shape (n,).
+        y_logits: Predicted class tensor of shape (n, num_variants, num_classes).
+
+    Returns:
+        CalibrationEstimate:
+    """
+
+    cal = CalibrationError().update(y_true.flatten(), pos_probs.flatten())
+    cal_err = cal.compute()
+    return cal_err
+
+
+def calc_accuracies(y_logits, y_true) -> AccuracyResult:
+    """
+    Calculate the accuracy
+
+    Args:
+        y_true: Ground truth tensor of shape (n,).
+        y_logits: Predicted class tensor of shape (n, num_variants, num_classes).
+
+    Returns:
+        AccuracyResult: A dictionary containing the accuracy and confidence interval.
+    """
+    y_pred = y_logits.argmax(dim=-1)
+    return accuracy_ci(y_true, y_pred)
+
+
 def evaluate_preds(
     y_true: Tensor,
     y_logits: Tensor,
-    ensembling: Literal["none", "partial", "full"] = "none",
+    prompt_ensembling: PromptEnsembling = PromptEnsembling.NONE,
 ) -> EvalResult:
     """
     Evaluate the performance of a classification model.
 
     Args:
-        y_true: Ground truth tensor of shape (N,).
-        y_logits: Predicted class tensor of shape (N, variants, n_classes).
+        y_true: Ground truth tensor of shape (n,).
+        y_logits: Predicted class tensor of shape (n, num_variants, num_classes).
+        prompt_ensembling: The prompt_ensembling mode.
 
     Returns:
         dict: A dictionary containing the accuracy, AUROC, and ECE.
     """
-    (n, v, c) = y_logits.shape
-    assert y_true.shape == (n,)
+    y_logits, y_true, num_classes = prepare(y_logits, y_true, prompt_ensembling)
+    return calc_eval_results(y_true, y_logits, prompt_ensembling, num_classes)
+
+
+def prepare(y_logits: Tensor, y_true: Tensor, prompt_ensembling: PromptEnsembling):
+    """
+    Prepare the logits and ground truth for evaluation
+    """
+    (n, num_variants, num_classes) = y_logits.shape
+    assert y_true.shape == (n,), f"y_true.shape: {y_true.shape} is not equal to n: {n}"
 
-    if ensembling == "full":
+    if prompt_ensembling == PromptEnsembling.FULL:
         y_logits = y_logits.mean(dim=1)
     else:
-        y_true = repeat(y_true, "n -> n v", v=v)
+        y_true = repeat(y_true, "n -> n v", v=num_variants)
 
-    THRESHOLD = 0.5
-    if ensembling == "none":
-        y_pred = y_logits[..., 1].gt(THRESHOLD).to(torch.int)
-    else:
-        y_pred = y_logits.argmax(dim=-1)
+    return y_logits, y_true, num_classes
 
-    acc = accuracy_ci(y_true, y_pred)
 
-    if ensembling == "none":
-        auroc = roc_auc_ci(to_one_hot(y_true, c).long().flatten(1), y_logits.flatten(1))
-    elif ensembling in ("partial", "full"):
-        # Pool together the negative and positive class logits
-        if c == 2:
-            auroc = roc_auc_ci(y_true, y_logits[..., 1] - y_logits[..., 0])
-        else:
-            auroc = roc_auc_ci(to_one_hot(y_true, c).long(), y_logits)
-    else:
-        raise ValueError(f"Unknown mode: {ensembling}")
+def calc_eval_results(
+    y_true: Tensor,
+    y_logits: Tensor,
+    prompt_ensembling: PromptEnsembling,
+    num_classes: int,
+) -> EvalResult:
+    """
+    Calculate the evaluation results
 
-    cal_acc = None
-    cal_err = None
-    cal_thresh = None
+    Args:
+        y_true: Ground truth tensor of shape (n,).
+        y_logits: Predicted class tensor of shape (n, num_variants, num_classes).
+        prompt_ensembling: The prompt_ensembling mode.
 
-    if c == 2:
-        pooled_logits = (
-            y_logits[..., 1]
-            if ensembling == "none"
-            else y_logits[..., 1] - y_logits[..., 0]
-        )
-        pos_probs = torch.sigmoid(pooled_logits)
+    Returns:
+        EvalResult: The result of evaluating a classifier containing the accuracy,
+        calibrated accuracies, calibrated errors, and AUROC.
+    """
+    acc = calc_accuracies(y_logits=y_logits, y_true=y_true)
 
-        # Calibrated accuracy
-        cal_thresh = pos_probs.float().quantile(y_true.float().mean()).item()
-        cal_preds = pos_probs.gt(cal_thresh).to(torch.int)
-        cal_acc = accuracy_ci(y_true, cal_preds)
+    pos_probs = torch.sigmoid(y_logits[..., 1] - y_logits[..., 0])
+    cal_acc, cal_thresh = (
+        calc_calibrated_accuracies(y_true=y_true, pos_probs=pos_probs)
+        if num_classes == 2
+        else None,
+        None,
+    )
+    cal_err = (
+        calc_calibrated_errors(y_true=y_true, pos_probs=pos_probs)
+        if num_classes == 2
+        else None
+    )
 
-        cal = CalibrationError().update(y_true.flatten(), pos_probs.flatten())
-        cal_err = cal.compute()
+    auroc = calc_auroc(
+        y_logits=y_logits,
+        y_true=y_true,
+        ensembling=prompt_ensembling,
+        num_classes=num_classes,
+    )
 
     return EvalResult(acc, cal_acc, cal_err, auroc, cal_thresh)
 
@@ -127,3 +231,49 @@ def to_one_hot(labels: Tensor, n_classes: int) -> Tensor:
     """
     one_hot_labels = labels.new_zeros(*labels.shape, n_classes)
     return one_hot_labels.scatter_(-1, labels.unsqueeze(-1).long(), 1)
+
+
+def layer_ensembling(
+    layer_outputs: list[LayerOutput], prompt_ensembling: PromptEnsembling
+) -> EvalResult:
+    """
+    Return EvalResult after prompt_ensembling
+    the probe output of the middle to last layers
+
+    Args:
+        layer_outputs: A list of LayerOutput containing the ground truth and
+        predicted class tensor of shape (n, num_variants, num_classes).
+        prompt_ensembling: The prompt_ensembling mode.
+
+    Returns:
+        EvalResult: The result of evaluating a classifier containing the accuracy,
+        calibrated accuracies, calibrated errors, and AUROC.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    y_logits_collection = []
+
+    num_classes = 2
+    y_true = layer_outputs[0].val_gt.to(device)
+
+    for layer_output in layer_outputs:
+        # all y_trues are identical, so just get the first
+        y_logits = layer_output.val_credences.to(device)
+        y_logits, y_true, num_classes = prepare(
+            y_logits=y_logits,
+            y_true=layer_outputs[0].val_gt.to(device),
+            prompt_ensembling=prompt_ensembling,
+        )
+        y_logits_collection.append(y_logits)
+
+    # get logits and ground_truth from middle to last layer
+    middle_index = len(layer_outputs) // 2
+    y_logits_stacked = torch.stack(y_logits_collection[middle_index:])
+    # layer prompt_ensembling of the stacked logits
+    y_logits_stacked_mean = torch.mean(y_logits_stacked, dim=0)
+
+    return calc_eval_results(
+        y_true=y_true,
+        y_logits=y_logits_stacked_mean,
+        prompt_ensembling=prompt_ensembling,
+        num_classes=num_classes,
+    )