isi-nlp · spigo900 · Jul 8, 2020 · Jul 8, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/README.md b/README.md
@@ -66,6 +66,33 @@ python eval.py \
     --output pred.lst
 ```
 
+## Ensembling using Pegasus
+
+### Setup
+
+The ensembling workflow is defined and run using the Pegasus workflow management system. To run the
+workflow, you'll need to install the [Pegasus wrapper][pegasus_wrapper].
+
+Note that before running you'll need to set up your user-specific parameters file,
+`parameters/root.params`. See `parameters/root.sample.params` for an example.
+
+### Running the workflow
+
+Once the wrapper is installed, generate the workflow:
+
+```bash
+python ai2/pegasus.py parameters/pegasus.params
+```
+
+Then submit the workflow:
+
+```bash
+cd path/to/experiment_root/ensemble
+sh submit.sh
+```
+
+[pegasus_wrapper]: https://github.com/isi-vista/vista-pegasus-wrapper/
+
 ## Results
 
 ### PIQA

diff --git a/accuracy.py → ai2/accuracy.py b/accuracy.py → ai2/accuracy.py
diff --git a/ai2/ensemble.py b/ai2/ensemble.py
@@ -0,0 +1,193 @@
+import csv
+import itertools
+import os
+import numpy as np
+from collections import Counter, defaultdict
+from typing import Mapping, Any
+import heapq
+
+from more_itertools import powerset
+from sklearn.metrics import accuracy_score
+import pandas as pd
+from scipy.stats.stats import pearsonr
+
+from vistautils.parameters_only_entrypoint import parameters_only_entry_point
+from vistautils.parameters import Parameters
+
+
+def get_model_name(model: Mapping[str, Any]) -> str:
+    return "_".join(str(option) for parameter, option in model['parameters'])
+
+
+def main(params: Parameters):
+    def run_ensemble(predictions_df, confidences_df, subset):
+        # confidences_df[confidences_df < 0.2] = 0  # Set low confidence values to 0.
+        # confidences_df = confidences_df.eq(confidences_df.where(confidences_df != 0).max(1), axis=0).astype(int)  # Get the most confident
+
+        relevant_confidences = confidences_df[subset]
+        weighted_votes = relevant_confidences.sum(axis=1).apply(np.argmax).to_numpy()
+        if task in ['socialiqa', 'alphanli']: weighted_votes += 1
+        final_predictions = weighted_votes.tolist()
+        stats = []
+        for _ in range(accuracy_bootstrapping_samples):
+            indices = [i for i in np.random.randint(0, len(final_predictions), size=len(final_predictions))]
+            stats.append(accuracy_score([labels[j] for j in indices], [final_predictions[j] for j in indices]))
+
+        # Calculate the confidence interval and log it to console
+        alpha = 0.95
+        p = ((1.0 - alpha) / 2.0) * 100
+        lower = max(0.0, np.percentile(stats, p))
+        p = (alpha + ((1.0 - alpha) / 2.0)) * 100
+        upper = min(1.0, np.percentile(stats, p))
+        accuracy = accuracy_score(labels, final_predictions)
+        print(f'Accuracy: {accuracy}, {alpha * 100:.1f} confidence interval {lower * 100:.1f} and {upper * 100:.1f}, '
+              f'average: {np.mean(stats) * 100:.1f}')
+
+        # print(f'{accuracy},{[int(i in subset) for i in model_to_path.keys()]}'.replace(' ','').replace('[','').replace(']','')) # CSV
+        # unweighted_votes = predictions_df[subset].mode(axis=1).too_nutolist()
+        return round(accuracy*100,2)
+
+    all_results = {}
+
+    task_to_threshold = params.namespace('task_to_threshold').as_nested_dicts()
+    task_to_gold = params.namespace('task_to_gold')
+
+    # Check that all the necessary namespaces and files exist before we go training on them.
+    gold_labels_paths = {}
+    for task in task_to_threshold.keys():
+        gold_labels_paths[task] = task_to_gold.namespace(task).existing_file('val_y')
+
+    # Check that all the necessary
+    task_to_models = {}
+    for task in task_to_threshold.keys():
+        models_for_task = params.namespace('models').arbitrary_list(task)
+        task_to_models[task] = models_for_task
+
+    data_sizes = params.arbitrary_list('data_sizes')
+    try_without = params.arbitrary_list('try_without')
+
+    accuracy_bootstrapping_samples = params.integer('accuracy_bootstrapping_samples')
+    output_file = params.creatable_file('output_file')
+
+    for task in task_to_threshold.keys():
+        task_models = task_to_models[task]
+        labels = pd.read_csv(gold_labels_paths[task], sep='\t', header=None).values.squeeze().tolist()
+        for data_size in data_sizes:
+            results = {}
+            print(f'\nRunning ensemble for {task.upper()}, {data_size}')
+            relevant_models = [model for model in task_models if model['train_data_slice'] == data_size]
+
+            best_score_per_seed_group = defaultdict(float)
+            best_model_per_seed_group = defaultdict(str)
+            successful_models = {}
+            model_to_predictions = {}
+            model_to_confidences = {}
+            # Get Accuracies
+            print('Accuracy of each model:')
+            for model in relevant_models:
+                try:
+                    preds = pd.read_csv(model['predictions'], sep='\t', header=None).values.squeeze().tolist()
+                    confs = pd.read_csv(model['confidence'], sep='\t', header=None).values.squeeze().tolist()
+                    accuracy = accuracy_score(labels, preds)
+
+                    model_name = get_model_name(model)
+                    successful_models[model_name] = {'accuracy': accuracy, 'parameters': dict(model['parameters'])}
+                    model_to_predictions[model_name] = preds
+                    model_to_confidences[model_name] = confs
+                    print(f'{model_name},{round(accuracy*100,2)}')
+                    model_without_task_data_size = '_'.join(
+                        str(option) for parameter, option in model['parameters']
+                        if parameter not in {'task', 'train_data-slice'}
+                    )
+                    results[model_without_task_data_size] = round(accuracy*100,2)
+
+                    # model_without_seed = model.strip('_'+model.split('_')[-1])
+                    model_without_seed = '_'.join(
+                        str(option) for parameter, option in model['parameters']
+                        if parameter != 'random_seed'
+                    )
+                    if accuracy > best_score_per_seed_group[model_without_seed]:
+                        best_score_per_seed_group[model_without_seed] = accuracy
+                        best_model_per_seed_group[model_without_seed] = model_name
+                except FileNotFoundError:
+                    print(f'Couldn\'t find preds for {model}')
+                    continue
+
+            # Compare Models
+            # print('Compare pairs of predictions of each model')
+            # print('ID1,ID22,Pred Sim,Pred Cor,Correctness Cor,Confidence Cor,ConfCor Both Correct,ConfCor One Correct,ConfCor Both Wrong')
+            # for id1, id2 in itertools.combinations(relevant_models, 2):
+            #     model1, rs1 = tuple(id1.split('_'))
+            #     model2, rs2 = tuple(id2.split('_'))
+            #     if model1 != model2 and rs1 != rs2: continue  # skip if both the model and rs are different
+            #     preds1, conf1 = model_to_predictions[id1], model_to_confidences[id1]
+            #     correctness1 = [int(p == labels[i]) for i, p in enumerate(preds1)]
+            #     preds2, conf2 = model_to_predictions[id2], model_to_confidences[id2]
+            #     correctness2 = [int(p == labels[i]) for i, p in enumerate(preds2)]
+            #     # ConfCor Both Correct
+            #     ccbc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] and correctness2[i]]))[0]
+            #     # ConfCor Only One Correct
+            #     ccoc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] != correctness2[i]]))[0]
+            #     # ConfCor Both Wrong
+            #     ccbw = \
+            #         pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] == correctness2[i] == 0]))[
+            #             0]
+            #     print(
+            #         f'{id1},{id2},{accuracy_score(preds1, preds2)},{pearsonr(preds1, preds2)[0]},{pearsonr(correctness1, correctness2)[0]},{pearsonr(conf1, conf2)[0]},{ccbc},{ccoc},{ccbw}')
+            # print('\n')
+
+            predictions_df = pd.DataFrame.from_dict(model_to_predictions)
+            confidences_df = pd.DataFrame.from_dict(model_to_confidences).applymap(np.asarray)
+            # print(f'accuracy,{list(model_to_path.keys())}'.replace(' ','').replace('\'','').replace('[','').replace(']','')) # print for csv
+            # Grid search for ensembling
+            # ensemble_results = {}
+            # for subset in powerset(successful_models):
+            #     if len(subset) <= 1: continue
+            #     subset = list(subset)
+            #     ensemble_results[tuple(subset)]=run_ensemble(predictions_df, confidences_df, subset)
+            # best = heapq.nlargest(10, ensemble_results, key=ensemble_results.get)
+            # print(ensemble_results[best[0]])
+            # best_performers = [m for ms in best for m in ms]
+            # counts = Counter(best_performers)
+            # print(counts.most_common())
+
+            print(best_model_per_seed_group)
+            print(best_score_per_seed_group)
+            print('Ensemble of all models:')
+            all_accuracy = run_ensemble(predictions_df, confidences_df, [
+                m for m, d in successful_models.items() if d['accuracy'] > task_to_threshold[d['parameters']['task']]
+            ])
+            results['Ensemble - All'] = all_accuracy
+
+            print('Ensemble of best-per-architecture:', )
+            best_per_seed_accuracy = run_ensemble(predictions_df, confidences_df, [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()])
+            # if task != 'physicaliqa' and task != 'alphanli':
+            #     confidences_df[[best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()]].to_csv(f'{task}_conf_ensemble.csv')
+
+            results['Ensemble - best-per-architecture'] = best_per_seed_accuracy
+            results['Ensemble Improvement best-per-architecture vs all'] = round(best_per_seed_accuracy-all_accuracy,2)
+            print('Ensemble Improvement best per arc vs all:', results['Ensemble Improvement best-per-architecture vs all'])
+
+            for factor in try_without:
+                without_factor = [m for m in successful_models if factor not in m]
+                print(f'Without {factor}:')
+                # print(without_factor)
+                wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor)
+                results[f'Ensemble - Without {factor}'] = wf_accuracy
+
+                without_factor_per_arc = [m for m in [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()] if factor not in m]
+                print(f'Best-per-arc without {factor}:')
+                # print(without_factor_per_arc)
+                bpa_wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor_per_arc)
+                results[f'Best-per-arc without {factor}'] = bpa_wf_accuracy
+                # if factor == 'embed_all_sep_mean' and (task == 'physicaliqa' or task == 'alphanli'):
+                #     confidences_df[without_factor_per_arc].to_csv(f'{task}_conf_ensemble.csv')
+
+            all_results[task + '_' + str(data_size)] = results
+
+    df = pd.DataFrame.from_dict(all_results)
+    df.to_csv(output_file, na_rep='-')
+
+
+if __name__ == '__main__':
+    parameters_only_entry_point(main)
diff --git a/eval.py → ai2/eval.py b/eval.py → ai2/eval.py
@@ -1,58 +1,76 @@
 from pathlib import Path
-from typing import List, Union
+from typing import List, Union, Any
 
-import hydra
 from loguru import logger
 import numpy as np
-import omegaconf
 import pandas as pd
 from sklearn.metrics import accuracy_score
 import torch
 import torch.nn.functional as F
 from torch.utils.data import DataLoader
 from tqdm import tqdm
-
-from model import Classifier
-
-# Save root path as hydra will create copies of this code in a folder
-ROOT_PATH = Path(__file__).parent.absolute()
-
-
-# If script is executed by itself, load in the configuration yaml file and desired checkpoint model
-@hydra.main(config_path="config/eval.yaml")
-def main(config: omegaconf.Config):
-    config = omegaconf.OmegaConf.to_container(config)
-    logger.info(config)
+from vistautils.parameters import Parameters
+from vistautils.parameters_only_entrypoint import parameters_only_entry_point
+
+from ai2.model import Classifier
+
+
+def main(params: Parameters):
+    checkpoint_path = params.existing_file('checkpoint_path')
+    results_path = params.creatable_file('results_path')
+    val_x_file = params.existing_file('val_x')
+    val_y_file = params.optional_existing_file('val_y')
+    with_true_label = params.boolean('with_true_label')
+    if with_true_label and val_y_file is None:
+        raise RuntimeError(
+            f'with_true_label set to true but no true labels (val_y) provided! '
+        )
+    elif not with_true_label and val_y_file is not None:
+        raise RuntimeError(
+            f'with_true_label set to false but got true labels val_y!'
+        )
+
+    model_name = params.string('model.model_name')
+    task_name = params.string('task_name')
+    maybe_random_seed = params.get('random_seed', object)
 
     # If the evaluation is deterministic for debugging purposes, we set the random seed
-    if not isinstance(config['random_seed'], bool):
-        logger.info(f"Running deterministic model with seed {config['random_seed']}")
-        np.random.seed(config['random_seed'])
-        torch.manual_seed(config['random_seed'])
+    if not isinstance(maybe_random_seed, bool):
+        if not isinstance(maybe_random_seed, int): \
+                raise RuntimeError(
+                    "Random seed must be either false (i.e. no random seed)"
+                    "or an integer seed!"
+                )
+        logger.info(f"Running deterministic model with seed {maybe_random_seed}")
+        np.random.seed(maybe_random_seed)
+        torch.manual_seed(maybe_random_seed)
         if torch.cuda.is_available():
             torch.backends.cuda.deterministic = True
             torch.backends.cuda.benchmark = False
 
     # Load in the check pointed model
+    config = params.namespace('model').as_nested_dicts()
+    config.update((k, v) for k, v in params.as_nested_dicts().items() if k != 'model')
     model = Classifier(config)
     device = 'cpu' if not torch.cuda.is_available() else "cuda"
-    checkpoint = torch.load(ROOT_PATH / config['checkpoint_path'], map_location=device)
+    checkpoint = torch.load(checkpoint_path, map_location=device)
     model.load_state_dict(checkpoint['state_dict'])
 
-    save_path = Path(f"{config['model']}-{config['task_name']}-s{config['random_seed']}")
+    save_path = Path(f"{model_name}-{task_name}-s{maybe_random_seed}")
     save_path.mkdir(parents=True, exist_ok=True)
 
     # Call the main function with appropriate parameters
     evaluate(a_classifier=model,
              output_path=save_path,
+             results_path=results_path,
              compute_device=device,
-             val_x=ROOT_PATH / config['val_x'],
-             val_y=(ROOT_PATH / config['val_y'] if config['with_true_label'] else None))
+             val_x=val_x_file,
+             val_y=val_y_file)
 
 
 # Function to perform the evaluation (This was separated out to be called in train script)
-def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_device: str,
-             val_x: Union[str, Path], val_y: Union[str, Path] = None):
+def evaluate(a_classifier: Classifier, output_path: Union[str, Path], results_path: Union[str, Path],
+             compute_device: str, val_x: Union[str, Path], val_y: Union[str, Path] = None):
     # Move model to device and set to evaluation mode
     a_classifier.to(compute_device)
     a_classifier.eval()
@@ -88,7 +106,7 @@ def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_de
 
         stats = []
         for _ in range(10000):
-            indices = [i for i in np.random.random_integers(0, len(predictions) - 1, size=len(predictions))]
+            indices = [i for i in np.random.randint(0, len(predictions) - 1, size=len(predictions))]
             stats.append(accuracy_score([labels[j] for j in indices], [predictions[j] for j in indices]))
 
         # Calculate the confidence interval and log it to console
@@ -101,10 +119,10 @@ def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_de
                     f'average: {np.mean(stats) * 100:.1f}')
 
         # Log eval result
-        with open(ROOT_PATH / f"results.txt", "a+") as resultf:
+        with open(results_path, "a+") as resultf:
             resultf.write(f'{output_path},Accuracy-lower-upper-average,{accuracy_score(labels, predictions):.3f},'
                     f'{lower * 100:.1f},{upper * 100:.1f},{np.mean(stats) * 100:.1f}\n')
 
 
 if __name__ == "__main__":
-    main()
+    parameters_only_entry_point(main)
diff --git a/filter_submission_model.py → ai2/filter_submission_model.py b/filter_submission_model.py → ai2/filter_submission_model.py
diff --git a/model.py → ai2/model.py b/model.py → ai2/model.py
@@ -50,9 +50,9 @@ def __init__(self, hparams):
         self.label_offset = 0
 
         # Load Transformer model from cache files (encoder and tokenizer)
-        self.embedder = AutoModel.from_pretrained(hparams["model"], cache_dir=self.root_path / "model_cache")
+        self.embedder = AutoModel.from_pretrained(hparams["model_name"], cache_dir=self.root_path / "model_cache")
         self.tokenizer = \
-            AutoTokenizer.from_pretrained(hparams["model"], cache_dir=self.root_path / "model_cache", use_fast=False)
+            AutoTokenizer.from_pretrained(hparams["model_name"], cache_dir=self.root_path / "model_cache", use_fast=False)
         self.embedder.train()
         self.dropout = nn.Dropout(hparams["dropout"])
 
@@ -76,13 +76,13 @@ def forward(self, batch):
         assert len(batch["attention_mask"].shape) == 2, "LM only take two-dimensional input"
         assert len(batch["token_type_ids"].shape) == 2, "LM only take two-dimensional input"
 
-        batch["token_type_ids"] = None if "roberta" in self.hparams["model"] or "lm_finetuned" \
-                                          in self.hparams["model"] else batch["token_type_ids"]
+        batch["token_type_ids"] = None if "roberta" in self.hparams["model_name"] or "lm_finetuned" \
+                                          in self.hparams["model_name"] else batch["token_type_ids"]
         results = self.embedder(input_ids=batch["input_ids"],
                                 attention_mask=batch["attention_mask"],
                                 token_type_ids=batch["token_type_ids"])
 
-        if 't5' in self.hparams["model"]:
+        if 't5' in self.hparams["model_name"]:
             results = self.embedder(input_ids=batch["input_ids"],
                                     decoder_input_ids=batch["input_ids"], )