diff --git a/README.md b/README.md index 9b7b052b..12b210fc 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,33 @@ python eval.py \ --output pred.lst ``` +## Ensembling using Pegasus + +### Setup + +The ensembling workflow is defined and run using the Pegasus workflow management system. To run the +workflow, you'll need to install the [Pegasus wrapper][pegasus_wrapper]. + +Note that before running you'll need to set up your user-specific parameters file, +`parameters/root.params`. See `parameters/root.sample.params` for an example. + +### Running the workflow + +Once the wrapper is installed, generate the workflow: + +```bash +python ai2/pegasus.py parameters/pegasus.params +``` + +Then submit the workflow: + +```bash +cd path/to/experiment_root/ensemble +sh submit.sh +``` + +[pegasus_wrapper]: https://github.com/isi-vista/vista-pegasus-wrapper/ + ## Results ### PIQA diff --git a/accuracy.py b/ai2/accuracy.py similarity index 100% rename from accuracy.py rename to ai2/accuracy.py diff --git a/ai2/ensemble.py b/ai2/ensemble.py new file mode 100644 index 00000000..a4c86d8c --- /dev/null +++ b/ai2/ensemble.py @@ -0,0 +1,193 @@ +import csv +import itertools +import os +import numpy as np +from collections import Counter, defaultdict +from typing import Mapping, Any +import heapq + +from more_itertools import powerset +from sklearn.metrics import accuracy_score +import pandas as pd +from scipy.stats.stats import pearsonr + +from vistautils.parameters_only_entrypoint import parameters_only_entry_point +from vistautils.parameters import Parameters + + +def get_model_name(model: Mapping[str, Any]) -> str: + return "_".join(str(option) for parameter, option in model['parameters']) + + +def main(params: Parameters): + def run_ensemble(predictions_df, confidences_df, subset): + # confidences_df[confidences_df < 0.2] = 0 # Set low confidence values to 0. + # confidences_df = confidences_df.eq(confidences_df.where(confidences_df != 0).max(1), axis=0).astype(int) # Get the most confident + + relevant_confidences = confidences_df[subset] + weighted_votes = relevant_confidences.sum(axis=1).apply(np.argmax).to_numpy() + if task in ['socialiqa', 'alphanli']: weighted_votes += 1 + final_predictions = weighted_votes.tolist() + stats = [] + for _ in range(accuracy_bootstrapping_samples): + indices = [i for i in np.random.randint(0, len(final_predictions), size=len(final_predictions))] + stats.append(accuracy_score([labels[j] for j in indices], [final_predictions[j] for j in indices])) + + # Calculate the confidence interval and log it to console + alpha = 0.95 + p = ((1.0 - alpha) / 2.0) * 100 + lower = max(0.0, np.percentile(stats, p)) + p = (alpha + ((1.0 - alpha) / 2.0)) * 100 + upper = min(1.0, np.percentile(stats, p)) + accuracy = accuracy_score(labels, final_predictions) + print(f'Accuracy: {accuracy}, {alpha * 100:.1f} confidence interval {lower * 100:.1f} and {upper * 100:.1f}, ' + f'average: {np.mean(stats) * 100:.1f}') + + # print(f'{accuracy},{[int(i in subset) for i in model_to_path.keys()]}'.replace(' ','').replace('[','').replace(']','')) # CSV + # unweighted_votes = predictions_df[subset].mode(axis=1).too_nutolist() + return round(accuracy*100,2) + + all_results = {} + + task_to_threshold = params.namespace('task_to_threshold').as_nested_dicts() + task_to_gold = params.namespace('task_to_gold') + + # Check that all the necessary namespaces and files exist before we go training on them. + gold_labels_paths = {} + for task in task_to_threshold.keys(): + gold_labels_paths[task] = task_to_gold.namespace(task).existing_file('val_y') + + # Check that all the necessary + task_to_models = {} + for task in task_to_threshold.keys(): + models_for_task = params.namespace('models').arbitrary_list(task) + task_to_models[task] = models_for_task + + data_sizes = params.arbitrary_list('data_sizes') + try_without = params.arbitrary_list('try_without') + + accuracy_bootstrapping_samples = params.integer('accuracy_bootstrapping_samples') + output_file = params.creatable_file('output_file') + + for task in task_to_threshold.keys(): + task_models = task_to_models[task] + labels = pd.read_csv(gold_labels_paths[task], sep='\t', header=None).values.squeeze().tolist() + for data_size in data_sizes: + results = {} + print(f'\nRunning ensemble for {task.upper()}, {data_size}') + relevant_models = [model for model in task_models if model['train_data_slice'] == data_size] + + best_score_per_seed_group = defaultdict(float) + best_model_per_seed_group = defaultdict(str) + successful_models = {} + model_to_predictions = {} + model_to_confidences = {} + # Get Accuracies + print('Accuracy of each model:') + for model in relevant_models: + try: + preds = pd.read_csv(model['predictions'], sep='\t', header=None).values.squeeze().tolist() + confs = pd.read_csv(model['confidence'], sep='\t', header=None).values.squeeze().tolist() + accuracy = accuracy_score(labels, preds) + + model_name = get_model_name(model) + successful_models[model_name] = {'accuracy': accuracy, 'parameters': dict(model['parameters'])} + model_to_predictions[model_name] = preds + model_to_confidences[model_name] = confs + print(f'{model_name},{round(accuracy*100,2)}') + model_without_task_data_size = '_'.join( + str(option) for parameter, option in model['parameters'] + if parameter not in {'task', 'train_data-slice'} + ) + results[model_without_task_data_size] = round(accuracy*100,2) + + # model_without_seed = model.strip('_'+model.split('_')[-1]) + model_without_seed = '_'.join( + str(option) for parameter, option in model['parameters'] + if parameter != 'random_seed' + ) + if accuracy > best_score_per_seed_group[model_without_seed]: + best_score_per_seed_group[model_without_seed] = accuracy + best_model_per_seed_group[model_without_seed] = model_name + except FileNotFoundError: + print(f'Couldn\'t find preds for {model}') + continue + + # Compare Models + # print('Compare pairs of predictions of each model') + # print('ID1,ID22,Pred Sim,Pred Cor,Correctness Cor,Confidence Cor,ConfCor Both Correct,ConfCor One Correct,ConfCor Both Wrong') + # for id1, id2 in itertools.combinations(relevant_models, 2): + # model1, rs1 = tuple(id1.split('_')) + # model2, rs2 = tuple(id2.split('_')) + # if model1 != model2 and rs1 != rs2: continue # skip if both the model and rs are different + # preds1, conf1 = model_to_predictions[id1], model_to_confidences[id1] + # correctness1 = [int(p == labels[i]) for i, p in enumerate(preds1)] + # preds2, conf2 = model_to_predictions[id2], model_to_confidences[id2] + # correctness2 = [int(p == labels[i]) for i, p in enumerate(preds2)] + # # ConfCor Both Correct + # ccbc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] and correctness2[i]]))[0] + # # ConfCor Only One Correct + # ccoc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] != correctness2[i]]))[0] + # # ConfCor Both Wrong + # ccbw = \ + # pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] == correctness2[i] == 0]))[ + # 0] + # print( + # f'{id1},{id2},{accuracy_score(preds1, preds2)},{pearsonr(preds1, preds2)[0]},{pearsonr(correctness1, correctness2)[0]},{pearsonr(conf1, conf2)[0]},{ccbc},{ccoc},{ccbw}') + # print('\n') + + predictions_df = pd.DataFrame.from_dict(model_to_predictions) + confidences_df = pd.DataFrame.from_dict(model_to_confidences).applymap(np.asarray) + # print(f'accuracy,{list(model_to_path.keys())}'.replace(' ','').replace('\'','').replace('[','').replace(']','')) # print for csv + # Grid search for ensembling + # ensemble_results = {} + # for subset in powerset(successful_models): + # if len(subset) <= 1: continue + # subset = list(subset) + # ensemble_results[tuple(subset)]=run_ensemble(predictions_df, confidences_df, subset) + # best = heapq.nlargest(10, ensemble_results, key=ensemble_results.get) + # print(ensemble_results[best[0]]) + # best_performers = [m for ms in best for m in ms] + # counts = Counter(best_performers) + # print(counts.most_common()) + + print(best_model_per_seed_group) + print(best_score_per_seed_group) + print('Ensemble of all models:') + all_accuracy = run_ensemble(predictions_df, confidences_df, [ + m for m, d in successful_models.items() if d['accuracy'] > task_to_threshold[d['parameters']['task']] + ]) + results['Ensemble - All'] = all_accuracy + + print('Ensemble of best-per-architecture:', ) + best_per_seed_accuracy = run_ensemble(predictions_df, confidences_df, [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()]) + # if task != 'physicaliqa' and task != 'alphanli': + # confidences_df[[best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()]].to_csv(f'{task}_conf_ensemble.csv') + + results['Ensemble - best-per-architecture'] = best_per_seed_accuracy + results['Ensemble Improvement best-per-architecture vs all'] = round(best_per_seed_accuracy-all_accuracy,2) + print('Ensemble Improvement best per arc vs all:', results['Ensemble Improvement best-per-architecture vs all']) + + for factor in try_without: + without_factor = [m for m in successful_models if factor not in m] + print(f'Without {factor}:') + # print(without_factor) + wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor) + results[f'Ensemble - Without {factor}'] = wf_accuracy + + without_factor_per_arc = [m for m in [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()] if factor not in m] + print(f'Best-per-arc without {factor}:') + # print(without_factor_per_arc) + bpa_wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor_per_arc) + results[f'Best-per-arc without {factor}'] = bpa_wf_accuracy + # if factor == 'embed_all_sep_mean' and (task == 'physicaliqa' or task == 'alphanli'): + # confidences_df[without_factor_per_arc].to_csv(f'{task}_conf_ensemble.csv') + + all_results[task + '_' + str(data_size)] = results + + df = pd.DataFrame.from_dict(all_results) + df.to_csv(output_file, na_rep='-') + + +if __name__ == '__main__': + parameters_only_entry_point(main) \ No newline at end of file diff --git a/eval.py b/ai2/eval.py similarity index 62% rename from eval.py rename to ai2/eval.py index 37031627..dc1952b3 100644 --- a/eval.py +++ b/ai2/eval.py @@ -1,58 +1,76 @@ from pathlib import Path -from typing import List, Union +from typing import List, Union, Any -import hydra from loguru import logger import numpy as np -import omegaconf import pandas as pd from sklearn.metrics import accuracy_score import torch import torch.nn.functional as F from torch.utils.data import DataLoader from tqdm import tqdm - -from model import Classifier - -# Save root path as hydra will create copies of this code in a folder -ROOT_PATH = Path(__file__).parent.absolute() - - -# If script is executed by itself, load in the configuration yaml file and desired checkpoint model -@hydra.main(config_path="config/eval.yaml") -def main(config: omegaconf.Config): - config = omegaconf.OmegaConf.to_container(config) - logger.info(config) +from vistautils.parameters import Parameters +from vistautils.parameters_only_entrypoint import parameters_only_entry_point + +from ai2.model import Classifier + + +def main(params: Parameters): + checkpoint_path = params.existing_file('checkpoint_path') + results_path = params.creatable_file('results_path') + val_x_file = params.existing_file('val_x') + val_y_file = params.optional_existing_file('val_y') + with_true_label = params.boolean('with_true_label') + if with_true_label and val_y_file is None: + raise RuntimeError( + f'with_true_label set to true but no true labels (val_y) provided! ' + ) + elif not with_true_label and val_y_file is not None: + raise RuntimeError( + f'with_true_label set to false but got true labels val_y!' + ) + + model_name = params.string('model.model_name') + task_name = params.string('task_name') + maybe_random_seed = params.get('random_seed', object) # If the evaluation is deterministic for debugging purposes, we set the random seed - if not isinstance(config['random_seed'], bool): - logger.info(f"Running deterministic model with seed {config['random_seed']}") - np.random.seed(config['random_seed']) - torch.manual_seed(config['random_seed']) + if not isinstance(maybe_random_seed, bool): + if not isinstance(maybe_random_seed, int): \ + raise RuntimeError( + "Random seed must be either false (i.e. no random seed)" + "or an integer seed!" + ) + logger.info(f"Running deterministic model with seed {maybe_random_seed}") + np.random.seed(maybe_random_seed) + torch.manual_seed(maybe_random_seed) if torch.cuda.is_available(): torch.backends.cuda.deterministic = True torch.backends.cuda.benchmark = False # Load in the check pointed model + config = params.namespace('model').as_nested_dicts() + config.update((k, v) for k, v in params.as_nested_dicts().items() if k != 'model') model = Classifier(config) device = 'cpu' if not torch.cuda.is_available() else "cuda" - checkpoint = torch.load(ROOT_PATH / config['checkpoint_path'], map_location=device) + checkpoint = torch.load(checkpoint_path, map_location=device) model.load_state_dict(checkpoint['state_dict']) - save_path = Path(f"{config['model']}-{config['task_name']}-s{config['random_seed']}") + save_path = Path(f"{model_name}-{task_name}-s{maybe_random_seed}") save_path.mkdir(parents=True, exist_ok=True) # Call the main function with appropriate parameters evaluate(a_classifier=model, output_path=save_path, + results_path=results_path, compute_device=device, - val_x=ROOT_PATH / config['val_x'], - val_y=(ROOT_PATH / config['val_y'] if config['with_true_label'] else None)) + val_x=val_x_file, + val_y=val_y_file) # Function to perform the evaluation (This was separated out to be called in train script) -def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_device: str, - val_x: Union[str, Path], val_y: Union[str, Path] = None): +def evaluate(a_classifier: Classifier, output_path: Union[str, Path], results_path: Union[str, Path], + compute_device: str, val_x: Union[str, Path], val_y: Union[str, Path] = None): # Move model to device and set to evaluation mode a_classifier.to(compute_device) a_classifier.eval() @@ -88,7 +106,7 @@ def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_de stats = [] for _ in range(10000): - indices = [i for i in np.random.random_integers(0, len(predictions) - 1, size=len(predictions))] + indices = [i for i in np.random.randint(0, len(predictions) - 1, size=len(predictions))] stats.append(accuracy_score([labels[j] for j in indices], [predictions[j] for j in indices])) # Calculate the confidence interval and log it to console @@ -101,10 +119,10 @@ def evaluate(a_classifier: Classifier, output_path: Union[str, Path], compute_de f'average: {np.mean(stats) * 100:.1f}') # Log eval result - with open(ROOT_PATH / f"results.txt", "a+") as resultf: + with open(results_path, "a+") as resultf: resultf.write(f'{output_path},Accuracy-lower-upper-average,{accuracy_score(labels, predictions):.3f},' f'{lower * 100:.1f},{upper * 100:.1f},{np.mean(stats) * 100:.1f}\n') if __name__ == "__main__": - main() + parameters_only_entry_point(main) diff --git a/filter_submission_model.py b/ai2/filter_submission_model.py similarity index 100% rename from filter_submission_model.py rename to ai2/filter_submission_model.py diff --git a/model.py b/ai2/model.py similarity index 98% rename from model.py rename to ai2/model.py index 7972ce9f..7d7b6784 100644 --- a/model.py +++ b/ai2/model.py @@ -50,9 +50,9 @@ def __init__(self, hparams): self.label_offset = 0 # Load Transformer model from cache files (encoder and tokenizer) - self.embedder = AutoModel.from_pretrained(hparams["model"], cache_dir=self.root_path / "model_cache") + self.embedder = AutoModel.from_pretrained(hparams["model_name"], cache_dir=self.root_path / "model_cache") self.tokenizer = \ - AutoTokenizer.from_pretrained(hparams["model"], cache_dir=self.root_path / "model_cache", use_fast=False) + AutoTokenizer.from_pretrained(hparams["model_name"], cache_dir=self.root_path / "model_cache", use_fast=False) self.embedder.train() self.dropout = nn.Dropout(hparams["dropout"]) @@ -76,13 +76,13 @@ def forward(self, batch): assert len(batch["attention_mask"].shape) == 2, "LM only take two-dimensional input" assert len(batch["token_type_ids"].shape) == 2, "LM only take two-dimensional input" - batch["token_type_ids"] = None if "roberta" in self.hparams["model"] or "lm_finetuned" \ - in self.hparams["model"] else batch["token_type_ids"] + batch["token_type_ids"] = None if "roberta" in self.hparams["model_name"] or "lm_finetuned" \ + in self.hparams["model_name"] else batch["token_type_ids"] results = self.embedder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"]) - if 't5' in self.hparams["model"]: + if 't5' in self.hparams["model_name"]: results = self.embedder(input_ids=batch["input_ids"], decoder_input_ids=batch["input_ids"], ) diff --git a/ai2/pegasus.py b/ai2/pegasus.py new file mode 100644 index 00000000..fdac0f33 --- /dev/null +++ b/ai2/pegasus.py @@ -0,0 +1,215 @@ +from typing import Mapping, List, Tuple, Any, cast +from pathlib import Path + +from vistautils.iter_utils import only +from vistautils.parameters import Parameters, YAMLParametersLoader +from vistautils.parameters_only_entrypoint import parameters_only_entry_point +from pegasus_wrapper import ( + initialize_vista_pegasus_wrapper, + directory_for, + # experiment_directory, + run_python_on_parameters, + limit_jobs_for_category, + write_workflow_description, +) +from pegasus_wrapper.resource_request import ResourceRequest +from pegasus_wrapper.locator import Locator +from pegasus_wrapper.artifact import ValueArtifact + +import ai2.train +import ai2.ensemble + +TIME_LIMIT_HOURS_NOT_ALPHANLI = 12 # Time limit in hours for tasks other than AlphaNLI +MINUTES_PER_HOUR = 60 + +# Default limit on the number of jobs that will run on MICS at once +DEFAULT_MAX_JOBS_ON_MICS = 2 + +# Represents a parameter combination as a list of (parameter_name, value) tuples. +ParameterCombination = List[Tuple[str, Any]] + + +def main(params: Parameters): + initialize_vista_pegasus_wrapper(params) + + params_root = params.existing_directory('project_root') / 'parameters' + parameter_options = params.namespace('parameter_options').as_nested_dicts() + + max_jobs_on_mics = params.integer('max_jobs_on_mics', default=DEFAULT_MAX_JOBS_ON_MICS) + + ensemble_params = params.namespace('ensemble') + data_sizes = params.arbitrary_list('parameter_options.train_data_slice') + ensemble_output_file_name = ensemble_params.string('output_file_name') + + # Compute all possible combinations of the parameters + parameter_combinations: List[ParameterCombination] = [[]] + for parameter_name, options in parameter_options.items(): + new_combinations = [] + for combination in parameter_combinations: + for option in options: + new_combination = combination + [(parameter_name, option)] + new_combinations.append(new_combination) + parameter_combinations = new_combinations + + # Process combination-specific overrides + training_overrides = sorted( + list(params.namespace_or_empty('training_overrides') + .as_nested_dicts() + .values()), + key=lambda override: override_generality(override, parameter_options), + ) + + # Training phase. + # Schedule training jobs for each parameter combination. Their outputs will be under "{experiment_root}/models". + model_outputs_locator = Locator(('models',)) + task_to_jobs_info = {} + for i, combination in enumerate(parameter_combinations): + task: str = only(option for parameter, option in combination if parameter == 'task') + train_data_slice: int = only(option for parameter, option in combination if parameter == 'train_data_slice') + options: Tuple[str] = tuple(str(option) if option != '' else '_default' for _, option in combination) + locator = model_outputs_locator / Locator(options) + + # Read in combination-specific parameters + job_params = params.from_key_value_pairs([('model', params.namespace('model'))]) + job_params = job_params.unify(Parameters.from_key_value_pairs(combination, namespace_separator=None)) + params_root = params.existing_directory('project_root') / 'parameters' + for parameter, option in combination: + if option != '': + parameter_directory = params_root / parameter + if parameter_directory.exists(): + option_params: Parameters = YAMLParametersLoader().load( + parameter_directory / f'{option}.params' + ) + job_params = job_params.unify(option_params) + + # Because the job parameters tend to indirectly include root.params, which includes a + # default partition, we need to override the partition setting to reflect our input + # parameters. + job_params = job_params.unify({'partition': params.string('partition')}) + + # Process overrides + for override in training_overrides: + if override_matches(override, dict(combination)): + job_params = job_params.unify({ + parameter_option: value for parameter_option, value in override.items() + if parameter_option != 'parameter_options' + }) + + # Messy parameters input. This shouldn't matter to ResourceRequest, though. Maybe clean up + # later. + resource_request = ResourceRequest.from_parameters( + params.unify(job_params) + ) + + # Set common parameters and schedule the job. + save_path = directory_for(locator) + job_params = job_params.unify({ + 'save_path': save_path, + 'save_best_only': False, + 'save_by_date_and_parameters': False, + 'eval_after_training': True, + }) + job = run_python_on_parameters( + locator, + ai2.train, + job_params, + depends_on=[], + resource_request=resource_request + ) + + # We track job info so that it can be fed to the ensembling script. + jobs_info = task_to_jobs_info.get(task, []) + jobs_info.append({ + 'job': job, + 'train_data_slice': train_data_slice, + 'parameters': combination, + 'predictions': ValueArtifact(locator=locator, value=Path('predictions.lst')), + 'confidence': ValueArtifact(locator=locator, value=Path('confidence.lst')), + }) + task_to_jobs_info[task] = jobs_info + + # Ensembling phase. + ensemble_locator = Locator(('ensembled',)) + + # Load the gold labels for every task covered by ensembling. + for task in ensemble_params.namespace('task_to_threshold').as_nested_dicts().keys(): + task_params = YAMLParametersLoader().load( + params_root / 'task' / f'{task}.params' + ) + ensemble_params = ensemble_params.unify({'task_to_gold': { + task: {'val_y': task_params.existing_file('val_y')} + }}) + if not task_to_jobs_info.get(task): + raise RuntimeError( + "Each task with a threshold must appear in the list of task parameters." + ) + + ensemble_params = ensemble_params.unify({ + 'data_sizes': data_sizes, + 'output_file': directory_for(ensemble_locator) / ensemble_output_file_name, + }) + + # Make a list of models and the relevant job info for the ensembling script to use. It needs to + # know, for example, where to find their predictions and their confidences. + for task, jobs_info in task_to_jobs_info.items(): + models_list = [] + for job_info in jobs_info: + predictions_path = directory_for(job_info['predictions'].locator) / job_info['predictions'].value + confidence_path = directory_for(job_info['confidence'].locator) / job_info['confidence'].value + + models_list.append({ + 'train_data_slice': job_info['train_data_slice'], + 'parameters': job_info['parameters'], + 'predictions': predictions_path, + 'confidence': confidence_path, + }) + + ensemble_params = ensemble_params.unify({ + 'models': {task: models_list} + }) + + run_python_on_parameters( + ensemble_locator, + ai2.ensemble, + ensemble_params, + depends_on=[ + job_info['job'] + for jobs_info in task_to_jobs_info.values() + for job_info in jobs_info + ] + ) + + # Limit number of jobs that will run at once on MICS account/partition + limit_jobs_for_category(category='mics', max_jobs=max_jobs_on_mics) + + write_workflow_description() + + +OVERRIDE_FILTER_KEY = 'parameter_options' + + +def override_generality(override: Mapping[str, Any], parameter_combinations: Mapping[str, List[Any]]) -> int: + """ + Returns the generality of an override with respect to the given mapping of possible parameter + combinations. + + The generality of an override is the number of configurations it applies to. A bigger number + indicates a more general override. The generality is computed as the total number of parameter + combinations the options dict applies to. + """ + override_options = cast(Mapping[str, List[Any]], override[OVERRIDE_FILTER_KEY]) + complexity = 1 + for parameter_name, all_possible_values in parameter_combinations.items(): + allowed_values = override_options.get(parameter_name, all_possible_values) + complexity *= len(allowed_values) + return complexity + + +def override_matches(override: Mapping[str, Any], parameter_combination: Mapping[str, Any]) -> bool: + override_options = cast(Mapping[str, List[Any]], override[OVERRIDE_FILTER_KEY]) + return all(parameter_combination.get(parameter_name) in allowed_values + for parameter_name, allowed_values in override_options.items()) + + +if __name__ == '__main__': + parameters_only_entry_point(main) diff --git a/predict.py b/ai2/predict.py similarity index 98% rename from predict.py rename to ai2/predict.py index 292cf736..99d85a8f 100644 --- a/predict.py +++ b/ai2/predict.py @@ -11,7 +11,7 @@ from tqdm import tqdm import numpy as np -from model import Classifier +from ai2.model import Classifier # Parse the input file from JSONL to a list of dictionaries. @@ -40,7 +40,7 @@ def main(input_file, output_file): 'random_seed': ckpt.strip('.ckpt').split('_')[-1], 'architecture': 'standard', 'with_true_label': True, - 'model': "roberta-large", + 'model_name': "roberta-large", 'accumulate_grad_batches': 8, 'use_amp': False, # Half precision only works best with Volta architectures such as V100 'max_epochs': 4, diff --git a/results_to_csv.py b/ai2/results_to_csv.py similarity index 100% rename from results_to_csv.py rename to ai2/results_to_csv.py diff --git a/ai2/train.py b/ai2/train.py new file mode 100644 index 00000000..45d247d3 --- /dev/null +++ b/ai2/train.py @@ -0,0 +1,125 @@ +from datetime import datetime +import random + +from vistautils.parameters_only_entrypoint import parameters_only_entry_point +from vistautils.parameters import Parameters +from loguru import logger +import numpy as np +from pytorch_lightning import Trainer +from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.loggers import TestTubeLogger +import torch + +from ai2.eval import evaluate +from ai2.model import Classifier + + +# Date and time formats for saving dated/timed outputs +DATE_FORMAT = '%Y-%m-%d' +TIME_FORMAT = '%H-%M-%S' + + +def train(params: Parameters): + # Load all expected parameters at the start so that if we're missing one, we crash immediately + # instead of wasting time finishing half the script. + save_path = params.optional_creatable_directory('save_path') + save_by_date_and_parameters = params.boolean('save_by_date_and_parameters') + save_best_only = params.boolean('save_best_only') + build_on_pretrained_model = params.optional_existing_file('build_on_pretrained_model') + + model_name = params.string('model.model_name') + task_name = params.string('task_name') + task_name2 = params.optional_string('task_name2') + architecture = params.string('architecture') + train_data_slice = params.integer('train_data_slice') + maybe_random_seed = params.get('random_seed', object) + + eval_after_training = params.boolean('eval_after_training') + val_x_file = params.existing_file('val_x') + val_y_file = params.existing_file('val_y') + + # If the training is deterministic for debugging purposes, we set the random seed + if not isinstance(maybe_random_seed, bool): + if not isinstance(maybe_random_seed, int):\ + raise RuntimeError( + "Random seed must be either false (i.e. no random seed)" + "or an integer seed!" + ) + logger.info(f"Running deterministic model with seed {maybe_random_seed}") + torch.manual_seed(maybe_random_seed) + np.random.seed(maybe_random_seed) + random.seed(maybe_random_seed) + if torch.cuda.is_available(): + torch.backends.cuda.deterministic = True + torch.backends.cuda.benchmark = False + + # Initialize the classifier by arguments specified in config file + config = params.namespace('model').as_nested_dicts() + config.update((k, v) for k, v in params.as_nested_dicts().items() if k != 'model') + model = Classifier(config) + logger.info('Initialized classifier.') + + if save_by_date_and_parameters: + now = datetime.now() + date = now.strftime(DATE_FORMAT) + time = now.strftime(TIME_FORMAT) + save_path = save_path / date / time / f"{model_name}_{task_name}-{train_data_slice}_{architecture}_s{maybe_random_seed}" + if task_name2: + save_path = save_path / f"_{task_name2}" + + if build_on_pretrained_model: + logger.info('Loading pretrained checkpoint...') + device = 'cpu' if not torch.cuda.is_available() else "cuda" + checkpoint = torch.load(build_on_pretrained_model, map_location=device) + model.load_state_dict(checkpoint['state_dict']) + save_path += f"_pretrained_{str(build_on_pretrained_model).split('/')[-1].split('.')[0]}" + logger.info(f'Output directory: {save_path}') + + # Define the trainer along with its checkpoint and experiment instance + checkpoint = ModelCheckpoint( + filepath=str(save_path / 'checkpoints' / 'foo'), # Last part needed due to parsing logic + verbose=True, + save_top_k=1 if save_best_only else -1, + ) + tt_logger = TestTubeLogger( + save_dir=str(save_path), + name=task_name, + version=0, + ) + tt_logger.experiment.autosave = True + # We pass the trainer parameters using the values from config (rather than params) to + # better reflect the parameters we passed the model. + trainer = Trainer( + logger=tt_logger, + checkpoint_callback=checkpoint, + gradient_clip_val=0, + gpus=list(range(torch.cuda.device_count())) if torch.cuda.is_available() else None, + log_gpu_memory="all", + progress_bar_refresh_rate=1, + check_val_every_n_epoch=1, + accumulate_grad_batches=config["accumulate_grad_batches"], + max_epochs=config["max_epochs"], + min_epochs=1, + train_percent_check=1.0, + val_percent_check=1.0, + test_percent_check=1.0, + log_save_interval=25, + row_log_interval=25, + distributed_backend="dp", + precision=16 if config["use_amp"] else 32, + weights_summary='top', + num_sanity_val_steps=5, + ) + trainer.fit(model) + logger.success('Training Completed') + + if eval_after_training: + logger.info('Start model evaluation') + # Evaluate the model with evaluate function from eval.py + evaluate(a_classifier=model, output_path=save_path, results_path=save_path / "results.txt", + compute_device=('cpu' if not torch.cuda.is_available() else "cuda"), + val_x=val_x_file, val_y=val_y_file) + + +if __name__ == "__main__": + parameters_only_entry_point(train) diff --git a/config/eval.yaml b/config/eval.yaml deleted file mode 100644 index 182915de..00000000 --- a/config/eval.yaml +++ /dev/null @@ -1,10 +0,0 @@ -defaults: - - model: roberta-large - - task: physicaliqa -checkpoint_path: "outputs/2020-03-20/16-37-40/outputs/checkpoints/_ckpt_epoch_4.ckpt" -random_seed: 42 -architecture: standard -with_true_label: True -hydra: - run: - dir: outputs \ No newline at end of file diff --git a/config/model/roberta-large.yaml b/config/model/roberta-large.yaml deleted file mode 100644 index bf93bbbc..00000000 --- a/config/model/roberta-large.yaml +++ /dev/null @@ -1,10 +0,0 @@ -model: "roberta-large" -accumulate_grad_batches: 8 -use_amp: false # Half precision only works best with Volta architectures such as V100 -max_epochs: 4 -learning_rate: 5e-6 -adam_epsilon: 1e-8 -warmup_steps: 300 -batch_size: 3 -dropout: 0.3 -max_length: 128 diff --git a/config/task/alphanli.yaml b/config/task/alphanli.yaml deleted file mode 100644 index 1cb47751..00000000 --- a/config/task/alphanli.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name: alphanli -train_x: "task_data/alphanli-train-dev/train.jsonl" -train_y: "task_data/alphanli-train-dev/train-labels.lst" -val_x: "task_data/alphanli-train-dev/internal-dev.jsonl" -val_y: "task_data/alphanli-train-dev/internal-dev-labels.lst" -formula: "obs1 + obs2 -> hyp1|hyp2" diff --git a/config/task/hellaswag.yaml b/config/task/hellaswag.yaml deleted file mode 100644 index 419d84dc..00000000 --- a/config/task/hellaswag.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name: hellaswag -train_x: "task_data/hellaswag-train-dev/train.jsonl" -train_y: "task_data/hellaswag-train-dev/train-labels.lst" -val_x: "task_data/hellaswag-train-dev/internal-dev.jsonl" -val_y: "task_data/hellaswag-train-dev/internal-dev-labels.lst" -formula: "ctx_a + ctx_b -> ending_options" diff --git a/config/task/physicaliqa.yaml b/config/task/physicaliqa.yaml deleted file mode 100644 index fa04bfe3..00000000 --- a/config/task/physicaliqa.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name: physicaliqa -train_x: "task_data/physicaliqa-train-dev/train.jsonl" -train_y: "task_data/physicaliqa-train-dev/train-labels.lst" -val_x: "task_data/physicaliqa-train-dev/internal-dev.jsonl" -val_y: "task_data/physicaliqa-train-dev/internal-dev-labels.lst" -formula: "goal -> sol1|sol2" diff --git a/config/task/socialiqa.yaml b/config/task/socialiqa.yaml deleted file mode 100644 index 0b43b61e..00000000 --- a/config/task/socialiqa.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name: socialiqa -train_x: "task_data/socialiqa-train-dev/train.jsonl" -train_y: "task_data/socialiqa-train-dev/train-labels.lst" -val_x: "task_data/socialiqa-train-dev/internal-dev.jsonl" -val_y: "task_data/socialiqa-train-dev/internal-dev-labels.lst" -formula: "context + question -> answerA|answerB|answerC" diff --git a/config/task2/cn_10k.yaml b/config/task2/cn_10k.yaml deleted file mode 100644 index 2f67ac23..00000000 --- a/config/task2/cn_10k.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name2: cn_all_cs_10k -train2_x: "task_data/cn_all_cs_10k-train-dev/train.jsonl" -train2_y: "task_data/cn_all_cs_10k-train-dev/train-labels.lst" -val2_x: "task_data/cn_all_cs_10k-train-dev/dev.jsonl" -val2_y: "task_data/cn_all_cs_10k-train-dev/dev-labels.lst" -formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" \ No newline at end of file diff --git a/config/task2/cn_20k.yaml b/config/task2/cn_20k.yaml deleted file mode 100644 index d70249fb..00000000 --- a/config/task2/cn_20k.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name2: cn_all_cs_20k -train2_x: "task_data/cn_all_cs_20k-train-dev/train.jsonl" -train2_y: "task_data/cn_all_cs_20k-train-dev/train-labels.lst" -val2_x: "task_data/cn_all_cs_20k-train-dev/dev.jsonl" -val2_y: "task_data/cn_all_cs_20k-train-dev/dev-labels.lst" -formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" \ No newline at end of file diff --git a/config/task2/cn_40k.yaml b/config/task2/cn_40k.yaml deleted file mode 100644 index c99d1034..00000000 --- a/config/task2/cn_40k.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name2: cn_all_cs_40k -train2_x: "task_data/cn_all_cs_40k-train-dev/train.jsonl" -train2_y: "task_data/cn_all_cs_40k-train-dev/train-labels.lst" -val2_x: "task_data/cn_all_cs_40k-train-dev/dev.jsonl" -val2_y: "task_data/cn_all_cs_40k-train-dev/dev-labels.lst" -formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" \ No newline at end of file diff --git a/config/task2/cn_physical_10k.yaml b/config/task2/cn_physical_10k.yaml deleted file mode 100644 index e0af7c67..00000000 --- a/config/task2/cn_physical_10k.yaml +++ /dev/null @@ -1,6 +0,0 @@ -task_name2: cn_physical_10k -train2_x: "task_data/cn_physical_10k-train-dev/train.jsonl" -train2_y: "task_data/cn_physical_10k-train-dev/train-labels.lst" -val2_x: "task_data/cn_physical_10k-train-dev/dev.jsonl" -val2_y: "task_data/cn_physical_10k-train-dev/dev-labels.lst" -formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" \ No newline at end of file diff --git a/config/train.yaml b/config/train.yaml deleted file mode 100644 index 57bc0616..00000000 --- a/config/train.yaml +++ /dev/null @@ -1,12 +0,0 @@ -defaults: - - model: roberta-large - - task: physicaliqa -random_seed: 10061880 -build_on_pretrained_model: False -save_best_only: True -eval_after_training: True -architecture: standard -train_data_slice: 90 -hydra: - run: - dir: outputs \ No newline at end of file diff --git a/ensemble.py b/ensemble.py deleted file mode 100644 index 1f61497a..00000000 --- a/ensemble.py +++ /dev/null @@ -1,159 +0,0 @@ -import csv -import itertools -import os -import numpy as np -from collections import Counter, defaultdict -import heapq - -from more_itertools import powerset -from sklearn.metrics import accuracy_score -import pandas as pd -from scipy.stats.stats import pearsonr - -tasks_to_threshold = { - 'alphanli':0.6, - 'physicaliqa':0.6, - 'socialiqa':0.6, - 'hellaswag':0.6, -} -models = [name for name in os.listdir("outputs/.") if name != 'slurm'] - -def run_ensemble(predictions_df, confidences_df, subset): - # confidences_df[confidences_df < 0.2] = 0 # Set low confidence values to 0. - # confidences_df = confidences_df.eq(confidences_df.where(confidences_df != 0).max(1), axis=0).astype(int) # Get the most confident - - relevant_confidences = confidences_df[subset] - weighted_votes = relevant_confidences.sum(axis=1).apply(np.argmax).to_numpy() - if task in ['socialiqa', 'alphanli']: weighted_votes += 1 - final_predictions = weighted_votes.tolist() - stats = [] - for _ in range(100): # TODO: Use 10K for official reports. 100 is used for quick dev runs. - indices = [i for i in np.random.random_integers(0, len(final_predictions) - 1, size=len(final_predictions))] - stats.append(accuracy_score([labels[j] for j in indices], [final_predictions[j] for j in indices])) - - # Calculate the confidence interval and log it to console - alpha = 0.95 - p = ((1.0 - alpha) / 2.0) * 100 - lower = max(0.0, np.percentile(stats, p)) - p = (alpha + ((1.0 - alpha) / 2.0)) * 100 - upper = min(1.0, np.percentile(stats, p)) - accuracy = accuracy_score(labels, final_predictions) - print(f'Accuracy: {accuracy}, {alpha * 100:.1f} confidence interval {lower * 100:.1f} and {upper * 100:.1f}, ' - f'average: {np.mean(stats) * 100:.1f}') - - # print(f'{accuracy},{[int(i in subset) for i in model_to_path.keys()]}'.replace(' ','').replace('[','').replace(']','')) # CSV - # unweighted_votes = predictions_df[subset].mode(axis=1).too_nutolist() - return round(accuracy*100,2) - -all_results = {} - -for task in tasks_to_threshold.keys(): - # for data_size in ['10','25','90']: - for data_size in ['100']: - results = {} - print(f'\nRunning ensemble for {task.upper()}, {data_size}') - relevant_models = [model for model in models if task in model and data_size == model.split('_')[1]] - # gold_labels_path = f'task_data/{task}-train-dev/internal-dev-labels.lst' - gold_labels_path = f'task_data/{task}-train-dev/dev-labels.lst' - labels = pd.read_csv(gold_labels_path, sep='\t', header=None).values.squeeze().tolist() - - best_score_per_seed_group = defaultdict(float) - best_model_per_seed_group = defaultdict(str) - successful_models = {} - model_to_predictions = {} - model_to_confidences = {} - # Get Accuracies - print('Accuracy of each model:') - for model in relevant_models: - path = 'outputs/'+model - try: - preds = pd.read_csv(path + '/predictions.lst', sep='\t', header=None).values.squeeze().tolist() - confs = pd.read_csv(path + '/confidence.lst', sep='\t', header=None).values.squeeze().tolist() - accuracy = accuracy_score(labels, preds) - - successful_models[model] = accuracy - model_to_predictions[model] = preds - model_to_confidences[model] = confs - print(f'{model},{round(accuracy*100,2)}') - results[model.replace(task+'_'+data_size+'_','')] = round(accuracy*100,2) - - model_without_seed = model.strip('_'+model.split('_')[-1]) - if accuracy > best_score_per_seed_group[model_without_seed]: - best_score_per_seed_group[model_without_seed] = accuracy - best_model_per_seed_group[model_without_seed] = model - except: - print(f'Couldn\'t find preds for {model}') - continue - - # Compare Models - # print('Compare pairs of predictions of each model') - # print('ID1,ID22,Pred Sim,Pred Cor,Correctness Cor,Confidence Cor,ConfCor Both Correct,ConfCor One Correct,ConfCor Both Wrong') - # for id1, id2 in itertools.combinations(relevant_models, 2): - # model1, rs1 = tuple(id1.split('_')) - # model2, rs2 = tuple(id2.split('_')) - # if model1 != model2 and rs1 != rs2: continue # skip if both the model and rs are different - # preds1, conf1 = model_to_predictions[id1], model_to_confidences[id1] - # correctness1 = [int(p == labels[i]) for i, p in enumerate(preds1)] - # preds2, conf2 = model_to_predictions[id2], model_to_confidences[id2] - # correctness2 = [int(p == labels[i]) for i, p in enumerate(preds2)] - # # ConfCor Both Correct - # ccbc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] and correctness2[i]]))[0] - # # ConfCor Only One Correct - # ccoc = pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] != correctness2[i]]))[0] - # # ConfCor Both Wrong - # ccbw = \ - # pearsonr(*zip(*[(conf1[i], conf2[i]) for i in range(len(preds1)) if correctness1[i] == correctness2[i] == 0]))[ - # 0] - # print( - # f'{id1},{id2},{accuracy_score(preds1, preds2)},{pearsonr(preds1, preds2)[0]},{pearsonr(correctness1, correctness2)[0]},{pearsonr(conf1, conf2)[0]},{ccbc},{ccoc},{ccbw}') - # print('\n') - - predictions_df = pd.DataFrame.from_dict(model_to_predictions) - confidences_df = pd.DataFrame.from_dict(model_to_confidences).applymap(np.asarray) - # print(f'accuracy,{list(model_to_path.keys())}'.replace(' ','').replace('\'','').replace('[','').replace(']','')) # print for csv - # Grid search for ensembling - # ensemble_results = {} - # for subset in powerset(successful_models): - # if len(subset) <= 1: continue - # subset = list(subset) - # ensemble_results[tuple(subset)]=run_ensemble(predictions_df, confidences_df, subset) - # best = heapq.nlargest(10, ensemble_results, key=ensemble_results.get) - # print(ensemble_results[best[0]]) - # best_performers = [m for ms in best for m in ms] - # counts = Counter(best_performers) - # print(counts.most_common()) - - print(best_model_per_seed_group) - print(best_score_per_seed_group) - print('Ensemble of all models:') - all_accuracy = run_ensemble(predictions_df, confidences_df, [m for m,a in successful_models.items() if a > tasks_to_threshold[m.split('_')[0]]]) - results['Ensemble - All'] = all_accuracy - - print('Ensemble of best-per-architecture:', ) - best_per_seed_accuracy = run_ensemble(predictions_df, confidences_df, [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()]) - # if task != 'physicaliqa' and task != 'alphanli': - # confidences_df[[best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()]].to_csv(f'{task}_conf_ensemble.csv') - - results['Ensemble - best-per-architecture'] = best_per_seed_accuracy - results['Ensemble Improvement best-per-architecture vs all'] = round(best_per_seed_accuracy-all_accuracy,2) - print('Ensemble Improvement best per arc vs all:', results['Ensemble Improvement best-per-architecture vs all']) - - for factor in ['cn_10k', 'standard', 'include_answers_in_context', 'embed_all_sep_mean']: - without_factor = [m for m in successful_models if factor not in m] - print(f'Without {factor}:') - # print(without_factor) - wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor) - results[f'Ensemble - Without {factor}'] = wf_accuracy - - without_factor_per_arc = [m for m in [best_model_per_seed_group[k] for k in best_score_per_seed_group.keys()] if factor not in m] - print(f'Best-per-arc without {factor}:') - # print(without_factor_per_arc) - bpa_wf_accuracy = run_ensemble(predictions_df, confidences_df, without_factor_per_arc) - results[f'Best-per-arc without {factor}'] = bpa_wf_accuracy - # if factor == 'embed_all_sep_mean' and (task == 'physicaliqa' or task == 'alphanli'): - # confidences_df[without_factor_per_arc].to_csv(f'{task}_conf_ensemble.csv') - - all_results[task + '_' + data_size] = results - -df = pd.DataFrame.from_dict(all_results) -df.to_csv('ensemble_results_100.csv',na_rep= '-') \ No newline at end of file diff --git a/parameters/.gitignore b/parameters/.gitignore new file mode 100644 index 00000000..b318337a --- /dev/null +++ b/parameters/.gitignore @@ -0,0 +1 @@ +root.params diff --git a/parameters/eval-roberta-large-alphanli.params b/parameters/eval-roberta-large-alphanli.params new file mode 100644 index 00000000..09803f51 --- /dev/null +++ b/parameters/eval-roberta-large-alphanli.params @@ -0,0 +1,6 @@ +_includes: + - "model/roberta-large.params" + - "task/socialiqa.params" + - "eval.params" + +checkpoint_path: "%experiments_root%/placeholder/socialiqa.ckpt" diff --git a/parameters/eval-roberta-large-hellaswag.params b/parameters/eval-roberta-large-hellaswag.params new file mode 100644 index 00000000..1acc3689 --- /dev/null +++ b/parameters/eval-roberta-large-hellaswag.params @@ -0,0 +1,6 @@ +_includes: + - "model/roberta-large.params" + - "task/hellaswag.params" + - "eval.params" + +checkpoint_path: "%experiments_root%/placeholder/hellaswag.ckpt" diff --git a/parameters/eval-roberta-large-physicaliqa.params b/parameters/eval-roberta-large-physicaliqa.params new file mode 100644 index 00000000..54dc0ae5 --- /dev/null +++ b/parameters/eval-roberta-large-physicaliqa.params @@ -0,0 +1,6 @@ +_includes: + - "model/roberta-large.params" + - "task/physicaliqa.params" + - "eval.params" + +checkpoint_path: "%experiments_root%/placeholder/physicaliqa.ckpt" diff --git a/parameters/eval-roberta-large-socialiqa.params b/parameters/eval-roberta-large-socialiqa.params new file mode 100644 index 00000000..6981b7b0 --- /dev/null +++ b/parameters/eval-roberta-large-socialiqa.params @@ -0,0 +1,6 @@ +_includes: + - "model/roberta-large.params" + - "task/alphanli.params" + - "eval.params" + +checkpoint_path: "%experiments_root%/placeholder/socialiqa.ckpt" diff --git a/parameters/eval.params b/parameters/eval.params new file mode 100644 index 00000000..2bf0e779 --- /dev/null +++ b/parameters/eval.params @@ -0,0 +1,12 @@ +_includes: + - "root.params" + - "model/roberta-large.params" + - "task/physicaliqa.params" + +experiment_name: "eval" +experiment_root: "%experiments_root%/%experiment_name%" +results_path: "%experiment_root%/results.txt" +checkpoint_path: "%experiments_root%/placeholder/_ckpt_epoch_4.ckpt" +random_seed: 42 +architecture: standard +with_true_label: True diff --git a/parameters/model/roberta-large.params b/parameters/model/roberta-large.params new file mode 100644 index 00000000..1ca07d13 --- /dev/null +++ b/parameters/model/roberta-large.params @@ -0,0 +1,11 @@ +model: + model_name: 'roberta-large' + accumulate_grad_batches: 8 + use_amp: false # Half precision only works best with Volta architectures such as V100 + max_epochs: 4 + learning_rate: 5e-6 + adam_epsilon: 1e-8 + warmup_steps: 300 + batch_size: 3 + dropout: 0.3 + max_length: 128 diff --git a/parameters/pegasus-dev-full.params b/parameters/pegasus-dev-full.params new file mode 100644 index 00000000..7e00e73f --- /dev/null +++ b/parameters/pegasus-dev-full.params @@ -0,0 +1,10 @@ +_includes: + - "./pegasus.params" + +training_overrides: + alphanli: + partition: ephemeral + job_time_in_minutes: 720 # 12 hours -- can't run 20 hours on ephemeral + +experiment_root: '%experiments_root%/ensemble-dev-full' +workflow_directory: '%experiment_root%' diff --git a/parameters/pegasus-dev.params b/parameters/pegasus-dev.params new file mode 100644 index 00000000..a59a2f39 --- /dev/null +++ b/parameters/pegasus-dev.params @@ -0,0 +1,42 @@ +_includes: + - "root.params" + - "model/roberta-large.params" + +parameter_options: + task: ['physicaliqa'] + train_data_slice: [100] + task2: [''] + architecture: ['standard'] + random_seed: [0, 42] + +training_overrides: + seed0: + parameter_options: + random_seed: [0] + # Nonsense parameter. Used so we can check if the override code is working without actually + # changing the training parameters for seed 0. + use_widget: true + +ensemble: + accuracy_bootstrapping_samples: 100 + output_file_name: 'dev_ensemble_results.csv' + try_without: [] + task_to_threshold: + physicaliqa: 0.6 + # Fill in data_sizes from train_data_slice + +# Based on https://github.com/isi-vista/gaia-event-extraction/blob/d1235671952dff13b7851a96088ef2ecef4996c9/sample_params/neural_trigger_models/pegasus/experiments/tbnamm.baseline.ace.debug.params +workflow_name: george +experiment_root: '%experiments_root%/ensemble-dev' +workflow_directory: '%experiment_root%' +backend: slurm +site: 'saga' +parallelism: 10 +namespace: saga +partition: ephemeral + +num_cpus: 4 +num_gpus: 1 +memory: '16g' +# sbatch --ntasks=1 # already covered, resource_request.py line 133 +job_time_in_minutes: 720 # 12 hours diff --git a/parameters/pegasus.params b/parameters/pegasus.params new file mode 100644 index 00000000..a86834f5 --- /dev/null +++ b/parameters/pegasus.params @@ -0,0 +1,54 @@ +_includes: + - "root.params" + - "model/roberta-large.params" + +parameter_options: + task: ['alphanli', 'hellaswag', 'physicaliqa', 'socialiqa'] + train_data_slice: [100] + task2: ['', 'cn_10k'] + architecture: ['standard', 'include_answers_in_context'] + random_seed: [0, 42, 10061880] + +training_overrides: + hellaswag: + parameter_options: + task: hellaswag + batch_size: 2 + alphanli: + parameter_options: + task: alphanli + partition: mics + job_time_in_minutes: 1200 # 20 hours + +ensemble: + accuracy_bootstrapping_samples: 100 # Use 10K for official reports. 100 is used for quick dev runs. + output_file_name: 'ensemble_results_100.csv' + try_without: ['cn_10k', 'standard', 'include_answers_in_context', 'embed_all_sep_mean'] + task_to_threshold: + alphanli: 0.6 + physicaliqa: 0.6 + socialiqa: 0.6 + hellaswag: 0.6 + try_without: + - 'cn_10k' + - 'standard' + - 'include_answers_in_context' + - 'embed_all_sep_mean' + +# Based on https://github.com/isi-vista/gaia-event-extraction/blob/d1235671952dff13b7851a96088ef2ecef4996c9/sample_params/neural_trigger_models/pegasus/experiments/tbnamm.baseline.ace.debug.params +workflow_name: george +experiment_root: '%experiments_root%/ensemble' +workflow_directory: '%experiment_root%' +backend: slurm +site: 'saga' +parallelism: 10 +namespace: saga +partition: ephemeral + +max_jobs_on_mics: 4 + +num_cpus: 4 +num_gpus: 1 +memory: '16g' +# sbatch --ntasks=1 # already covered, resource_request.py line 133 +job_time_in_minutes: 720 # 12 horus diff --git a/parameters/root.sample.params b/parameters/root.sample.params new file mode 100644 index 00000000..5f469f3a --- /dev/null +++ b/parameters/root.sample.params @@ -0,0 +1,11 @@ +# Based on https://github.com/isi-vista/gaia-event-extraction/blob/master/sample_params/root.sample.params +# This should be the path to the project root directory. +project_root: /nas/minlp/users/mics/jcecil/ai2 +# This should be the directory under which Pegasus experiment directories should be written. +experiments_root: "%project_root%/experiments" + +# SAGA settings +partition: mics # SLURM partition to run on +# spack_root: /opt/spack/share/spack/setup-env.sh # path to your spack clone +conda_base_path: /nas/home/jcecil/miniconda3 # path to your anaconda install +conda_environment: ai2 diff --git a/parameters/task/alphanli.params b/parameters/task/alphanli.params new file mode 100644 index 00000000..0e5fc2f6 --- /dev/null +++ b/parameters/task/alphanli.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name: alphanli +train_x: "%project_root%/task_data/alphanli-train-dev/train.jsonl" +train_y: "%project_root%/task_data/alphanli-train-dev/train-labels.lst" +val_x: "%project_root%/task_data/alphanli-train-dev/internal-dev.jsonl" +val_y: "%project_root%/task_data/alphanli-train-dev/internal-dev-labels.lst" +formula: "obs1 + obs2 -> hyp1|hyp2" diff --git a/parameters/task/hellaswag.params b/parameters/task/hellaswag.params new file mode 100644 index 00000000..c6cd71dd --- /dev/null +++ b/parameters/task/hellaswag.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name: hellaswag +train_x: "%project_root%/task_data/hellaswag-train-dev/train.jsonl" +train_y: "%project_root%/task_data/hellaswag-train-dev/train-labels.lst" +val_x: "%project_root%/task_data/hellaswag-train-dev/internal-dev.jsonl" +val_y: "%project_root%/task_data/hellaswag-train-dev/internal-dev-labels.lst" +formula: "ctx_a + ctx_b -> ending_options" diff --git a/parameters/task/physicaliqa.params b/parameters/task/physicaliqa.params new file mode 100644 index 00000000..c23087f6 --- /dev/null +++ b/parameters/task/physicaliqa.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name: physicaliqa +train_x: "%project_root%/task_data/physicaliqa-train-dev/train.jsonl" +train_y: "%project_root%/task_data/physicaliqa-train-dev/train-labels.lst" +val_x: "%project_root%/task_data/physicaliqa-train-dev/internal-dev.jsonl" +val_y: "%project_root%/task_data/physicaliqa-train-dev/internal-dev-labels.lst" +formula: "goal -> sol1|sol2" diff --git a/parameters/task/socialiqa.params b/parameters/task/socialiqa.params new file mode 100644 index 00000000..ff1aeef6 --- /dev/null +++ b/parameters/task/socialiqa.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name: socialiqa +train_x: "%project_root%/task_data/socialiqa-train-dev/train.jsonl" +train_y: "%project_root%/task_data/socialiqa-train-dev/train-labels.lst" +val_x: "%project_root%/task_data/socialiqa-train-dev/internal-dev.jsonl" +val_y: "%project_root%/task_data/socialiqa-train-dev/internal-dev-labels.lst" +formula: "context + question -> answerA|answerB|answerC" diff --git a/parameters/task2/cn_10k.params b/parameters/task2/cn_10k.params new file mode 100644 index 00000000..ab795233 --- /dev/null +++ b/parameters/task2/cn_10k.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name2: cn_all_cs_10k +train2_x: "%project_root%/task_data/cn_all_cs_10k-train-dev/train.jsonl" +train2_y: "%project_root%/task_data/cn_all_cs_10k-train-dev/train-labels.lst" +val2_x: "%project_root%/task_data/cn_all_cs_10k-train-dev/dev.jsonl" +val2_y: "%project_root%/task_data/cn_all_cs_10k-train-dev/dev-labels.lst" +formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" diff --git a/parameters/task2/cn_20k.params b/parameters/task2/cn_20k.params new file mode 100644 index 00000000..8cb1b0ff --- /dev/null +++ b/parameters/task2/cn_20k.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name2: cn_all_cs_20k +train2_x: "%project_root%/task_data/cn_all_cs_20k-train-dev/train.jsonl" +train2_y: "%project_root%/task_data/cn_all_cs_20k-train-dev/train-labels.lst" +val2_x: "%project_root%/task_data/cn_all_cs_20k-train-dev/dev.jsonl" +val2_y: "%project_root%/task_data/cn_all_cs_20k-train-dev/dev-labels.lst" +formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" diff --git a/parameters/task2/cn_40k.params b/parameters/task2/cn_40k.params new file mode 100644 index 00000000..beff5aa0 --- /dev/null +++ b/parameters/task2/cn_40k.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name2: cn_all_cs_40k +train2_x: "%project_root%/task_data/cn_all_cs_40k-train-dev/train.jsonl" +train2_y: "%project_root%/task_data/cn_all_cs_40k-train-dev/train-labels.lst" +val2_x: "%project_root%/task_data/cn_all_cs_40k-train-dev/dev.jsonl" +val2_y: "%project_root%/task_data/cn_all_cs_40k-train-dev/dev-labels.lst" +formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" diff --git a/parameters/task2/cn_physical_10k.params b/parameters/task2/cn_physical_10k.params new file mode 100644 index 00000000..85b35858 --- /dev/null +++ b/parameters/task2/cn_physical_10k.params @@ -0,0 +1,9 @@ +_includes: + - "../root.params" + +task_name2: cn_physical_10k +train2_x: "%project_root%/task_data/cn_physical_10k-train-dev/train.jsonl" +train2_y: "%project_root%/task_data/cn_physical_10k-train-dev/train-labels.lst" +val2_x: "%project_root%/task_data/cn_physical_10k-train-dev/dev.jsonl" +val2_y: "%project_root%/task_data/cn_physical_10k-train-dev/dev-labels.lst" +formula2: "e1 + e2 -> sol1|sol2|sol3|sol4|sol5|sol6|sol7" diff --git a/parameters/train-pretrained-roberta-large-alphanli.params b/parameters/train-pretrained-roberta-large-alphanli.params new file mode 100644 index 00000000..494cc9b3 --- /dev/null +++ b/parameters/train-pretrained-roberta-large-alphanli.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-pretrained-roberta-large.common.params" + - "task/hellaswag.params" + +experiment_name: 'train_pretrained_roberta-large_alphanli' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-pretrained-roberta-large-hellaswag.params b/parameters/train-pretrained-roberta-large-hellaswag.params new file mode 100644 index 00000000..ea22b1ba --- /dev/null +++ b/parameters/train-pretrained-roberta-large-hellaswag.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-pretrained-roberta-large.common.params" + - "task/hellaswag.params" + +experiment_name: 'train_pretrained_roberta-large_hellaswag' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-pretrained-roberta-large-physicaliqa.params b/parameters/train-pretrained-roberta-large-physicaliqa.params new file mode 100644 index 00000000..6cdace05 --- /dev/null +++ b/parameters/train-pretrained-roberta-large-physicaliqa.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-pretrained-roberta-large.common.params" + - "task/physicaliqa.params" + +experiment_name: 'train_pretrained_roberta-large_physicaliqa' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-pretrained-roberta-large-socialiqa.params b/parameters/train-pretrained-roberta-large-socialiqa.params new file mode 100644 index 00000000..5343e59d --- /dev/null +++ b/parameters/train-pretrained-roberta-large-socialiqa.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-pretrained-roberta-large.common.params" + - "task/socialiqa.params" + +experiment_name: 'train_pretrained_roberta-large_socialiqa' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-pretrained-roberta-large.common.params b/parameters/train-pretrained-roberta-large.common.params new file mode 100644 index 00000000..58274a7e --- /dev/null +++ b/parameters/train-pretrained-roberta-large.common.params @@ -0,0 +1,11 @@ +_includes: + - "model/roberta-large.params" + +experiment_name: 'train_pretrained' +random_seed: 42 +save_path: "%experiments_root%/%experiment_name%" +save_best_only: True +save_by_date_and_parameters: True +eval_after_training: True +architecture: standard +train_data_slice: 90 diff --git a/parameters/train-roberta-large-alphanli.params b/parameters/train-roberta-large-alphanli.params new file mode 100644 index 00000000..e07025c1 --- /dev/null +++ b/parameters/train-roberta-large-alphanli.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-roberta-large.commmon.params" + - "task/alphanli.params" + +experiment_name: 'train_roberta-large_alphanli' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-roberta-large-hellaswag.params b/parameters/train-roberta-large-hellaswag.params new file mode 100644 index 00000000..0d10ded7 --- /dev/null +++ b/parameters/train-roberta-large-hellaswag.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-roberta-large.commmon.params" + - "task/hellaswag.params" + +experiment_name: 'train_roberta-large_hellaswag' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-roberta-large-physicaliqa.params b/parameters/train-roberta-large-physicaliqa.params new file mode 100644 index 00000000..37b75ceb --- /dev/null +++ b/parameters/train-roberta-large-physicaliqa.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-roberta-large.common.params" + - "task/physicaliqa.params" + +experiment_name: 'train_roberta-large_physicaliqa' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-roberta-large-socialiqa.params b/parameters/train-roberta-large-socialiqa.params new file mode 100644 index 00000000..8152f456 --- /dev/null +++ b/parameters/train-roberta-large-socialiqa.params @@ -0,0 +1,7 @@ +_includes: + - "root.params" + - "train-roberta-large.common.params" + - "task/socialiqa.params" + +experiment_name: 'train_roberta-large_socialiqa' +experiment_root: "%experiments_root%/%experiment_name%" diff --git a/parameters/train-roberta-large.common.params b/parameters/train-roberta-large.common.params new file mode 100644 index 00000000..62696361 --- /dev/null +++ b/parameters/train-roberta-large.common.params @@ -0,0 +1,11 @@ +_includes: + - "model/roberta-large.params" + +experiment_name: 'train' +random_seed: 10061880 +save_path: "%experiments_root%/%experiment_name%" +save_best_only: True +save_by_date_and_parameters: True +eval_after_training: True +architecture: standard +train_data_slice: 90 diff --git a/parameters/train.params b/parameters/train.params new file mode 100644 index 00000000..045ecc5f --- /dev/null +++ b/parameters/train.params @@ -0,0 +1,2 @@ +_includes: + - "train-roberta-large-physicaliqa.params" diff --git a/requirements.txt b/requirements.txt index 0eba59ac..86689e12 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ numpy==1.18.1 oauthlib==3.1.0 omegaconf==1.4.1 pandas==1.0.1 +pegasus-wrapper==0.1.0 Pillow==7.0.0 protobuf==3.11.3 pyasn1==0.4.8 @@ -47,4 +48,5 @@ torch==1.2.0 tqdm==4.42.1 transformers==2.4.0 urllib3==1.25.8 +vistautils==0.23.0 Werkzeug==1.0.0 diff --git a/runner.py b/runner.py deleted file mode 100644 index bf537d7b..00000000 --- a/runner.py +++ /dev/null @@ -1,49 +0,0 @@ -import os - -parameter_options = { - 'task': ['alphanli', 'hellaswag', 'physicaliqa', 'socialiqa'], - # 'task': ['socialiqa'], - 'train_data_slice': ['10','25', '90'], - # 'train_data_slice': ['10'], - # 'task2': ['cn_10k', 'cn_20k', 'cn_40k', 'cn_physical_10k'], - # 'task2': ['','cn_10k'], - # 'task2': [''], - # 'architecture': ['standard', 'include_answers_in_context', 'embed_all_sep_mean'], - 'architecture': ['standard'], - # 'random_seed': ['0', '42', '10061880'], - # 'random_seed': ['541401', '283219', '566944', '605430', '47299', '115719', '169760', '112068', '789504', '926273'], - # 'random_seed': ['541401', '283219', '566944'], - 'random_seed': ['541401', '566944'], - 'learning_rate': ['5e-7','5e-6','5e-5'], - 'batch_size': ['2','3','6'], - # 'batch_size': ['2'], - 'dropout': ['0','0.2','0.3'], - # 'dropout': ['0.2'], - } - -# Create all possible combinations of parameters -parameter_combinations = [[]] -for parameter_name, options in parameter_options.items(): - new_combinations = [] - for combination in parameter_combinations: - for option in options: - new_combination = combination + [(parameter_name,option)] - new_combinations.append(new_combination) - parameter_combinations = new_combinations - -for i, combination in enumerate(parameter_combinations): - experiment_id = '_'.join(option for _, option in combination if option != '') - os.system(f"sbatch " - # Additional SLURM specifications - f"-J {experiment_id} " - f"-o outputs/slurm/{experiment_id}.out " - # Ephemeral specifications - sudo sacctmgr modify user beser set MaxJobs=25 - f"{'' if 'alphanli' in experiment_id else '--partition=ephemeral --qos=ephemeral --time=12:00:00 '}" - f"slurm/run_saga.sh " - # Python script commands - f"\"" - f"{' '.join([f'{name}={option}' for name,option in combination if option != ''])}" - f" save_path={experiment_id}" - # f" save_best_only=False" - f"{' batch_size=2' if 'hellaswag' in experiment_id else ''}" - f"\"") diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..6ff09a00 --- /dev/null +++ b/setup.py @@ -0,0 +1,5 @@ +from setuptools import setup + +setup( + name='ai2', +) diff --git a/slurm/cross_task_eval.sh b/slurm/cross_task_eval.sh index 2a389cdf..14245d4e 100644 --- a/slurm/cross_task_eval.sh +++ b/slurm/cross_task_eval.sh @@ -8,7 +8,7 @@ #SBATCH --gpus-per-task=1 # GPU Allocated #SBATCH --job-name=CROSS_TASK_EVAL # The name of this job. If removed the job will have name of your shell script. #SBATCH --output=outputs/%x-%j.out # The name of the file output. %x-%j means JOB_NAME-JOB_ID. If removed output will be in file slurm-JOB_ID. -#SBATCH --mail-user=dwangli@isi.edu # Email address for email notifications to be sent to. +#SBATCH --mail-user=dwangli@isi.ai2 # Email address for email notifications to be sent to. #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --export=NONE # Ensure job gets a fresh login environment #SBATCH --array=0-3 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. Add %1 if you only want one jobs running at one time. @@ -37,7 +37,7 @@ allTask=(alphanli hellaswag physicaliqa socialiqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo "" echo "This is using Roberta Large trained on alphanli to evaluate on task: $task" -python eval.py task="$task" +python ai2/eval.py "parameters/eval-roberta-large-${task}.params" echo "" ### Finishing up the job and copy the output off of staging diff --git a/slurm/pretrained_anli.sh b/slurm/pretrained_anli.sh index 114fed56..ae673d71 100644 --- a/slurm/pretrained_anli.sh +++ b/slurm/pretrained_anli.sh @@ -8,7 +8,7 @@ #SBATCH --gpus-per-task=2 # GPU Allocated #SBATCH --job-name=ANLI_BASED # The name of this job. If removed the job will have name of your shell script. #SBATCH --output=outputs/%x-%j.out # The name of the file output. %x-%j means JOB_NAME-JOB_ID. If removed output will be in file slurm-JOB_ID. -#SBATCH --mail-user=dwangli@isi.edu # Email address for email notifications to be sent to. +#SBATCH --mail-user=dwangli@isi.ai2 # Email address for email notifications to be sent to. #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --export=NONE # Ensure job gets a fresh login environment #SBATCH --array=0-2 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. Add %1 if you only want one jobs running at one time. @@ -34,7 +34,7 @@ echo "" allTask=(physicaliqa hellaswag socialiqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo "" -python train.py task="$task" build_on_pretrained_model=outputs/roberta-large-baselines/alphanli-s42.ckpt +python ai2/train.py "parameters/train-pretrained-roberta-large-${task}.params" -p build_on_pretrained_model outputs/roberta-large-baselines/alphanli-s42.ckpt echo "" ### Finishing up the job and copy the output off of staging diff --git a/slurm/pretrained_hellaswag.sh b/slurm/pretrained_hellaswag.sh index 3b0b4bbc..a6e32bad 100644 --- a/slurm/pretrained_hellaswag.sh +++ b/slurm/pretrained_hellaswag.sh @@ -8,7 +8,7 @@ #SBATCH --gpus-per-task=2 # GPU Allocated #SBATCH --job-name=HELLASWAG_BASED # The name of this job. If removed the job will have name of your shell script. #SBATCH --output=outputs/%x-%j.out # The name of the file output. %x-%j means JOB_NAME-JOB_ID. If removed output will be in file slurm-JOB_ID. -#SBATCH --mail-user=dwangli@isi.edu # Email address for email notifications to be sent to. +#SBATCH --mail-user=dwangli@isi.ai2 # Email address for email notifications to be sent to. #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --export=NONE # Ensure job gets a fresh login environment #SBATCH --array=0-2 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. Add %1 if you only want one jobs running at one time. @@ -34,7 +34,7 @@ echo "" allTask=(alphanli physicaliqa socialiqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo "" -python train.py task="$task" build_on_pretrained_model=outputs/roberta-large-baselines/hellaswag-s42.ckpt +python ai2/train.py "parameters/train-pretrained-roberta-large-${task}.params" -p build_on_pretrained_model outputs/roberta-large-baselines/hellaswag-s42.ckpt echo "" ### Finishing up the job and copy the output off of staging diff --git a/slurm/pretrained_piqa.sh b/slurm/pretrained_piqa.sh index fef37e44..a0fd3477 100644 --- a/slurm/pretrained_piqa.sh +++ b/slurm/pretrained_piqa.sh @@ -8,7 +8,7 @@ #SBATCH --gpus-per-task=2 # GPU Allocated #SBATCH --job-name=PIQA_BASED # The name of this job. If removed the job will have name of your shell script. #SBATCH --output=outputs/%x-%j.out # The name of the file output. %x-%j means JOB_NAME-JOB_ID. If removed output will be in file slurm-JOB_ID. -#SBATCH --mail-user=dwangli@isi.edu # Email address for email notifications to be sent to. +#SBATCH --mail-user=dwangli@isi.ai2 # Email address for email notifications to be sent to. #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --export=NONE # Ensure job gets a fresh login environment #SBATCH --array=0-2 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. Add %1 if you only want one jobs running at one time. @@ -34,7 +34,7 @@ echo "" allTask=(alphanli hellaswag socialiqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo "" -python train.py task="$task" build_on_pretrained_model=outputs/roberta-large-baselines/physicaliqa-s42.ckpt +python ai2/train.py "parameters/train-pretrained-roberta-large-${task}.params" -p build_on_pretrained_model outputs/roberta-large-baselines/physicaliqa-s42.ckpt echo "" ### Finishing up the job and copy the output off of staging diff --git a/slurm/pretrained_siqa.sh b/slurm/pretrained_siqa.sh index 6f08d777..8cb25182 100644 --- a/slurm/pretrained_siqa.sh +++ b/slurm/pretrained_siqa.sh @@ -8,7 +8,7 @@ #SBATCH --gpus-per-task=2 # GPU Allocated #SBATCH --job-name=SIQA_BASED # The name of this job. If removed the job will have name of your shell script. #SBATCH --output=outputs/%x-%j.out # The name of the file output. %x-%j means JOB_NAME-JOB_ID. If removed output will be in file slurm-JOB_ID. -#SBATCH --mail-user=dwangli@isi.edu # Email address for email notifications to be sent to. +#SBATCH --mail-user=dwangli@isi.ai2 # Email address for email notifications to be sent to. #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --export=NONE # Ensure job gets a fresh login environment #SBATCH --array=0-2 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. Add %1 if you only want one jobs running at one time. @@ -34,7 +34,7 @@ echo "" allTask=(alphanli hellaswag physicaliqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo "" -python train.py task="$task" build_on_pretrained_model=outputs/roberta-large-baselines/socialiqa-s42.ckpt +python ai2/train.py "parameters/train-pretrained-roberta-large-${task}.params" -p build_on_pretrained_model outputs/roberta-large-baselines/socialiqa-s42.ckpt echo "" ### Finishing up the job and copy the output off of staging diff --git a/slurm/run_saga.sh b/slurm/run_saga.sh index 4ce3d489..510b8062 100644 --- a/slurm/run_saga.sh +++ b/slurm/run_saga.sh @@ -19,4 +19,4 @@ conda activate ai2 spack load cuda@9.0.176 spack load cudnn@7.6.5.32-9.0-linux-x64 -python train.py $1 +python ai2/train.py $1 diff --git a/slurm/train_ai2.sh b/slurm/train_ai2.sh index df84b6d7..775a59ca 100644 --- a/slurm/train_ai2.sh +++ b/slurm/train_ai2.sh @@ -9,7 +9,7 @@ #SBATCH --gpus-per-task=1 #SBATCH --job-name=TRAIN_AI2 #SBATCH --output=outputs/slurm/%x-%j.out # %x-%j means JOB_NAME-JOB_ID. -#SBATCH --mail-user=ahedges@isi.edu +#SBATCH --mail-user=ahedges@isi.ai2 #SBATCH --mail-type=ALL # Type of notifications to receive. Other options includes BEGIN, END, FAIL, REQUEUE and more. #SBATCH --array=0-3 # Submitting an array of (n-m+1) jobs, with $SLURM_ARRAY_TASK_ID ranging from n to m. @@ -29,7 +29,7 @@ echo "This is job $((SLURM_ARRAY_TASK_ID + 1)) out of $SLURM_ARRAY_TASK_COUNT jo allTask=(alphanli hellaswag physicaliqa socialiqa) task=${allTask[${SLURM_ARRAY_TASK_ID}]} echo -time python -u train.py task="$task" +time python -u ai2/train.py "parameters/train-roberta-large-${task}.params" echo # Finishing up the job and copy the output off of staging diff --git a/train.py b/train.py deleted file mode 100644 index aaf1ce82..00000000 --- a/train.py +++ /dev/null @@ -1,99 +0,0 @@ -import os -from pathlib import Path -import random - -import hydra -from loguru import logger -import numpy as np -import omegaconf -from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.loggers import TestTubeLogger -import torch - -from eval import evaluate -from model import Classifier - -# Save root path as hydra will create copies of this code in date specific folder -ROOT_PATH = Path(__file__).parent.absolute() - - -@hydra.main(config_path="config/train.yaml", strict=False) -def train(config: omegaconf.Config): - config = omegaconf.OmegaConf.to_container(config) - logger.info(config) - - # If the training is deterministic for debugging purposes, we set the random seed - if not isinstance(config['random_seed'], bool): - logger.info(f"Running deterministic model with seed {config['random_seed']}") - torch.manual_seed(config['random_seed']) - np.random.seed(config['random_seed']) - random.seed(config['random_seed']) - if torch.cuda.is_available(): - torch.backends.cuda.deterministic = True - torch.backends.cuda.benchmark = False - - # Initialize the classifier by arguments specified in config file - model = Classifier(config) - logger.info('Initialized classifier.') - if 'save_path' in config: - save_path = config['save_path'] - else: - save_path = f"{config['model']}_{config['task_name']}-{config['train_data_slice']}_{config['architecture']}_s{config['random_seed']}" - if 'task_name2' in config: - save_path = save_path + f"_{config['task_name2']}" - - if config['build_on_pretrained_model']: - logger.info('Loading pretrained checkpoint...') - device = 'cpu' if not torch.cuda.is_available() else "cuda" - checkpoint = torch.load(ROOT_PATH / config['build_on_pretrained_model'], map_location=device) - model.load_state_dict(checkpoint['state_dict']) - save_path += f"_pretrained_{config['build_on_pretrained_model'].split('/')[-1].split('.')[0]}" - logger.info('Output directory: ' + save_path) - - # Define the trainer along with its checkpoint and experiment instance - checkpoint = ModelCheckpoint( - filepath=os.path.join(save_path, 'checkpoints', 'foo'), # Last part needed due to parsing logic - verbose=True, - save_top_k=1 if config['save_best_only'] else -1, - ) - tt_logger = TestTubeLogger( - save_dir=save_path, - name=config['task_name'], - version=0, - ) - tt_logger.experiment.autosave = True - trainer = Trainer( - logger=tt_logger, - checkpoint_callback=checkpoint, - gradient_clip_val=0, - gpus=list(range(torch.cuda.device_count())) if torch.cuda.is_available() else None, - log_gpu_memory="all", - progress_bar_refresh_rate=1, - check_val_every_n_epoch=1, - accumulate_grad_batches=config["accumulate_grad_batches"], - max_epochs=config["max_epochs"], - min_epochs=1, - train_percent_check=1.0, - val_percent_check=1.0, - test_percent_check=1.0, - log_save_interval=25, - row_log_interval=25, - distributed_backend="dp", - precision=16 if config["use_amp"] else 32, - weights_summary='top', - num_sanity_val_steps=5, - ) - trainer.fit(model) - logger.success('Training Completed') - - if config['eval_after_training']: - logger.info('Start model evaluation') - # Evaluate the model with evaluate function from eval.py - evaluate(a_classifier=model, output_path=save_path, - compute_device=('cpu' if not torch.cuda.is_available() else "cuda"), - val_x=ROOT_PATH / config["val_x"], val_y=ROOT_PATH / config["val_y"]) - - -if __name__ == "__main__": - train()