From adfd4140a3f888af8f8bf073229c8c869e1cba46 Mon Sep 17 00:00:00 2001 From: Patrick Menninger Date: Thu, 1 Oct 2020 20:15:51 -0700 Subject: [PATCH 1/2] Fixed two bugs with evaluating 3rd party results 1. NAB benchmark contained duplicate timestamps which caused length to not match up with imported results, throwing errors 2. Threshold values were read in as strings instead of floats, throwing errors Signed-off-by: Patrick Menninger --- nab/labeler.py | 3 +++ nab/sweeper.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/nab/labeler.py b/nab/labeler.py index 90fac1a94..a17aee38a 100644 --- a/nab/labeler.py +++ b/nab/labeler.py @@ -213,6 +213,9 @@ def getLabels(self): indices = betweenT1AndT2.loc[:,"label"].index labels["label"].values[indices.values] = 1 + # remove duplicate rows (somehow they snuck in for certain datasets) + labels = labels.drop_duplicates(subset=['timestamp']) + self.labels[relativePath] = labels else: diff --git a/nab/sweeper.py b/nab/sweeper.py index 39f7b15cd..d301a9483 100644 --- a/nab/sweeper.py +++ b/nab/sweeper.py @@ -301,6 +301,7 @@ def scoreDataSet( scores (list) List of per-row scores, to be saved in score file matchingRow (ThresholdScore) """ + threshold = float(threshold) anomalyList = self.calcSweepScore( timestamps, anomalyScores, windowLimits, dataSetName) scoresByThreshold = self.calcScoreByThreshold(anomalyList) @@ -308,10 +309,10 @@ def scoreDataSet( matchingRow = None prevRow = None for thresholdScore in scoresByThreshold: - if thresholdScore.threshold == threshold: + if float(thresholdScore.threshold) == threshold: matchingRow = thresholdScore break - elif thresholdScore.threshold < threshold: + elif float(thresholdScore.threshold) < threshold: matchingRow = prevRow break From 28ac3c03120ddc0b5160c6e021d711564d239ab2 Mon Sep 17 00:00:00 2001 From: Patrick Menninger Date: Fri, 2 Oct 2020 10:56:58 -0700 Subject: [PATCH 2/2] Added optional --removeDuplicateLabels flag Duplicate fixes from previous commit broke the default detectors, so I made them optional Signed-off-by: Patrick Menninger --- nab/labeler.py | 10 ++++++---- nab/runner.py | 4 ++-- run.py | 7 ++++++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/nab/labeler.py b/nab/labeler.py index a17aee38a..8554762b8 100644 --- a/nab/labeler.py +++ b/nab/labeler.py @@ -106,7 +106,7 @@ class CorpusLabel(object): benchmark corpus. """ - def __init__(self, path, corpus): + def __init__(self, path, corpus, remove_duplicates=False): """ Initializes a CorpusLabel object by getting the anomaly windows and labels. When this is done for combining raw user labels, we skip getLabels() @@ -114,6 +114,7 @@ def __init__(self, path, corpus): @param path (string) Name of file containing the set of labels. @param corpus (nab.Corpus) Corpus object. + @param remove_duplicates (bool) Whether to remove duplicate rows in the label data """ self.path = path @@ -125,7 +126,7 @@ def __init__(self, path, corpus): if "raw" not in self.path: # Do not get labels from files in the path nab/labels/raw - self.getLabels() + self.getLabels(remove_duplicates) def getWindows(self): @@ -192,7 +193,7 @@ def validateLabels(self): raise ValueError("In the label file %s, windows overlap." % self.path) - def getLabels(self): + def getLabels(self, remove_duplicates=False): """ Get Labels as a dictionary of key-value pairs of a relative path and its corresponding binary vector of anomaly labels. Labels are simply a more @@ -214,7 +215,8 @@ def getLabels(self): labels["label"].values[indices.values] = 1 # remove duplicate rows (somehow they snuck in for certain datasets) - labels = labels.drop_duplicates(subset=['timestamp']) + if remove_duplicates: + labels = labels.drop_duplicates(subset=['timestamp']) self.labels[relativePath] = labels diff --git a/nab/runner.py b/nab/runner.py index 176c31760..b0704c67e 100644 --- a/nab/runner.py +++ b/nab/runner.py @@ -87,10 +87,10 @@ def __init__(self, self.profiles = None - def initialize(self): + def initialize(self, remove_duplicate_labels=False): """Initialize all the relevant objects for the run.""" self.corpus = Corpus(self.dataDir) - self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus) + self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus, remove_duplicates=remove_duplicate_labels) with open(self.profilesPath) as p: self.profiles = json.load(p) diff --git a/run.py b/run.py index 8c71ec9e2..b8083c36c 100755 --- a/run.py +++ b/run.py @@ -89,7 +89,7 @@ def main(args): thresholdPath=thresholdsFile, numCPUs=numCPUs) - runner.initialize() + runner.initialize(args.removeDuplicateLabels) if args.detect: detectorConstructors = getDetectorClassConstructors(args.detectors) @@ -142,6 +142,11 @@ def main(args): default=False, action="store_true") + parser.add_argument("--removeDuplicateLabels", + help="If specified will remove any duplicate rows from the labeled NAB data", + default=False, + action="store_true") + parser.add_argument("--dataDir", default="data", help="This holds all the label windows for the corpus.")