From adfd4140a3f888af8f8bf073229c8c869e1cba46 Mon Sep 17 00:00:00 2001
From: Patrick Menninger <patrick.menninger@viasat.com>
Date: Thu, 1 Oct 2020 20:15:51 -0700
Subject: [PATCH 1/2] Fixed two bugs with evaluating 3rd party results

1. NAB benchmark contained duplicate timestamps which caused length to not match up with imported results, throwing errors
2. Threshold values were read in as strings instead of floats, throwing errors

Signed-off-by: Patrick Menninger <patrick.menninger@viasat.com>
---
 nab/labeler.py | 3 +++
 nab/sweeper.py | 5 +++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/nab/labeler.py b/nab/labeler.py
index 90fac1a94..a17aee38a 100644
--- a/nab/labeler.py
+++ b/nab/labeler.py
@@ -213,6 +213,9 @@ def getLabels(self):
           indices = betweenT1AndT2.loc[:,"label"].index
           labels["label"].values[indices.values] = 1
 
+        # remove duplicate rows (somehow they snuck in for certain datasets)
+        labels = labels.drop_duplicates(subset=['timestamp'])
+
         self.labels[relativePath] = labels
 
       else:
diff --git a/nab/sweeper.py b/nab/sweeper.py
index 39f7b15cd..d301a9483 100644
--- a/nab/sweeper.py
+++ b/nab/sweeper.py
@@ -301,6 +301,7 @@ def scoreDataSet(
       scores      (list) List of per-row scores, to be saved in score file
       matchingRow (ThresholdScore)
     """
+    threshold = float(threshold)
     anomalyList = self.calcSweepScore(
       timestamps, anomalyScores, windowLimits, dataSetName)
     scoresByThreshold = self.calcScoreByThreshold(anomalyList)
@@ -308,10 +309,10 @@ def scoreDataSet(
     matchingRow = None
     prevRow = None
     for thresholdScore in scoresByThreshold:
-      if thresholdScore.threshold == threshold:
+      if float(thresholdScore.threshold) == threshold:
         matchingRow = thresholdScore
         break
-      elif thresholdScore.threshold < threshold:
+      elif float(thresholdScore.threshold) < threshold:
         matchingRow = prevRow
         break
 

From 28ac3c03120ddc0b5160c6e021d711564d239ab2 Mon Sep 17 00:00:00 2001
From: Patrick Menninger <patrick.menninger@viasat.com>
Date: Fri, 2 Oct 2020 10:56:58 -0700
Subject: [PATCH 2/2] Added optional --removeDuplicateLabels flag

Duplicate fixes from previous commit broke the default detectors, so I made them optional

Signed-off-by: Patrick Menninger <patrick.menninger@viasat.com>
---
 nab/labeler.py | 10 ++++++----
 nab/runner.py  |  4 ++--
 run.py         |  7 ++++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/nab/labeler.py b/nab/labeler.py
index a17aee38a..8554762b8 100644
--- a/nab/labeler.py
+++ b/nab/labeler.py
@@ -106,7 +106,7 @@ class CorpusLabel(object):
   benchmark corpus.
   """
 
-  def __init__(self, path, corpus):
+  def __init__(self, path, corpus, remove_duplicates=False):
     """
     Initializes a CorpusLabel object by getting the anomaly windows and labels.
     When this is done for combining raw user labels, we skip getLabels()
@@ -114,6 +114,7 @@ def __init__(self, path, corpus):
 
     @param path    (string)      Name of file containing the set of labels.
     @param corpus  (nab.Corpus)  Corpus object.
+    @param remove_duplicates (bool) Whether to remove duplicate rows in the label data
     """
     self.path = path
 
@@ -125,7 +126,7 @@ def __init__(self, path, corpus):
 
     if "raw" not in self.path:
       # Do not get labels from files in the path nab/labels/raw
-      self.getLabels()
+      self.getLabels(remove_duplicates)
 
 
   def getWindows(self):
@@ -192,7 +193,7 @@ def validateLabels(self):
           raise ValueError("In the label file %s, windows overlap." % self.path)
 
 
-  def getLabels(self):
+  def getLabels(self, remove_duplicates=False):
     """
     Get Labels as a dictionary of key-value pairs of a relative path and its
     corresponding binary vector of anomaly labels. Labels are simply a more
@@ -214,7 +215,8 @@ def getLabels(self):
           labels["label"].values[indices.values] = 1
 
         # remove duplicate rows (somehow they snuck in for certain datasets)
-        labels = labels.drop_duplicates(subset=['timestamp'])
+        if remove_duplicates:
+            labels = labels.drop_duplicates(subset=['timestamp'])
 
         self.labels[relativePath] = labels
 
diff --git a/nab/runner.py b/nab/runner.py
index 176c31760..b0704c67e 100644
--- a/nab/runner.py
+++ b/nab/runner.py
@@ -87,10 +87,10 @@ def __init__(self,
     self.profiles = None
 
 
-  def initialize(self):
+  def initialize(self, remove_duplicate_labels=False):
     """Initialize all the relevant objects for the run."""
     self.corpus = Corpus(self.dataDir)
-    self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus)
+    self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus, remove_duplicates=remove_duplicate_labels)
 
     with open(self.profilesPath) as p:
       self.profiles = json.load(p)
diff --git a/run.py b/run.py
index 8c71ec9e2..b8083c36c 100755
--- a/run.py
+++ b/run.py
@@ -89,7 +89,7 @@ def main(args):
                   thresholdPath=thresholdsFile,
                   numCPUs=numCPUs)
 
-  runner.initialize()
+  runner.initialize(args.removeDuplicateLabels)
 
   if args.detect:
     detectorConstructors = getDetectorClassConstructors(args.detectors)
@@ -142,6 +142,11 @@ def main(args):
                     default=False,
                     action="store_true")
 
+  parser.add_argument("--removeDuplicateLabels",
+                    help="If specified will remove any duplicate rows from the labeled NAB data",
+                    default=False,
+                    action="store_true")
+
   parser.add_argument("--dataDir",
                     default="data",
                     help="This holds all the label windows for the corpus.")