diff --git a/nab/labeler.py b/nab/labeler.py index 90fac1a94..8554762b8 100644 --- a/nab/labeler.py +++ b/nab/labeler.py @@ -106,7 +106,7 @@ class CorpusLabel(object): benchmark corpus. """ - def __init__(self, path, corpus): + def __init__(self, path, corpus, remove_duplicates=False): """ Initializes a CorpusLabel object by getting the anomaly windows and labels. When this is done for combining raw user labels, we skip getLabels() @@ -114,6 +114,7 @@ def __init__(self, path, corpus): @param path (string) Name of file containing the set of labels. @param corpus (nab.Corpus) Corpus object. + @param remove_duplicates (bool) Whether to remove duplicate rows in the label data """ self.path = path @@ -125,7 +126,7 @@ def __init__(self, path, corpus): if "raw" not in self.path: # Do not get labels from files in the path nab/labels/raw - self.getLabels() + self.getLabels(remove_duplicates) def getWindows(self): @@ -192,7 +193,7 @@ def validateLabels(self): raise ValueError("In the label file %s, windows overlap." % self.path) - def getLabels(self): + def getLabels(self, remove_duplicates=False): """ Get Labels as a dictionary of key-value pairs of a relative path and its corresponding binary vector of anomaly labels. Labels are simply a more @@ -213,6 +214,10 @@ def getLabels(self): indices = betweenT1AndT2.loc[:,"label"].index labels["label"].values[indices.values] = 1 + # remove duplicate rows (somehow they snuck in for certain datasets) + if remove_duplicates: + labels = labels.drop_duplicates(subset=['timestamp']) + self.labels[relativePath] = labels else: diff --git a/nab/runner.py b/nab/runner.py index 176c31760..b0704c67e 100644 --- a/nab/runner.py +++ b/nab/runner.py @@ -87,10 +87,10 @@ def __init__(self, self.profiles = None - def initialize(self): + def initialize(self, remove_duplicate_labels=False): """Initialize all the relevant objects for the run.""" self.corpus = Corpus(self.dataDir) - self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus) + self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus, remove_duplicates=remove_duplicate_labels) with open(self.profilesPath) as p: self.profiles = json.load(p) diff --git a/nab/sweeper.py b/nab/sweeper.py index 39f7b15cd..d301a9483 100644 --- a/nab/sweeper.py +++ b/nab/sweeper.py @@ -301,6 +301,7 @@ def scoreDataSet( scores (list) List of per-row scores, to be saved in score file matchingRow (ThresholdScore) """ + threshold = float(threshold) anomalyList = self.calcSweepScore( timestamps, anomalyScores, windowLimits, dataSetName) scoresByThreshold = self.calcScoreByThreshold(anomalyList) @@ -308,10 +309,10 @@ def scoreDataSet( matchingRow = None prevRow = None for thresholdScore in scoresByThreshold: - if thresholdScore.threshold == threshold: + if float(thresholdScore.threshold) == threshold: matchingRow = thresholdScore break - elif thresholdScore.threshold < threshold: + elif float(thresholdScore.threshold) < threshold: matchingRow = prevRow break diff --git a/run.py b/run.py index 8c71ec9e2..b8083c36c 100755 --- a/run.py +++ b/run.py @@ -89,7 +89,7 @@ def main(args): thresholdPath=thresholdsFile, numCPUs=numCPUs) - runner.initialize() + runner.initialize(args.removeDuplicateLabels) if args.detect: detectorConstructors = getDetectorClassConstructors(args.detectors) @@ -142,6 +142,11 @@ def main(args): default=False, action="store_true") + parser.add_argument("--removeDuplicateLabels", + help="If specified will remove any duplicate rows from the labeled NAB data", + default=False, + action="store_true") + parser.add_argument("--dataDir", default="data", help="This holds all the label windows for the corpus.")