diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..007a7cf02 --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +**/.* diff --git a/.gitignore b/.gitignore index fa9622e15..1d632af8f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ - +*.bak .DS_Store *.pyc data_test/ deprecated/ +build/ +dist/ nab.egg-info/ .idea/ .project @@ -13,5 +15,5 @@ nab/detectors/htmjava/.pydevproject scripts/.ipynb_checkpoints/ # Generated files - +plot_* *resultsSummary* diff --git a/.zenodo.json b/.zenodo.json index e2e9b3464..45af762d1 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -1,3 +1,4 @@ +{ "description": "The Numenta Anomaly Benchmark", "access_right": "open", "license": { @@ -81,5 +82,5 @@ { "name": "breznak" } - ], + ] } diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..4029018e9 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Changelog +All notable changes to this project will be documented in this file. + +## [v1.1] - 2019-09-12 +### Updated runtime to Python 3 +- Moved python 2 runtimes into independent detectors. +- Updated documentation and examples. + +## [v1.0] - 2017-04-26 +### Initial release +- Established proper python program setup. + +## [v0.8] - 2015-09-04 +### Initial tag for scoreboard \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..994504356 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,29 @@ +NAB is intended for the research community and we encourage your contributions and feedback! + +Before your [pull requests](https://help.github.com/articles/using-pull-requests) can be reviewed by our team, you'll need to sign our [Contributor License](https://numenta.com/contributor-license). + + +#### Data +We welcome data you're willing to contribute. Specifically we're looking for data meeting the following criteria: +* real-world time-series data +* \>1000 records +* labeled anomalies + +#### Anomaly detection algorithms +For us to consider adding your algorithm to the NAB repo it must meet the following criteria: +* open-source +* work with streaming data (i.e. process data in real-time) +* we must be able to fully-replicate your results + +For an algorithm to be used in practice it must run online as data is streaming in, and not in batch. It is necessary the algorithms are computationally efficient to process streaming data, i.e O(N). The following algorithms have been tested on NAB and do not meet this criteria: +- [Lytics Anomalyzer](https://github.com/lytics/anomalyzer) + - Runs in O(N^2) because for each subsequent record the model retrains over all previous records. + - The author recommended using the detector within a moving window (250 records) to speed up the algorithm, yielding the following results: 4.42 on the standard profile, 2.39 for rewarding low FP, and 8.58 for rewarding low FN. However this still ran quite slow; e.g. running Anomalyzer on "realKnownCause/machine_temperature_system_failure.csv" took 52m0s, but only 4m39s for the HTM detector. + +We investigated some popular open-source algorithms to add to NAB, and have found the following unsuitable for streaming/online anomaly detection: +- [Yahoo EGADS](https://github.com/yahoo/egads) separates time series modeling from anomaly detection. To detect anomalies EGADS compares the prediction error to a threshold, and it determines this threshold by scanning the whole data file. It may be possible to use a small part of EGADS to output a set of anomaly scores by simply outputting the prediction error, but this calls for a hardcoded threshold and is a significant departure from the algorithm. +- [Netflix's "Robust Anomaly Detection" (RAD)](https://github.com/Netflix/Surus) uses Robust Principal Component Analysis (RPCA), which is not inherently aware of time. RAD applies RPCA to time series by chunking the data according to a seasonality that you specify, thus creating "time dimensions". The algorithm scans an entire time series, and then decides where the anomalies occurred. +- [LinkedIn's luminol](https://github.com/linkedin/luminol) is a general time-series analysis toolkit, with several algorithms for anomaly detection. However, these algorithms run in batch, not streaming; they process an entire time-series and return the anomalous time windows after the fact. + +#### Comments/suggestions +Want to suggest some changes to the NAB codebase? Submit an [issue](https://github.com/numenta/NAB/issues/new) and/or pull request and we'll take a look. diff --git a/Dockerfile.py27 b/Dockerfile.py27 new file mode 100644 index 000000000..986b0593d --- /dev/null +++ b/Dockerfile.py27 @@ -0,0 +1,28 @@ +FROM numenta/nupic:1.0.5 + +# Plus Java so we can run HTM.Java as well +RUN wget https://d3pxv6yz143wms.cloudfront.net/8.212.04.2/java-1.8.0-amazon-corretto-jdk_8.212.04-2_amd64.deb && \ + apt-get update && apt-get install java-common && apt-get install -y --no-install-recommends apt-utils && \ + dpkg --install java-1.8.0-amazon-corretto-jdk_8.212.04-2_amd64.deb + +ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-amazon-corretto +ENV PATH $JAVA_HOME/bin:$PATH + +ENV NAB /usr/local/src/nab + +ADD . $NAB +WORKDIR $NAB +RUN python -m pip install -e . + +# Run Numenta detectors +RUN echo "Running numenta detectors in Python 2.7..." +WORKDIR $NAB/nab/detectors/numenta +RUN python -m pip install -r requirements.txt +RUN python run.py --skipConfirmation + +# Run HTM.Java detector +RUN echo "Running HTM.Java detector in Java 8 / Python 2.7..." +WORKDIR $NAB/nab/detectors/htmjava/nab/detectors/htmjava +RUN ./gradlew clean build +WORKDIR $NAB/nab/detectors/htmjava +RUN python run.py --skipConfirmation diff --git a/README.md b/README.md index dfef4b996..b28de30a9 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,31 @@ -The Numenta Anomaly Benchmark [![Build Status](https://travis-ci.org/numenta/NAB.svg?branch=master)](https://travis-ci.org/numenta/NAB) +The Numenta Anomaly Benchmark (NAB) [![Build Status](https://travis-ci.org/numenta/NAB.svg?branch=master)](https://travis-ci.org/numenta/NAB) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1040335.svg)](https://doi.org/10.5281/zenodo.1040335) ----------------------------- -Welcome. This repository contains the data and scripts comprising the Numenta -Anomaly Benchmark (NAB). NAB is a novel benchmark for evaluating +Welcome. This repository contains the data and scripts which comprise the +Numenta Anomaly Benchmark (NAB) v1.1. NAB is a novel benchmark for evaluating algorithms for anomaly detection in streaming, real-time applications. It is -comprised of over 50 labeled real-world and artificial timeseries data files plus a -novel scoring mechanism designed for real-time applications. - -Included are the tools to allow you to easily run NAB on your -own anomaly detection algorithms; see the [NAB entry points -info](https://github.com/numenta/NAB/wiki#nab-entry-points). Competitive results -tied to open source code will be posted in the wiki on the -[Scoreboard](https://github.com/numenta/NAB/wiki/NAB%20Scoreboard). Let us know -about your work by emailing us at [nab@numenta.org](mailto:nab@numenta.org) or +composed of over 50 labeled real-world and artificial timeseries data files +plus a novel scoring mechanism designed for real-time applications. + +Included are the tools to allow you to run NAB on your own anomaly detection +algorithms; see the [NAB entry points +info](https://github.com/numenta/NAB/wiki/NAB-Entry-Points). Competitive +results tied to open source code will be posted on the +[Scoreboard](https://github.com/numenta/NAB#scoreboard). Let us know about +your work by emailing us at [nab@numenta.org](mailto:nab@numenta.org) or submitting a pull request. -This readme is a brief overview and contains details for setting up NAB. Please -refer to the following for more details about NAB scoring, data, and motivation: +This readme is a brief overview and contains details for setting up NAB. +Please refer to the following for more details about NAB scoring, data, and +motivation: - [Unsupervised real-time anomaly detection for streaming data](http://www.sciencedirect.com/science/article/pii/S0925231217309864) - The main paper, covering NAB and Numenta's HTM-based anomaly detection algorithm - [NAB Whitepaper](https://github.com/numenta/NAB/wiki#nab-whitepaper) - [Evaluating Real-time Anomaly Detection Algorithms](http://arxiv.org/abs/1510.03336) - Original publication of NAB -We encourage you to publish your results on running NAB, and share them with us at [nab@numenta.org](nab@numenta.org). Please cite the following publication when referring to NAB: +We encourage you to publish your results on running NAB, and share them with +us at [nab@numenta.org](nab@numenta.org). Please cite the following +publication when referring to NAB: Ahmad, S., Lavin, A., Purdy, S., & Agha, Z. (2017). Unsupervised real-time anomaly detection for streaming data. Neurocomputing, Available online 2 June @@ -59,27 +62,29 @@ The NAB scores are normalized such that the maximum possible is 100.0 (i.e. the \**** We have included the results for RCF using an [AWS proprietary implementation](https://docs.aws.amazon.com/kinesisanalytics/latest/sqlref/sqlrf-random-cut-forest.html); even though the algorithm code is not open source, the [algorithm description](http://proceedings.mlr.press/v48/guha16.pdf) is public and the code we used to run [NAB on RCF](nab/detectors/random_cut_forest) is open source. - † Algorithm was an entry to the [2016 NAB Competition](http://numenta.com/blog/2016/08/10/numenta-anomaly-benchmark-nab-competition-2016-winners/). -Please see [the wiki section on contributing algorithms](https://github.com/numenta/NAB/wiki/NAB-Contributions-Criteria#anomaly-detection-algorithms) for discussion on posting algorithms to the scoreboard. +Please see [the wiki section on contributing +algorithms](https://github.com/numenta/NAB/wiki/NAB-Contributions-Criteria#anomaly-detection-algorithms) +for discussion on posting algorithms to the scoreboard. #### Corpus -The NAB corpus of 58 timeseries data files is designed to provide data for research -in streaming anomaly detection. It is comprised of both -real-world and artifical timeseries data containing labeled anomalous periods of behavior. +The NAB corpus of 58 timeseries data files is designed to provide data for +research in streaming anomaly detection. It is comprised of both real-world +and artifical timeseries data containing labeled anomalous periods of +behavior. The majority of the data is real-world from a variety of sources such as AWS server metrics, Twitter volume, advertisement clicking metrics, traffic data, -and more. All data is included in the repository, with more details in the [data -readme](https://github.com/numenta/NAB/tree/master/data). We are in the process -of adding more data, and actively searching for more data. Please contact us at -[nab@numenta.org](mailto:nab@numenta.org) if you have similar data (ideally with -known anomalies) that you would like to see incorporated into NAB. +and more. All data is included in the repository, with more details in the +[data readme](https://github.com/numenta/NAB/tree/master/data). Please +contact us at [nab@numenta.org](mailto:nab@numenta.org) if you have similar +data (ideally with known anomalies) that you would like to see incorporated +into NAB. -The NAB version will be updated whenever new data (and corresponding labels) is -added to the corpus; NAB is currently in v1.0. +The NAB version will be updated whenever new data (and corresponding labels) +is added to the corpus or other significant changes are made. #### Additional Scores @@ -96,8 +101,6 @@ run without likelihood, set the variable `self.useLikelihood` in to `False`. - - | Detector |Standard Profile | Reward Low FP | Reward Low FN | |---------------|---------|------------------|---------------| | Numenta HTMusing NuPIC v0.5.6* | 70.1 | 63.1 | 74.3 | @@ -110,66 +113,57 @@ to `False`. † Algorithm was an entry to the [2016 NAB Competition](http://numenta.com/blog/2016/08/10/numenta-anomaly-benchmark-nab-competition-2016-winners/). -Installing NAB 1.0 ------------------- +Installing NAB +-------------- ### Supported Platforms - OSX 10.9 and higher - Amazon Linux (via AMI) -Other platforms may work but have not been tested. - +Other platforms may work. NAB has been tested on Windows 10 but is not +officially supported. ### Initial requirements You need to manually install the following: -- [Python 2.7](https://www.python.org/download/) +- [Python 3.6](https://www.python.org/download/) - [pip](https://pip.pypa.io/en/latest/installing.html) - [NumPy](http://www.numpy.org/) -- [NuPIC](http://www.github.com/numenta/nupic) (only required if running the Numenta detector) -##### Download this repository +#### Download this repository Use the Github links provided in the right sidebar. -##### Install the Python requirements - - cd NAB - (sudo) pip install -r requirements.txt - -This will install the required modules. - -##### Install NAB - -Recommended: +#### Install NAB - pip install . --user +##### Pip: +From inside the checkout directory: -> Note: If NuPIC is not already installed, the version specified in -`NAB/requirements.txt` will be installed. If NuPIC is already installed, it - will not be re-installed. - + pip install -r requirements.txt + pip install . --user If you want to manage dependency versions yourself, you can skip dependencies with: pip install . --user --no-deps - If you are actively working on the code and are familiar with manual PYTHONPATH setup: - pip install -e . --install-option="--prefix=/some/other/path/" + pip install -e . --install-option="--prefix=/some/other/path/" +##### Anaconda: + + conda env create ### Usage There are several different use cases for NAB: -1. If you just want to look at all the results we reported in the paper, there +1. If you want to look at all the results we reported in the paper, there is no need to run anything. All the data files are in the data subdirectory and all individual detections for reported algorithms are checked in to the results subdirectory. Please see the README files in those locations. @@ -178,31 +172,28 @@ subdirectory. Please see the README files in those locations. `scripts` directory for `scripts/plot.py` 1. If you have your own algorithm and want to run the NAB benchmark, please see -the [NAB Entry Points](https://github.com/numenta/NAB/wiki#nab-entry-diagram) +the [NAB Entry Points](https://github.com/numenta/NAB/wiki/NAB-Entry-Points) section in the wiki. (The easiest option is often to simply run your algorithm on the data and output results in the CSV format we specify. Then run the NAB scoring algorithm to compute the final scores. This is how we scored the Twitter algorithm, which is written in R.) -1. If you are a NuPIC user and just want to run the Numenta HTM detector follow +1. If you are a NuPIC user and want to run the Numenta HTM detector follow the directions below to "Run HTM with NAB". 1. If you want to run everything including the bundled Skyline detector follow the directions below to "Run full NAB". Note that this will take hours as the Skyline code is quite slow. -1. If you just want to run NAB on one or more data files (e.g. for debugging) +1. If you want to run NAB on one or more data files (e.g. for debugging) follow the directions below to "Run a subset of NAB". - -##### Run HTM with NAB - -First make sure NuPIC is installed and working properly. Then: +##### Run a detector on NAB cd /path/to/nab - python run.py -d numenta --detect --optimize --score --normalize + python run.py -d expose --detect --optimize --score --normalize -This will run the Numenta detector only and produce normalized scores. Note that +This will run the EXPoSE detector only and produce normalized scores. Note that by default it tries to use all the cores on your machine. The above command should take about 20-30 minutes on a current powerful laptop with 4-8 cores. For debugging you can run subsets of the data files by modifying and specifying @@ -212,27 +203,27 @@ specific label files (see section below). Please type: to see all the options. -Note that to replicate results exactly as in the paper you may need to checkout -the specific version of NuPIC (and associated nupic.core) that is noted in the -[Scoreboard](https://github.com/numenta/NAB/wiki/NAB%20Scoreboard): +##### Running non-Python 3 detectors + +NAB is a Python 3 framework, and can only integrate Python 3 detectors. The following detectors must be run outside the NAB runtime and integrated for scoring in a later step. These detectors include: - cd /path/to/nupic/ - git checkout -b nab {TAG NAME} - cd /path/to/nupic.core/ - git checkout -b nab {TAG NAME} + numenta (Python 2) + numentaTM (Python 2) + htmjava (Python 2 / Java) + twitterADVec (R) + random_cut_forest (AWS Kinesis Analytics) + +Instructions on how to run the each detector in their native environment can be found in the `nab/detectors/${name}` directory. The Python 2 HTM detectors are also provided within a docker image, available with `docker pull numenta/nab:py2.7`. ##### Run full NAB cd /path/to/nab python run.py -This will run everything and produce results files for all anomaly detection -methods. Several algorithms are included in the repo, such as the Numenta -HTM anomaly detection method, as well as methods from the [Etsy -Skyline](https://github.com/etsy/skyline) anomaly detection library, a sliding -window detector, Bayes Changepoint, and so on. This will also pass those results -files to the scoring script to generate final NAB scores. **Note**: this option -will take many many hours to run. +This will run all detectors available in this repository and produce results +files. To run non-Python3 detectors see "Running non-Python3 detectors" above. + +**Note**: this option may take many many hours to run. ##### Run subset of NAB data files @@ -248,7 +239,7 @@ are interested in. NAB on a subset of labels: cd /path/to/nab - python run.py -d numenta --detect --windowsFile labels/combined_windows_tiny.json + python run.py -d expose --detect --windowsFile labels/combined_windows_tiny.json This will run the `detect` phase of NAB on the data files specified in the above JSON file. Note that scoring and normalization are not supported with this diff --git a/environment.yml b/environment.yml new file mode 100644 index 000000000..4bbc311d5 --- /dev/null +++ b/environment.yml @@ -0,0 +1,20 @@ +name: NAB +channels: + - defaults + - conda-forge + +dependencies: + - python=3.6 + - pip + + # See requirements.txt + - pandas==0.20.3 + - simplejson==3.11.1 + - boto3==1.9.134 + - scikit-learn==0.21.1 + + - pip: + - boto3 + - botocore + # Install NAB in development mode + - -e . diff --git a/labels/combined_windows.json b/labels/combined_windows.json index 2f1be3557..7777f6ff5 100644 --- a/labels/combined_windows.json +++ b/labels/combined_windows.json @@ -165,8 +165,8 @@ ], "realAWSCloudwatch/iio_us-east-1_i-a2eb1cd9_NetworkIn.csv": [ [ - "2013-10-10 10:35:00.000000", - "2013-10-10 15:45:00.000000" + "2013-10-10 07:00:00.000000", + "2013-10-10 12:10:00.000000" ], [ "2013-10-10 18:05:00.000000", diff --git a/nab/corpus.py b/nab/corpus.py index 55d45b3aa..8eba88a65 100644 --- a/nab/corpus.py +++ b/nab/corpus.py @@ -157,7 +157,7 @@ def addColumn(self, columnName, data, write=False): modificiations or not. """ - for relativePath in self.dataFiles.keys(): + for relativePath in list(self.dataFiles.keys()): self.dataFiles[relativePath].modifyData( columnName, data[relativePath], write=write) @@ -172,7 +172,7 @@ def removeColumn(self, columnName, write=False): @param write (boolean) Flag to decide whether to write corpus modificiations or not. """ - for relativePath in self.dataFiles.keys(): + for relativePath in list(self.dataFiles.keys()): self.dataFiles[relativePath].modifyData(columnName, write=write) def copy(self, newRoot=None): @@ -184,13 +184,13 @@ def copy(self, newRoot=None): if newRoot[-1] != os.path.sep: newRoot += os.path.sep if os.path.isdir(newRoot): - print "directory already exists" + print("directory already exists") return None else: createPath(newRoot) newCorpus = Corpus(newRoot) - for relativePath in self.dataFiles.keys(): + for relativePath in list(self.dataFiles.keys()): newCorpus.addDataSet(relativePath, self.dataFiles[relativePath]) return newCorpus @@ -224,7 +224,7 @@ def getDataSubset(self, query): datafile. """ ans = {} - for relativePath in self.dataFiles.keys(): + for relativePath in list(self.dataFiles.keys()): if query in relativePath: ans[relativePath] = self.dataFiles[relativePath] return ans diff --git a/nab/detectors/base.py b/nab/detectors/base.py index 6ac4fdc15..4f637e85a 100644 --- a/nab/detectors/base.py +++ b/nab/detectors/base.py @@ -28,12 +28,11 @@ -class AnomalyDetector(object): +class AnomalyDetector(object, metaclass=abc.ABCMeta): """ Base class for all anomaly detectors. When inheriting from this class please take note of which methods MUST be overridden, as documented below. """ - __metaclass__ = abc.ABCMeta def __init__( self, dataSet, @@ -107,13 +106,20 @@ def run(self): detectorValues = self.handleRecord(inputData) + # Make sure anomalyScore is between 0 and 1 + if not 0 <= detectorValues[0] <= 1: + raise ValueError( + f"anomalyScore must be a number between 0 and 1. " + f"Please verify if '{self.handleRecord.__qualname__}' method is " + f"returning a value between 0 and 1") + outputRow = list(row) + list(detectorValues) rows.append(outputRow) # Progress report if (i % 1000) == 0: - print ".", + print(".", end=' ') sys.stdout.flush() ans = pandas.DataFrame(rows, columns=headers) @@ -134,8 +140,8 @@ def detectDataSet(args): outputPath = os.path.join(outputDir, detectorName, relativeDir, fileName) createPath(outputPath) - print "%s: Beginning detection with %s for %s" % \ - (i, detectorName, relativePath) + print("%s: Beginning detection with %s for %s" % \ + (i, detectorName, relativePath)) detectorInstance.initialize() results = detectorInstance.run() @@ -145,6 +151,6 @@ def detectDataSet(args): results.to_csv(outputPath, index=False) - print "%s: Completed processing %s records at %s" % \ - (i, len(results.index), datetime.now()) - print "%s: Results have been written to %s" % (i, outputPath) + print("%s: Completed processing %s records at %s" % \ + (i, len(results.index), datetime.now())) + print("%s: Results have been written to %s" % (i, outputPath)) diff --git a/nab/detectors/context_ose/cad_ose.py b/nab/detectors/context_ose/cad_ose.py index f14f21531..64bbb7aff 100644 --- a/nab/detectors/context_ose/cad_ose.py +++ b/nab/detectors/context_ose/cad_ose.py @@ -18,6 +18,7 @@ # http://numenta.org/licenses/ # ---------------------------------------------------------------------- +from functools import cmp_to_key from nab.detectors.context_ose.context_operator import ContextOperator class ContextualAnomalyDetectorOSE(object): @@ -90,7 +91,7 @@ def step(self, inpFacts): else : percentSelectedContextActive = 0.0 - srtAContexts = sorted(activeContexts, cmp=aContextsCMP) + srtAContexts = sorted(activeContexts, key=lambda x: (x[1], x[2], x[3])) activeNeurons = [ cInf[0] for cInf in srtAContexts[-self.maxActNeurons:] ] currNeurFacts = set([ 2 ** 31 + fact for fact in activeNeurons ]) @@ -136,12 +137,3 @@ def getAnomalyScore(self,inputData): self.aScoresHistory.append(currentAnomalyScore) return returnedAnomalyScore - - -def aContextsCMP(x, y): - if cmp(x[1], y[1]) !=0 : - return cmp(x[1], y[1]) - elif cmp(x[2], y[2]) !=0 : - return cmp(x[2], y[2]) - else : - return cmp(x[3], y[3]) diff --git a/nab/detectors/context_ose/context_operator.py b/nab/detectors/context_ose/context_operator.py index b33c5bb96..425cbddae 100644 --- a/nab/detectors/context_ose/context_operator.py +++ b/nab/detectors/context_ose/context_operator.py @@ -187,7 +187,7 @@ def updateContextsAndGetActive(self, newContextFlag): for leftSemiContVal in self.crossedSemiContextsLists[0]: - for rightSemiContextID, contextID in leftSemiContVal[3].iteritems(): + for rightSemiContextID, contextID in leftSemiContVal[3].items(): if self.newContextID != contextID : diff --git a/nab/detectors/earthgecko_skyline/earthgecko_skyline_detector.py b/nab/detectors/earthgecko_skyline/earthgecko_skyline_detector.py index 0577de496..a2e4ed2e1 100644 --- a/nab/detectors/earthgecko_skyline/earthgecko_skyline_detector.py +++ b/nab/detectors/earthgecko_skyline/earthgecko_skyline_detector.py @@ -58,7 +58,7 @@ import scipy scipy_version = scipy.version.version if scipy_version != '1.1.0': - print('To run grubbs and ks_test scipy==1.1.0 is required, scipy %s is installed' % scipy_version) + print(('To run grubbs and ks_test scipy==1.1.0 is required, scipy %s is installed' % scipy_version)) sys.exit(1) except: print('To run grubbs and ks_test scipy==1.1.0 is required') @@ -67,7 +67,7 @@ import statsmodels statsmodels_version = statsmodels.version.version if statsmodels_version != '0.8.0': - print('To run grubbs and ks_test statsmodels==0.8.0 is required, statsmodels %s is installed' % statsmodels_version) + print(('To run grubbs and ks_test statsmodels==0.8.0 is required, statsmodels %s is installed' % statsmodels_version)) sys.exit(1) except: print('To run grubbs and ks_test statsmodels==0.8.0 is required') diff --git a/nab/detectors/expose/expose_detector.py b/nab/detectors/expose/expose_detector.py index 9579af35b..1f0f32cfa 100644 --- a/nab/detectors/expose/expose_detector.py +++ b/nab/detectors/expose/expose_detector.py @@ -66,6 +66,7 @@ def handleRecord(self, inputData): # product, is the likelihood of data point being normal. Resulting # anomaly scores are in the range of -0.02 to 1.02. anomalyScore = numpy.asscalar(1 - numpy.inner(inputFeature, exposeModel)) + anomalyScore = (anomalyScore + 0.02) / 1.04 self.timestep += 1 return [anomalyScore] diff --git a/nab/detectors/htmjava/.gitignore b/nab/detectors/htmjava/.gitignore index aacbaa445..774ff6e4f 100644 --- a/nab/detectors/htmjava/.gitignore +++ b/nab/detectors/htmjava/.gitignore @@ -4,4 +4,5 @@ build .classpath .project .settings -.idea \ No newline at end of file +.idea +env/ \ No newline at end of file diff --git a/nab/detectors/htmjava/README.md b/nab/detectors/htmjava/README.md index 3f86ae0ae..fb68a6626 100644 --- a/nab/detectors/htmjava/README.md +++ b/nab/detectors/htmjava/README.md @@ -1,18 +1,145 @@ -## [HTM Java](https://github.com/numenta/htm.java) NAB detector +# [HTM Java](https://github.com/numenta/htm.java) NAB detector -### Run [htm.java](https://github.com/numenta/htm.java) with NAB on your local machine +This directory holds the code required to run the `htmjava` detector against +the NAB data. In addition to Java, some of this code requires Python 2 and +therefore extra setup. In 2019 the main body of the benchmark's code was +ported to Python 3 but this detector relies on NuPIC which supports Python 2 +only. -First make sure you have __java 8__ installed +This code can be used to replicate results listed on the scoreboard of +the main repository for the following detectors: - java -version + htmjava -Build __htm.java__ NAB detector: - - cd nab/detectors/htmjava - ./gradlew clean build +## Installation + +### Docker + +This detector is also provided within a docker image, available with `docker pull numenta/nab:py2.7`. + +### Java + +First make sure you have __java 8__ installed. You should see a version number matchin 1.8.XXXX. + +``` +$ java -version +java version "1.8.0_211" +Java(TM) SE Runtime Environment (build 1.8.0_211-b12) +Java HotSpot(TM) 64-Bit Server VM (build 25.211-b12, mixed mode) +``` -Run __htm.java__ NAB detector: +Navigate to the *inner* `htmjava` directory and build __htm.java__ NAB detector: - cd /path/to/nab - python run.py -d htmjava --detect --optimize --score --normalize +``` +cd nab/detectors/htmjava +./gradlew clean build +``` + +Once this has built correctly navigate back to the *outer* `htmjava` directory +and continue with the Python installation and usage described below. + +`cd ../../../` + +### Python + +We assume you have a working version of Python 3 installed as your default Python. +If your default system Python is still Python 2 you can skip the virtual environment +creation below. + +#### Requirements to install + +- [Python 2.7](https://www.python.org/download/) +- [Virtualenv](https://pypi.org/project/virtualenv/) + +#### Install a virtual environment + +Create a new Python 2 virtual environment in this directory. + +`virtualenv -p path/to/python2 env` + +On Windows this might be: + +`virtualenv -p C:\Python27\python.exe env` + +Activate that virtual environment. + +`./env/Scripts/activate` + +or + +`env\Scripts\activate.bat` on Windows. + +Confirm you have a local Python 2 + +``` +$ python +Python 2.7.13 (v2.7.13:a06454b1afa1, Dec 17 2016, 20:53:40) [MSC v.1500 64 bit (AMD64)] on win32 +Type "help", "copyright", "credits" or "license" for more information. +>>> +``` + +#### Install NuPIC + +`pip install nupic` + +#### Install detectors + +`python setup.py develop` + +## Usage + +### Detection + +This directory contains a modified version of the `run.py` script which exists +in the main NAB directory. It can be used to run *detection* only using the +`htmjava` detector against NAB data. + +By default it will output results to the main NAB/results directory. + +`python run.py` + +Note: By default `run.py` tries to use all the cores on your machine. The above +command should take about 20-30 minutes on a current powerful laptop with 4-8 +cores. + +To see all options of this script type: + +`python run.py --help` + +### Optimizing, Scoring and Normalizing + +Once you have run detection against the NAB data you will need to exit the +Python 2 virtual environment and move into the main NAB directory. + +``` +(env) /NAB/nab/detectors/htmjava +$ deactivate +/NAB/nab/detectors/htmjava +$ cd ../../../ +/NAB +$ +``` + +Then follow the instructions in the main README to run optimization, scoring, and normalization, e.g.: + +`python run.py -d htmjava --optimize --score --normalize` + +### Run a subset of NAB data files + +For debugging it is sometimes useful to be able to run your algorithm on a +subset of the NAB data files or on your own set of data files. You can do that +by creating a custom `combined_windows.json` file that only contains labels for +the files you want to run. This new file should be in exactly the same format as +`combined_windows.json` except it would only contain windows for the files you +are interested in. + +**Example**: an example file containing two files is in +`labels/combined_windows_tiny.json`. (Under of the main NAB directory) The +following command shows you how to run NAB on a subset of labels: + + python run.py --detect --windowsFile labels/combined_windows_tiny.json +This will run the `detect` phase of NAB on the data files specified in the above +JSON file. Note that scoring and normalization are not supported with this +option. Note also that you may see warning messages regarding the lack of labels +for other files. You can ignore these warnings. \ No newline at end of file diff --git a/nab/detectors/htmjava/nab/__init__.py b/nab/detectors/htmjava/nab/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/htmjava/nab/corpus.py b/nab/detectors/htmjava/nab/corpus.py new file mode 100644 index 000000000..7373c3fac --- /dev/null +++ b/nab/detectors/htmjava/nab/corpus.py @@ -0,0 +1,234 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +""" +This contains the objects to store and manipulate a database of csv files. +""" + +import copy +import os +import pandas + +from nab.util import (absoluteFilePaths, + createPath) + + + +class DataFile(object): + """ + Class for storing and manipulating a single datafile. + Data is stored in pandas.DataFrame + """ + + def __init__(self, srcPath): + """ + @param srcPath (string) Filename of datafile to read. + """ + self.srcPath = srcPath + + self.fileName = os.path.split(srcPath)[1] + + self.data = pandas.io.parsers.read_csv(self.srcPath, + header=0, parse_dates=[0]) + + + def write(self, newPath=None): + """Write datafile to self.srcPath or newPath if given. + + @param newPath (string) Path to write datafile to. If path is not given, + write to source path + """ + + path = newPath if newPath else self.srcPath + self.data.to_csv(path, index=False) + + + def modifyData(self, columnName, data=None, write=False): + """Add columnName to datafile if data is given otherwise remove + columnName. + + @param columnName (string) Name of the column in the datafile to + either add or remove. + + @param data (pandas.Series) Column data to be added to datafile. + Data length should be as long as the + length of other columns. + + @param write (boolean) Flag to choose whether to write modifications to + source path. + """ + if isinstance(data, pandas.Series): + self.data[columnName] = data + else: + if columnName in self.data: + del self.data[columnName] + + if write: + self.write() + + + def getTimestampRange(self, t1, t2): + """Given timestamp range, get all records that are within that range. + + @param t1 (int) Starting timestamp. + + @param t2 (int) Ending timestamp. + + @return (list) Timestamp and value for each time stamp within the + timestamp range. + """ + tmp = self.data[self.data["timestamp"] >= t1] + ans = tmp[tmp["timestamp"] <= t2]["timestamp"].tolist() + return ans + + + def __str__(self): + ans = "" + ans += "path: %s\n" % self.srcPath + ans += "file name: %s\n"% self.fileName + ans += "data size: ", self.data.shape() + ans += "sample line: %s\n" % ", ".join(self.data[0]) + return ans + + + +class Corpus(object): + """ + Class for storing and manipulating a corpus of data where each datafile is + stored as a DataFile object. + """ + + def __init__(self, srcRoot): + """ + @param srcRoot (string) Source directory of corpus. + """ + self.srcRoot = srcRoot + self.dataFiles = self.getDataFiles() + self.numDataFiles = len(self.dataFiles) + + + def getDataFiles(self): + """ + Collect all CSV data files from self.srcRoot directory. + + @return (dict) Keys are relative paths (from self.srcRoot) and values are + the corresponding data files. + """ + filePaths = absoluteFilePaths(self.srcRoot) + dataSets = [DataFile(path) for path in filePaths if ".csv" in path] + + def getRelativePath(srcRoot, srcPath): + # Handle case where srcRoot is already relative + srcRoot = os.path.abspath(srcRoot) + ind = srcPath.index(srcRoot) + root_len = len(srcRoot) + return srcPath[ind+root_len:]\ + .strip(os.path.sep).replace(os.path.sep, "/") + + return {getRelativePath(self.srcRoot, d.srcPath) : d for d in dataSets} + + + def addColumn(self, columnName, data, write=False): + """ + Add column to entire corpus given columnName and dictionary of data for each + file in the corpus. If newRoot is given then corpus is copied and then + modified. + + @param columnName (string) Name of the column in the datafile to add. + + @param data (dict) Dictionary containing key value pairs of a + relative path and its corresponding + datafile (as a pandas.Series). + + @param write (boolean) Flag to decide whether to write corpus + modificiations or not. + """ + + for relativePath in self.dataFiles.keys(): + self.dataFiles[relativePath].modifyData( + columnName, data[relativePath], write=write) + + + def removeColumn(self, columnName, write=False): + """ + Remove column from entire corpus given columnName. If newRoot if given then + corpus is copied and then modified. + + @param columnName (string) Name of the column in the datafile to add. + + @param write (boolean) Flag to decide whether to write corpus + modificiations or not. + """ + for relativePath in self.dataFiles.keys(): + self.dataFiles[relativePath].modifyData(columnName, write=write) + + def copy(self, newRoot=None): + """Copy corpus to a newRoot which cannot already exist. + + @param newRoot (string) Location of new directory to copy corpus + to. + """ + if newRoot[-1] != os.path.sep: + newRoot += os.path.sep + if os.path.isdir(newRoot): + print "directory already exists" + return None + else: + createPath(newRoot) + + newCorpus = Corpus(newRoot) + for relativePath in self.dataFiles.keys(): + newCorpus.addDataSet(relativePath, self.dataFiles[relativePath]) + return newCorpus + + + def addDataSet(self, relativePath, dataSet): + """Add datafile to corpus given its realtivePath within the corpus. + + @param relativePath (string) Path of the new datafile relative to + the corpus directory. + + @param datafile (datafile) Data set to be added to corpus. + """ + self.dataFiles[relativePath] = copy.deepcopy(dataSet) + newPath = self.srcRoot + relativePath + createPath(newPath) + self.dataFiles[relativePath].srcPath = newPath + self.dataFiles[relativePath].write() + self.numDataFiles = len(self.dataFiles) + + + def getDataSubset(self, query): + """ + Get subset of the corpus given a query to match the datafile filename or + relative path. + + @param query (string) Search query for obtainin the subset of + the corpus. + + @return (dict) Dictionary containing key value pairs of a + relative path and its corresponding + datafile. + """ + ans = {} + for relativePath in self.dataFiles.keys(): + if query in relativePath: + ans[relativePath] = self.dataFiles[relativePath] + return ans diff --git a/nab/detectors/htmjava/nab/detectors/__init__.py b/nab/detectors/htmjava/nab/detectors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/htmjava/nab/detectors/base.py b/nab/detectors/htmjava/nab/detectors/base.py new file mode 100644 index 000000000..6ac4fdc15 --- /dev/null +++ b/nab/detectors/htmjava/nab/detectors/base.py @@ -0,0 +1,150 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import abc +import os +import pandas +import sys + +from datetime import datetime +from nab.util import createPath, getProbationPeriod + + + +class AnomalyDetector(object): + """ + Base class for all anomaly detectors. When inheriting from this class please + take note of which methods MUST be overridden, as documented below. + """ + __metaclass__ = abc.ABCMeta + + def __init__( self, + dataSet, + probationaryPercent): + + self.dataSet = dataSet + self.probationaryPeriod = getProbationPeriod( + probationaryPercent, dataSet.data.shape[0]) + + self.inputMin = self.dataSet.data["value"].min() + self.inputMax = self.dataSet.data["value"].max() + + + def initialize(self): + """Do anything to initialize your detector in before calling run. + + Pooling across cores forces a pickling operation when moving objects from + the main core to the pool and this may not always be possible. This function + allows you to create objects within the pool itself to avoid this issue. + """ + pass + + def getAdditionalHeaders(self): + """ + Returns a list of strings. Subclasses can add in additional columns per + record. + + This method MAY be overridden to provide the names for those + columns. + """ + return [] + + + @abc.abstractmethod + def handleRecord(self, inputData): + """ + Returns a list [anomalyScore, *]. It is required that the first + element of the list is the anomalyScore. The other elements may + be anything, but should correspond to the names returned by + getAdditionalHeaders(). + + This method MUST be overridden by subclasses + """ + raise NotImplementedError + + + def getHeader(self): + """ + Gets the outputPath and all the headers needed to write the results files. + """ + headers = ["timestamp", + "value", + "anomaly_score"] + + headers.extend(self.getAdditionalHeaders()) + + return headers + + + def run(self): + """ + Main function that is called to collect anomaly scores for a given file. + """ + + headers = self.getHeader() + + rows = [] + for i, row in self.dataSet.data.iterrows(): + + inputData = row.to_dict() + + detectorValues = self.handleRecord(inputData) + + outputRow = list(row) + list(detectorValues) + + rows.append(outputRow) + + # Progress report + if (i % 1000) == 0: + print ".", + sys.stdout.flush() + + ans = pandas.DataFrame(rows, columns=headers) + return ans + + +def detectDataSet(args): + """ + Function called in each detector process that run the detector that it is + given. + + @param args (tuple) Arguments to run a detector on a file and then + """ + (i, detectorInstance, detectorName, labels, outputDir, relativePath) = args + + relativeDir, fileName = os.path.split(relativePath) + fileName = detectorName + "_" + fileName + outputPath = os.path.join(outputDir, detectorName, relativeDir, fileName) + createPath(outputPath) + + print "%s: Beginning detection with %s for %s" % \ + (i, detectorName, relativePath) + detectorInstance.initialize() + + results = detectorInstance.run() + + # label=1 for relaxed windows, 0 otherwise + results["label"] = labels + + results.to_csv(outputPath, index=False) + + print "%s: Completed processing %s records at %s" % \ + (i, len(results.index), datetime.now()) + print "%s: Results have been written to %s" % (i, outputPath) diff --git a/nab/detectors/htmjava/nab/detectors/htmjava/__init__.py b/nab/detectors/htmjava/nab/detectors/htmjava/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/htmjava/build.gradle b/nab/detectors/htmjava/nab/detectors/htmjava/build.gradle similarity index 100% rename from nab/detectors/htmjava/build.gradle rename to nab/detectors/htmjava/nab/detectors/htmjava/build.gradle diff --git a/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.jar b/nab/detectors/htmjava/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.jar similarity index 100% rename from nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.jar rename to nab/detectors/htmjava/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.jar diff --git a/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.properties b/nab/detectors/htmjava/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.properties similarity index 100% rename from nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.properties rename to nab/detectors/htmjava/nab/detectors/htmjava/gradle/wrapper/gradle-wrapper.properties diff --git a/nab/detectors/htmjava/gradlew b/nab/detectors/htmjava/nab/detectors/htmjava/gradlew similarity index 100% rename from nab/detectors/htmjava/gradlew rename to nab/detectors/htmjava/nab/detectors/htmjava/gradlew diff --git a/nab/detectors/htmjava/gradlew.bat b/nab/detectors/htmjava/nab/detectors/htmjava/gradlew.bat similarity index 96% rename from nab/detectors/htmjava/gradlew.bat rename to nab/detectors/htmjava/nab/detectors/htmjava/gradlew.bat index aec99730b..8a0b282aa 100644 --- a/nab/detectors/htmjava/gradlew.bat +++ b/nab/detectors/htmjava/nab/detectors/htmjava/gradlew.bat @@ -1,90 +1,90 @@ -@if "%DEBUG%" == "" @echo off -@rem ########################################################################## -@rem -@rem Gradle startup script for Windows -@rem -@rem ########################################################################## - -@rem Set local scope for the variables with windows NT shell -if "%OS%"=="Windows_NT" setlocal - -@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -set DEFAULT_JVM_OPTS= - -set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. -set APP_BASE_NAME=%~n0 -set APP_HOME=%DIRNAME% - -@rem Find java.exe -if defined JAVA_HOME goto findJavaFromJavaHome - -set JAVA_EXE=java.exe -%JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init - -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:findJavaFromJavaHome -set JAVA_HOME=%JAVA_HOME:"=% -set JAVA_EXE=%JAVA_HOME%/bin/java.exe - -if exist "%JAVA_EXE%" goto init - -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. - -goto fail - -:init -@rem Get command-line arguments, handling Windowz variants - -if not "%OS%" == "Windows_NT" goto win9xME_args -if "%@eval[2+2]" == "4" goto 4NT_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* -goto execute - -:4NT_args -@rem Get arguments from the 4NT Shell from JP Software -set CMD_LINE_ARGS=%$ - -:execute -@rem Setup the command line - -set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar - -@rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% - -:end -@rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd - -:fail -rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of -rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 - -:mainEnd -if "%OS%"=="Windows_NT" endlocal - -:omega +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windowz variants + +if not "%OS%" == "Windows_NT" goto win9xME_args +if "%@eval[2+2]" == "4" goto 4NT_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* +goto execute + +:4NT_args +@rem Get arguments from the 4NT Shell from JP Software +set CMD_LINE_ARGS=%$ + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/nab/detectors/htmjava/htmjava_detector.py b/nab/detectors/htmjava/nab/detectors/htmjava/htmjava_detector.py similarity index 100% rename from nab/detectors/htmjava/htmjava_detector.py rename to nab/detectors/htmjava/nab/detectors/htmjava/htmjava_detector.py diff --git a/nab/detectors/htmjava/src/main/java/nab/detectors/htmjava/HTMModel.java b/nab/detectors/htmjava/nab/detectors/htmjava/src/main/java/nab/detectors/htmjava/HTMModel.java similarity index 100% rename from nab/detectors/htmjava/src/main/java/nab/detectors/htmjava/HTMModel.java rename to nab/detectors/htmjava/nab/detectors/htmjava/src/main/java/nab/detectors/htmjava/HTMModel.java diff --git a/nab/detectors/htmjava/src/main/resources/log4j.properties b/nab/detectors/htmjava/nab/detectors/htmjava/src/main/resources/log4j.properties similarity index 100% rename from nab/detectors/htmjava/src/main/resources/log4j.properties rename to nab/detectors/htmjava/nab/detectors/htmjava/src/main/resources/log4j.properties diff --git a/nab/detectors/htmjava/nab/labeler.py b/nab/detectors/htmjava/nab/labeler.py new file mode 100644 index 000000000..da6734f6c --- /dev/null +++ b/nab/detectors/htmjava/nab/labeler.py @@ -0,0 +1,467 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import datetime +import itertools +import numpy +import os +import pandas +try: + import simplejson as json +except ImportError: + import json + +from nab.util import (absoluteFilePaths, + getProbationPeriod, + strf, + strp, + deepmap, + createPath, + writeJSON) + + + +def bucket(rawTimes, buffer): + """ + Buckets (groups) timestamps that are within the amount of time specified by + buffer. + """ + bucket = [] + rawBuckets = [] + + current = None + for t in rawTimes: + if current is None: + current = t + bucket = [current] + continue + if (t - current) <= buffer: + bucket.append(t) + else: + rawBuckets.append(bucket) + current = t + bucket = [current] + if bucket: + rawBuckets.append(bucket) + + return rawBuckets + + +def merge(rawBuckets, threshold): + """ + Merges bucketed timestamps into one timestamp (most frequent, or earliest). + """ + truths = [] + passed = [] + + for bucket in rawBuckets: + if len(bucket) >= threshold: + truths.append(max(bucket, key=bucket.count)) + else: + passed.append(bucket) + + return truths, passed + + +def checkForOverlap(labels, buffer, labelsFileName, dataFileName): + """ + Raise a ValueError if the difference between any consecutive labels is smaller + than the buffer. + """ + for i in xrange(len(labels)-1): + if labels[i+1] - labels[i] <= buffer: + # import pdb; pdb.set_trace() + raise ValueError("The labels {} and {} in \'{}\' labels for data file " + "\'{}\' are too close to each other to be considered distinct " + "anomalies. Please relabel." + .format(labels[i], labels[i+1], labelsFileName, dataFileName)) + + + +class CorpusLabel(object): + """ + Class to store and manipulate a single set of labels for the whole + benchmark corpus. + """ + + def __init__(self, path, corpus): + """ + Initializes a CorpusLabel object by getting the anomaly windows and labels. + When this is done for combining raw user labels, we skip getLabels() + because labels are not yet created. + + @param path (string) Name of file containing the set of labels. + @param corpus (nab.Corpus) Corpus object. + """ + self.path = path + + self.windows = None + self.labels = None + + self.corpus = corpus + self.getWindows() + + if "raw" not in self.path: + # Do not get labels from files in the path nab/labels/raw + self.getLabels() + + + def getWindows(self): + """ + Read JSON label file. Get timestamps as dictionaries with key:value pairs of + a relative path and its corresponding list of windows. + """ + def found(t, data): + f = data["timestamp"][data["timestamp"] == pandas.Timestamp(t)] + exists = (len(f) == 1) + + return exists + + with open(os.path.join(self.path)) as windowFile: + windows = json.load(windowFile) + + self.windows = {} + + for relativePath in windows.keys(): + + self.windows[relativePath] = deepmap(strp, windows[relativePath]) + + if len(self.windows[relativePath]) == 0: + continue + + data = self.corpus.dataFiles[relativePath].data + if "raw" in self.path: + timestamps = windows[relativePath] + else: + timestamps = list(itertools.chain.from_iterable(windows[relativePath])) + + # Check that timestamps are present in dataset + if not all([found(t,data) for t in timestamps]): + raise ValueError("In the label file %s, one of the timestamps used for " + "the datafile %s doesn't match; it does not exist in " + "the file. Timestamps in json label files have to " + "exactly match timestamps in corresponding datafiles." + % (self.path, relativePath)) + + + def validateLabels(self): + """ + This is run at the end of the label combining process (see + scripts/combine_labels.py) to validate the resulting ground truth windows, + specifically that they are distinct (unique, non-overlapping). + """ + with open(os.path.join(self.path)) as windowFile: + windows = json.load(windowFile) + + self.windows = {} + + for relativePath in windows.keys(): + + self.windows[relativePath] = deepmap(strp, windows[relativePath]) + + if len(self.windows[relativePath]) == 0: + continue + + num_windows = len(self.windows[relativePath]) + if num_windows > 1: + if not all([(self.windows[relativePath][i+1][0] + - self.windows[relativePath][i][1]).total_seconds() >= 0 + for i in xrange(num_windows-1)]): + raise ValueError("In the label file %s, windows overlap." % self.path) + + + def getLabels(self): + """ + Get Labels as a dictionary of key-value pairs of a relative path and its + corresponding binary vector of anomaly labels. Labels are simply a more + verbose version of the windows. + """ + self.labels = {} + + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + if self.windows.has_key(relativePath): + windows = self.windows[relativePath] + + labels = pandas.DataFrame({"timestamp": dataSet.data["timestamp"]}) + labels['label'] = 0 + + for t1, t2 in windows: + moreThanT1 = labels[labels["timestamp"] >= t1] + betweenT1AndT2 = moreThanT1[moreThanT1["timestamp"] <= t2] + indices = betweenT1AndT2.loc[:,"label"].index + labels["label"].values[indices.values] = 1 + + self.labels[relativePath] = labels + + else: + print "Warning: no label for datafile",relativePath + + +class LabelCombiner(object): + """ + This class is used to combine labels from multiple human labelers, and the set + of manual labels (known anomalies). + The output is a single ground truth label file containing anomalies where + there is enough human agreement. The class also computes the window around + each anomaly. The exact logic is described elsewhere in the NAB + documentation. + """ + + def __init__(self, labelDir, corpus, + threshold, windowSize, + probationaryPercent, verbosity): + """ + @param labelDir (string) A directory name containing user label files. + This directory should contain one label file + per human labeler. + @param corpus (Corpus) Instance of Corpus class. + @param threshold (float) A percentage between 0 and 1, specifying the + agreement threshold. It describes the level + of agreement needed between individual + labelers before a particular point in a + data file is labeled as anomalous in the + combined file. + @param windowSize (float) Estimated size of an anomaly window, as a + ratio the dataset length. + @param verbosity (int) 0, 1, or 2 to print out select labeling + metrics; 0 is none, 2 is the most. + """ + self.labelDir = labelDir + self.corpus = corpus + self.threshold = threshold + self.windowSize = windowSize + self.probationaryPercent = probationaryPercent + self.verbosity = verbosity + + self.userLabels = None + self.nLabelers = None + self.knownLabels = None + + self.combinedWindows = None + + + def __str__(self): + ans = "" + ans += "labelDir: %s\n" % self.labelDir + ans += "corpus: %s\n" % self.corpus + ans += "number of labelers: %d\n" % self.nLabelers + ans += "agreement threshold: %d\n" % self.threshold + return ans + + + def write(self, labelsPath, windowsPath): + """Write the combined labels and windows to destination directories.""" + if not os.path.isdir(labelsPath): + createPath(labelsPath) + if not os.path.isdir(windowsPath): + createPath(windowsPath) + + writeJSON(labelsPath, self.labelTimestamps) + writeJSON(windowsPath, self.combinedWindows) + + + def combine(self): + """Combine raw and known labels in anomaly windows.""" + self.getRawLabels() + self.combineLabels() + self.editPoorLabels() + self.applyWindows() + self.checkWindows() + + + def getRawLabels(self): + """Collect the raw user labels from specified directory.""" + labelPaths = absoluteFilePaths(self.labelDir) + self.userLabels = [] + self.knownLabels = [] + for path in labelPaths: + if "known" in path: + self.knownLabels.append(CorpusLabel(path, self.corpus)) + else: + self.userLabels.append(CorpusLabel(path, self.corpus)) + + self.nLabelers = len(self.userLabels) + if self.nLabelers == 0: + raise ValueError("No users labels found") + + + def combineLabels(self): + """ + Combines raw user labels to create set of true anomaly labels. + A buffer is used to bucket labels that identify the same anomaly. The buffer + is half the estimated window size of an anomaly -- approximates an average + of two anomalies per dataset, and no window can have > 1 anomaly. + After bucketing, a label becomes a true anomaly if it was labeled by a + proportion of the users greater than the defined threshold. Then the bucket + is merged into one timestamp -- the ground truth label. + The set of known anomaly labels are added as well. These have been manually + labeled because we know the direct causes of the anomalies. They are added + as if they are the result of the bucket-merge process. + + If verbosity > 0, the dictionary passedLabels -- the raw labels that did not + pass the threshold qualification -- is printed to the console. + """ + def setTruthLabels(dataSet, trueAnomalies): + """Returns the indices of the ground truth anomalies for a data file.""" + timestamps = dataSet.data["timestamp"] + labels = numpy.array(timestamps.isin(trueAnomalies), dtype=int) + return [i for i in range(len(labels)) if labels[i]==1] + + self.labelTimestamps = {} + self.labelIndices = {} + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + if ("Known" in relativePath) or ("artificial" in relativePath): + knownAnomalies = self.knownLabels[0].windows[relativePath] + self.labelTimestamps[relativePath] = [str(t) for t in knownAnomalies] + self.labelIndices[relativePath] = setTruthLabels(dataSet, knownAnomalies) + continue + + # Calculate the window buffer -- used for bucketing labels identifying + # the same anomaly. + granularity = dataSet.data["timestamp"][1] - dataSet.data["timestamp"][0] + buffer = datetime.timedelta(minutes= + granularity.total_seconds()/60 * len(dataSet.data) * self.windowSize/10) + + rawTimesLists = [] + userCount = 0 + for user in self.userLabels: + if relativePath in user.windows: + # the user has labels for this file + checkForOverlap( + user.windows[relativePath], buffer, user.path, relativePath) + rawTimesLists.append(user.windows[relativePath]) + userCount += 1 + if not rawTimesLists: + # no labeled anomalies for this data file + self.labelTimestamps[relativePath] = [] + self.labelIndices[relativePath] = setTruthLabels(dataSet, []) + continue + else: + rawTimes = list(itertools.chain.from_iterable(rawTimesLists)) + rawTimes.sort() + + # Bucket and merge the anomaly timestamps. + threshold = userCount * self.threshold + trueAnomalies, passedAnomalies = merge( + bucket(rawTimes, buffer), threshold) + + self.labelTimestamps[relativePath] = [str(t) for t in trueAnomalies] + self.labelIndices[relativePath] = setTruthLabels(dataSet, trueAnomalies) + + if self.verbosity>0: + print "----" + print "For %s the passed raw labels and qualified true labels are,"\ + " respectively:" % relativePath + print passedAnomalies + print trueAnomalies + + return self.labelTimestamps, self.labelIndices + + + def editPoorLabels(self): + """ + This edits labels that have been flagged for manual revision. From + inspecting the data and anomaly windows, we have determined some combined + labels should be revised, or not included in the ground truth labels. + """ + count = 0 + for relativePath, indices in self.labelIndices.iteritems(): + + if "iio_us-east-1_i-a2eb1cd9_NetworkIn" in relativePath: + self.labelIndices[relativePath] = [249, 339] + + count += len(indices) + + if self.verbosity > 0: + print "=============================================================" + print "Total ground truth anomalies in benchmark dataset =", count + + + def applyWindows(self): + """ + This takes all the true anomalies, as calculated by combineLabels(), and + adds a standard window. The window length is the class variable windowSize, + and the location is centered on the anomaly timestamp. + + If verbosity = 2, the window metrics are printed to the console. + """ + allWindows = {} + for relativePath, anomalies in self.labelIndices.iteritems(): + data = self.corpus.dataFiles[relativePath].data + length = len(data) + num = len(anomalies) + if num: + windowLength = int(self.windowSize * length / len(anomalies)) + else: + windowLength = int(self.windowSize * length) + + if self.verbosity==2: + print "----" + print "Window metrics for file", relativePath + print "file length =", length, ";" \ + "number of windows =", num, ";" \ + "window length =", windowLength + + windows = [] + for a in anomalies: + front = max(a - windowLength/2, 0) + back = min(a + windowLength/2, length-1) + + windowLimit = [strf(data["timestamp"][front]), + strf(data["timestamp"][back])] + + windows.append(windowLimit) + + allWindows[relativePath] = windows + + self.combinedWindows = allWindows + + + def checkWindows(self): + """ + This takes the anomaly windows and checks for overlap with both each other + and with the probationary period. Overlapping windows are merged into a + single window. Windows overlapping with the probationary period are deleted. + """ + for relativePath, windows in self.combinedWindows.iteritems(): + numWindows = len(windows) + if numWindows > 0: + + fileLength = self.corpus.dataFiles[relativePath].data.shape[0] + probationIndex = getProbationPeriod( + self.probationaryPercent, fileLength) + + probationTimestamp = self.corpus.dataFiles[relativePath].data[ + "timestamp"][probationIndex] + + if (pandas.to_datetime(windows[0][0]) + -probationTimestamp).total_seconds() < 0: + del windows[0] + print ("The first window in {} overlaps with the probationary period " + ", so we're deleting it.".format(relativePath)) + + i = 0 + while len(windows)-1 > i: + if (pandas.to_datetime(windows[i+1][0]) + - pandas.to_datetime(windows[i][1])).total_seconds() <= 0: + # merge windows + windows[i] = [windows[i][0], windows[i+1][1]] + del windows[i+1] + i += 1 diff --git a/nab/detectors/htmjava/nab/runner.py b/nab/detectors/htmjava/nab/runner.py new file mode 100644 index 000000000..7cb4adc68 --- /dev/null +++ b/nab/detectors/htmjava/nab/runner.py @@ -0,0 +1,120 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import multiprocessing +import os +import pandas +try: + import simplejson as json +except ImportError: + import json + +from nab.corpus import Corpus +from nab.detectors.base import detectDataSet +from nab.labeler import CorpusLabel + + +class Runner(object): + """ +Class to run detection on the NAB benchmark using the specified set of +profiles and/or detectors. +""" + + def __init__(self, + dataDir, + resultsDir, + labelPath, + profilesPath, + numCPUs=None): + """ + @param dataDir (string) Directory where all the raw datasets exist. + + @param resultsDir (string) Directory where the detector anomaly scores + will be scored. + + @param labelPath (string) Path where the labels of the datasets + exist. + + @param profilesPath (string) Path to JSON file containing application + profiles and associated cost matrices. + + @param numCPUs (int) Number of CPUs to be used for calls to + multiprocessing.pool.map + """ + self.dataDir = dataDir + self.resultsDir = resultsDir + + self.labelPath = labelPath + self.profilesPath = profilesPath + self.pool = multiprocessing.Pool(numCPUs) + + self.probationaryPercent = 0.15 + self.windowSize = 0.10 + + self.corpus = None + self.corpusLabel = None + self.profiles = None + + + def initialize(self): + """Initialize all the relevant objects for the run.""" + self.corpus = Corpus(self.dataDir) + self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus) + + with open(self.profilesPath) as p: + self.profiles = json.load(p) + + + def detect(self, detectors): + """Generate results file given a dictionary of detector classes + + Function that takes a set of detectors and a corpus of data and creates a + set of files storing the alerts and anomaly scores given by the detectors + + @param detectors (dict) Dictionary with key value pairs of a + detector name and its corresponding + class constructor. + """ + print "\nRunning detection step" + + count = 0 + args = [] + for detectorName, detectorConstructor in detectors.iteritems(): + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + + if self.corpusLabel.labels.has_key(relativePath): + args.append( + ( + count, + detectorConstructor( + dataSet=dataSet, + probationaryPercent=self.probationaryPercent), + detectorName, + self.corpusLabel.labels[relativePath]["label"], + self.resultsDir, + relativePath + ) + ) + + count += 1 + + # Using `map_async` instead of `map` so interrupts are properly handled. + # See: http://stackoverflow.com/a/1408476 + self.pool.map_async(detectDataSet, args).get(999999) diff --git a/nab/detectors/htmjava/nab/util.py b/nab/detectors/htmjava/nab/util.py new file mode 100644 index 000000000..b02bf044d --- /dev/null +++ b/nab/detectors/htmjava/nab/util.py @@ -0,0 +1,338 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import datetime +import dateutil +import math +import os +import pandas +import pprint +import sys + +try: + import simplejson as json +except ImportError: + import json + + + +def getProbationPeriod(probationPercent, fileLength): + """Return the probationary period index.""" + return min( + math.floor(probationPercent * fileLength), + probationPercent * 5000) + + +def getOldDict(filePath): + """Loads the json given by filepath, returning the dictionary of data.""" + if os.path.exists(filePath): + with open(filePath) as inFile: + dataDict = json.load(inFile) + else: + dataDict = {} + + if not isinstance(dataDict, dict): + raise ValueError("Incorrect type; expected a dict.") + + return dataDict + + +def writeJSON(filePath, data): + """Dumps data to a nicely formatted json at filePath.""" + with open(filePath, "w") as outFile: + outFile.write(json.dumps(data, + sort_keys=True, + indent=4, + separators=(',', ': '))) + + +def updateFinalResults(newResults, resultsFilePath): + """ + Keep final results file updated with (most recent) score normalization. + + @param newResults (dict) Dictionary of normalized scores, from + most recent call to normalize(). + + @param resultsFilePath (str) File containing the best normalized scores + from the past runs of normalize(). + + @return oldResults (dict) Updated final results. + """ + results = getOldDict(resultsFilePath) + + for detector, score in newResults.iteritems(): + results[detector] = score + + writeJSON(resultsFilePath, results) + + return results + + +def updateThresholds(newThresholds, thresholdsFilePath): + """ + The thresholds file keeps a dictionary of thresholds and raw scores for + combinations of detector and scoring profiles. This function updates the file + with the new thresholds. + + @param newThresholds (dict) Optimized thresholds, as returned by + optimizeThreshold() in the optimizer. + + @param thresholdsFilePath (str) JSON of thresholds and their corresponding + raw scores. + + @return oldThresholds (dict) Updated thresholds. + """ + oldThresholds = getOldDict(thresholdsFilePath) + + for detector, profileDictionary in newThresholds.iteritems(): + if detector not in oldThresholds: + # add an entry for a new detector + oldThresholds[detector] = newThresholds[detector] + continue + + for profileName, data in profileDictionary.iteritems(): + if profileName not in oldThresholds[detector]: + # add an entry for a new scoring profile under this detector + oldThresholds[detector][profileName] = data + continue + oldThresholds[detector][profileName] = data + + writeJSON(thresholdsFilePath, oldThresholds) + + return oldThresholds + + +def checkInputs(args): + """Function that displays a set of arguments and asks to proceed.""" + pprint.pprint(vars(args)) + inp = raw_input("Proceed? (y/n): ") + + if inp == 'y': + return True + + if inp == 'n': + return False + + print "Incorrect input given\n" + return checkInputs(args) + + +def convertAnomalyScoresToDetections(anomalyScores, threshold): + """ + Convert anomaly scores (values between 0 and 1) to detections (binary + values) given a threshold. + """ + length = len(anomalyScores) + detections = pandas.Series([0]*length) + + alerts = anomalyScores[anomalyScores >= threshold].index + + detections[alerts] = 1 + + return detections + + +def relativeFilePaths(directory): + """Given directory, get path of all files within relative to the directory. + + @param directory (string) Absolute directory name. + + @return (iterable) All filepaths within directory, relative to + that directory. + """ + for dirpath,_,filenames in os.walk(directory): + filenames = [f for f in filenames if not f[0] == "."] + for f in filenames: + yield os.path.join(dirpath, f) + + +def absoluteFilePaths(directory): + """Given directory, gets the absolute path of all files within. + + @param directory (string) Directory name. + + @return (iterable) All absolute filepaths within directory. + """ + for dirpath,_,filenames in os.walk(directory): + filenames = [f for f in filenames if not f[0] == "."] + for f in filenames: + yield os.path.abspath(os.path.join(dirpath, f)) + + +def makeDirsExist(dirname): + """Makes sure a given directory exists. If not, it creates it. + + @param dirname (string) Absolute directory name. + """ + + if not os.path.exists(dirname): + # This is being run in parallel so watch out for race condition. + try: + os.makedirs(dirname) + except OSError: + pass + + +def createPath(path): + """Makes sure a given path exists. If not, it creates it. + + @param path (string) Absolute path name. + """ + dirname = os.path.dirname(path) + makeDirsExist(dirname) + + +def detectorClassToName(obj): + """Removes the 'detector' from the end of detector class's name. + + @param obj (subclass of AnomalyDetector) Detector class. + + @return (string) Name of detector. + """ + tailLength = len('detector') + name = obj.__name__[:-tailLength].lower() + return name + + +def detectorNameToClass(name): + name = name[0].upper() + name[1:] + className = name + "Detector" + + return className + + +def osPathSplit(path, debug=False): + """ + os_path_split_asunder + http://stackoverflow.com/questions/4579908/cross-platform-splitting-of-path-in-python + Path splitter that works on both unix-based and windows platforms. + + @param path (string) Path to be split. + + @return (list) Split path. + """ + parts = [] + while True: + newpath, tail = os.path.split(path) + if debug: + print repr(path), (newpath, tail) + if newpath == path: + assert not tail + if path: + parts.append(path) + break + parts.append(tail) + path = newpath + parts.reverse() + return parts + + +def convertResultsPathToDataPath(path): + """ + @param path (string) Path to dataset in the data directory. + + @return (string) Path to dataset result in the result directory. + """ + path = path.split(os.path.sep) + detector = path[0] + path = path[1:] + + filename = path[-1] + toRemove = detector + "_" + + i = filename.index(toRemove) + filename = filename[:i] + filename[i+len(toRemove):] + + path[-1] = filename + path = "/".join(path) + + return path + + +def flattenDict(dictionary, files={}, head=""): + """ + @param dictionary (dict) Dictionary of dictionaries to be flattened. + + @param files (dict) Dictionary to build up + + @param head (string) Prefix to each key + """ + for key in dictionary.keys(): + concat = head + "/" + key if head != "" else key + if type(dictionary[key]) is dict: + flattenDict(dictionary[key], files, concat) + else: + files[concat] = dictionary[key] + + return files + + +def strf(t): + """ + @param t (datetime.Datetime) Datetime object. + + @return (string) Formatted string of datetime. + """ + return datetime.datetime.strftime(t, "%Y-%m-%d %H:%M:%S.%f") + + +def strp(t): + """ + @param t (datetime.datetime) String of datetime with format: + "YYYY-MM-DD HH:mm:SS.ss". + + @return (string) Datetime object. + """ + return dateutil.parser.parse(t) + + +def recur(function, value, n): + """ + @param function (function) Function to recurse. + + @param value (value) Value to recurse on. + + @param n (int) Number of times to recurse. + """ + if n < 0 or int(n) != n: + print "incorrect input" + sys.exit() + + elif n == 0: + return value + + elif n == 1: + return function(value) + + else: + return recur(function, function(value), n-1) + + +def deepmap(f, datum): + """Deeply applies f across the datum. + + @param f (function) Function to map with. + + @param datum (datum) Object to map over. + """ + if type(datum) == list: + return [deepmap(f, x) for x in datum] + else: + return f(datum) diff --git a/nab/detectors/htmjava/requirements.txt b/nab/detectors/htmjava/requirements.txt new file mode 100644 index 000000000..d07652e31 --- /dev/null +++ b/nab/detectors/htmjava/requirements.txt @@ -0,0 +1,3 @@ +nupic==1.0.5 +pandas==0.20.3 +simplejson==3.11.1 diff --git a/nab/detectors/htmjava/run.py b/nab/detectors/htmjava/run.py new file mode 100644 index 000000000..75a278479 --- /dev/null +++ b/nab/detectors/htmjava/run.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- +import argparse +import os +try: + import simplejson as json +except ImportError: + import json + +from nab.runner import Runner +from nab.util import checkInputs +from nab.detectors.htmjava.htmjava_detector import HtmjavaDetector + + +def get_nth_parent_dir(n, path): + """ + Return the Nth parent of `path` where the 0th parent is the direct parent + directory. + """ + parent = os.path.dirname(path) + if n == 0: + return parent + + return get_nth_parent_dir(n-1, parent) + +def main(args): + + filepath = os.path.realpath(__file__) + + # Find the main NAB folder + # Assuming `filepath` is ~ <...>/NAB/nab/detectors/htmjava/run.py + root = get_nth_parent_dir(3, filepath) + + numCPUs = int(args.numCPUs) if args.numCPUs is not None else None + + dataDir = os.path.join(root, args.dataDir) + windowsFile = os.path.join(root, args.windowsFile) + resultsDir = os.path.join(root, args.resultsDir) + profilesFile = os.path.join(root, args.profilesFile) + + runner = Runner(dataDir=dataDir, + labelPath=windowsFile, + resultsDir=resultsDir, + profilesPath=profilesFile, + numCPUs=numCPUs) + + runner.initialize() + + runner.detect({'htmjava': HtmjavaDetector}) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--skipConfirmation", + help="If specified will skip the user confirmation step", + default=False, + action="store_true") + + parser.add_argument("--dataDir", + default="data", + help="This holds all the label windows for the corpus.") + + parser.add_argument("--resultsDir", + default="results", + help="This will hold the results after running detectors " + "on the data") + + parser.add_argument("--windowsFile", + default=os.path.join("labels", "combined_windows.json"), + help="JSON file containing ground truth labels for the " + "corpus.") + + parser.add_argument("-p", "--profilesFile", + default=os.path.join("config", "profiles.json"), + help="The configuration file to use while running the " + "benchmark.") + + parser.add_argument("-n", "--numCPUs", + default=None, + help="The number of CPUs to use to run the " + "benchmark. If not specified all CPUs will be used.") + + args = parser.parse_args() + + if args.skipConfirmation or checkInputs(args): + main(args) diff --git a/nab/detectors/htmjava/setup.py b/nab/detectors/htmjava/setup.py new file mode 100644 index 000000000..eac3d56e9 --- /dev/null +++ b/nab/detectors/htmjava/setup.py @@ -0,0 +1,115 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import os +import pkg_resources +import warnings +from setuptools import setup, find_packages + +REPO_DIR = os.path.dirname(os.path.realpath(__file__)) + + + +# Utility function to read the README file. +# Used for the long_description. It"s nice, because now 1) we have a top level +# README file and 2) it"s easier to type in the README file than to put a raw +# string in below ... +def read(fname): + with open(os.path.join(os.path.dirname(__file__), fname)) as f: + result = f.read() + return result + + + +def nupicInstalled(): + """ + Determine whether NuPIC is already installed. + :return: boolean + """ + try: + _ = pkg_resources.get_distribution("nupic") + return True + except pkg_resources.DistributionNotFound: + pass # Silently ignore. NuPIC will be installed later. + + return False + + + +def parseFile(requirementFile): + """ + Parse requirement file. + :return: list of requirements. + """ + try: + return [ + line.strip() + for line in open(requirementFile).readlines() + if not line.startswith("#") + ] + except IOError: + return [] + + + +def findRequirements(): + """ + Read the requirements.txt file and parse into requirements for setup's + install_requirements option. + """ + requirementsPath = os.path.join(REPO_DIR, "requirements.txt") + requirements = parseFile(requirementsPath) + + if nupicInstalled(): + # The user already has a version of NuPIC installed. We'll remove the entry + # in requirements.txt to not conflate the two and will issue a user warning. + reqs = [] + for req in requirements: + if "nupic" != req.split("==")[0]: + reqs.append(req) + else: + warnings.warn("NuPIC is already installed so %s from requirements.txt " + "will be not be installed." % req) + else: + reqs = requirements + return reqs + + + +if __name__ == "__main__": + requirements = findRequirements() + + setup( + name="nab", + version="1.0", + author="Alexander Lavin", + author_email="nab@numenta.org", + description=( + "Numenta Anomaly Benchmark: A benchmark for streaming anomaly prediction"), + license="AGPL", + packages=find_packages(), + long_description=read("README.md"), + install_requires=requirements, + entry_points={ + "console_scripts": [ + "nab-plot = nab.plot:main", + ], + }, + ) diff --git a/nab/detectors/knncad/knncad_detector.py b/nab/detectors/knncad/knncad_detector.py index c2e1924bd..8bf9b14ea 100644 --- a/nab/detectors/knncad/knncad_detector.py +++ b/nab/detectors/knncad/knncad_detector.py @@ -23,7 +23,7 @@ def metric(self,a,b): return np.dot(np.dot(diff,self.sigma),diff.T) def ncm(self,item, item_in_array=False): - arr = map(lambda x:self.metric(x,item), self.training) + arr = [self.metric(x,item) for x in self.training] return np.sum(np.partition(arr, self.k+item_in_array)[:self.k+item_in_array]) def handleRecord(self, inputData): @@ -46,9 +46,9 @@ def handleRecord(self, inputData): try: self.sigma = np.linalg.inv(np.dot(np.array(self.training).T, self.training)) except np.linalg.linalg.LinAlgError: - print 'Singular Matrix at record', self.record_count + print('Singular Matrix at record', self.record_count) if len(self.scores) == 0: - self.scores = map(lambda v: self.ncm(v, True), self.training) + self.scores = [self.ncm(v, True) for v in self.training] new_score = self.ncm(new_item) result = 1.*len(np.where(np.array(self.scores) < new_score)[0])/len(self.scores) diff --git a/nab/detectors/numenta/.gitignore b/nab/detectors/numenta/.gitignore new file mode 100644 index 000000000..ae412d6a0 --- /dev/null +++ b/nab/detectors/numenta/.gitignore @@ -0,0 +1 @@ +env/ \ No newline at end of file diff --git a/nab/detectors/numenta/README.md b/nab/detectors/numenta/README.md new file mode 100644 index 000000000..70c1f62b6 --- /dev/null +++ b/nab/detectors/numenta/README.md @@ -0,0 +1,125 @@ +# Numenta and NumentaTM detectors + +This directory holds the Python 2 code required to run the `numenta` and +`numentaTM` detectors against the NAB data. In 2019 the main body of the +benchmark's code was ported to Python 3 however these detectors rely on NuPIC +which is Python 2 only. + +This code can be used to replicate results listed on the scoreboard of +the main repository for the following detectors: + + numenta + numentaTM + +## Installation + +### Docker + +Both these detectors are also provided within a docker image, available with `docker pull numenta/nab:py2.7`. + +### Assumptions + +We assume you have a working version of Python 3 installed as your default Python. +If your default system Python is still Python 2 you can skip the virtual environment +creation below. + +### Requirements to install + +- [Python 2.7](https://www.python.org/download/) +- [Virtualenv](https://pypi.org/project/virtualenv/) + +### Install a virtual environment + +Create a new Python 2 virtual environment in this directory. + +`virtualenv -p path/to/python2 env` + +On Windows this might be: + +`virtualenv -p C:\Python27\python.exe env` + +Activate that virtual environment. + +`./env/Scripts/activate` + +or + +`env\Scripts\activate.bat` on Windows. + +Confirm you have a local Python 2 + +``` +$ python +Python 2.7.13 (v2.7.13:a06454b1afa1, Dec 17 2016, 20:53:40) [MSC v.1500 64 bit (AMD64)] on win32 +Type "help", "copyright", "credits" or "license" for more information. +>>> +``` + +### Install detectors + +`cd /path/to/NAB/` +`pip install nupic` +`python nab/detectors/numenta/setup.py develop` + +## Usage + +### Detection + +This directory contains a modified version of the `run.py` script which exists +in the main NAB directory. It can be used to run *detection* only using the +`numenta` and `numentaTM` detectors against NAB data. + +By default it will run both `numenta` and `numentaTM` detectors and output +results to the main NAB/results directory. + +`python2 run.py` + +Note: By default `run.py` tries to use all the cores on your machine. The above +command should take about 20-30 minutes on a current powerful laptop with 4-8 +cores. + +To run only one of the detectors use the `-d` option: + +`python2 run.py -d numenta` + +To see all options of this script type: + +`python2 run.py --help` + +### Optimizing, Scoring and Normalizing + +Once you have run either of the detectors herein against the NAB data you will need +to exit the Python 2 virtual environment and move into the main NAB directory. + +``` +(env) /NAB/nab/detectors/numenta +$ deactivate +/NAB/nab/detectors/numenta +$ cd ../../../ +/NAB +$ +``` + +Then follow the instructions in the main README to run optimization, scoring, and normalization, e.g.: + +`python run.py -d numenta,numentaTM --optimize --score --normalize` + +### Run a subset of NAB data files + +For debugging it is sometimes useful to be able to run your algorithm on a +subset of the NAB data files or on your own set of data files. You can do that +by creating a custom `combined_windows.json` file that only contains labels for +the files you want to run. This new file should be in exactly the same format as +`combined_windows.json` except it would only contain windows for the files you +are interested in. + +**Example**: an example file containing two files is in +`labels/combined_windows_tiny.json`. (Under of the main NAB directory) The +following command shows you how to run NAB on a subset of labels: + + python2 run.py -d numenta --detect --windowsFile labels/combined_windows_tiny.json + +This will run the `detect` phase of NAB on the data files specified in the above +JSON file. Note that scoring and normalization are not supported with this +option. Note also that you may see warning messages regarding the lack of labels +for other files. You can ignore these warnings. diff --git a/nab/detectors/numenta/nab/__init__.py b/nab/detectors/numenta/nab/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/numenta/nab/corpus.py b/nab/detectors/numenta/nab/corpus.py new file mode 100644 index 000000000..7373c3fac --- /dev/null +++ b/nab/detectors/numenta/nab/corpus.py @@ -0,0 +1,234 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +""" +This contains the objects to store and manipulate a database of csv files. +""" + +import copy +import os +import pandas + +from nab.util import (absoluteFilePaths, + createPath) + + + +class DataFile(object): + """ + Class for storing and manipulating a single datafile. + Data is stored in pandas.DataFrame + """ + + def __init__(self, srcPath): + """ + @param srcPath (string) Filename of datafile to read. + """ + self.srcPath = srcPath + + self.fileName = os.path.split(srcPath)[1] + + self.data = pandas.io.parsers.read_csv(self.srcPath, + header=0, parse_dates=[0]) + + + def write(self, newPath=None): + """Write datafile to self.srcPath or newPath if given. + + @param newPath (string) Path to write datafile to. If path is not given, + write to source path + """ + + path = newPath if newPath else self.srcPath + self.data.to_csv(path, index=False) + + + def modifyData(self, columnName, data=None, write=False): + """Add columnName to datafile if data is given otherwise remove + columnName. + + @param columnName (string) Name of the column in the datafile to + either add or remove. + + @param data (pandas.Series) Column data to be added to datafile. + Data length should be as long as the + length of other columns. + + @param write (boolean) Flag to choose whether to write modifications to + source path. + """ + if isinstance(data, pandas.Series): + self.data[columnName] = data + else: + if columnName in self.data: + del self.data[columnName] + + if write: + self.write() + + + def getTimestampRange(self, t1, t2): + """Given timestamp range, get all records that are within that range. + + @param t1 (int) Starting timestamp. + + @param t2 (int) Ending timestamp. + + @return (list) Timestamp and value for each time stamp within the + timestamp range. + """ + tmp = self.data[self.data["timestamp"] >= t1] + ans = tmp[tmp["timestamp"] <= t2]["timestamp"].tolist() + return ans + + + def __str__(self): + ans = "" + ans += "path: %s\n" % self.srcPath + ans += "file name: %s\n"% self.fileName + ans += "data size: ", self.data.shape() + ans += "sample line: %s\n" % ", ".join(self.data[0]) + return ans + + + +class Corpus(object): + """ + Class for storing and manipulating a corpus of data where each datafile is + stored as a DataFile object. + """ + + def __init__(self, srcRoot): + """ + @param srcRoot (string) Source directory of corpus. + """ + self.srcRoot = srcRoot + self.dataFiles = self.getDataFiles() + self.numDataFiles = len(self.dataFiles) + + + def getDataFiles(self): + """ + Collect all CSV data files from self.srcRoot directory. + + @return (dict) Keys are relative paths (from self.srcRoot) and values are + the corresponding data files. + """ + filePaths = absoluteFilePaths(self.srcRoot) + dataSets = [DataFile(path) for path in filePaths if ".csv" in path] + + def getRelativePath(srcRoot, srcPath): + # Handle case where srcRoot is already relative + srcRoot = os.path.abspath(srcRoot) + ind = srcPath.index(srcRoot) + root_len = len(srcRoot) + return srcPath[ind+root_len:]\ + .strip(os.path.sep).replace(os.path.sep, "/") + + return {getRelativePath(self.srcRoot, d.srcPath) : d for d in dataSets} + + + def addColumn(self, columnName, data, write=False): + """ + Add column to entire corpus given columnName and dictionary of data for each + file in the corpus. If newRoot is given then corpus is copied and then + modified. + + @param columnName (string) Name of the column in the datafile to add. + + @param data (dict) Dictionary containing key value pairs of a + relative path and its corresponding + datafile (as a pandas.Series). + + @param write (boolean) Flag to decide whether to write corpus + modificiations or not. + """ + + for relativePath in self.dataFiles.keys(): + self.dataFiles[relativePath].modifyData( + columnName, data[relativePath], write=write) + + + def removeColumn(self, columnName, write=False): + """ + Remove column from entire corpus given columnName. If newRoot if given then + corpus is copied and then modified. + + @param columnName (string) Name of the column in the datafile to add. + + @param write (boolean) Flag to decide whether to write corpus + modificiations or not. + """ + for relativePath in self.dataFiles.keys(): + self.dataFiles[relativePath].modifyData(columnName, write=write) + + def copy(self, newRoot=None): + """Copy corpus to a newRoot which cannot already exist. + + @param newRoot (string) Location of new directory to copy corpus + to. + """ + if newRoot[-1] != os.path.sep: + newRoot += os.path.sep + if os.path.isdir(newRoot): + print "directory already exists" + return None + else: + createPath(newRoot) + + newCorpus = Corpus(newRoot) + for relativePath in self.dataFiles.keys(): + newCorpus.addDataSet(relativePath, self.dataFiles[relativePath]) + return newCorpus + + + def addDataSet(self, relativePath, dataSet): + """Add datafile to corpus given its realtivePath within the corpus. + + @param relativePath (string) Path of the new datafile relative to + the corpus directory. + + @param datafile (datafile) Data set to be added to corpus. + """ + self.dataFiles[relativePath] = copy.deepcopy(dataSet) + newPath = self.srcRoot + relativePath + createPath(newPath) + self.dataFiles[relativePath].srcPath = newPath + self.dataFiles[relativePath].write() + self.numDataFiles = len(self.dataFiles) + + + def getDataSubset(self, query): + """ + Get subset of the corpus given a query to match the datafile filename or + relative path. + + @param query (string) Search query for obtainin the subset of + the corpus. + + @return (dict) Dictionary containing key value pairs of a + relative path and its corresponding + datafile. + """ + ans = {} + for relativePath in self.dataFiles.keys(): + if query in relativePath: + ans[relativePath] = self.dataFiles[relativePath] + return ans diff --git a/nab/detectors/numenta/nab/detectors/__init__.py b/nab/detectors/numenta/nab/detectors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/numenta/nab/detectors/base.py b/nab/detectors/numenta/nab/detectors/base.py new file mode 100644 index 000000000..6ac4fdc15 --- /dev/null +++ b/nab/detectors/numenta/nab/detectors/base.py @@ -0,0 +1,150 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import abc +import os +import pandas +import sys + +from datetime import datetime +from nab.util import createPath, getProbationPeriod + + + +class AnomalyDetector(object): + """ + Base class for all anomaly detectors. When inheriting from this class please + take note of which methods MUST be overridden, as documented below. + """ + __metaclass__ = abc.ABCMeta + + def __init__( self, + dataSet, + probationaryPercent): + + self.dataSet = dataSet + self.probationaryPeriod = getProbationPeriod( + probationaryPercent, dataSet.data.shape[0]) + + self.inputMin = self.dataSet.data["value"].min() + self.inputMax = self.dataSet.data["value"].max() + + + def initialize(self): + """Do anything to initialize your detector in before calling run. + + Pooling across cores forces a pickling operation when moving objects from + the main core to the pool and this may not always be possible. This function + allows you to create objects within the pool itself to avoid this issue. + """ + pass + + def getAdditionalHeaders(self): + """ + Returns a list of strings. Subclasses can add in additional columns per + record. + + This method MAY be overridden to provide the names for those + columns. + """ + return [] + + + @abc.abstractmethod + def handleRecord(self, inputData): + """ + Returns a list [anomalyScore, *]. It is required that the first + element of the list is the anomalyScore. The other elements may + be anything, but should correspond to the names returned by + getAdditionalHeaders(). + + This method MUST be overridden by subclasses + """ + raise NotImplementedError + + + def getHeader(self): + """ + Gets the outputPath and all the headers needed to write the results files. + """ + headers = ["timestamp", + "value", + "anomaly_score"] + + headers.extend(self.getAdditionalHeaders()) + + return headers + + + def run(self): + """ + Main function that is called to collect anomaly scores for a given file. + """ + + headers = self.getHeader() + + rows = [] + for i, row in self.dataSet.data.iterrows(): + + inputData = row.to_dict() + + detectorValues = self.handleRecord(inputData) + + outputRow = list(row) + list(detectorValues) + + rows.append(outputRow) + + # Progress report + if (i % 1000) == 0: + print ".", + sys.stdout.flush() + + ans = pandas.DataFrame(rows, columns=headers) + return ans + + +def detectDataSet(args): + """ + Function called in each detector process that run the detector that it is + given. + + @param args (tuple) Arguments to run a detector on a file and then + """ + (i, detectorInstance, detectorName, labels, outputDir, relativePath) = args + + relativeDir, fileName = os.path.split(relativePath) + fileName = detectorName + "_" + fileName + outputPath = os.path.join(outputDir, detectorName, relativeDir, fileName) + createPath(outputPath) + + print "%s: Beginning detection with %s for %s" % \ + (i, detectorName, relativePath) + detectorInstance.initialize() + + results = detectorInstance.run() + + # label=1 for relaxed windows, 0 otherwise + results["label"] = labels + + results.to_csv(outputPath, index=False) + + print "%s: Completed processing %s records at %s" % \ + (i, len(results.index), datetime.now()) + print "%s: Results have been written to %s" % (i, outputPath) diff --git a/nab/detectors/numenta/nab/detectors/numenta/__init__.py b/nab/detectors/numenta/nab/detectors/numenta/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/nab/detectors/numenta/nab/detectors/numenta/numentaTM_detector.py b/nab/detectors/numenta/nab/detectors/numenta/numentaTM_detector.py new file mode 100644 index 000000000..0ba71b197 --- /dev/null +++ b/nab/detectors/numenta/nab/detectors/numenta/numentaTM_detector.py @@ -0,0 +1,75 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2016, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import os +import math +import simplejson as json + +from nupic.algorithms import anomaly_likelihood +from nupic.frameworks.opf.common_models.cluster_params import ( + getScalarMetricWithTimeOfDayAnomalyParams) +try: + from nupic.frameworks.opf.model_factory import ModelFactory +except: + # Try importing it the old way (version < 0.7.0.dev0) + from nupic.frameworks.opf.modelfactory import ModelFactory + +from nab.detectors.numenta.numenta_detector import NumentaDetector + + + +class NumentaTMDetector(NumentaDetector): + """ + This detector uses the implementation of temporal memory in + https://github.com/numenta/nupic.core/blob/master/src/nupic/algorithms/TemporalMemory.hpp. + It differs from its parent detector in temporal memory and its parameters. + """ + + def __init__(self, *args, **kwargs): + + super(NumentaTMDetector, self).__init__(*args, **kwargs) + + + def initialize(self): + # Get config params, setting the RDSE resolution + rangePadding = abs(self.inputMax - self.inputMin) * 0.2 + + modelParams = getScalarMetricWithTimeOfDayAnomalyParams( + metricData=[0], + minVal=self.inputMin-rangePadding, + maxVal=self.inputMax+rangePadding, + minResolution=0.001, + tmImplementation="tm_cpp" + )["modelConfig"] + + self._setupEncoderParams( + modelParams["modelParams"]["sensorParams"]["encoders"]) + + self.model = ModelFactory.create(modelParams) + + self.model.enableInference({"predictedField": "value"}) + + # Initialize the anomaly likelihood object + numentaLearningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) + self.anomalyLikelihood = anomaly_likelihood.AnomalyLikelihood( + learningPeriod=numentaLearningPeriod, + estimationSamples=self.probationaryPeriod-numentaLearningPeriod, + reestimationPeriod=100 + ) diff --git a/nab/detectors/numenta/nab/detectors/numenta/numenta_detector.py b/nab/detectors/numenta/nab/detectors/numenta/numenta_detector.py new file mode 100644 index 000000000..782333ac1 --- /dev/null +++ b/nab/detectors/numenta/nab/detectors/numenta/numenta_detector.py @@ -0,0 +1,152 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import math + +from nupic.algorithms import anomaly_likelihood +from nupic.frameworks.opf.common_models.cluster_params import ( + getScalarMetricWithTimeOfDayAnomalyParams) +try: + from nupic.frameworks.opf.model_factory import ModelFactory +except: + # Try importing it the old way (version < 0.7.0.dev0) + from nupic.frameworks.opf.modelfactory import ModelFactory + +from nab.detectors.base import AnomalyDetector + +# Fraction outside of the range of values seen so far that will be considered +# a spatial anomaly regardless of the anomaly likelihood calculation. This +# accounts for the human labelling bias for spatial values larger than what +# has been seen so far. +SPATIAL_TOLERANCE = 0.05 + + + +class NumentaDetector(AnomalyDetector): + """ + This detector uses an HTM based anomaly detection technique. + """ + + def __init__(self, *args, **kwargs): + + super(NumentaDetector, self).__init__(*args, **kwargs) + + self.model = None + self.sensorParams = None + self.anomalyLikelihood = None + # Keep track of value range for spatial anomaly detection + self.minVal = None + self.maxVal = None + + # Set this to False if you want to get results based on raw scores + # without using AnomalyLikelihood. This will give worse results, but + # useful for checking the efficacy of AnomalyLikelihood. You will need + # to re-optimize the thresholds when running with this setting. + self.useLikelihood = True + + + def getAdditionalHeaders(self): + """Returns a list of strings.""" + return ["raw_score"] + + + def handleRecord(self, inputData): + """Returns a tuple (anomalyScore, rawScore). + + Internally to NuPIC "anomalyScore" corresponds to "likelihood_score" + and "rawScore" corresponds to "anomaly_score". Sorry about that. + """ + # Send it to Numenta detector and get back the results + result = self.model.run(inputData) + + # Get the value + value = inputData["value"] + + # Retrieve the anomaly score and write it to a file + rawScore = result.inferences["anomalyScore"] + + # Update min/max values and check if there is a spatial anomaly + spatialAnomaly = False + if self.minVal != self.maxVal: + tolerance = (self.maxVal - self.minVal) * SPATIAL_TOLERANCE + maxExpected = self.maxVal + tolerance + minExpected = self.minVal - tolerance + if value > maxExpected or value < minExpected: + spatialAnomaly = True + if self.maxVal is None or value > self.maxVal: + self.maxVal = value + if self.minVal is None or value < self.minVal: + self.minVal = value + + if self.useLikelihood: + # Compute log(anomaly likelihood) + anomalyScore = self.anomalyLikelihood.anomalyProbability( + inputData["value"], rawScore, inputData["timestamp"]) + logScore = self.anomalyLikelihood.computeLogLikelihood(anomalyScore) + finalScore = logScore + else: + finalScore = rawScore + + if spatialAnomaly: + finalScore = 1.0 + + return (finalScore, rawScore) + + + def initialize(self): + # Get config params, setting the RDSE resolution + rangePadding = abs(self.inputMax - self.inputMin) * 0.2 + modelParams = getScalarMetricWithTimeOfDayAnomalyParams( + metricData=[0], + minVal=self.inputMin-rangePadding, + maxVal=self.inputMax+rangePadding, + minResolution=0.001, + tmImplementation = "cpp" + )["modelConfig"] + + self._setupEncoderParams( + modelParams["modelParams"]["sensorParams"]["encoders"]) + + self.model = ModelFactory.create(modelParams) + + self.model.enableInference({"predictedField": "value"}) + + if self.useLikelihood: + # Initialize the anomaly likelihood object + numentaLearningPeriod = int(math.floor(self.probationaryPeriod / 2.0)) + self.anomalyLikelihood = anomaly_likelihood.AnomalyLikelihood( + learningPeriod=numentaLearningPeriod, + estimationSamples=self.probationaryPeriod-numentaLearningPeriod, + reestimationPeriod=100 + ) + + + def _setupEncoderParams(self, encoderParams): + # The encoder must expect the NAB-specific datafile headers + encoderParams["timestamp_dayOfWeek"] = encoderParams.pop("c0_dayOfWeek") + encoderParams["timestamp_timeOfDay"] = encoderParams.pop("c0_timeOfDay") + encoderParams["timestamp_timeOfDay"]["fieldname"] = "timestamp" + encoderParams["timestamp_timeOfDay"]["name"] = "timestamp" + encoderParams["timestamp_weekend"] = encoderParams.pop("c0_weekend") + encoderParams["value"] = encoderParams.pop("c1") + encoderParams["value"]["fieldname"] = "value" + encoderParams["value"]["name"] = "value" + + self.sensorParams = encoderParams["value"] diff --git a/nab/detectors/numenta/nab/labeler.py b/nab/detectors/numenta/nab/labeler.py new file mode 100644 index 000000000..da6734f6c --- /dev/null +++ b/nab/detectors/numenta/nab/labeler.py @@ -0,0 +1,467 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import datetime +import itertools +import numpy +import os +import pandas +try: + import simplejson as json +except ImportError: + import json + +from nab.util import (absoluteFilePaths, + getProbationPeriod, + strf, + strp, + deepmap, + createPath, + writeJSON) + + + +def bucket(rawTimes, buffer): + """ + Buckets (groups) timestamps that are within the amount of time specified by + buffer. + """ + bucket = [] + rawBuckets = [] + + current = None + for t in rawTimes: + if current is None: + current = t + bucket = [current] + continue + if (t - current) <= buffer: + bucket.append(t) + else: + rawBuckets.append(bucket) + current = t + bucket = [current] + if bucket: + rawBuckets.append(bucket) + + return rawBuckets + + +def merge(rawBuckets, threshold): + """ + Merges bucketed timestamps into one timestamp (most frequent, or earliest). + """ + truths = [] + passed = [] + + for bucket in rawBuckets: + if len(bucket) >= threshold: + truths.append(max(bucket, key=bucket.count)) + else: + passed.append(bucket) + + return truths, passed + + +def checkForOverlap(labels, buffer, labelsFileName, dataFileName): + """ + Raise a ValueError if the difference between any consecutive labels is smaller + than the buffer. + """ + for i in xrange(len(labels)-1): + if labels[i+1] - labels[i] <= buffer: + # import pdb; pdb.set_trace() + raise ValueError("The labels {} and {} in \'{}\' labels for data file " + "\'{}\' are too close to each other to be considered distinct " + "anomalies. Please relabel." + .format(labels[i], labels[i+1], labelsFileName, dataFileName)) + + + +class CorpusLabel(object): + """ + Class to store and manipulate a single set of labels for the whole + benchmark corpus. + """ + + def __init__(self, path, corpus): + """ + Initializes a CorpusLabel object by getting the anomaly windows and labels. + When this is done for combining raw user labels, we skip getLabels() + because labels are not yet created. + + @param path (string) Name of file containing the set of labels. + @param corpus (nab.Corpus) Corpus object. + """ + self.path = path + + self.windows = None + self.labels = None + + self.corpus = corpus + self.getWindows() + + if "raw" not in self.path: + # Do not get labels from files in the path nab/labels/raw + self.getLabels() + + + def getWindows(self): + """ + Read JSON label file. Get timestamps as dictionaries with key:value pairs of + a relative path and its corresponding list of windows. + """ + def found(t, data): + f = data["timestamp"][data["timestamp"] == pandas.Timestamp(t)] + exists = (len(f) == 1) + + return exists + + with open(os.path.join(self.path)) as windowFile: + windows = json.load(windowFile) + + self.windows = {} + + for relativePath in windows.keys(): + + self.windows[relativePath] = deepmap(strp, windows[relativePath]) + + if len(self.windows[relativePath]) == 0: + continue + + data = self.corpus.dataFiles[relativePath].data + if "raw" in self.path: + timestamps = windows[relativePath] + else: + timestamps = list(itertools.chain.from_iterable(windows[relativePath])) + + # Check that timestamps are present in dataset + if not all([found(t,data) for t in timestamps]): + raise ValueError("In the label file %s, one of the timestamps used for " + "the datafile %s doesn't match; it does not exist in " + "the file. Timestamps in json label files have to " + "exactly match timestamps in corresponding datafiles." + % (self.path, relativePath)) + + + def validateLabels(self): + """ + This is run at the end of the label combining process (see + scripts/combine_labels.py) to validate the resulting ground truth windows, + specifically that they are distinct (unique, non-overlapping). + """ + with open(os.path.join(self.path)) as windowFile: + windows = json.load(windowFile) + + self.windows = {} + + for relativePath in windows.keys(): + + self.windows[relativePath] = deepmap(strp, windows[relativePath]) + + if len(self.windows[relativePath]) == 0: + continue + + num_windows = len(self.windows[relativePath]) + if num_windows > 1: + if not all([(self.windows[relativePath][i+1][0] + - self.windows[relativePath][i][1]).total_seconds() >= 0 + for i in xrange(num_windows-1)]): + raise ValueError("In the label file %s, windows overlap." % self.path) + + + def getLabels(self): + """ + Get Labels as a dictionary of key-value pairs of a relative path and its + corresponding binary vector of anomaly labels. Labels are simply a more + verbose version of the windows. + """ + self.labels = {} + + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + if self.windows.has_key(relativePath): + windows = self.windows[relativePath] + + labels = pandas.DataFrame({"timestamp": dataSet.data["timestamp"]}) + labels['label'] = 0 + + for t1, t2 in windows: + moreThanT1 = labels[labels["timestamp"] >= t1] + betweenT1AndT2 = moreThanT1[moreThanT1["timestamp"] <= t2] + indices = betweenT1AndT2.loc[:,"label"].index + labels["label"].values[indices.values] = 1 + + self.labels[relativePath] = labels + + else: + print "Warning: no label for datafile",relativePath + + +class LabelCombiner(object): + """ + This class is used to combine labels from multiple human labelers, and the set + of manual labels (known anomalies). + The output is a single ground truth label file containing anomalies where + there is enough human agreement. The class also computes the window around + each anomaly. The exact logic is described elsewhere in the NAB + documentation. + """ + + def __init__(self, labelDir, corpus, + threshold, windowSize, + probationaryPercent, verbosity): + """ + @param labelDir (string) A directory name containing user label files. + This directory should contain one label file + per human labeler. + @param corpus (Corpus) Instance of Corpus class. + @param threshold (float) A percentage between 0 and 1, specifying the + agreement threshold. It describes the level + of agreement needed between individual + labelers before a particular point in a + data file is labeled as anomalous in the + combined file. + @param windowSize (float) Estimated size of an anomaly window, as a + ratio the dataset length. + @param verbosity (int) 0, 1, or 2 to print out select labeling + metrics; 0 is none, 2 is the most. + """ + self.labelDir = labelDir + self.corpus = corpus + self.threshold = threshold + self.windowSize = windowSize + self.probationaryPercent = probationaryPercent + self.verbosity = verbosity + + self.userLabels = None + self.nLabelers = None + self.knownLabels = None + + self.combinedWindows = None + + + def __str__(self): + ans = "" + ans += "labelDir: %s\n" % self.labelDir + ans += "corpus: %s\n" % self.corpus + ans += "number of labelers: %d\n" % self.nLabelers + ans += "agreement threshold: %d\n" % self.threshold + return ans + + + def write(self, labelsPath, windowsPath): + """Write the combined labels and windows to destination directories.""" + if not os.path.isdir(labelsPath): + createPath(labelsPath) + if not os.path.isdir(windowsPath): + createPath(windowsPath) + + writeJSON(labelsPath, self.labelTimestamps) + writeJSON(windowsPath, self.combinedWindows) + + + def combine(self): + """Combine raw and known labels in anomaly windows.""" + self.getRawLabels() + self.combineLabels() + self.editPoorLabels() + self.applyWindows() + self.checkWindows() + + + def getRawLabels(self): + """Collect the raw user labels from specified directory.""" + labelPaths = absoluteFilePaths(self.labelDir) + self.userLabels = [] + self.knownLabels = [] + for path in labelPaths: + if "known" in path: + self.knownLabels.append(CorpusLabel(path, self.corpus)) + else: + self.userLabels.append(CorpusLabel(path, self.corpus)) + + self.nLabelers = len(self.userLabels) + if self.nLabelers == 0: + raise ValueError("No users labels found") + + + def combineLabels(self): + """ + Combines raw user labels to create set of true anomaly labels. + A buffer is used to bucket labels that identify the same anomaly. The buffer + is half the estimated window size of an anomaly -- approximates an average + of two anomalies per dataset, and no window can have > 1 anomaly. + After bucketing, a label becomes a true anomaly if it was labeled by a + proportion of the users greater than the defined threshold. Then the bucket + is merged into one timestamp -- the ground truth label. + The set of known anomaly labels are added as well. These have been manually + labeled because we know the direct causes of the anomalies. They are added + as if they are the result of the bucket-merge process. + + If verbosity > 0, the dictionary passedLabels -- the raw labels that did not + pass the threshold qualification -- is printed to the console. + """ + def setTruthLabels(dataSet, trueAnomalies): + """Returns the indices of the ground truth anomalies for a data file.""" + timestamps = dataSet.data["timestamp"] + labels = numpy.array(timestamps.isin(trueAnomalies), dtype=int) + return [i for i in range(len(labels)) if labels[i]==1] + + self.labelTimestamps = {} + self.labelIndices = {} + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + if ("Known" in relativePath) or ("artificial" in relativePath): + knownAnomalies = self.knownLabels[0].windows[relativePath] + self.labelTimestamps[relativePath] = [str(t) for t in knownAnomalies] + self.labelIndices[relativePath] = setTruthLabels(dataSet, knownAnomalies) + continue + + # Calculate the window buffer -- used for bucketing labels identifying + # the same anomaly. + granularity = dataSet.data["timestamp"][1] - dataSet.data["timestamp"][0] + buffer = datetime.timedelta(minutes= + granularity.total_seconds()/60 * len(dataSet.data) * self.windowSize/10) + + rawTimesLists = [] + userCount = 0 + for user in self.userLabels: + if relativePath in user.windows: + # the user has labels for this file + checkForOverlap( + user.windows[relativePath], buffer, user.path, relativePath) + rawTimesLists.append(user.windows[relativePath]) + userCount += 1 + if not rawTimesLists: + # no labeled anomalies for this data file + self.labelTimestamps[relativePath] = [] + self.labelIndices[relativePath] = setTruthLabels(dataSet, []) + continue + else: + rawTimes = list(itertools.chain.from_iterable(rawTimesLists)) + rawTimes.sort() + + # Bucket and merge the anomaly timestamps. + threshold = userCount * self.threshold + trueAnomalies, passedAnomalies = merge( + bucket(rawTimes, buffer), threshold) + + self.labelTimestamps[relativePath] = [str(t) for t in trueAnomalies] + self.labelIndices[relativePath] = setTruthLabels(dataSet, trueAnomalies) + + if self.verbosity>0: + print "----" + print "For %s the passed raw labels and qualified true labels are,"\ + " respectively:" % relativePath + print passedAnomalies + print trueAnomalies + + return self.labelTimestamps, self.labelIndices + + + def editPoorLabels(self): + """ + This edits labels that have been flagged for manual revision. From + inspecting the data and anomaly windows, we have determined some combined + labels should be revised, or not included in the ground truth labels. + """ + count = 0 + for relativePath, indices in self.labelIndices.iteritems(): + + if "iio_us-east-1_i-a2eb1cd9_NetworkIn" in relativePath: + self.labelIndices[relativePath] = [249, 339] + + count += len(indices) + + if self.verbosity > 0: + print "=============================================================" + print "Total ground truth anomalies in benchmark dataset =", count + + + def applyWindows(self): + """ + This takes all the true anomalies, as calculated by combineLabels(), and + adds a standard window. The window length is the class variable windowSize, + and the location is centered on the anomaly timestamp. + + If verbosity = 2, the window metrics are printed to the console. + """ + allWindows = {} + for relativePath, anomalies in self.labelIndices.iteritems(): + data = self.corpus.dataFiles[relativePath].data + length = len(data) + num = len(anomalies) + if num: + windowLength = int(self.windowSize * length / len(anomalies)) + else: + windowLength = int(self.windowSize * length) + + if self.verbosity==2: + print "----" + print "Window metrics for file", relativePath + print "file length =", length, ";" \ + "number of windows =", num, ";" \ + "window length =", windowLength + + windows = [] + for a in anomalies: + front = max(a - windowLength/2, 0) + back = min(a + windowLength/2, length-1) + + windowLimit = [strf(data["timestamp"][front]), + strf(data["timestamp"][back])] + + windows.append(windowLimit) + + allWindows[relativePath] = windows + + self.combinedWindows = allWindows + + + def checkWindows(self): + """ + This takes the anomaly windows and checks for overlap with both each other + and with the probationary period. Overlapping windows are merged into a + single window. Windows overlapping with the probationary period are deleted. + """ + for relativePath, windows in self.combinedWindows.iteritems(): + numWindows = len(windows) + if numWindows > 0: + + fileLength = self.corpus.dataFiles[relativePath].data.shape[0] + probationIndex = getProbationPeriod( + self.probationaryPercent, fileLength) + + probationTimestamp = self.corpus.dataFiles[relativePath].data[ + "timestamp"][probationIndex] + + if (pandas.to_datetime(windows[0][0]) + -probationTimestamp).total_seconds() < 0: + del windows[0] + print ("The first window in {} overlaps with the probationary period " + ", so we're deleting it.".format(relativePath)) + + i = 0 + while len(windows)-1 > i: + if (pandas.to_datetime(windows[i+1][0]) + - pandas.to_datetime(windows[i][1])).total_seconds() <= 0: + # merge windows + windows[i] = [windows[i][0], windows[i+1][1]] + del windows[i+1] + i += 1 diff --git a/nab/detectors/numenta/nab/runner.py b/nab/detectors/numenta/nab/runner.py new file mode 100644 index 000000000..7cb4adc68 --- /dev/null +++ b/nab/detectors/numenta/nab/runner.py @@ -0,0 +1,120 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import multiprocessing +import os +import pandas +try: + import simplejson as json +except ImportError: + import json + +from nab.corpus import Corpus +from nab.detectors.base import detectDataSet +from nab.labeler import CorpusLabel + + +class Runner(object): + """ +Class to run detection on the NAB benchmark using the specified set of +profiles and/or detectors. +""" + + def __init__(self, + dataDir, + resultsDir, + labelPath, + profilesPath, + numCPUs=None): + """ + @param dataDir (string) Directory where all the raw datasets exist. + + @param resultsDir (string) Directory where the detector anomaly scores + will be scored. + + @param labelPath (string) Path where the labels of the datasets + exist. + + @param profilesPath (string) Path to JSON file containing application + profiles and associated cost matrices. + + @param numCPUs (int) Number of CPUs to be used for calls to + multiprocessing.pool.map + """ + self.dataDir = dataDir + self.resultsDir = resultsDir + + self.labelPath = labelPath + self.profilesPath = profilesPath + self.pool = multiprocessing.Pool(numCPUs) + + self.probationaryPercent = 0.15 + self.windowSize = 0.10 + + self.corpus = None + self.corpusLabel = None + self.profiles = None + + + def initialize(self): + """Initialize all the relevant objects for the run.""" + self.corpus = Corpus(self.dataDir) + self.corpusLabel = CorpusLabel(path=self.labelPath, corpus=self.corpus) + + with open(self.profilesPath) as p: + self.profiles = json.load(p) + + + def detect(self, detectors): + """Generate results file given a dictionary of detector classes + + Function that takes a set of detectors and a corpus of data and creates a + set of files storing the alerts and anomaly scores given by the detectors + + @param detectors (dict) Dictionary with key value pairs of a + detector name and its corresponding + class constructor. + """ + print "\nRunning detection step" + + count = 0 + args = [] + for detectorName, detectorConstructor in detectors.iteritems(): + for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + + if self.corpusLabel.labels.has_key(relativePath): + args.append( + ( + count, + detectorConstructor( + dataSet=dataSet, + probationaryPercent=self.probationaryPercent), + detectorName, + self.corpusLabel.labels[relativePath]["label"], + self.resultsDir, + relativePath + ) + ) + + count += 1 + + # Using `map_async` instead of `map` so interrupts are properly handled. + # See: http://stackoverflow.com/a/1408476 + self.pool.map_async(detectDataSet, args).get(999999) diff --git a/nab/detectors/numenta/nab/util.py b/nab/detectors/numenta/nab/util.py new file mode 100644 index 000000000..b02bf044d --- /dev/null +++ b/nab/detectors/numenta/nab/util.py @@ -0,0 +1,338 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import datetime +import dateutil +import math +import os +import pandas +import pprint +import sys + +try: + import simplejson as json +except ImportError: + import json + + + +def getProbationPeriod(probationPercent, fileLength): + """Return the probationary period index.""" + return min( + math.floor(probationPercent * fileLength), + probationPercent * 5000) + + +def getOldDict(filePath): + """Loads the json given by filepath, returning the dictionary of data.""" + if os.path.exists(filePath): + with open(filePath) as inFile: + dataDict = json.load(inFile) + else: + dataDict = {} + + if not isinstance(dataDict, dict): + raise ValueError("Incorrect type; expected a dict.") + + return dataDict + + +def writeJSON(filePath, data): + """Dumps data to a nicely formatted json at filePath.""" + with open(filePath, "w") as outFile: + outFile.write(json.dumps(data, + sort_keys=True, + indent=4, + separators=(',', ': '))) + + +def updateFinalResults(newResults, resultsFilePath): + """ + Keep final results file updated with (most recent) score normalization. + + @param newResults (dict) Dictionary of normalized scores, from + most recent call to normalize(). + + @param resultsFilePath (str) File containing the best normalized scores + from the past runs of normalize(). + + @return oldResults (dict) Updated final results. + """ + results = getOldDict(resultsFilePath) + + for detector, score in newResults.iteritems(): + results[detector] = score + + writeJSON(resultsFilePath, results) + + return results + + +def updateThresholds(newThresholds, thresholdsFilePath): + """ + The thresholds file keeps a dictionary of thresholds and raw scores for + combinations of detector and scoring profiles. This function updates the file + with the new thresholds. + + @param newThresholds (dict) Optimized thresholds, as returned by + optimizeThreshold() in the optimizer. + + @param thresholdsFilePath (str) JSON of thresholds and their corresponding + raw scores. + + @return oldThresholds (dict) Updated thresholds. + """ + oldThresholds = getOldDict(thresholdsFilePath) + + for detector, profileDictionary in newThresholds.iteritems(): + if detector not in oldThresholds: + # add an entry for a new detector + oldThresholds[detector] = newThresholds[detector] + continue + + for profileName, data in profileDictionary.iteritems(): + if profileName not in oldThresholds[detector]: + # add an entry for a new scoring profile under this detector + oldThresholds[detector][profileName] = data + continue + oldThresholds[detector][profileName] = data + + writeJSON(thresholdsFilePath, oldThresholds) + + return oldThresholds + + +def checkInputs(args): + """Function that displays a set of arguments and asks to proceed.""" + pprint.pprint(vars(args)) + inp = raw_input("Proceed? (y/n): ") + + if inp == 'y': + return True + + if inp == 'n': + return False + + print "Incorrect input given\n" + return checkInputs(args) + + +def convertAnomalyScoresToDetections(anomalyScores, threshold): + """ + Convert anomaly scores (values between 0 and 1) to detections (binary + values) given a threshold. + """ + length = len(anomalyScores) + detections = pandas.Series([0]*length) + + alerts = anomalyScores[anomalyScores >= threshold].index + + detections[alerts] = 1 + + return detections + + +def relativeFilePaths(directory): + """Given directory, get path of all files within relative to the directory. + + @param directory (string) Absolute directory name. + + @return (iterable) All filepaths within directory, relative to + that directory. + """ + for dirpath,_,filenames in os.walk(directory): + filenames = [f for f in filenames if not f[0] == "."] + for f in filenames: + yield os.path.join(dirpath, f) + + +def absoluteFilePaths(directory): + """Given directory, gets the absolute path of all files within. + + @param directory (string) Directory name. + + @return (iterable) All absolute filepaths within directory. + """ + for dirpath,_,filenames in os.walk(directory): + filenames = [f for f in filenames if not f[0] == "."] + for f in filenames: + yield os.path.abspath(os.path.join(dirpath, f)) + + +def makeDirsExist(dirname): + """Makes sure a given directory exists. If not, it creates it. + + @param dirname (string) Absolute directory name. + """ + + if not os.path.exists(dirname): + # This is being run in parallel so watch out for race condition. + try: + os.makedirs(dirname) + except OSError: + pass + + +def createPath(path): + """Makes sure a given path exists. If not, it creates it. + + @param path (string) Absolute path name. + """ + dirname = os.path.dirname(path) + makeDirsExist(dirname) + + +def detectorClassToName(obj): + """Removes the 'detector' from the end of detector class's name. + + @param obj (subclass of AnomalyDetector) Detector class. + + @return (string) Name of detector. + """ + tailLength = len('detector') + name = obj.__name__[:-tailLength].lower() + return name + + +def detectorNameToClass(name): + name = name[0].upper() + name[1:] + className = name + "Detector" + + return className + + +def osPathSplit(path, debug=False): + """ + os_path_split_asunder + http://stackoverflow.com/questions/4579908/cross-platform-splitting-of-path-in-python + Path splitter that works on both unix-based and windows platforms. + + @param path (string) Path to be split. + + @return (list) Split path. + """ + parts = [] + while True: + newpath, tail = os.path.split(path) + if debug: + print repr(path), (newpath, tail) + if newpath == path: + assert not tail + if path: + parts.append(path) + break + parts.append(tail) + path = newpath + parts.reverse() + return parts + + +def convertResultsPathToDataPath(path): + """ + @param path (string) Path to dataset in the data directory. + + @return (string) Path to dataset result in the result directory. + """ + path = path.split(os.path.sep) + detector = path[0] + path = path[1:] + + filename = path[-1] + toRemove = detector + "_" + + i = filename.index(toRemove) + filename = filename[:i] + filename[i+len(toRemove):] + + path[-1] = filename + path = "/".join(path) + + return path + + +def flattenDict(dictionary, files={}, head=""): + """ + @param dictionary (dict) Dictionary of dictionaries to be flattened. + + @param files (dict) Dictionary to build up + + @param head (string) Prefix to each key + """ + for key in dictionary.keys(): + concat = head + "/" + key if head != "" else key + if type(dictionary[key]) is dict: + flattenDict(dictionary[key], files, concat) + else: + files[concat] = dictionary[key] + + return files + + +def strf(t): + """ + @param t (datetime.Datetime) Datetime object. + + @return (string) Formatted string of datetime. + """ + return datetime.datetime.strftime(t, "%Y-%m-%d %H:%M:%S.%f") + + +def strp(t): + """ + @param t (datetime.datetime) String of datetime with format: + "YYYY-MM-DD HH:mm:SS.ss". + + @return (string) Datetime object. + """ + return dateutil.parser.parse(t) + + +def recur(function, value, n): + """ + @param function (function) Function to recurse. + + @param value (value) Value to recurse on. + + @param n (int) Number of times to recurse. + """ + if n < 0 or int(n) != n: + print "incorrect input" + sys.exit() + + elif n == 0: + return value + + elif n == 1: + return function(value) + + else: + return recur(function, function(value), n-1) + + +def deepmap(f, datum): + """Deeply applies f across the datum. + + @param f (function) Function to map with. + + @param datum (datum) Object to map over. + """ + if type(datum) == list: + return [deepmap(f, x) for x in datum] + else: + return f(datum) diff --git a/nab/detectors/numenta/requirements.txt b/nab/detectors/numenta/requirements.txt new file mode 100644 index 000000000..0b10643b6 --- /dev/null +++ b/nab/detectors/numenta/requirements.txt @@ -0,0 +1,4 @@ +nupic==1.0.5 +numpy<=1.16 #latest to support py2 +pandas==0.20.3 +simplejson==3.11.1 diff --git a/nab/detectors/numenta/run.py b/nab/detectors/numenta/run.py new file mode 100644 index 000000000..8612f882a --- /dev/null +++ b/nab/detectors/numenta/run.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +""" +Entry point for the Python 2 based detectors `numenta` and `numenta_tm` +""" +import argparse +import os +try: + import simplejson as json +except ImportError: + import json + +from nab.runner import Runner +from nab.util import (detectorNameToClass, checkInputs) +from nab.detectors.numenta.numenta_detector import NumentaDetector +from nab.detectors.numenta.numentaTM_detector import NumentaTMDetector + + +def getDetectorClassConstructors(detectors): + """ + Takes in names of detectors. Collects class names that correspond to those + detectors and returns them in a dict. The dict maps detector name to class + names. Assumes the detectors have been imported. + """ + detectorConstructors = { + d : globals()[detectorNameToClass(d)] for d in detectors} + + return detectorConstructors + + +def get_nth_parent_dir(n, path): + """ + Return the Nth parent of `path` where the 0th parent is the direct parent + directory. + """ + parent = os.path.dirname(path) + if n == 0: + return parent + + return get_nth_parent_dir(n-1, parent) + +def main(args): + + filepath = os.path.realpath(__file__) + + # Find the main NAB folder + # Assuming `filepath` is ~ <...>/NAB/nab/detectors/numenta/run.py + root = get_nth_parent_dir(3, filepath) + + numCPUs = int(args.numCPUs) if args.numCPUs is not None else None + + dataDir = os.path.join(root, args.dataDir) + windowsFile = os.path.join(root, args.windowsFile) + resultsDir = os.path.join(root, args.resultsDir) + profilesFile = os.path.join(root, args.profilesFile) + + runner = Runner(dataDir=dataDir, + labelPath=windowsFile, + resultsDir=resultsDir, + profilesPath=profilesFile, + numCPUs=numCPUs) + + runner.initialize() + + detectorConstructors = getDetectorClassConstructors(args.detectors) + runner.detect(detectorConstructors) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("--skipConfirmation", + help="If specified will skip the user confirmation step", + default=False, + action="store_true") + + parser.add_argument("--dataDir", + default="data", + help="This holds all the label windows for the corpus.") + + parser.add_argument("--resultsDir", + default="results", + help="This will hold the results after running detectors " + "on the data") + + parser.add_argument("--windowsFile", + default=os.path.join("labels", "combined_windows.json"), + help="JSON file containing ground truth labels for the " + "corpus.") + + parser.add_argument("-d", "--detectors", + nargs="*", + type=str, + default=["numenta", "numentaTM"], + help="Comma separated list of detector(s) to use") + + parser.add_argument("-p", "--profilesFile", + default=os.path.join("config", "profiles.json"), + help="The configuration file to use while running the " + "benchmark.") + + parser.add_argument("-n", "--numCPUs", + default=None, + help="The number of CPUs to use to run the " + "benchmark. If not specified all CPUs will be used.") + + # In this version of run.py this is a no-op + # See https://github.com/numenta/NAB/issues/346 for why it was retained + parser.add_argument("--detect", + help="No-op. See: https://github.com/numenta/NAB/issues/346", + default=False, + action="store_true") + + + args = parser.parse_args() + + if len(args.detectors) == 1: + # Handle comma-seperated list argument. + args.detectors = args.detectors[0].split(",") + + if args.skipConfirmation or checkInputs(args): + main(args) diff --git a/nab/detectors/numenta/setup.py b/nab/detectors/numenta/setup.py new file mode 100644 index 000000000..eac3d56e9 --- /dev/null +++ b/nab/detectors/numenta/setup.py @@ -0,0 +1,115 @@ +# ---------------------------------------------------------------------- +# Copyright (C) 2014-2015, Numenta, Inc. Unless you have an agreement +# with Numenta, Inc., for a separate license for this software code, the +# following terms and conditions apply: +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero Public License version 3 as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU Affero Public License for more details. +# +# You should have received a copy of the GNU Affero Public License +# along with this program. If not, see http://www.gnu.org/licenses. +# +# http://numenta.org/licenses/ +# ---------------------------------------------------------------------- + +import os +import pkg_resources +import warnings +from setuptools import setup, find_packages + +REPO_DIR = os.path.dirname(os.path.realpath(__file__)) + + + +# Utility function to read the README file. +# Used for the long_description. It"s nice, because now 1) we have a top level +# README file and 2) it"s easier to type in the README file than to put a raw +# string in below ... +def read(fname): + with open(os.path.join(os.path.dirname(__file__), fname)) as f: + result = f.read() + return result + + + +def nupicInstalled(): + """ + Determine whether NuPIC is already installed. + :return: boolean + """ + try: + _ = pkg_resources.get_distribution("nupic") + return True + except pkg_resources.DistributionNotFound: + pass # Silently ignore. NuPIC will be installed later. + + return False + + + +def parseFile(requirementFile): + """ + Parse requirement file. + :return: list of requirements. + """ + try: + return [ + line.strip() + for line in open(requirementFile).readlines() + if not line.startswith("#") + ] + except IOError: + return [] + + + +def findRequirements(): + """ + Read the requirements.txt file and parse into requirements for setup's + install_requirements option. + """ + requirementsPath = os.path.join(REPO_DIR, "requirements.txt") + requirements = parseFile(requirementsPath) + + if nupicInstalled(): + # The user already has a version of NuPIC installed. We'll remove the entry + # in requirements.txt to not conflate the two and will issue a user warning. + reqs = [] + for req in requirements: + if "nupic" != req.split("==")[0]: + reqs.append(req) + else: + warnings.warn("NuPIC is already installed so %s from requirements.txt " + "will be not be installed." % req) + else: + reqs = requirements + return reqs + + + +if __name__ == "__main__": + requirements = findRequirements() + + setup( + name="nab", + version="1.0", + author="Alexander Lavin", + author_email="nab@numenta.org", + description=( + "Numenta Anomaly Benchmark: A benchmark for streaming anomaly prediction"), + license="AGPL", + packages=find_packages(), + long_description=read("README.md"), + install_requires=requirements, + entry_points={ + "console_scripts": [ + "nab-plot = nab.plot:main", + ], + }, + ) diff --git a/nab/detectors/random_cut_forest/random_cut_forest.py b/nab/detectors/random_cut_forest/random_cut_forest.py index ab5fae806..09b5c653c 100644 --- a/nab/detectors/random_cut_forest/random_cut_forest.py +++ b/nab/detectors/random_cut_forest/random_cut_forest.py @@ -206,15 +206,15 @@ def createApplication(): Create a new AWS Kinesis Analytics Application used to provide anomaly scores from NAB data files. See "random_cut_forest.sql" """ - print "Creating kinesis streams" + print("Creating kinesis streams") streams = createStreams() inputStream = streams[INPUT_STREAM_NAME] outputStream = streams[OUTPUT_STREAM_NAME] - print "Creating IAM Role" + print("Creating IAM Role") role = createRole(inputStream, outputStream) - print "Creating kinesis analytics application" + print("Creating kinesis analytics application") sourceCode = open(APPLICATION_SOURCE_FILE, "r").read() kinesisAnalytics = boto3.client("kinesisanalytics") kinesisAnalytics.create_application( @@ -301,13 +301,13 @@ def deleteApplication(): """ Deletes the application created via "createApplication" """ - print "Deleting IAM Role" + print("Deleting IAM Role") deleteRole() - print "Deleting kinesis streams" + print("Deleting kinesis streams") deleteStreams() - print "Deleting kinesis analytics application" + print("Deleting kinesis analytics application") kinesisAnalytics = boto3.client("kinesisanalytics") try: response = kinesisAnalytics.describe_application( @@ -332,7 +332,7 @@ def streamFile(corpus, corpusLabel, resultsdir, name): :param name: NAB data file name (i.e. "realKnownCause/nyc_taxi.csv") :return: The result file absolute path """ - print "Streaming", name + print("Streaming", name) startApplication() @@ -360,7 +360,13 @@ def streamFile(corpus, corpusLabel, resultsdir, name): response = kinesis.get_records(ShardIterator=shardIterator) records = response["Records"] if len(records) > 0: - rows.extend([rec["Data"].strip('\n').split(",") for rec in records]) + parsed_records = [] + for rec in records: + parsed_record = str(rec["Data"], "utf-8") + parsed_record = parsed_record.strip('\n') + parsed_record = parsed_record.split(",") + parsed_records.append(parsed_record) + rows.extend(parsed_records) shardIterator = response["NextShardIterator"] sys.stdout.write("\rProcessed {}/{} ".format(len(rows), total)) sys.stdout.flush() @@ -403,7 +409,7 @@ def streamAll(corpus, corpusLabel, resultsdir): 'scripts/create_new_detector.py --detector randomCutForest' first """ - for name in corpus.dataFiles.keys(): + for name in list(corpus.dataFiles.keys()): streamFile(corpus, corpusLabel, resultsdir, name) diff --git a/nab/detectors/relative_entropy/relative_entropy_detector.py b/nab/detectors/relative_entropy/relative_entropy_detector.py index cb7db3469..84f221ce9 100644 --- a/nab/detectors/relative_entropy/relative_entropy_detector.py +++ b/nab/detectors/relative_entropy/relative_entropy_detector.py @@ -174,7 +174,7 @@ def getAgreementHypothesis(self,P_hat): index = -1 minEntropy = float("inf") - for i in xrange(self.m): + for i in range(self.m): entropy = 2 * self.W * stats.entropy(P_hat,self.P[i]) if entropy < self.T and entropy < minEntropy: minEntropy = entropy diff --git a/nab/detectors/twitter/README.md b/nab/detectors/twitter/README.md new file mode 100644 index 000000000..a187618e9 --- /dev/null +++ b/nab/detectors/twitter/README.md @@ -0,0 +1,127 @@ +## Introduction + +[AnomalyDetection](https://github.com/twitter/AnomalyDetection) is an R +package developed by Twitter that detects anomalies in time-series data. The +package implements their Seasonal Hybrid ESD algorithm, which extends the +generalized ESD algorithm to allow for seasonality in the data, i.e. different +periods of patterns in the data that represent macro-level changes rather than +micro-level anomalies. + +To evaluate AnomalyDetection, written in R, on NAB, written in Python, we have +three options: port the R code into Python, use an interface from R to Python +like rpy2, or use the R code for anomaly detection and the Python code for +evaluating the results. We elected to go with the third option, following +"Path 3" in [this NAB +figure](https://drive.google.com/a/numenta.com/file/d/0B1_XUjaAXeV3NmxhbEFtZVZ4TmM/view?pli=1). +Thus the task reduced to converting the NAB data files into structures as +expected by AnomalyDetection, and then converting the output of +AnomalyDetection into the results format required by NAB. + +## Step 1 - run the detection algorithms + +We provide a the R script (nab_anomaly_detection.r) we used to run the +AnomlayDetection algorithms on NAB, which includes a few subtleties detailed +below. + +### Handling NAB datasets in R + +As specified in [the NAB technical +whitepaper](https://github.com/numenta/NAB/wiki#nab-whitepaper), +datasets in NAB are CSV files with a "timestamp" column and a "value" column. +The values are floats or integers, and the timestamps are strings of the form +`YYYY-mm-dd HH:MM:SS.s` (in Python notation). In R notation, the timestamps +are of the form `%Y-%m-%d %H:%M:%OS`. R provides a `read.csv` function to load +NAB data into a dataframe that AnomalyDetection can use. Converting the +timestamps in the CSV file to the appropriate datatype in R requires a bit of +subtlety. With the path to the CSV file stored in `dataFilePath`, + + setClass("nabDate") + setAs("character", "nabDate", function(from) as.POSIXlt(from, format="%Y-%m-%d %H:%M:%OS")) + nab_data <- read.csv(dataFilePath, colClasses=c("nabDate", "numeric")) + +Now `nab_data` can be passed into the AnomalyDetection functions. + +### AnomalyDetectionTs issues + +The Ts version of AnomalyDetection is intended to use the periodocity in time +series data to supplement the underlying algorithms. However, we found the +algorithm failed to detect the necessary periodicity params for a large subset +of the NAB data files. Researching the errors revealed open issues in the +AnomalyDetection source code, where the recommended course of action is to +defer to the Vec version. Therefore we do not include the Ts version in the +NAB results. + +There are two error statements and corresponding AnomalyDetection issues: +["Anom detection needs at least 2 periods worth of +data"](https://github.com/twitter/AnomalyDetection/issues/15) and ["must +supply period length for time series +decomposition"](https://github.com/twitter/AnomalyDetection/issues/45). + +### Tuning the AnomalyDetection parameters + +We tuned the parameters of AnomalyDetectionVec to yield the best NAB results +possible (across all application profiles), and the AnomalyDetectionTs +parameters in an attempt to run it effectively on most of the dataset. + +The parameters of significant consequence to the results of +AnomalyDetectionVec are `period` and `max_anoms`. The former defines the +number of records in a single period (used in seasonal decomposition), and the +latter captures the maximum percent of data points that will be labelled as +anomalous by the algorithm. We tuned these parameters manually in search of +the best final scores, finding `period=150` and `max_anoms = 0.0020` maximize +the scores for all three NAB application profiles (standard, reward low FP, +reward low FN). + +## Step 2 - prepare to run NAB + +To prepare NAB for analyzing results from a new detector, we ran the following script: + + python scripts/create_new_detector.py --detector twitterADVec + +This script generates the necessary directories and creates an entry in the thresholds JSON. + +### Formatting the results for NAB + +NAB requires a CSV file with timestamp, value, anomaly_score, and label +columns, so we want to add these columns to our `nab_data` data frame. Because +AnomalyDetection identifies anomalies, rather than reporting an anomaly +probability or a raw score for each record, we used a binary anomaly_score: +the records flagged by AnomalyDetection as anomalous are represented by 1, and +all others 0. The label column is also binary, indicating whether or not a +record is within a true anomaly window. The true anomalies and their durations +are recorded in a [JSON file of the combined +windows](https://github.com/numenta/NAB/blob/master/labels/combined_windows.json). +We used the +[jsonlite](http://cran.r-project.org/web/packages/jsonlite/index.html) R +package for handling the JSON. + +With all columns added to the dataframe, `write.csv` lets us write the results +to a CSV file that can be passed into NAB. **Note:** Each CSV file must have +the name of the detector followed by an underscore at the beginning of the +filename, e.g. `twitterADVec_cpu_utilization_asg_misconfiguration.csv`. + +This is implemented in `addDetections()` and `addLabels()` of our script for running AnomalyDetection. + +## Step 3 - run NAB + +The results CSV files were placed in NAB/results/twitterADVec/ in categorical +subdirectories. Now we're ready to score the results, and in the top level of +NAB we run: + + python run.py -d twitterADVec --optimize (optional) --score --normalize + +This runs the scoring and normalization step for the twitterADVec detector. +The optimization step is optional because we can manually set the thresholds +(for all application profiles) arbitrarily between 0 and 1. That is, because +the anomaly_score entries are binary, we can use a threshold of 0.5 and skip +optimization. + +The final scores will be printed to the screen and written to +[nab/results/final_results.json](https://github.com/numenta/NAB/blob/master/results/final_results.json), +and results CSV files for each application profile will be written to the +twitterADVec directory. We obtained the following output for +AnomalyDetectionVec with optimized parameters: + + Final score for 'twitterADVec_reward_low_FP_rate_scores' = 33.61 + Final score for 'twitterADVec_reward_low_FN_rate_scores' = 53.50 + Final score for 'twitterADVec_standard_scores' = 47.06 diff --git a/nab/detectors/twitter/nab_anomaly_detection.r b/nab/detectors/twitter/nab_anomaly_detection.r new file mode 100644 index 000000000..b616ed38e --- /dev/null +++ b/nab/detectors/twitter/nab_anomaly_detection.r @@ -0,0 +1,135 @@ +################################################################################ +# This script runs the Twitter AnomalyDetection algorithms on the NAB data set. +# +# You must first install the AnomalyDetection package: +# https://github.com/twitter/AnomalyDetection#how-to-get-started +# +# You must also have NAB installed and specify the path at the bottom of this +# script. +################################################################################ + +library(methods) +library(AnomalyDetection) +library(jsonlite) + + + +addDetections <- function(anomalyDataFrame, detections, algorithmName) { + anomalyDataFrame$anomaly_score=0.0 + + if (length(detections$anoms) > 0) { + for (i in 1:nrow(detections$anoms)) { + if (algorithmName == "twitterADTs") { + idx = match(detections$anoms[i, 1], anomalyDataFrame$timestamp) + } + else if (algorithmName == "twitterADVec") { + idx = detections$anoms[i, 1] + } + anomalyDataFrame[idx,]$anomaly_score = 1.0 + } + } + return(anomalyDataFrame) +} + + +addLabels <- function(anomalyDataFrame, anomalyBounds) { + anomalyDataFrame$label = 0 + + if (length(anomalyBounds) != 0) { + for (i in 1:nrow(anomalyBounds)) { + lower = anomalyBounds[i, 1] + upper = anomalyBounds[i, 2] + idx = anomalyDataFrame$timestamp >= lower & anomalyDataFrame$timestamp <= upper + idx[is.na(idx)] = FALSE + anomalyDataFrame[idx,]$label = 1 + } + } + return(anomalyDataFrame) +} + + +runTwitter <- function(algorithmName, nab_data, filename) { + + if (algorithmName == "twitterADTs") { + results = tryCatch( + { + message(paste( + "Attempting detection w/ AnomalyDetectionTS on ", filename)) + AnomalyDetectionTs( + nab_data, max_anoms=0.0008, direction='both', plot=FALSE) + }, + error = function(cond) { + message(paste("Unable to run the algorithm for ", filename)) + return(NULL) + } + ) + } + else if (algorithmName == "twitterADVec") { + message(paste("Detecting w/ AnomalyDetectionVec on ", filename)) + results = AnomalyDetectionVec( + nab_data[,2], alpha=0.05, period=150, max_anoms=0.0020, direction='both', + plot=FALSE) + } + + message("Results...") + print(results$anoms) + + return(results) +} + + +main <- function(pathToNAB, algorithmName, skipFiles=list()) { + # pathToNAB (character): string specifying path to the NAB dir. + # algorithmName (character): either 'twitterADTs' or 'twitterADVec'. + # skipFiles (list): file names to skip; useful in debugging. + + # Format dates: coerce from character class to nabDate class + setClass("nabDate") + setAs( + "character", + "nabDate", + function(from) as.POSIXlt(from, format="%Y-%m-%d %H:%M:%OS")) + + # Setup paths to NAB data and results + nabDataDir = paste(pathToNAB, "data", sep='/') + dataDirs = list.files(nabDataDir) + resultsDir = paste(pathToNAB, "results", algorithmName, sep='/') + + # Get the truth anomaly windows + windows = fromJSON(paste(pathToNAB, "labels/combined_windows.json", sep='/')) + + for (dDir in dataDirs) { + dataFiles = list.files(paste(nabDataDir, dDir, sep='/')) + for (dFile in dataFiles) { + if (is.element(dFile, skipFiles)) { + next + } + + # Get the data and run the detector + dataName = paste(dDir, dFile, sep='/') + dFilePath = paste(nabDataDir, dataName, sep='/') + nab_data = read.csv(dFilePath, colClasses=c("nabDate", "numeric")) + results = runTwitter(algorithmName, nab_data, dFilePath) + + # Populate dataframe with anomaly scores and truth labels + nab_data = addDetections(nab_data, results, algorithmName) + nab_data = addLabels(nab_data, windows[[dataName]]) + + # Write results to csv + resultsFileName = paste(algorithmName, dFile, sep='_') + write.csv( + nab_data, + paste(resultsDir, dDir, resultsFileName, sep='/'), + row.names=FALSE) + } + } +} + + + +pathToNAB = "path/to/nab" +skipFiles = list() +algorithmNames = list("twitterADVec", "twitterADTs") +for (alg in algorithmNames) { + main(pathToNAB, alg) +} diff --git a/nab/labeler.py b/nab/labeler.py index 3925f4bf0..629ad7f8b 100644 --- a/nab/labeler.py +++ b/nab/labeler.py @@ -85,7 +85,7 @@ def checkForOverlap(labels, buffer, labelsFileName, dataFileName): Raise a ValueError if the difference between any consecutive labels is smaller than the buffer. """ - for i in xrange(len(labels)-1): + for i in range(len(labels)-1): if labels[i+1] - labels[i] <= buffer: # import pdb; pdb.set_trace() raise ValueError("The labels {} and {} in \'{}\' labels for data file " @@ -129,7 +129,7 @@ def getWindows(self): a relative path and its corresponding list of windows. """ def found(t, data): - f = data["timestamp"][data["timestamp"] == pandas.tslib.Timestamp(t)] + f = data["timestamp"][data["timestamp"] == pandas.Timestamp(t)] exists = (len(f) == 1) return exists @@ -139,7 +139,7 @@ def found(t, data): self.windows = {} - for relativePath in windows.keys(): + for relativePath in list(windows.keys()): self.windows[relativePath] = deepmap(strp, windows[relativePath]) @@ -172,7 +172,7 @@ def validateLabels(self): self.windows = {} - for relativePath in windows.keys(): + for relativePath in list(windows.keys()): self.windows[relativePath] = deepmap(strp, windows[relativePath]) @@ -183,7 +183,7 @@ def validateLabels(self): if num_windows > 1: if not all([(self.windows[relativePath][i+1][0] - self.windows[relativePath][i][1]).total_seconds() >= 0 - for i in xrange(num_windows-1)]): + for i in range(num_windows-1)]): raise ValueError("In the label file %s, windows overlap." % self.path) @@ -195,8 +195,8 @@ def getLabels(self): """ self.labels = {} - for relativePath, dataSet in self.corpus.dataFiles.iteritems(): - if self.windows.has_key(relativePath): + for relativePath, dataSet in self.corpus.dataFiles.items(): + if relativePath in self.windows: windows = self.windows[relativePath] labels = pandas.DataFrame({"timestamp": dataSet.data["timestamp"]}) @@ -211,7 +211,7 @@ def getLabels(self): self.labels[relativePath] = labels else: - print "Warning: no label for datafile",relativePath + print("Warning: no label for datafile",relativePath) class LabelCombiner(object): @@ -326,7 +326,7 @@ def setTruthLabels(dataSet, trueAnomalies): self.labelTimestamps = {} self.labelIndices = {} - for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + for relativePath, dataSet in self.corpus.dataFiles.items(): if ("Known" in relativePath) or ("artificial" in relativePath): knownAnomalies = self.knownLabels[0].windows[relativePath] self.labelTimestamps[relativePath] = [str(t) for t in knownAnomalies] @@ -366,11 +366,11 @@ def setTruthLabels(dataSet, trueAnomalies): self.labelIndices[relativePath] = setTruthLabels(dataSet, trueAnomalies) if self.verbosity>0: - print "----" - print "For %s the passed raw labels and qualified true labels are,"\ - " respectively:" % relativePath - print passedAnomalies - print trueAnomalies + print("----") + print("For %s the passed raw labels and qualified true labels are,"\ + " respectively:" % relativePath) + print(passedAnomalies) + print(trueAnomalies) return self.labelTimestamps, self.labelIndices @@ -382,7 +382,7 @@ def editPoorLabels(self): labels should be revised, or not included in the ground truth labels. """ count = 0 - for relativePath, indices in self.labelIndices.iteritems(): + for relativePath, indices in self.labelIndices.items(): if "iio_us-east-1_i-a2eb1cd9_NetworkIn" in relativePath: self.labelIndices[relativePath] = [249, 339] @@ -390,8 +390,8 @@ def editPoorLabels(self): count += len(indices) if self.verbosity > 0: - print "=============================================================" - print "Total ground truth anomalies in benchmark dataset =", count + print("=============================================================") + print("Total ground truth anomalies in benchmark dataset =", count) def applyWindows(self): @@ -403,7 +403,7 @@ def applyWindows(self): If verbosity = 2, the window metrics are printed to the console. """ allWindows = {} - for relativePath, anomalies in self.labelIndices.iteritems(): + for relativePath, anomalies in self.labelIndices.items(): data = self.corpus.dataFiles[relativePath].data length = len(data) num = len(anomalies) @@ -413,11 +413,11 @@ def applyWindows(self): windowLength = int(self.windowSize * length) if self.verbosity==2: - print "----" - print "Window metrics for file", relativePath - print "file length =", length, ";" \ + print("----") + print("Window metrics for file", relativePath) + print("file length =", length, ";" \ "number of windows =", num, ";" \ - "window length =", windowLength + "window length =", windowLength) windows = [] for a in anomalies: @@ -440,7 +440,7 @@ def checkWindows(self): and with the probationary period. Overlapping windows are merged into a single window. Windows overlapping with the probationary period are deleted. """ - for relativePath, windows in self.combinedWindows.iteritems(): + for relativePath, windows in self.combinedWindows.items(): numWindows = len(windows) if numWindows > 0: @@ -454,8 +454,8 @@ def checkWindows(self): if (pandas.to_datetime(windows[0][0]) -probationTimestamp).total_seconds() < 0: del windows[0] - print ("The first window in {} overlaps with the probationary period " - ", so we're deleting it.".format(relativePath)) + print(("The first window in {} overlaps with the probationary period " + ", so we're deleting it.".format(relativePath))) i = 0 while len(windows)-1 > i: diff --git a/nab/optimizer.py b/nab/optimizer.py index fa62c19b9..efe4f08c2 100644 --- a/nab/optimizer.py +++ b/nab/optimizer.py @@ -63,7 +63,7 @@ def optimizeThreshold(args): # First, get the sweep-scores for each row in each data set allAnomalyRows = [] - for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): + for relativePath, dataSet in resultsCorpus.dataFiles.items(): if "_scores.csv" in relativePath: continue @@ -91,9 +91,9 @@ def optimizeThreshold(args): scoresByThreshold,key=lambda x: x.score, reverse=True) bestParams = scoresByThreshold[0] - print("Optimizer found a max score of {} with anomaly threshold {}.".format( + print(("Optimizer found a max score of {} with anomaly threshold {}.".format( bestParams.score, bestParams.threshold - )) + ))) return { "threshold": bestParams.threshold, diff --git a/nab/plot.py b/nab/plot.py index 845d91d8e..bd45073fc 100644 --- a/nab/plot.py +++ b/nab/plot.py @@ -33,7 +33,7 @@ import plotly.plotly from plotly.graph_objs import ( - Bar, Data, Figure, Layout, Line, Margin, Marker, Scatter, XAxis, YAxis) + Bar, Figure, Layout, Line, Margin, Marker, Scatter) try: import simplejson as json @@ -137,7 +137,7 @@ def _addValues(data, start=None, end=None): return Scatter(x=data["timestamp"][mask], y=data["value"][mask], name="value", - line=Line( + line=dict( width=1.5 ), showlegend=False) @@ -184,7 +184,7 @@ def _addLabels(data, labels, target="value", start=None, end=None): mode="markers", name="Ground Truth Anomaly", text=["Anomalous Instance"], - marker=Marker( + marker=dict( color="rgb(200, 20, 20)", size=10, symbol=MARKERS[0] @@ -218,7 +218,7 @@ def _addWindows(self, start=None, end=None): return Bar(x=x, y=y, name="Anomaly Window", - marker=Marker( + marker=dict( color="rgb(220, 100, 100)" ), opacity=0.3) @@ -239,7 +239,7 @@ def _addProbation(self, start=None, end=None): return Bar(x=x, y=y, name="Probationary Period", - marker=Marker( + marker=dict( color="rgb(0, 0, 200)" ), opacity=0.2) @@ -255,14 +255,12 @@ def _createLayout(title=None, xLabel="Date", yLabel="Metric", fontSize=12, "showlegend": False, "width": width, "height": height, - "xaxis": XAxis( + "xaxis": dict( title=xLabel, ), - "yaxis": YAxis( + "yaxis": dict( title=yLabel, domain=[0, 1], - autorange=True, - autotick=True, ), "barmode": "stack", "bargap": 0} @@ -345,14 +343,13 @@ def plotMultipleDetectors(self, if withProbation: traces.append(self._addProbation()) - # Create plotly Data and Layout objects: - data = Data(traces) + # Create plotly Layout object: layout = self._createLayout("Anomaly Detections for " + self.dataName) # Query plotly - fig = Figure(data=data, layout=layout) + fig = Figure(data=traces, layout=layout) plot_url = self.py.plot(fig) - print "Detections plot URL: ", plot_url + print("Detections plot URL: ", plot_url) return plot_url @@ -408,12 +405,11 @@ def plot(self, if withProbation: traces.append(self._addProbation(start=start, end=end)) - # Create plotly Data and Layout objects: - data = Data(traces) + # Create plotly Layout object: layout = self._createLayout(self.dataName, xLabel=xLabel, yLabel=yLabel, fontSize=fontSize, width=width, height=height) # Query plotly - fig = Figure(data=data, layout=layout) + fig = Figure(data=traces, layout=layout) if plotPath is None: # We temporarily switch to a temp directory to avoid overwriting the # previous plot when in offline mode. @@ -423,7 +419,7 @@ def plot(self, try: os.chdir(tempDir) plotPath = self.py.plot(fig) - print "Data plot URL: ", plotPath + print("Data plot URL: ", plotPath) finally: os.chdir(cwd) else: @@ -470,11 +466,11 @@ def _addDetections(self, name, symbol, FP, TP): mode="markers", name=name, text=["anomalous data"], - marker=Marker( + marker=dict( color="rgb(200, 20, 20)", size=15.0, symbol=symbol, - line=Line( + line=dict( color="rgb(200, 20, 20)", width=2 ) @@ -485,11 +481,11 @@ def _addDetections(self, name, symbol, FP, TP): mode="markers", name=name, text=["anomalous data"], - marker=Marker( + marker=dict( color="rgb(20, 200, 20)", size=15.0, symbol=symbol, - line=Line( + line=dict( color="rgb(20, 200, 20)", width=2 ) @@ -536,7 +532,7 @@ def main(): parser.add_argument("file") args = parser.parse_args() if args.offline and args.output is not None: - print "Plots cannot be saved to file in offline mode." + print("Plots cannot be saved to file in offline mode.") sys.exit(-1) path = args.file title = args.title diff --git a/nab/runner.py b/nab/runner.py index 2d193be4f..3de15b135 100644 --- a/nab/runner.py +++ b/nab/runner.py @@ -106,14 +106,14 @@ def detect(self, detectors): detector name and its corresponding class constructor. """ - print "\nRunning detection step" + print("\nRunning detection step") count = 0 args = [] - for detectorName, detectorConstructor in detectors.iteritems(): - for relativePath, dataSet in self.corpus.dataFiles.iteritems(): + for detectorName, detectorConstructor in detectors.items(): + for relativePath, dataSet in self.corpus.dataFiles.items(): - if self.corpusLabel.labels.has_key(relativePath): + if relativePath in self.corpusLabel.labels: args.append( ( count, @@ -131,7 +131,7 @@ class constructor. # Using `map_async` instead of `map` so interrupts are properly handled. # See: http://stackoverflow.com/a/1408476 - self.pool.map_async(detectDataSet, args).get(99999999) + self.pool.map_async(detectDataSet, args).get(999999) def optimize(self, detectorNames): @@ -144,7 +144,7 @@ def optimize(self, detectorNames): dictionary containing the score and the threshold used to obtained that score. """ - print "\nRunning optimize step" + print("\nRunning optimize step") scoreFlag = False thresholds = {} @@ -155,7 +155,7 @@ def optimize(self, detectorNames): thresholds[detectorName] = {} - for profileName, profile in self.profiles.iteritems(): + for profileName, profile in self.profiles.items(): thresholds[detectorName][profileName] = optimizeThreshold( (detectorName, profile["CostMatrix"], @@ -183,7 +183,7 @@ def score(self, detectorNames, thresholds): another dictionary containing the score and the threshold used to obtained that score. """ - print "\nRunning scoring step" + print("\nRunning scoring step") scoreFlag = True baselines = {} @@ -193,7 +193,7 @@ def score(self, detectorNames, thresholds): resultsDetectorDir = os.path.join(self.resultsDir, detectorName) resultsCorpus = Corpus(resultsDetectorDir) - for profileName, profile in self.profiles.iteritems(): + for profileName, profile in self.profiles.items(): threshold = thresholds[detectorName][profileName]["threshold"] resultsDF = scoreCorpus(threshold, @@ -211,8 +211,8 @@ def score(self, detectorNames, thresholds): (detectorName, profileName)) resultsDF.to_csv(scorePath, index=False) - print "%s detector benchmark scores written to %s" %\ - (detectorName, scorePath) + print("%s detector benchmark scores written to %s" %\ + (detectorName, scorePath)) self.resultsFiles.append(scorePath) @@ -231,7 +231,7 @@ def normalize(self): Note the results CSVs still contain the original scores, not normalized. """ - print "\nRunning score normalization step" + print("\nRunning score normalization step") # Get baseline scores for each application profile. nullDir = os.path.join(self.resultsDir, "null") @@ -240,7 +240,7 @@ def normalize(self): "run the null detector before normalizing scores.") baselines = {} - for profileName, _ in self.profiles.iteritems(): + for profileName, _ in self.profiles.items(): fileName = os.path.join(nullDir, "null_" + profileName + "_scores.csv") with open(fileName) as f: @@ -251,13 +251,13 @@ def normalize(self): with open(self.labelPath, "rb") as f: labelsDict = json.load(f) tpCount = 0 - for labels in labelsDict.values(): + for labels in list(labelsDict.values()): tpCount += len(labels) # Normalize the score from each results file. finalResults = {} for resultsFile in self.resultsFiles: - profileName = [k for k in baselines.keys() if k in resultsFile][0] + profileName = [k for k in list(baselines.keys()) if k in resultsFile][0] base = baselines[profileName] with open(resultsFile) as f: @@ -275,10 +275,10 @@ def normalize(self): finalResults[detector] = {} finalResults[detector][profile] = score - print ("Final score for \'%s\' detector on \'%s\' profile = %.2f" - % (detector, profile, score)) + print(("Final score for \'%s\' detector on \'%s\' profile = %.2f" + % (detector, profile, score))) resultsPath = os.path.join(self.resultsDir, "final_results.json") updateFinalResults(finalResults, resultsPath) - print "Final scores have been written to %s." % resultsPath + print("Final scores have been written to %s." % resultsPath) diff --git a/nab/scorer.py b/nab/scorer.py index 96a92c861..2312f0016 100644 --- a/nab/scorer.py +++ b/nab/scorer.py @@ -66,7 +66,7 @@ def scoreCorpus(threshold, args): scoreFlag) = args args = [] - for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): + for relativePath, dataSet in resultsCorpus.dataFiles.items(): if "_scores.csv" in relativePath: continue @@ -102,12 +102,13 @@ def scoreCorpus(threshold, args): # Using `map_async` instead of `map` so interrupts are properly handled. # See: http://stackoverflow.com/a/1408476 - results = pool.map_async(scoreDataSet, args).get(99999999) + # Magic number is a timeout in seconds. + results = pool.map_async(scoreDataSet, args).get(999999) # Total the 6 scoring metrics for all data files totals = [None]*3 + [0]*6 for row in results: - for i in xrange(6): + for i in range(6): totals[i+3] += row[i+4] results.append(["Totals"] + totals) diff --git a/nab/test_helpers.py b/nab/test_helpers.py index c39741b96..4abcd9210 100644 --- a/nab/test_helpers.py +++ b/nab/test_helpers.py @@ -55,7 +55,7 @@ def writeCorpus(corpusDir, corpusData): """ makeDirsExist(corpusDir) - for relativePath, data in corpusData.iteritems(): + for relativePath, data in corpusData.items(): dataFilePath = os.path.join(corpusDir, relativePath) createPath(dataFilePath) data.to_csv(dataFilePath, index=False) @@ -69,7 +69,7 @@ def generateTimestamps(start, increment, length): @param length (int) Number of datetime objects """ timestamps = pandas.Series([start]) - for i in xrange(length - 1): + for i in range(length - 1): timestamps.loc[i + 1] = timestamps.loc[i] + increment return timestamps @@ -88,7 +88,7 @@ def generateWindows(timestamps, numWindows, windowSize): delta = timestamps[1] - timestamps[0] diff = int(round((len(timestamps) - numWindows * windowSize) / float(numWindows + 1))) windows = [] - for i in xrange(numWindows): + for i in range(numWindows): t1 = start + delta * diff * (i + 1) + (delta * windowSize * i) t2 = t1 + delta * (windowSize - 1) if not any(timestamps == t1) or not any(timestamps == t2): diff --git a/nab/util.py b/nab/util.py index b02bf044d..a285fc6e6 100644 --- a/nab/util.py +++ b/nab/util.py @@ -77,7 +77,7 @@ def updateFinalResults(newResults, resultsFilePath): """ results = getOldDict(resultsFilePath) - for detector, score in newResults.iteritems(): + for detector, score in newResults.items(): results[detector] = score writeJSON(resultsFilePath, results) @@ -101,13 +101,13 @@ def updateThresholds(newThresholds, thresholdsFilePath): """ oldThresholds = getOldDict(thresholdsFilePath) - for detector, profileDictionary in newThresholds.iteritems(): + for detector, profileDictionary in newThresholds.items(): if detector not in oldThresholds: # add an entry for a new detector oldThresholds[detector] = newThresholds[detector] continue - for profileName, data in profileDictionary.iteritems(): + for profileName, data in profileDictionary.items(): if profileName not in oldThresholds[detector]: # add an entry for a new scoring profile under this detector oldThresholds[detector][profileName] = data @@ -122,7 +122,7 @@ def updateThresholds(newThresholds, thresholdsFilePath): def checkInputs(args): """Function that displays a set of arguments and asks to proceed.""" pprint.pprint(vars(args)) - inp = raw_input("Proceed? (y/n): ") + inp = input("Proceed? (y/n): ") if inp == 'y': return True @@ -130,7 +130,7 @@ def checkInputs(args): if inp == 'n': return False - print "Incorrect input given\n" + print("Incorrect input given\n") return checkInputs(args) @@ -232,7 +232,7 @@ def osPathSplit(path, debug=False): while True: newpath, tail = os.path.split(path) if debug: - print repr(path), (newpath, tail) + print(repr(path), (newpath, tail)) if newpath == path: assert not tail if path: @@ -274,7 +274,7 @@ def flattenDict(dictionary, files={}, head=""): @param head (string) Prefix to each key """ - for key in dictionary.keys(): + for key in list(dictionary.keys()): concat = head + "/" + key if head != "" else key if type(dictionary[key]) is dict: flattenDict(dictionary[key], files, concat) @@ -312,7 +312,7 @@ def recur(function, value, n): @param n (int) Number of times to recurse. """ if n < 0 or int(n) != n: - print "incorrect input" + print("incorrect input") sys.exit() elif n == 0: diff --git a/requirements.txt b/requirements.txt index 87ed093f8..45c9c1011 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,7 @@ -nupic==1.0.5 -pandas==0.20.3 +Cython==0.29.14 +pandas==0.23.3 simplejson==3.11.1 -boto3==1.7.11 +boto3==1.9.134 +scikit-learn==0.21.1; python_version >= '3.5' +scikit-learn==0.20.1; python_version < '3.5' +plotly==2.0.0 diff --git a/run.py b/run.py index f4d34e1b6..1b235e6c1 100755 --- a/run.py +++ b/run.py @@ -132,11 +132,11 @@ def main(args): parser.add_argument("-d", "--detectors", nargs="*", type=str, - default=["null", "numenta", "random", "bayesChangePt", - "windowedGaussian", "expose", "relativeEntropy", - "earthgeckoSkyline"], + default=["null", "random", + "bayesChangePt", "windowedGaussian", "expose", + "relativeEntropy", "earthgeckoSkyline"], help="Comma separated list of detector(s) to use, e.g. " - "null,numenta") + "null, expose") parser.add_argument("-p", "--profilesFile", default=os.path.join("config", "profiles.json"), @@ -174,12 +174,6 @@ def main(args): if "bayesChangePt" in args.detectors: from nab.detectors.bayes_changept.bayes_changept_detector import ( BayesChangePtDetector) - if "numenta" in args.detectors: - from nab.detectors.numenta.numenta_detector import NumentaDetector - if "htmjava" in args.detectors: - from nab.detectors.htmjava.htmjava_detector import HtmjavaDetector - if "numentaTM" in args.detectors: - from nab.detectors.numenta.numentaTM_detector import NumentaTMDetector if "null" in args.detectors: from nab.detectors.null.null_detector import NullDetector if "random" in args.detectors: @@ -196,17 +190,11 @@ def main(args): if "relativeEntropy" in args.detectors: from nab.detectors.relative_entropy.relative_entropy_detector import ( RelativeEntropyDetector) - - # To run expose detector, you must have sklearn version 0.16.1 installed. - # Higher versions of sklearn may not be compatible with numpy version 1.9.2 - # required to run nupic. if "expose" in args.detectors: from nab.detectors.expose.expose_detector import ExposeDetector - if "contextOSE" in args.detectors: from nab.detectors.context_ose.context_ose_detector import ( ContextOSEDetector ) - if "earthgeckoSkyline" in args.detectors: from nab.detectors.earthgecko_skyline.earthgecko_skyline_detector import EarthgeckoSkylineDetector diff --git a/scripts/add_labels_to_data.py b/scripts/add_labels_to_data.py index 75cafd26a..56174c9fa 100755 --- a/scripts/add_labels_to_data.py +++ b/scripts/add_labels_to_data.py @@ -49,7 +49,7 @@ def main(args): corpusLabel.getEverything() columnData = {} - for relativePath in corpusLabel.labels.keys(): + for relativePath in list(corpusLabel.labels.keys()): columnData[relativePath] = pandas.Series( corpusLabel.labels[relativePath]["label"]) @@ -57,7 +57,7 @@ def main(args): corpus.copy(newRoot=args.destDir) - print "Done adding labels!" + print("Done adding labels!") if __name__ == "__main__": diff --git a/scripts/combine_labels.py b/scripts/combine_labels.py index 6b94ce449..d97f1589f 100755 --- a/scripts/combine_labels.py +++ b/scripts/combine_labels.py @@ -49,26 +49,26 @@ def main(args): probationaryPercent = 0.15 - print "Getting corpus." + print("Getting corpus.") corpus = Corpus(dataDir) - print "Creating LabelCombiner." + print("Creating LabelCombiner.") labelCombiner = LabelCombiner(labelDir, corpus, args.threshold, windowSize, probationaryPercent, args.verbosity) - print "Combining labels." + print("Combining labels.") labelCombiner.combine() - print "Writing combined labels files." + print("Writing combined labels files.") labelCombiner.write(args.combinedLabelsPath, args.combinedWindowsPath) - print "Attempting to load objects as a test." + print("Attempting to load objects as a test.") corpusLabel = CorpusLabel(args.combinedWindowsPath, corpus) corpusLabel.validateLabels() - print "Successfully combined labels!" - print "Resulting windows stored in:", args.combinedWindowsPath + print("Successfully combined labels!") + print("Resulting windows stored in:", args.combinedWindowsPath) if __name__ == "__main__": diff --git a/scripts/create_empty_label_file.py b/scripts/create_empty_label_file.py index 4932b5101..5745da921 100755 --- a/scripts/create_empty_label_file.py +++ b/scripts/create_empty_label_file.py @@ -46,13 +46,13 @@ def main(args): corpus = Corpus(args.dataDir) - empty_labels = {p : [] for p in corpus.dataFiles.keys() if "Known" not in p} + empty_labels = {p : [] for p in list(corpus.dataFiles.keys()) if "Known" not in p} with open(args.labelFile, "w") as outFile: outFile.write(json.dumps(empty_labels, sort_keys=True, indent=4, separators=(',', ': '))) - print "Empty label file written to",args.labelFile + print("Empty label file written to",args.labelFile) if __name__ == "__main__": diff --git a/scripts/plot.py b/scripts/plot.py index 68efc3b5a..440859fc3 100644 --- a/scripts/plot.py +++ b/scripts/plot.py @@ -42,7 +42,7 @@ assert len(dataFiles) == len(dataNames) - for i in xrange(len(dataFiles)): + for i in range(len(dataFiles)): dataPlotter = PlotNAB( dataFile=dataFiles[i], dataName=dataNames[i], @@ -69,9 +69,9 @@ # "Ambient Temperature System Failure Data" # ) # detectors=["numenta", "null"] - # + # assert len(dataFiles) == len(dataNames) - # + # # Create the list of result filenames for each detector # allResultsFiles = [] # for f in dataFiles: @@ -80,12 +80,13 @@ # filename = d + "/"+f.replace("/","/"+d+"_") # resultFiles.append(filename) # allResultsFiles.append(resultFiles) - # + # # Now plot everything # for i in range(len(dataFiles)): # dataPlotter = PlotNAB( # dataFile=dataFiles[i], - # dataName=dataNames[i]) + # dataName=dataNames[i], + # offline=True) # dataPlotter.plotMultipleDetectors( # allResultsFiles[i], # detectors=detectors, diff --git a/scripts/sort_data.py b/scripts/sort_data.py index 7b35e41fa..e19136eac 100644 --- a/scripts/sort_data.py +++ b/scripts/sort_data.py @@ -52,7 +52,7 @@ def main(args): output_filename = os.path.join(args.destDir, datafile) sortData(input_filename, output_filename) - print "Sorted files written to ", args.destDir + print("Sorted files written to ", args.destDir) if __name__ == "__main__": diff --git a/setup.py b/setup.py index eac3d56e9..e521797e1 100644 --- a/setup.py +++ b/setup.py @@ -19,14 +19,11 @@ # ---------------------------------------------------------------------- import os -import pkg_resources -import warnings from setuptools import setup, find_packages REPO_DIR = os.path.dirname(os.path.realpath(__file__)) - # Utility function to read the README file. # Used for the long_description. It"s nice, because now 1) we have a top level # README file and 2) it"s easier to type in the README file than to put a raw @@ -37,22 +34,6 @@ def read(fname): return result - -def nupicInstalled(): - """ - Determine whether NuPIC is already installed. - :return: boolean - """ - try: - _ = pkg_resources.get_distribution("nupic") - return True - except pkg_resources.DistributionNotFound: - pass # Silently ignore. NuPIC will be installed later. - - return False - - - def parseFile(requirementFile): """ Parse requirement file. @@ -68,29 +49,13 @@ def parseFile(requirementFile): return [] - def findRequirements(): """ Read the requirements.txt file and parse into requirements for setup's install_requirements option. """ requirementsPath = os.path.join(REPO_DIR, "requirements.txt") - requirements = parseFile(requirementsPath) - - if nupicInstalled(): - # The user already has a version of NuPIC installed. We'll remove the entry - # in requirements.txt to not conflate the two and will issue a user warning. - reqs = [] - for req in requirements: - if "nupic" != req.split("==")[0]: - reqs.append(req) - else: - warnings.warn("NuPIC is already installed so %s from requirements.txt " - "will be not be installed." % req) - else: - reqs = requirements - return reqs - + return parseFile(requirementsPath) if __name__ == "__main__": @@ -98,7 +63,7 @@ def findRequirements(): setup( name="nab", - version="1.0", + version="1.1", author="Alexander Lavin", author_email="nab@numenta.org", description=( diff --git a/tests/integration/corpus_test.py b/tests/integration/corpus_test.py index dc25c5c31..6324ff2ae 100644 --- a/tests/integration/corpus_test.py +++ b/tests/integration/corpus_test.py @@ -53,7 +53,7 @@ def testGetDataFiles(self): is a dictionary containing DataFile objects containing pandas.DataFrame objects to represent the underlying data. """ - for df in self.corpus.dataFiles.values(): + for df in list(self.corpus.dataFiles.values()): self.assertIsInstance(df, nab.corpus.DataFile) self.assertIsInstance(df.data, pandas.DataFrame) self.assertEqual(set(df.data.columns.values), @@ -66,13 +66,13 @@ def testAddColumn(self): "test" is added. """ columnData = {} - for relativePath, df in self.corpus.dataFiles.iteritems(): + for relativePath, df in self.corpus.dataFiles.items(): rows, _ = df.data.shape columnData[relativePath] = pandas.Series(np.zeros(rows)) self.corpus.addColumn("test", columnData, write=False) - for df in self.corpus.dataFiles.values(): + for df in list(self.corpus.dataFiles.values()): self.assertEqual(set(df.data.columns.values), set(["timestamp", "value", "test"])) @@ -83,7 +83,7 @@ def testRemoveColumn(self): named "test" is removed. """ columnData = {} - for relativePath, df in self.corpus.dataFiles.iteritems(): + for relativePath, df in self.corpus.dataFiles.items(): rows, _ = df.data.shape columnData[relativePath] = pandas.Series(np.zeros(rows)) @@ -91,7 +91,7 @@ def testRemoveColumn(self): self.corpus.removeColumn("test", write=False) - for df in self.corpus.dataFiles.values(): + for df in list(self.corpus.dataFiles.values()): self.assertEqual(set(df.data.columns.values), set(["timestamp", "value"])) @@ -107,8 +107,8 @@ def testCopy(self): copyCorpus = nab.corpus.Corpus(copyLocation) - for relativePath in self.corpus.dataFiles.keys(): - self.assertIn(relativePath, copyCorpus.dataFiles.keys()) + for relativePath in list(self.corpus.dataFiles.keys()): + self.assertIn(relativePath, list(copyCorpus.dataFiles.keys())) self.assertTrue( all(self.corpus.dataFiles[relativePath].data == \ @@ -126,7 +126,7 @@ def testAddDataSet(self): copyLocation = os.path.join(tempfile.mkdtemp(), "test") copyCorpus = self.corpus.copy(copyLocation) - for relativePath, df in self.corpus.dataFiles.iteritems(): + for relativePath, df in self.corpus.dataFiles.items(): newPath = relativePath + "_copy" copyCorpus.addDataSet(newPath, copy.deepcopy(df)) @@ -144,7 +144,7 @@ def testGetDataSubset(self): subset1 = self.corpus.getDataSubset(query1) self.assertEqual(len(subset1), 2) - for relativePath in subset1.keys(): + for relativePath in list(subset1.keys()): self.assertIn(query1, relativePath) query2 = "artificialWithAnomaly" @@ -152,7 +152,7 @@ def testGetDataSubset(self): self.assertEqual(len(subset2), 1) - for relativePath in subset2.keys(): + for relativePath in list(subset2.keys()): self.assertIn(query2, relativePath) diff --git a/tests/integration/corpuslabel_test.py b/tests/integration/corpuslabel_test.py index 45c4f631b..612ebce14 100644 --- a/tests/integration/corpuslabel_test.py +++ b/tests/integration/corpuslabel_test.py @@ -105,7 +105,7 @@ def testRowsLabeledAnomalousWithinAWindow(self): corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus) - for relativePath, lab in corpusLabel.labels.iteritems(): + for relativePath, lab in corpusLabel.labels.items(): windows = corpusLabel.windows[relativePath] for row in lab[lab["label"] == 1].iterrows(): @@ -151,7 +151,7 @@ def testGetLabels(self): corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus) - for relativePath, l in corpusLabel.labels.iteritems(): + for relativePath, l in corpusLabel.labels.items(): windows = corpusLabel.windows[relativePath] for t, lab in corpusLabel.labels["test_data_file.csv"].values: diff --git a/tests/integration/scorer_test.py b/tests/integration/scorer_test.py index 4d073f609..917601711 100644 --- a/tests/integration/scorer_test.py +++ b/tests/integration/scorer_test.py @@ -99,9 +99,9 @@ def testFalsePositiveScaling(self): # Make arbitrary detections, score, repeat scores = [] - for _ in xrange(20): + for _ in range(20): anomalyScores = pandas.Series([0]*length) - indices = random.sample(range(length), 10) + indices = random.sample(list(range(length)), 10) anomalyScores[indices] = 1 (scores, matchingRow) = sweeper.scoreDataSet( timestamps, @@ -245,7 +245,7 @@ def testScoringAllMetrics(self): threshold ) - self.assertAlmostEquals(matchingRow.score, -0.9540, 4) + self.assertAlmostEqual(matchingRow.score, -0.9540, 4) self._checkCounts(matchingRow, length-windowSize*numWindows-1, 2, 1, 8) diff --git a/tests/integration/true_positive_test.py b/tests/integration/true_positive_test.py index 1a38bff46..4dc48a840 100644 --- a/tests/integration/true_positive_test.py +++ b/tests/integration/true_positive_test.py @@ -269,7 +269,7 @@ def testTruePositiveAtRightEdgeOfWindow(self): # TP score + FP score + 1 should be very close to 0; the 1 is added to # account for the subsequent FN contribution. - self.assertAlmostEquals(matchingRow1.score + matchingRow2.score + 1, 0.0, 3) + self.assertAlmostEqual(matchingRow1.score + matchingRow2.score + 1, 0.0, 3) self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1) self._checkCounts(matchingRow2, length-windowSize*numWindows-1, 0, 1,