diff --git a/.coveragerc b/.coveragerc index 249fcf15ddb4b..3fa052108477e 100644 --- a/.coveragerc +++ b/.coveragerc @@ -23,3 +23,4 @@ omit = scripts/* dev/* airflow/migrations/* + airflow/www_rbac/node_modules/** diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000..368fdb4331a12 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 110 +ignore = E731,W504 +exclude = .svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,*/_vendor/* diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 6000d0e5ff450..539ca28fca423 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -30,4 +30,4 @@ Make sure you have checked _all_ steps below. ### Code Quality -- [ ] Passes `git diff upstream/master -u -- "*.py" | flake8 --diff` +- [ ] Passes `flake8` diff --git a/.gitignore b/.gitignore index 0e53aaded07b6..0ccc81842735e 100644 --- a/.gitignore +++ b/.gitignore @@ -83,6 +83,7 @@ instance/ # Sphinx documentation docs/_build/ +docs/_api/ # PyBuilder target/ @@ -138,3 +139,22 @@ rat-results.txt *.generated *.tar.gz scripts/ci/kubernetes/kube/.generated/airflow.yaml + +# Node & Webpack Stuff +*.entry.js +node_modules +npm-debug.log* +static/dist +derby.log +metastore_db + +# Airflow log files when airflow is run locally +airflow-*.err +airflow-*.out +airflow-*.log +airflow-*.pid + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json diff --git a/.rat-excludes b/.rat-excludes index e5373aacd5fbc..fb9361f7f39fc 100644 --- a/.rat-excludes +++ b/.rat-excludes @@ -1,16 +1,21 @@ +# Note: these patterns are applied to single files or directories, not full paths +# coverage/* will ignore any coverage dir, but airflow/www/static/coverage/* will match nothing + .gitignore .gitattributes +.airflowignore .coverage .coveragerc .codecov.yml +.eslintrc +.eslintignore +.flake8 .rat-excludes requirements.txt .*log .travis.yml .*pyc .*lock -docs -.*md dist build airflow.egg-info @@ -18,16 +23,23 @@ apache_airflow.egg-info .idea metastore_db .*sql +.*svg .*csv CHANGELOG.txt .*zip .*lock +# Generated doc files +.*html +_build/* +_static/* +.buildinfo +searchindex.js + # Apache Rat does not detect BSD-2 clause properly # it is compatible according to http://www.apache.org/legal/resolved.html#category-a kerberos_auth.py airflow_api_auth_backend_kerberos_auth_py.html licenses/* -airflow/www/static/docs parallel.js underscore.js jquery.dataTables.min.js @@ -37,3 +49,16 @@ bootstrap-toggle.min.js bootstrap-toggle.min.css d3.v3.min.js ace.js +node_modules/* +.*json +coverage/* +git_version +flake8_diff.sh +coverage.xml + +rat-results.txt +apache-airflow-.*\+source.tar.gz.* +apache-airflow-.*\+bin.tar.gz.* + +# vendored modules +_vendor/* diff --git a/.readthedocs.yml b/.readthedocs.yml index 643548c03e07a..c6a4da8d690c8 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,4 +22,4 @@ python: - doc - docker - gcp_api - - emr + - emr diff --git a/.travis.yml b/.travis.yml index 01c08d9537f64..594bbfd8288af 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,96 +16,73 @@ # specific language governing permissions and limitations # under the License. # -sudo: true -dist: trusty +dist: xenial language: python -jdk: - - oraclejdk8 -services: - - cassandra - - mysql - - postgresql - - rabbitmq -addons: - apt: - packages: - - slapd - - ldap-utils - - openssh-server - - mysql-server-5.6 - - mysql-client-core-5.6 - - mysql-client-5.6 - - krb5-user - - krb5-kdc - - krb5-admin-server - - oracle-java8-installer - - python-selinux - postgresql: "9.2" python: - - "2.7" - - "3.5" + - "3.6" env: global: - TRAVIS_CACHE=$HOME/.travis_cache/ - - KRB5_CONFIG=/etc/krb5.conf - - KRB5_KTNAME=/etc/airflow.keytab - # Travis on google cloud engine has a global /etc/boto.cfg that - # does not work with python 3 - - BOTO_CONFIG=/tmp/bogusvalue matrix: - - TOX_ENV=py27-backend_mysql - - TOX_ENV=py27-backend_sqlite - - TOX_ENV=py27-backend_postgres - - TOX_ENV=py35-backend_mysql - - TOX_ENV=py35-backend_sqlite - - TOX_ENV=py35-backend_postgres - - TOX_ENV=flake8 - - TOX_ENV=py27-backend_postgres KUBERNETES_VERSION=v1.9.0 - - TOX_ENV=py35-backend_postgres KUBERNETES_VERSION=v1.10.0 -matrix: - exclude: - - python: "3.5" - env: TOX_ENV=py27-backend_mysql - - python: "3.5" - env: TOX_ENV=py27-backend_sqlite - - python: "3.5" - env: TOX_ENV=py27-backend_postgres - - python: "2.7" - env: TOX_ENV=py35-backend_mysql - - python: "2.7" - env: TOX_ENV=py35-backend_sqlite - - python: "2.7" - env: TOX_ENV=py35-backend_postgres - - python: "2.7" - env: TOX_ENV=flake8 - - python: "3.5" - env: TOX_ENV=py27-backend_postgres KUBERNETES_VERSION=v1.9.0 - - python: "2.7" - env: TOX_ENV=py35-backend_postgres KUBERNETES_VERSION=v1.10.0 + - TOX_ENV=py27-backend_mysql-env_docker + - TOX_ENV=py27-backend_sqlite-env_docker + - TOX_ENV=py27-backend_postgres-env_docker + - TOX_ENV=py35-backend_mysql-env_docker PYTHON_VERSION=3 + - TOX_ENV=py35-backend_sqlite-env_docker PYTHON_VERSION=3 + - TOX_ENV=py35-backend_postgres-env_docker PYTHON_VERSION=3 + - TOX_ENV=py27-backend_postgres-env_kubernetes KUBERNETES_VERSION=v1.9.0 + - TOX_ENV=py35-backend_postgres-env_kubernetes KUBERNETES_VERSION=v1.13.0 PYTHON_VERSION=3 + +stages: + - pre-test + - test + +jobs: + include: + - name: Flake8 + stage: pre-test + install: pip install flake8 + script: flake8 + - name: mypy + stage: pre-test + install: pip install mypy + script: mypy airflow tests + - name: Check license header + stage: pre-test + install: skip + script: scripts/ci/6-check-license.sh + - name: Check docs + stage: pre-test + install: pip install -e .[doc] + script: docs/build.sh + cache: directories: - $HOME/.wheelhouse/ + - $HOME/.cache/pip - $HOME/.travis_cache/ before_install: - - yes | ssh-keygen -t rsa -C your_email@youremail.com -P '' -f ~/.ssh/id_rsa - - cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys - - ln -s ~/.ssh/authorized_keys ~/.ssh/authorized_keys2 - - chmod 600 ~/.ssh/* - - jdk_switcher use oraclejdk8 -install: - - pip install --upgrade pip - - pip install tox - - pip install codecov -before_script: - - cat "$TRAVIS_BUILD_DIR/scripts/ci/my.cnf" | sudo tee -a /etc/mysql/my.cnf - - mysql -e 'drop database if exists airflow; create database airflow' -u root - - sudo service mysql restart - - psql -c 'create database airflow;' -U postgres - - export PATH=${PATH}:/tmp/hive/bin # Required for K8s v1.10.x. See # https://github.com/kubernetes/kubernetes/issues/61058#issuecomment-372764783 - - sudo mount --make-shared / && sudo service docker restart + - if [ ! -z "$KUBERNETES_VERSION" ]; then sudo mount --make-shared / && sudo service docker restart; fi +install: + - pip install --upgrade pip + - docker-compose -f scripts/ci/docker-compose.yml pull --quiet --parallel script: - - ./scripts/ci/travis_script.sh -after_success: - - codecov + - if [ -z "$KUBERNETES_VERSION" ]; then + docker-compose --log-level ERROR -f scripts/ci/docker-compose.yml run airflow-testing /app/scripts/ci/run-ci.sh; + fi + - if [ ! -z "$KUBERNETES_VERSION" ]; then + ./scripts/ci/kubernetes/minikube/stop_minikube.sh && + ./scripts/ci/kubernetes/setup_kubernetes.sh && + ./scripts/ci/kubernetes/kube/deploy.sh -d persistent_mode && + MINIKUBE_IP=$(minikube ip) docker-compose --log-level ERROR -f scripts/ci/docker-compose.yml -f scripts/ci/docker-compose-kubernetes.yml run airflow-testing /app/scripts/ci/run-ci.sh; + fi + - if [ ! -z "$KUBERNETES_VERSION" ]; then + ./scripts/ci/kubernetes/minikube/stop_minikube.sh && + ./scripts/ci/kubernetes/setup_kubernetes.sh && + ./scripts/ci/kubernetes/kube/deploy.sh -d git_mode && + MINIKUBE_IP=$(minikube ip) docker-compose --log-level ERROR -f scripts/ci/docker-compose.yml -f scripts/ci/docker-compose-kubernetes.yml run airflow-testing /app/scripts/ci/run-ci.sh; + fi +before_cache: + - sudo chown -R travis:travis $HOME/.cache/pip $HOME/.wheelhouse/ diff --git a/CHANGELOG.txt b/CHANGELOG.txt index fa4e6547a7e1d..02221bfe0de65 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,920 +1,2438 @@ -AIRFLOW 1.9.0, 2018-01-02 +Airflow 1.10.3, 2019-04-09 +-------------------------- + +New Feature +""""""""""" + +- [AIRFLOW-4232] Add ``none_skipped`` trigger rule (#5032) +- [AIRFLOW-3971] Add Google Cloud Natural Language operators (#4980) +- [AIRFLOW-4069] Add Opsgenie Alert Hook and Operator (#4903) +- [AIRFLOW-3552] Fix encoding issue in ImapAttachmentToS3Operator (#5040) +- [AIRFLOW-3552] Add ImapToS3TransferOperator (#4476) +- [AIRFLOW-1526] Add dingding hook and operator (#4895) +- [AIRFLOW-3490] Add BigQueryHook's Ability to Patch Table/View (#4299) +- [AIRFLOW-3918] Add ssh private-key support to git-sync for KubernetesExecutor (#4777) +- [AIRFLOW-3659] Create Google Cloud Transfer Service Operators (#4792) +- [AIRFLOW-3939] Add Google Cloud Translate operator (#4755) +- [AIRFLOW-3541] Add Avro logical type conversion to bigquery hook (#4553) +- [AIRFLOW-4106] instrument staving tasks in pool (#4927) +- [AIRFLOW-2568] Azure Container Instances operator (#4121) +- [AIRFLOW-4107] instrument executor (#4928) +- [AIRFLOW-4033] record stats of task duration (#4858) +- [AIRFLOW-3892] Create Redis pub sub sensor (#4712) +- [AIRFLOW-4124] add get_table and get_table_location in aws_glue_hook and tests (#4942) +- [AIRFLOW-1262] Adds missing docs for email configuration (#4557) +- [AIRFLOW-3701] Add Google Cloud Vision Product Search operators (#4665) +- [AIRFLOW-3766] Add support for kubernetes annotations (#4589) +- [AIRFLOW-3741] Add extra config to Oracle hook (#4584) +- [AIRFLOW-1262] Allow configuration of email alert subject and body (#2338) +- [AIRFLOW-2985] Operators for S3 object copying/deleting (#3823) +- [AIRFLOW-2993] s3_to_sftp and sftp_to_s3 operators (#3828) +- [AIRFLOW-3799] Add compose method to GoogleCloudStorageHook (#4641) +- [AIRFLOW-3218] add support for poking a whole DAG (#4058) +- [AIRFLOW-3315] Add ImapAttachmentSensor (#4161) +- [AIRFLOW-2780] Add IMAP Hook to retrieve email attachments (#4119) +- [AIRFLOW-3556] Add cross join set dependency function (#4356) + +Improvement +""""""""""" + +- [AIRFLOW-4120] Modify SchedulerJob.manage_slas to respect zero timedelta SLAs (#4939) +- [AIRFLOW-3823] Exclude branch's downstream tasks from the tasks to skip (#4666) +- [AIRFLOW-3274] Add run_as_user and fs_group options for Kubernetes (#4648) +- [AIRFLOW-4247] Template Region on the DataprocOperators (#5046) +- [AIRFLOW-4008] add envFrom for Kubernetes Executor (#4952) +- [AIRFLOW-3947] Flash msg for no DAG-level access error (#4767) +- [AIRFLOW-3287] Moving database clean-up code into the CoreTest.tearDown() (#4122) +- [AIRFLOW-4058] Name models test file to get automatically picked up (#4901) +- [AIRFLOW-3830] Remove DagBag from /dag_details (#4831) +- [AIRFLOW-3596] Clean up undefined template variables. (#4401) +- [AIRFLOW-3573] Remove DagStat table (#4378) +- [AIRFLOW-3623] Fix bugs in Download task logs (#5005) +- [AIRFLOW-4173] Improve SchedulerJob.process_file() (#4993) +- [AIRFLOW-3540] Warn if old airflow.cfg file is found (#5006) +- [AIRFLOW-4000] Return response when no file (#4822) +- [AIRFLOW-3383] Rotate fernet keys. (#4225) +- [AIRFLOW-3003] Pull the krb5 image instead of building (#3844) +- [AIRFLOW-3862] Check types with mypy. (#4685) +- [AIRFLOW-251] Add option SQL_ALCHEMY_SCHEMA parameter to specify schema for metadata (#4199) +- [AIRFLOW-1814] Temple PythonOperator {op_args,op_kwargs} fields (#4691) +- [AIRFLOW-3730] Standarization use of logs mechanisms (#4556) +- [AIRFLOW-3770] Validation of documentation on CI] (#4593) +- [AIRFLOW-3866] Run docker-compose pull silently in CI (#4688) +- [AIRFLOW-3685] Move licence header check (#4497) +- [AIRFLOW-3670] Add stages to Travis build (#4477) +- [AIRFLOW-3937] KubernetesPodOperator support for envFrom configMapRef and secretRef (#4772) +- [AIRFLOW-3408] Remove outdated info from Systemd Instructions (#4269) +- [AIRFLOW-3202] add missing documentation for AWS hooks/operator (#4048) +- [AIRFLOW-3908] Add more Google Cloud Vision operators (#4791) +- [AIRFLOW-2915] Add example DAG for GoogleCloudStorageToBigQueryOperator (#3763) +- [AIRFLOW-3062] Add Qubole in integration docs (#3946) +- [AIRFLOW-3288] Add SNS integration (#4123) +- [AIRFLOW-3148] Remove unnecessary arg "parameters" in RedshiftToS3Transfer (#3995) +- [AIRFLOW-3049] Add extra operations for Mongo hook (#3890) +- [AIRFLOW-3559] Add missing options to DatadogHook. (#4362) +- [AIRFLOW-1191] Simplify override of spark submit command. (#4360) +- [AIRFLOW-3155] Add ability to filter by a last modified time in GCS Operator (#4008) +- [AIRFLOW-2864] Fix docstrings for SubDagOperator (#3712) +- [AIRFLOW-4062] Improve docs on install extra package commands (#4966) +- [AIRFLOW-3743] Unify different methods of working out AIRFLOW_HOME (#4705) +- [AIRFLOW-4002] Option to open debugger on errors in `airflow test`. (#4828) +- [AIRFLOW-3997] Extend Variable.get so it can return None when var not found (#4819) +- [AIRFLOW-4009] Fix docstring issue in GCSToBQOperator (#4836) +- [AIRFLOW-3980] Unify logger (#4804) +- [AIRFLOW-4076] Correct port type of beeline_default in init_db (#4908) +- [AIRFLOW-4046] Add validations for poke_interval & timeout for Sensor (#4878) +- [AIRFLOW-3744] Abandon the use of obsolete aliases of methods (#4568) +- [AIRFLOW-3865] Add API endpoint to get python code of dag by id (#4687) +- [AIRFLOW-3516] Support to create k8 worker pods in batches (#4434) +- [AIRFLOW-2843] Add flag in ExternalTaskSensor to check if external DAG/task exists (#4547) +- [AIRFLOW-2224] Add support CSV files in MySqlToGoogleCloudStorageOperator (#4738) +- [AIRFLOW-3895] GoogleCloudStorageHook/Op create_bucket takes optional resource params (#4717) +- [AIRFLOW-3950] Improve AirflowSecurityManager.update_admin_perm_view (#4774) +- [AIRFLOW-4006] Make better use of Set in AirflowSecurityManager (#4833) +- [AIRFLOW-3917] Specify alternate kube config file/context when running out of cluster (#4859) +- [AIRFLOW-3911] Change Harvesting DAG parsing results to DEBUG log level (#4729) +- [AIRFLOW-3584] Use ORM DAGs for index view. (#4390) +- [AIRFLOW-2821] Refine Doc "Plugins" (#3664) +- [AIRFLOW-3561] Improve queries (#4368) +- [AIRFLOW-3600] Remove dagbag from trigger (#4407) +- [AIRFLOW-3713] Updated documentation for GCP optional project_id (#4541) +- [AIRFLOW-2767] - Upgrade gunicorn to 19.5.0 to avoid moderate-severity CVE (#4795) +- [AIRFLOW-3795] provide_context param is now used (#4735) +- [AIRFLOW-4012] - Upgrade tabulate to 0.8.3 (#4838) +- [AIRFLOW-3623] Support download logs by attempts from UI (#4425) +- [AIRFLOW-2715] Use region setting when launching Dataflow templates (#4139) +- [AIRFLOW-3932] Update unit tests and documentation for safe mode flag. (#4760) +- [AIRFLOW-3932] Optionally skip dag discovery heuristic. (#4746) +- [AIRFLOW-3258] K8S executor environment variables section. (#4627) +- [AIRFLOW-3931] set network, subnetwork when launching dataflow template (#4744) +- [AIRFLOW-4095] Add template_fields for S3CopyObjectOperator & S3DeleteObjectsOperator (#4920) +- [AIRFLOW-2798] Remove needless code from models.py +- [AIRFLOW-3731] Constrain mysqlclient to <1.4 (#4558) +- [AIRFLOW-3139] include parameters into log.info in SQL operators, if any (#3986) +- [AIRFLOW-3174] Refine Docstring for SQL Operators & Hooks (#4043) +- [AIRFLOW-3933] Fix various typos (#4747) +- [AIRFLOW-3905] Allow using "parameters" in SqlSensor (#4723) +- [AIRFLOW-2761] Parallelize enqueue in celery executor (#4234) +- [AIRFLOW-3540] Respect environment config when looking up config file. (#4340) +- [AIRFLOW-2156] Parallelize Celery Executor task state fetching (#3830) +- [AIRFLOW-3702] Add backfill option to run backwards (#4676) +- [AIRFLOW-3821] Add replicas logic to GCP SQL example DAG (#4662) +- [AIRFLOW-3547] Fixed Jinja templating in SparkSubmitOperator (#4347) +- [AIRFLOW-3647] Add archives config option to SparkSubmitOperator (#4467) +- [AIRFLOW-3802] Updated documentation for HiveServer2Hook (#4647) +- [AIRFLOW-3817] - Corrected task ids returned by BranchPythonOperator to match the dummy operator ids (#4659) +- [AIRFLOW-3782] Clarify docs around celery worker_autoscale in default_airflow.cfg (#4609) +- [AIRFLOW-1945] Add Autoscale config for Celery workers (#3989) +- [AIRFLOW-3590]: Change log message of executor exit status (#4616) +- [AIRFLOW-3591] Fix start date, end date, duration for rescheduled tasks (#4502) +- [AIRFLOW-3709] Validate `allowed_states` for ExternalTaskSensor (#4536) +- [AIRFLOW-3522] Add support for sending Slack attachments (#4332) +- [AIRFLOW-3569] Add "Trigger DAG" button in DAG page (/www only) (#4373) +- [AIRFLOW-3569] Add "Trigger DAG" button in DAG page (/www_rbac only) (#4373) +- [AIRFLOW-3044] Dataflow operators accept templated job_name param (#3887) +- [AIRFLOW-3023] Fix docstring datatypes +- [AIRFLOW-2928] Use uuid4 instead of uuid1 (#3779) +- [AIRFLOW-2988] Run specifically python2 for dataflow (#3826) +- [AIRFLOW-3697] Vendorize nvd3 and slugify (#4513) +- [AIRFLOW-3692] Remove ENV variables to avoid GPL (#4506) +- [AIRFLOW-3907] Upgrade flask and set cookie security flags. (#4725) +- [AIRFLOW-3698] Add documentation for AWS Connection (#4514) +- [AIRFLOW-3616][AIRFLOW-1215] Add aliases for schema with underscore (#4523) +- [AIRFLOW-3375] Support returning multiple tasks with BranchPythonOperator (#4215) +- [AIRFLOW-3742] Fix handling of "fallback" for AirflowConfigParsxer.getint/boolean (#4674) +- [AIRFLOW-3742] Respect the `fallback` arg in airflow.configuration.get (#4567) +- [AIRFLOW-3789] Fix flake8 3.7 errors. (#4617) +- [AIRFLOW-3602] Improve ImapHook handling of retrieving no attachments (#4475) +- [AIRFLOW-3631] Update flake8 and fix lint. (#4436) + +Bug fixes +""""""""" + +- [AIRFLOW-2994] Fix command status check in Qubole Check operator (#3790) +- [AIRFLOW-2563] Fix PigCliHook Python 3 string/bytes use +- [AIRFLOW-4248] Fix 'FileExistsError' makedirs race in file_processor_handler (#5047) +- [AIRFLOW-4240] State-changing actions should be POST requests (#5039) +- [AIRFLOW-4246] Flask-Oauthlib needs downstream dependencies pinning due to breaking changes (#5045) +- [AIRFLOW-3887] Downgrade dagre-d3 to 0.4.18 (#4713) +- [AIRFLOW-3419] Fix S3Hook.select_key on Python3 (#4970) +- [AIRFLOW-4127] Correct AzureContainerInstanceHook._get_instance_view's return (#4945) +- [AIRFLOW-4172] Fix changes for driver class path option in Spark Submit (#4992) +- [AIRFLOW-3615] Preserve case of UNIX socket paths in Connections (#4591) +- [AIRFLOW-3417] ECSOperator: pass platformVersion only for FARGATE launch type (#4256) +- [AIRFLOW-3884] Fixing doc checker, no warnings allowed anymore and fixed the current… (#4702) +- [AIRFLOW-2652] implement / enhance baseOperator deepcopy +- [AIRFLOW-4001] Update docs about how to run tests (#4826) +- [AIRFLOW-3699] Speed up Flake8 (#4515) +- [AIRFLOW-4160] Fix redirecting of 'Trigger Dag' Button in DAG Page (#4982) +- [AIRFLOW-3650] Skip running on mysql for the flaky test (#4457) +- [AIRFLOW-3423] Fix mongo hook to work with anonymous access (#4258) +- [AIRFLOW-3982] Fix race condition in CI test (#4968) +- [AIRFLOW-3982] Update DagRun state based on its own tasks (#4808) +- [AIRFLOW-3737] Kubernetes executor cannot handle long dag/task names (#4636) +- [AIRFLOW-3945] Stop inserting row when permission views unchanged (#4764) +- [AIRFLOW-4123] Add Exception handling for _change_state method in K8 Executor (#4941) +- [AIRFLOW-3771] Minor refactor securityManager (#4594) +- [AIRFLOW-987] pass kerberos cli args keytab and principal to kerberos.run() (#4238) +- [AIRFLOW-3736] Allow int value in SqoopOperator.extra_import_options(#4906) +- [AIRFLOW-4063] Fix exception string in BigQueryHook [2/2] (#4902) +- [AIRFLOW-4063] Fix exception string in BigQueryHook (#4899) +- [AIRFLOW-4037] Log response in SimpleHttpOperator even if the response check fails +- [AIRFLOW-4044] The documentation of `query_params` in `BigQueryOperator` is wrong. (#4876) +- [AIRFLOW-4015] Make missing API endpoints available in classic mode +- [AIRFLOW-3153] Send DAG processing stats to statsd (#4748) +- [AIRFLOW-2966] Catch ApiException in the Kubernetes Executor (#4209) +- [AIRFLOW-4129] Escape HTML in generated tooltips (#4950) +- [AIRFLOW-4070] AirflowException -> log.warning for duplicate task dependencies (#4904) +- [AIRFLOW-4054] Fix assertEqualIgnoreMultipleSpaces util & add tests (#4886) +- [AIRFLOW-3239] Fix test recovery further (#4074) +- [AIRFLOW-4053] Fix KubePodOperator Xcom on Kube 1.13.0 (#4883) +- [AIRFLOW-2961] Refactor tests.BackfillJobTest.test_backfill_examples test (#3811) +- [AIRFLOW-3606] Fix Flake8 test & fix the Flake8 errors introduced since Flake8 test was broken (#4415) +- [AIRFLOW-3543] Fix deletion of DAG with rescheduled tasks (#4646) +- [AIRFLOW-2548] Output plugin import errors to web UI (#3930) +- [AIRFLOW-4019] Fix AWS Athena Sensor object has no attribute 'mode' (#4844) +- [AIRFLOW-3758] Fix circular import in WasbTaskHandler (#4601) +- [AIRFLOW-3706] Fix tooltip max-width by correcting ordering of CSS files (#4947) +- [AIRFLOW-4100] Correctly JSON escape data for tree/graph views (#4921) +- [AIRFLOW-3636] Fix a test introduced in #4425 (#4446) +- [AIRFLOW-3977] Add examples of trigger rules in doc (#4805) +- [AIRFLOW-2511] Fix improper failed session commit handling causing deadlocks (#4769) +- [AIRFLOW-3962] Added graceful handling for creation of dag_run of a dag which doesn't have any task (#4781) +- [AIRFLOW-3881] Correct to_csv row number (#4699) +- [AIRFLOW-3875] Simplify SlackWebhookHook code and change docstring (#4696) +- [AIRFLOW-3733] Don't raise NameError in HQL hook to_csv when no rows returned (#4560) +- [AIRFLOW-3734] Fix hql not run when partition is None (#4561) +- [AIRFLOW-3767] Correct bulk insert function (#4773) +- [AIRFLOW-4087] remove sudo in basetaskrunner on_finish (#4916) +- [AIRFLOW-3768] Escape search parameter in pagination controls (#4911) +- [AIRFLOW-4045] Fix hard-coded URLs in FAB-based UI (#4914) +- [AIRFLOW-3123] Use a stack for DAG context management (#3956) +- [AIRFLOW-3060] DAG context manager fails to exit properly in certain circumstances +- [AIRFLOW-3924] Fix try number in alert emails (#4741) +- [AIRFLOW-4083] Add tests for link generation utils (#4912) +- [AIRFLOW-2190] Send correct HTTP status for base_url not found (#4910) +- [AIRFLOW-4015] Add get_dag_runs GET endpoint to "classic" API (#4884) +- [AIRFLOW-3239] Enable existing CI tests (#4131) +- [AIRFLOW-1390] Update Alembic to 0.9 (#3935) +- [AIRFLOW-3885] Fix race condition in scheduler test (#4737) +- [AIRFLOW-3885] ~10x speed-up of SchedulerJobTest suite (#4730) +- [AIRFLOW-3780] Fix some incorrect when base_url is used (#4643) +- [AIRFLOW-3807] Fix Graph View Highlighting of Tasks (#4653) +- [AIRFLOW-3009] Import Hashable from collection.abc to fix Python 3.7 deprecation warning (#3849) +- [AIRFLOW-2231] Fix relativedelta DAG schedule_interval (#3174) +- [AIRFLOW-2641] Fix MySqlToHiveTransfer to handle MySQL DECIMAL correctly +- [AIRFLOW-3751] Option to allow malformed schemas for LDAP authentication (#4574) +- [AIRFLOW-2888] Add deprecation path for task_runner config change (#4851) +- [AIRFLOW-2930] Fix celery excecutor scheduler crash (#3784) +- [AIRFLOW-2888] Remove shell=True and bash from task launch (#3740) +- [AIRFLOW-3885] ~2.5x speed-up for backfill tests (#4731) +- [AIRFLOW-3885] ~20x speed-up of slowest unit test (#4726) +- [AIRFLOW-2508] Handle non string types in Operators templatized fields (#4292) +- [AIRFLOW-3792] Fix validation in BQ for useLegacySQL & queryParameters (#4626) +- [AIRFLOW-3749] Fix Edit Dag Run page when using RBAC (#4613) +- [AIRFLOW-3801] Fix DagBag collect dags invocation to prevent examples to be loaded (#4677) +- [AIRFLOW-3774] Register blueprints with RBAC web app (#4598) +- [AIRFLOW-3719] Handle StopIteration in CloudWatch logs retrieval (#4516) +- [AIRFLOW-3108] Define get_autocommit method for MsSqlHook (#4525) +- [AIRFLOW-3074] Add relevant ECS options to ECS operator. (#3908) +- [AIRFLOW-3353] Upgrade Redis client (#4834) +- [AIRFLOW-3250] Fix for Redis Hook for not authorised connection calls (#4090) +- [AIRFLOW-2009] Fix dataflow hook connection-id (#4563) +- [AIRFLOW-2190] Fix TypeError when returning 404 (#4596) +- [AIRFLOW-2876] Update Tenacity to 4.12 (#3723) +- [AIRFLOW-3923] Update flask-admin dependency to 1.5.3 to resolve security vulnerabilities from safety (#4739) +- [AIRFLOW-3683] Fix formatting of error message for invalid TriggerRule (#4490) +- [AIRFLOW-2787] Allow is_backfill to handle NULL DagRun.run_id (#3629) +- [AIRFLOW-3780] Fix some incorrect when base_url is used (#4643) +- [AIRFLOW-3639] Fix request creation in Jenkins Operator (#4450) +- [AIRFLOW-3779] Don't install enum34 backport when not needed (#4620) +- [AIRFLOW-3079] Improve migration scripts to support MSSQL Server (#3964) +- [AIRFLOW-2735] Use equality, not identity, check for detecting AWS Batch failures[] +- [AIRFLOW-2706] AWS Batch Operator should use top-level job state to determine status +- [AIRFLOW-XXX] Fix typo in http_operator.py +- [AIRFLOW-XXX] Solve lodash security warning (#4820) +- [AIRFLOW-XXX] Pin version of tornado pulled in by Celery. (#4815) +- [AIRFLOW-XXX] Upgrade FAB to 1.12.3 (#4694) +- [AIRFLOW-XXX] Pin pinodb dependency (#4704) +- [AIRFLOW-XXX] Pin version of Pip in tests to work around pypa/pip#6163 (#4576) +- [AIRFLOW-XXX] Fix spark submit hook KeyError (#4578) +- [AIRFLOW-XXX] Pin psycopg2 due to breaking change (#5036) +- [AIRFLOW-XXX] Pin Sendgrid dep. (#5031) +- [AIRFLOW-XXX] Fix flaky test - test_execution_unlimited_parallelism (#4988) + +Misc/Interal +"""""""""""" + +- [AIRFLOW-4016] Clear runs for BackfillJobTest (#4839) +- [AIRFLOW-4177] Check types in tests +- [AIRFLOW-4144] add description of is_delete_operator_pod (#4943) +- [AIRFLOW-3476,3477] Move Kube classes out of models.py (#4443) +- [AIRFLOW-3464] Move SkipMixin out of models.py (#4386) +- [AIRFLOW-3463] Move Log out of models.py (#4639) +- [AIRFLOW-3458] Move connection tests (#4680) +- [AIRFLOW-3461] Move TaskFail out of models.py (#4630) +- [AIRFLOW-3462] Move TaskReschedule out of models.py (#4618) +- [AIRFLOW-3474] Move SlaMiss out of models.py (#4608) +- [AIRFLOW-3475] Move ImportError out of models.py (#4383) +- [AIRFLOW-3459] Move DagPickle to separate file (#4374) +- [AIRFLOW-3925] Don't pull docker-images on pretest (#4740) +- [AIRFLOW-4154] Correct string formatting in jobs.py (#4972) +- [AIRFLOW-3458] Deprecation path for moving models.Connection +- [AIRFLOW-3458] Move models.Connection into separate file (#4335) +- [AIRFLOW-XXX] Remove old/non-test files that nose ignores (#4930) + +Doc-only changes +"""""""""""""""" + +- [AIRFLOW-3996] Add view source link to included fragments +- [AIRFLOW-3811] automatic generation of API Reference in docs (#4788) +- [AIRFLOW-3810] Remove duplicate autoclass directive (#4656) +- [AIRFLOW-XXX] Mention that statsd must be installed to gather metrics (#5038) +- [AIRFLOW-XXX] Add contents to cli (#4825) +- [AIRFLOW-XXX] fix check docs failure on CI (#4998) +- [AIRFLOW-XXX] Fix syntax docs errors (#4789) +- [AIRFLOW-XXX] Docs rendering improvement (#4684) +- [AIRFLOW-XXX] Automatically link Jira/GH on doc's changelog page (#4587) +- [AIRFLOW-XXX] Mention Oracle in the Extra Packages documentation (#4987) +- [AIRFLOW-XXX] Drop deprecated sudo option; use default docker compose on Travis. (#4732) +- [AIRFLOW-XXX] Update kubernetes.rst docs (#3875) +- [AIRFLOW-XXX] Improvements to formatted content in documentation (#4835) +- [AIRFLOW-XXX] Add Daniel to committer list (#4961) +- [AIRFLOW-XXX] Add Xiaodong Deng to committers list +- [AIRFLOW-XXX] Add history become ASF top level project (#4757) +- [AIRFLOW-XXX] Move out the examples from integration.rst (#4672) +- [AIRFLOW-XXX] Extract reverse proxy info to a separate file (#4657) +- [AIRFLOW-XXX] Reduction of the number of warnings in the documentation (#4585) +- [AIRFLOW-XXX] Fix GCS Operator docstrings (#4054) +- [AIRFLOW-XXX] Fix Docstrings in Hooks, Sensors & Operators (#4137) +- [AIRFLOW-XXX] Split guide for operators to multiple files (#4814) +- [AIRFLOW-XXX] Split connection guide to multiple files (#4824) +- [AIRFLOW-XXX] Remove almost all warnings from building docs (#4588) +- [AIRFLOW-XXX] Add backreference in docs between operator and integration (#4671) +- [AIRFLOW-XXX] Improve linking to classes (#4655) +- [AIRFLOW-XXX] Mock optional modules when building docs (#4586) +- [AIRFLOW-XXX] Update plugin macros documentation (#4971) +- [AIRFLOW-XXX] Add missing docstring for 'autodetect' in GCS to BQ Operator (#4979) +- [AIRFLOW-XXX] Add missing GCP operators to Docs (#4260) +- [AIRFLOW-XXX] Fixing the issue in Documentation (#3756) +- [AIRFLOW-XXX] Add Hint at user defined macros (#4885) +- [AIRFLOW-XXX] Correct schedule_interval in Scheduler docs (#4157) +- [AIRFLOW-XXX] Improve airflow-jira script to make RelManager's life easier (#4857) +- [AIRFLOW-XXX] Add missing class references to docs (#4644) +- [AIRFLOW-XXX] Fix typo (#4564) +- [AIRFLOW-XXX] Add a doc about fab security (#4595) +- [AIRFLOW-XXX] Speed up DagBagTest cases (#3974) +- [AIRFLOW-XXX] Reduction of the number of warnings in the documentation (#4585) + + +Airflow 1.10.2, 2019-01-19 +-------------------------- + +New features +"""""""""""" + +- [AIRFLOW-2658] Add GCP specific k8s pod operator (#3532) +- [AIRFLOW-2440] Google Cloud SQL import/export operator (#4251) +- [AIRFLOW-3212] Add AwsGlueCatalogPartitionSensor (#4112) +- [AIRFLOW-2750] Add subcommands to delete and list users +- [AIRFLOW-3480] Add GCP Spanner Database Operators (#4353) +- [AIRFLOW-3560] Add DayOfWeek Sensor (#4363) +- [AIRFLOW-3371] BigQueryHook's Ability to Create View (#4213) +- [AIRFLOW-3332] Add method to allow inserting rows into BQ table (#4179) +- [AIRFLOW-3055] add get_dataset and get_datasets_list to bigquery_hook (#3894) +- [AIRFLOW-2887] Added BigQueryCreateEmptyDatasetOperator and create_emty_dataset to bigquery_hook (#3876) +- [AIRFLOW-2758] Add a sensor for MongoDB +- [AIRFLOW-2640] Add Cassandra table sensor +- [AIRFLOW-3398] Google Cloud Spanner instance database query operator (#4314) +- [AIRFLOW-3310] Google Cloud Spanner deploy / delete operators (#4286) +- [AIRFLOW-3406] Implement an Azure CosmosDB operator (#4265) +- [AIRFLOW-3434] Allows creating intermediate dirs in SFTPOperator (#4270) +- [AIRFLOW-3345] Add Google Cloud Storage (GCS) operators for ACL (#4192) +- [AIRFLOW-3266] Add AWS Athena Hook and Operator (#4111) +- [AIRFLOW-3346] Add hook and operator for GCP transfer service (#4189) +- [AIRFLOW-2983] Add prev_ds_nodash and next_ds_nodash macro (#3821) +- [AIRFLOW-3403] Add AWS Athena Sensor (#4244) +- [AIRFLOW-3323] Support HTTP basic authentication for Airflow Flower (#4166) +- [AIRFLOW-3410] Add feature to allow Host Key Change for SSH Op (#4249) +- [AIRFLOW-3275] Add Google Cloud SQL Query operator (#4170) +- [AIRFLOW-2691] Manage JS dependencies via npm +- [AIRFLOW-2795] Oracle to Oracle Transfer Operator (#3639) +- [AIRFLOW-2596] Add Oracle to Azure Datalake Transfer Operator +- [AIRFLOW-3220] Add Instance Group Manager Operators for GCE (#4167) +- [AIRFLOW-2882] Add import and export for pool cli using JSON +- [AIRFLOW-2965] CLI tool to show the next execution datetime (#3834) +- [AIRFLOW-2874] Enables FAB's theme support (#3719) +- [AIRFLOW-3336] Add new TriggerRule for 0 upstream failures (#4182) + +Improvements +"""""""""""" + +- [AIRFLOW-3680] Consistency update in tests for All GCP-related operators (#4493) +- [AIRFLOW-3675] Use googlapiclient for google apis (#4484) +- [AIRFLOW-3205] Support multipart uploads to GCS (#4084) +- [AIRFLOW-2826] Add GoogleCloudKMSHook (#3677) +- [AIRFLOW-3676] Add required permission to CloudSQL export/import example (#4489) +- [AIRFLOW-3679] Added Google Cloud Base Hook to documentation (#4487) +- [AIRFLOW-3594] Unify different License Header +- [AIRFLOW-3197] Remove invalid parameter KeepJobFlowAliveWhenNoSteps in example DAG (#4404) +- [AIRFLOW-3504] Refine the functionality of "/health" endpoint (#4309) +- [AIRFLOW-3103][AIRFLOW-3147] Update flask-appbuilder (#3937) +- [AIRFLOW-3168] More resillient database use in CI (#4014) +- [AIRFLOW-3076] Remove preloading of MySQL testdata (#3911) +- [AIRFLOW-3035] Allow custom 'job_error_states' in dataproc ops (#3884) +- [AIRFLOW-3246] Make hmsclient optional in airflow.hooks.hive_hooks (#4080) +- [AIRFLOW-3059] Log how many rows are read from Postgres (#3905) +- [AIRFLOW-2463] Make task instance context available for hive queries +- [AIRFLOW-3190] Make flake8 compliant (#4035) +- [AIRFLOW-1998] Implemented DatabricksRunNowOperator for jobs/run-now … (#3813) +- [AIRFLOW-2267] Airflow DAG level access (#3197) +- [AIRFLOW-2359] Add set failed for DagRun and task in tree view (#3255) +- [AIRFLOW-3008] Move Kubernetes example DAGs to contrib +- [AIRFLOW-3402] Support global k8s affinity and toleration configs (#4247) +- [AIRFLOW-3610] Add region param for EMR jobflow creation (#4418) +- [AIRFLOW-3531] Fix test for GCS to GCS Transfer Hook (#4452) +- [AIRFLOW-3531] Add gcs to gcs transfer operator. (#4331) +- [AIRFLOW-3034]: Readme updates : Add Slack & Twitter, remove Gitter +- [AIRFLOW-3028] Update Text & Images in Readme.md +- [AIRFLOW-208] Add badge to show supported Python versions (#3839) +- [AIRFLOW-2238] Update PR tool to push directly to Github +- [AIRFLOW-2238] Flake8 fixes on dev/airflow-pr +- [AIRFLOW-2238] Update PR tool to remove outdated info (#3978) +- [AIRFLOW-3005] Replace 'Airbnb Airflow' with 'Apache Airflow' (#3845) +- [AIRFLOW-3150] Make execution_date templated in TriggerDagRunOperator (#4359) +- [AIRFLOW-1196][AIRFLOW-2399] Add templated field in TriggerDagRunOperator (#4228) +- [AIRFLOW-3340] Placeholder support in connections form (#4185) +- [AIRFLOW-3446] Add Google Cloud BigTable operators (#4354) +- [AIRFLOW-1921] Add support for https and user auth (#2879) +- [AIRFLOW-2770] Read `dags_in_image` config value as a boolean (#4319) +- [AIRFLOW-3022] Add volume mount to KubernetesExecutorConfig (#3855) +- [AIRFLOW-2917] Set AIRFLOW__CORE__SQL_ALCHEMY_CONN only when needed (#3766) +- [AIRFLOW-2712] Pass annotations to KubernetesExecutorConfig +- [AIRFLOW-461] Support autodetected schemas in BigQuery run_load (#3880) +- [AIRFLOW-2997] Support cluster fields in bigquery (#3838) +- [AIRFLOW-2916] Arg `verify` for AwsHook() & S3 sensors/operators (#3764) +- [AIRFLOW-491] Add feature to pass extra api configs to BQ Hook (#3733) +- [AIRFLOW-2889] Fix typos detected by github.com/client9/misspell (#3732) +- [AIRFLOW-850] Add a PythonSensor (#4349) +- [AIRFLOW-2747] Explicit re-schedule of sensors (#3596) +- [AIRFLOW-3392] Add index on dag_id in sla_miss table (#4235) +- [AIRFLOW-3001] Add index 'ti_dag_date' to taskinstance (#3885) +- [AIRFLOW-2861] Add index on log table (#3709) +- [AIRFLOW-3518] Performance fixes for topological_sort of Tasks (#4322) +- [AIRFLOW-3521] Fetch more than 50 items in `airflow-jira compare` script (#4300) +- [AIRFLOW-1919] Add option to query for DAG runs given a DAG ID +- [AIRFLOW-3444] Explicitly set transfer operator description. (#4279) +- [AIRFLOW-3411] Add OpenFaaS hook (#4267) +- [AIRFLOW-2785] Add context manager entry points to mongoHook +- [AIRFLOW-2524] Add SageMaker doc to AWS integration section (#4278) +- [AIRFLOW-3479] Keeps records in Log Table when DAG is deleted (#4287) +- [AIRFLOW-2948] Arg check & better doc - SSHOperator & SFTPOperator (#3793) +- [AIRFLOW-2245] Add remote_host of SSH/SFTP operator as templated field (#3765) +- [AIRFLOW-2670] Update SSH Operator's Hook to respect timeout (#3666) +- [AIRFLOW-3380] Add metrics documentation (#4219) +- [AIRFLOW-3361] Log the task_id in the PendingDeprecationWarning from BaseOperator (#4030) +- [AIRFLOW-3213] Create ADLS to GCS operator (#4134) +- [AIRFLOW-3395] added the REST API endpoints to the doc (#4236) +- [AIRFLOW-3294] Update connections form and integration docs (#4129) +- [AIRFLOW-3236] Create AzureDataLakeStorageListOperator (#4094) +- [AIRFLOW-3062] Add Qubole in integration docs (#3946) +- [AIRFLOW-3306] Disable flask-sqlalchemy modification tracking. (#4146) +- [AIRFLOW-2867] Refactor Code to conform standards (#3714) +- [AIRFLOW-2753] Add dataproc_job_id instance var holding actual DP jobId +- [AIRFLOW-3132] Enable specifying auto_remove option for DockerOperator (#3977) +- [AIRFLOW-2731] Raise psutil restriction to <6.0.0 +- [AIRFLOW-3384] Allow higher versions of Sqlalchemy and Jinja2 (#4227) +- [Airflow-2760] Decouple DAG parsing loop from scheduler loop (#3873) +- [AIRFLOW-3004] Add config disabling scheduler cron (#3899) +- [AIRFLOW-3175] Fix docstring format in airflow/jobs.py (#4025) +- [AIRFLOW-3589] Visualize reschedule state in all views (#4408) +- [AIRFLOW-2698] Simplify Kerberos code (#3563) +- [AIRFLOW-2499] Dockerise CI pipeline (#3393) +- [AIRFLOW-3432] Add test for feature "Delete DAG in UI" (#4266) +- [AIRFLOW-3301] Update DockerOperator CI test for PR #3977 (#4138) +- [AIRFLOW-3478] Make sure that the session is closed +- [AIRFLOW-3687] Add missing @apply_defaults decorators (#4498) +- [AIRFLOW-3691] Update notice to 2019 (#4503) +- [AIRFLOW-3689] Update pop-up message when deleting DAG in RBAC UI (#4505) +- [AIRFLOW-2801] Skip test_mark_success_no_kill in PostgreSQL on CI (#3642) +- [AIRFLOW-3693] Replace psycopg2-binary by psycopg2 (#4508) +- [AIRFLOW-3700] Change the lowest allowed version of "requests" (#4517) +- [AIRFLOW-3704] Support SSL Protection When Redis is Used as Broker for CeleryExecutor (#4521) +- [AIRFLOW-3681] All GCP operators have now optional GCP Project ID (#4500) +- [Airflow 2782] Upgrades Dagre D3 version to latest possible +- [Airflow 2783] Implement eslint for JS code check (#3641) +- [AIRFLOW-2805] Display multiple timezones on UI (#3687) +- [AIRFLOW-3302] Small CSS fixes (#4140) +- [Airflow-2766] Respect shared datetime across tabs +- [AIRFLOW-2776] Compress tree view JSON +- [AIRFLOW-2407] Use feature detection for reload() (#3298) +- [AIRFLOW-3452] Removed an unused/dangerous display-none (#4295) +- [AIRFLOW-3348] Update run statistics on dag refresh (#4197) +- [AIRFLOW-3125] Monitor Task Instances creation rates (#3966) + + +Bug fixes +""""""""" + +- [AIRFLOW-3191] Fix not being able to specify execution_date when creating dagrun (#4037) +- [AIRFLOW-3657] Fix zendesk integration (#4466) +- [AIRFLOW-3605] Load plugins from entry_points (#4412) +- [AIRFLOW-3646] Rename plugins_manager.py to test_xx to trigger tests (#4464) +- [AIRFLOW-3655] Escape links generated in model views (#4463) +- [AIRFLOW-3662] Add dependency for Enum (#4468) +- [AIRFLOW-3630] Cleanup of GCP Cloud SQL Connection (#4451) +- [AIRFLOW-1837] Respect task start_date when different from dag's (#4010) +- [AIRFLOW-2829] Brush up the CI script for minikube +- [AIRFLOW-3519] Fix example http operator (#4455) +- [AIRFLOW-2811] Fix scheduler_ops_metrics.py to work (#3653) +- [AIRFLOW-2751] add job properties update in hive to druid operator. +- [AIRFLOW-2918] Remove unused imports +- [AIRFLOW-2918] Fix Flake8 violations (#3931) +- [AIRFLOW-2771] Add except type to broad S3Hook try catch clauses +- [AIRFLOW-2918] Fix Flake8 violations (#3772) +- [AIRFLOW-2099] Handle getsource() calls gracefully +- [AIRFLOW-3397] Fix integrety error in rbac AirflowSecurityManager (#4305) +- [AIRFLOW-3281] Fix Kubernetes operator with git-sync (#3770) +- [AIRFLOW-2615] Limit DAGs parsing to once only +- [AIRFLOW-2952] Fix Kubernetes CI (#3922) +- [AIRFLOW-2933] Enable Codecov on Docker-CI Build (#3780) +- [AIRFLOW-2082] Resolve a bug in adding password_auth to api as auth method (#4343) +- [AIRFLOW-3612] Remove incubation/incubator mention (#4419) +- [AIRFLOW-3581] Fix next_ds/prev_ds semantics for manual runs (#4385) +- [AIRFLOW-3527] Update Cloud SQL Proxy to have shorter path for UNIX socket (#4350) +- [AIRFLOW-3316] For gcs_to_bq: add missing init of schema_fields var (#4430) +- [AIRFLOW-3583] Fix AirflowException import (#4389) +- [AIRFLOW-3578] Fix Type Error for BigQueryOperator (#4384) +- [AIRFLOW-2755] Added `kubernetes.worker_dags_folder` configuration (#3612) +- [AIRFLOW-2655] Fix inconsistency of default config of kubernetes worker +- [AIRFLOW-2645][AIRFLOW-2617] Add worker_container_image_pull_policy +- [AIRFLOW-2661] fix config dags_volume_subpath and logs_volume_subpath +- [AIRFLOW-3550] Standardize GKE hook (#4364) +- [AIRFLOW-2863] Fix GKEClusterHook catching wrong exception (#3711) +- [AIRFLOW-2939][AIRFLOW-3568] Fix TypeError in GCSToS3Op & S3ToGCSOp (#4371) +- [AIRFLOW-3327] Add support for location in BigQueryHook (#4324) +- [AIRFLOW-3438] Fix default values in BigQuery Hook & BigQueryOperator (… +- [AIRFLOW-3355] Fix BigQueryCursor.execute to work with Python3 (#4198) +- [AIRFLOW-3447] Add 2 options for ts_nodash Macro (#4323) +- [AIRFLOW-1552] Airflow Filter_by_owner not working with password_auth (#4276) +- [AIRFLOW-3484] Fix Over-logging in the k8s executor (#4296) +- [AIRFLOW-3309] Add MongoDB connection (#4154) +- [AIRFLOW-3414] Fix reload_module in DagFileProcessorAgent (#4253) +- [AIRFLOW-1252] API accept JSON when invoking a trigger dag (#2334) +- [AIRFLOW-3425] Fix setting default scope in hook (#4261) +- [AIRFLOW-3416] Fixes Python 3 compatibility with CloudSqlQueryOperator (#4254) +- [AIRFLOW-3263] Ignore exception when 'run' kills already killed job (#4108) +- [AIRFLOW-3264] URL decoding when parsing URI for connection (#4109) +- [AIRFLOW-3365][AIRFLOW-3366] Allow celery_broker_transport_options to be set with environment variables (#4211) +- [AIRFLOW-2642] fix wrong value git-sync initcontainer env GIT_SYNC_ROOT (#3519) +- [AIRFLOW-3353] Pin redis verison (#4195) +- [AIRFLOW-3251] KubernetesPodOperator now uses 'image_pull_secrets' argument when creating Pods (#4188) +- [AIRFLOW-2705] Move class-level moto decorator to method-level +- [AIRFLOW-3233] Fix deletion of DAGs in the UI (#4069) +- [AIRFLOW-2908] Allow retries with KubernetesExecutor. (#3758) +- [AIRFLOW-1561] Fix scheduler to pick up example DAGs without other DAGs (#2635) +- [AIRFLOW-3352] Fix expose_config not honoured on RBAC UI (#4194) +- [AIRFLOW-3592] Fix logs when task is in rescheduled state (#4492) +- [AIRFLOW-3634] Fix GCP Spanner Test (#4440) +- [AIRFLOW-XXX] Fix PythonVirtualenvOperator tests (#3968) +- [AIRFLOW-3239] Fix/refine tests for api/common/experimental/ (#4255) +- [AIRFLOW-2951] Update dag_run table end_date when state change (#3798) +- [AIRFLOW-2756] Fix bug in set DAG run state workflow (#3606) +- [AIRFLOW-3690] Fix bug to set state of a task for manually-triggered DAGs (#4504) +- [AIRFLOW-3319] KubernetsExecutor: Need in try_number in labels if getting them later (#4163) +- [AIRFLOW-3724] Fix the broken refresh button on Graph View in RBAC UI +- [AIRFLOW-3732] Fix issue when trying to edit connection in RBAC UI +- [AIRFLOW-2866] Fix missing CSRF token head when using RBAC UI (#3804) +- [AIRFLOW-3259] Fix internal server error when displaying charts (#4114) +- [AIRFLOW-3271] Fix issue with persistence of RBAC Permissions modified via UI (#4118) +- [AIRFLOW-3141] Handle duration View for missing dag (#3984) +- [AIRFLOW-2766] Respect shared datetime across tabs +- [AIRFLOW-1413] Fix FTPSensor failing on error message with unexpected (#2450) +- [AIRFLOW-3378] KubernetesPodOperator does not delete on timeout failure (#4218) +- [AIRFLOW-3245] Fix list processing in resolve_template_files (#4086) +- [AIRFLOW-2703] Catch transient DB exceptions from scheduler's heartbeat it does not crash (#3650) +- [AIRFLOW-1298] Clear UPSTREAM_FAILED using the clean cli (#3886) + +Doc-only changes +"""""""""""""""" + +- [AIRFLOW-XXX] GCP operators documentation clarifications (#4273) +- [AIRFLOW-XXX] Docs: Fix paths to GCS transfer operator (#4479) +- [AIRFLOW-XXX] Add missing GCP operators to Docs (#4260) +- [AIRFLOW-XXX] Fix Docstrings for Operators (#3820) +- [AIRFLOW-XXX] Fix inconsistent comment in example_python_operator.py (#4337) +- [AIRFLOW-XXX] Fix incorrect parameter in SFTPOperator example (#4344) +- [AIRFLOW-XXX] Add missing remote logging field (#4333) +- [AIRFLOW-XXX] Revise template variables documentation (#4172) +- [AIRFLOW-XXX] Fix typo in docstring of gcs_to_bq (#3833) +- [AIRFLOW-XXX] Fix display of SageMaker operators/hook docs (#4263) +- [AIRFLOW-XXX] Better instructions for airflow flower (#4214) +- [AIRFLOW-XXX] Make pip install commands consistent (#3752) +- [AIRFLOW-XXX] Add `BigQueryGetDataOperator` to Integration Docs (#4063) +- [AIRFLOW-XXX] Don't spam test logs with "bad cron expression" messages (#3973) +- [AIRFLOW-XXX] Update committer list based on latest TLP discussion (#4427) +- [AIRFLOW-XXX] Fix incorrect statement in contributing guide (#4104) +- [AIRFLOW-XXX] Fix Broken Link in CONTRIBUTING.md +- [AIRFLOW-XXX] Update Contributing Guide - Git Hooks (#4120) +- [AIRFLOW-3426] Correct Python Version Documentation Reference (#4259) +- [AIRFLOW-2663] Add instructions to install SSH dependencies +- [AIRFLOW-XXX] Clean up installation extra packages table (#3750) +- [AIRFLOW-XXX] Remove redundant space in Kerberos (#3866) +- [AIRFLOW-3086] Add extras group for google auth to setup.py (#3917) +- [AIRFLOW-XXX] Add Kubernetes Dependency in Extra Packages Doc (#4281) +- [AIRFLOW-3696] Add Version info to Airflow Documentation (#4512) +- [AIRFLOW-XXX] Correct Typo in sensor's exception (#4545) +- [AIRFLOW-XXX] Fix a typo of config (#4544) +- [AIRFLOW-XXX] Fix BashOperator Docstring (#4052) +- [AIRFLOW-3018] Fix Minor issues in Documentation +- [AIRFLOW-XXX] Fix Minor issues with Azure Cosmos Operator (#4289) +- [AIRFLOW-3382] Fix incorrect docstring in DatastoreHook (#4222) +- [AIRFLOW-XXX] Fix copy&paste mistake (#4212) +- [AIRFLOW-3260] Correct misleading BigQuery error (#4098) +- [AIRFLOW-XXX] Fix Typo in SFTPOperator docstring (#4016) +- [AIRFLOW-XXX] Fixing the issue in Documentation (#3998) +- [AIRFLOW-XXX] Fix undocumented params in S3_hook +- [AIRFLOW-XXX] Fix SlackWebhookOperator execute method comment (#3963) +- [AIRFLOW-3070] Refine web UI authentication-related docs (#3863) + +Airflow 1.10.1, 2018-11-13 +-------------------------- + +New features +"""""""""""" + +- [AIRFLOW-2524] Airflow integration with AWS Sagemaker +- [AIRFLOW-2657] Add ability to delete DAG from web ui +- [AIRFLOW-2780] Adds IMAP Hook to interact with a mail server +- [AIRFLOW-2794] Add delete support for Azure blob +- [AIRFLOW-2912] Add operators for Google Cloud Functions +- [AIRFLOW-2974] Add Start/Restart/Terminate methods Databricks Hook +- [AIRFLOW-2989] No Parameter to change bootDiskType for DataprocClusterCreateOperator +- [AIRFLOW-3078] Basic operators for Google Compute Engine +- [AIRFLOW-3147] Update Flask-AppBuilder version +- [AIRFLOW-3231] Basic operators for Google Cloud SQL (deploy / patch / delete) +- [AIRFLOW-3276] Google Cloud SQL database create / patch / delete operators + +Improvements +"""""""""""" + +- [AIRFLOW-393] Add progress callbacks for FTP downloads +- [AIRFLOW-520] Show Airflow version on web page +- [AIRFLOW-843] Exceptions now available in context during on_failure_callback +- [AIRFLOW-2476] Update tabulate dependency to v0.8.2 +- [AIRFLOW-2592] Bump Bleach dependency +- [AIRFLOW-2622] Add "confirm=False" option to SFTPOperator +- [AIRFLOW-2662] support affinity & nodeSelector policies for kubernetes executor/operator +- [AIRFLOW-2709] Improve error handling in Databricks hook +- [AIRFLOW-2723] Update lxml dependancy to >= 4.0. +- [AIRFLOW-2763] No precheck mechanism in place during worker initialisation for the connection to metadata database +- [AIRFLOW-2789] Add ability to create single node cluster to DataprocClusterCreateOperator +- [AIRFLOW-2797] Add ability to create Google Dataproc cluster with custom image +- [AIRFLOW-2854] kubernetes_pod_operator add more configuration items +- [AIRFLOW-2855] Need to Check Validity of Cron Expression When Process DAG File/Zip File +- [AIRFLOW-2904] Clean an unnecessary line in airflow/executors/celery_executor.py +- [AIRFLOW-2921] A trivial incorrectness in CeleryExecutor() +- [AIRFLOW-2922] Potential deal-lock bug in CeleryExecutor() +- [AIRFLOW-2932] GoogleCloudStorageHook - allow compression of file +- [AIRFLOW-2949] Syntax Highlight for Single Quote +- [AIRFLOW-2951] dag_run end_date Null after a dag is finished +- [AIRFLOW-2956] Kubernetes tolerations for pod operator +- [AIRFLOW-2997] Support for clustered tables in Bigquery hooks/operators +- [AIRFLOW-3006] Fix error when schedule_interval="None" +- [AIRFLOW-3008] Move Kubernetes related example DAGs to contrib/example_dags +- [AIRFLOW-3025] Allow to specify dns and dns-search parameters for DockerOperator +- [AIRFLOW-3067] (www_rbac) Flask flash messages are not displayed properly (no background color) +- [AIRFLOW-3069] Decode output of S3 file transform operator +- [AIRFLOW-3072] Assign permission get_logs_with_metadata to viewer role +- [AIRFLOW-3090] INFO logs are too verbose +- [AIRFLOW-3103] Update Flask-Login +- [AIRFLOW-3112] Align SFTP hook with SSH hook +- [AIRFLOW-3119] Enable loglevel on celery worker and inherit from airflow.cfg +- [AIRFLOW-3137] Make ProxyFix middleware optional +- [AIRFLOW-3173] Add _cmd options for more password config options +- [AIRFLOW-3177] Change scheduler_heartbeat metric from gauge to counter +- [AIRFLOW-3193] Pin docker requirement version to v3 +- [AIRFLOW-3195] Druid Hook: Log ingestion spec and task id +- [AIRFLOW-3197] EMR Hook is missing some parameters to valid on the AWS API +- [AIRFLOW-3232] Make documentation for GCF Functions operator more readable +- [AIRFLOW-3262] Can't get log containing Response when using SimpleHttpOperator +- [AIRFLOW-3265] Add support for "unix_socket" in connection extra for Mysql Hook + +Doc-only changes +"""""""""""""""" + +- [AIRFLOW-1441] Tutorial Inconsistencies Between Example Pipeline Definition and Recap +- [AIRFLOW-2682] Add how-to guide(s) for how to use basic operators like BashOperator and PythonOperator +- [AIRFLOW-3104] .airflowignore feature is not mentioned at all in documentation +- [AIRFLOW-3237] Refactor example DAGs +- [AIRFLOW-3187] Update airflow.gif file with a slower version +- [AIRFLOW-3159] Update Airflow documentation on GCP Logging +- [AIRFLOW-3030] Command Line docs incorrect subdir +- [AIRFLOW-2990] Docstrings for Hooks/Operators are in incorrect format +- [AIRFLOW-3127] Celery SSL Documentation is out-dated +- [AIRFLOW-2779] Add license headers to doc files +- [AIRFLOW-2779] Add project version to license + +Bug fixes +""""""""" + +- [AIRFLOW-839] docker_operator.py attempts to log status key without first checking existence +- [AIRFLOW-1104] Concurrency check in scheduler should count queued tasks as well as running +- [AIRFLOW-1163] Add support for x-forwarded-* headers to support access behind AWS ELB +- [AIRFLOW-1195] Cleared tasks in SubDagOperator do not trigger Parent dag_runs +- [AIRFLOW-1508] Skipped state not part of State.task_states +- [AIRFLOW-1762] Use key_file in SSHHook.create_tunnel() +- [AIRFLOW-1837] Differing start_dates on tasks not respected by scheduler. +- [AIRFLOW-1874] Support standard SQL in Check, ValueCheck and IntervalCheck BigQuery operators +- [AIRFLOW-1917] print() from python operators end up with extra new line +- [AIRFLOW-1970] Database cannot be initialized if an invalid fernet key is provided +- [AIRFLOW-2145] Deadlock after clearing a running task +- [AIRFLOW-2216] Cannot specify a profile for AWS Hook to load with s3 config file +- [AIRFLOW-2574] initdb fails when mysql password contains percent sign +- [AIRFLOW-2707] Error accessing log files from web UI +- [AIRFLOW-2716] Replace new Python 3.7 keywords +- [AIRFLOW-2744] RBAC app doesn't integrate plugins (blueprints etc) +- [AIRFLOW-2772] BigQuery hook does not allow specifying both the partition field name and table name at the same time +- [AIRFLOW-2778] Bad Import in collect_dag in DagBag +- [AIRFLOW-2786] Variables view fails to render if a variable has an empty key +- [AIRFLOW-2799] Filtering UI objects by datetime is broken +- [AIRFLOW-2800] Remove airflow/ low-hanging linting errors +- [AIRFLOW-2825] S3ToHiveTransfer operator may not may able to handle GZIP file with uppercase ext in S3 +- [AIRFLOW-2848] dag_id is missing in metadata table "job" for LocalTaskJob +- [AIRFLOW-2860] DruidHook: time variable is not updated correctly when checking for timeout +- [AIRFLOW-2865] Race condition between on_success_callback and LocalTaskJob's cleanup +- [AIRFLOW-2893] Stuck dataflow job due to jobName mismatch. +- [AIRFLOW-2895] Prevent scheduler from spamming heartbeats/logs +- [AIRFLOW-2900] Code not visible for Packaged DAGs +- [AIRFLOW-2905] Switch to regional dataflow job service. +- [AIRFLOW-2907] Sendgrid - Attachments - ERROR - Object of type 'bytes' is not JSON serializable +- [AIRFLOW-2938] Invalid 'extra' field in connection can raise an AttributeError when attempting to edit +- [AIRFLOW-2979] Deprecated Celery Option not in Options list +- [AIRFLOW-2981] TypeError in dataflow operators when using GCS jar or py_file +- [AIRFLOW-2984] Cannot convert naive_datetime when task has a naive start_date/end_date +- [AIRFLOW-2994] flatten_results in BigQueryOperator/BigQueryHook should default to None +- [AIRFLOW-3002] ValueError in dataflow operators when using GCS jar or py_file +- [AIRFLOW-3012] Email on sla miss is send only to first address on the list +- [AIRFLOW-3046] ECS Operator mistakenly reports success when task is killed due to EC2 host termination +- [AIRFLOW-3064] No output from `airflow test` due to default logging config +- [AIRFLOW-3072] Only admin can view logs in RBAC UI +- [AIRFLOW-3079] Improve initdb to support MSSQL Server +- [AIRFLOW-3089] Google auth doesn't work under http +- [AIRFLOW-3099] Errors raised when some blocs are missing in airflow.cfg +- [AIRFLOW-3109] Default user permission should contain 'can_clear' +- [AIRFLOW-3111] Confusing comments and instructions for log templates in UPDATING.md and default_airflow.cfg +- [AIRFLOW-3124] Broken webserver debug mode (RBAC) +- [AIRFLOW-3136] Scheduler Failing the Task retries run while processing Executor Events +- [AIRFLOW-3138] Migration cc1e65623dc7 creates issues with postgres +- [AIRFLOW-3161] Log Url link does not link to task instance logs in RBAC UI +- [AIRFLOW-3162] HttpHook fails to parse URL when port is specified +- [AIRFLOW-3183] Potential Bug in utils/dag_processing/DagFileProcessorManager.max_runs_reached() +- [AIRFLOW-3203] Bugs in DockerOperator & Some operator test scripts were named incorrectly +- [AIRFLOW-3238] Dags, removed from the filesystem, are not deactivated on initdb +- [AIRFLOW-3268] Cannot pass SSL dictionary to mysql connection via URL +- [AIRFLOW-3277] Invalid timezone transition handling for cron schedules +- [AIRFLOW-3295] Require encryption in DaskExecutor when certificates are configured. +- [AIRFLOW-3297] EmrStepSensor marks cancelled step as successful + +Airflow 1.10.0, 2018-08-03 +-------------------------- +- [AIRFLOW-2870] Use abstract TaskInstance for migration +- [AIRFLOW-2859] Implement own UtcDateTime (#3708) +- [AIRFLOW-2140] Don't require kubernetes for the SparkSubmit hook +- [AIRFLOW-2869] Remove smart quote from default config +- [AIRFLOW-2857] Fix Read the Docs env +- [AIRFLOW-2817] Force explicit choice on GPL dependency +- [AIRFLOW-2716] Replace async and await py3.7 keywords +- [AIRFLOW-2810] Fix typo in Xcom model timestamp +- [AIRFLOW-2710] Clarify fernet key value in documentation +- [AIRFLOW-2606] Fix DB schema and SQLAlchemy model +- [AIRFLOW-2646] Fix setup.py not to install snakebite on Python3 +- [AIRFLOW-2604] Add index to task_fail +- [AIRFLOW-2650] Mark SchedulerJob as succeed when hitting Ctrl-c +- [AIRFLOW-2678] Fix db schema unit test to remove checking fab models +- [AIRFLOW-2624] Fix webserver login as anonymous +- [AIRFLOW-2654] Fix incorret URL on refresh in Graph View of FAB UI +- [AIRFLOW-2668] Handle missing optional cryptography dependency +- [AIRFLOW-2681] Include last dag run of externally triggered DAGs in UI. +- [AIRFLOW-1840] Support back-compat on old celery config +- [AIRFLOW-2612][AIRFLOW-2534] Clean up Hive-related tests +- [AIRFLOW-2608] Implements/Standardize custom exceptions for experimental APIs +- [AIRFLOW-2607] Fix failing TestLocalClient +- [AIRFLOW-2638] dbapi_hook: support REPLACE INTO +- [AIRFLOW-2542][AIRFLOW-1790] Rename AWS Batch Operator queue to job_queue +- [AIRFLOW-2567] Extract result from the kubernetes pod as Xcom +- [AIRFLOW-XXX] Adding REA Group to readme +- [AIRFLOW-2601] Allow user to specify k8s config +- [AIRFLOW-2559] Azure Fileshare hook +- [AIRFLOW-1786] Enforce correct behavior for soft-fail sensors +- [AIRFLOW-2355] Airflow trigger tag parameters in subdag +- [AIRFLOW-2613] Fix Airflow searching .zip bug +- [AIRFLOW-2627] Add a sensor for Cassandra +- [AIRFLOW-2634][AIRFLOW-2534] Remove dependency for impyla +- [AIRFLOW-2611] Fix wrong dag volume mount path for kubernetes executor +- [AIRFLOW-2562] Add Google Kubernetes Engine Operators +- [AIRFLOW-2630] Fix classname in test_sql_sensor.py +- [AIRFLOW-2534] Fix bug in HiveServer2Hook +- [AIRFLOW-2586] Stop getting AIRFLOW_HOME value from config file in bash operator +- [AIRFLOW-2605] Fix autocommit for MySqlHook +- [AIRFLOW-2539][AIRFLOW-2359] Move remaing log config to configuration file +- [AIRFLOW-1656] Tree view dags query changed +- [AIRFLOW-2617] add imagePullPolicy config for kubernetes executor +- [AIRFLOW-2429] Fix security/task/sensors/ti_deps folders flake8 error +- [AIRFLOW-2550] Implements API endpoint to list DAG runs +- [AIRFLOW-2512][AIRFLOW-2522] Use google-auth instead of oauth2client +- [AIRFLOW-2429] Fix operators folder flake8 error +- [AIRFLOW-2585] Fix several bugs in CassandraHook and CassandraToGCSOperator +- [AIRFLOW-2597] Restore original dbapi.run() behavior +- [AIRFLOW-2590] Fix commit in DbApiHook.run() for no-autocommit DB +- [AIRFLOW-1115] fix github oauth api URL +- [AIRFLOW-2587] Add TIMESTAMP type mapping to MySqlToHiveTransfer +- [AIRFLOW-2591][AIRFLOW-2581] Set default value of autocommit to False in DbApiHook.run() +- [AIRFLOW-59] Implement bulk_dump and bulk_load for the Postgres hook +- [AIRFLOW-2533] Fix path to DAG's on kubernetes executor workers +- [AIRFLOW-2581] RFLOW-2581] Fix DbApiHook autocommit +- [AIRFLOW-2578] Add option to use proxies in JiraHook +- [AIRFLOW-2575] Make gcs to gcs operator work with large files +- [AIRFLOW-437] Send TI context in kill zombies +- [AIRFLOW-2566] Change backfill to rerun failed tasks +- [AIRFLOW-1021] Fix double login for new users with LDAP +- [AIRFLOW-XXX] Typo fix +- [AIRFLOW-2561] Fix typo in EmailOperator +- [AIRFLOW-2573] Cast BigQuery TIMESTAMP field to float +- [AIRFLOW-2560] Adding support for internalIpOnly to DataprocClusterCreateOperator +- [AIRFLOW-2565] templatize cluster_label +- [AIRFLOW-83] add mongo hook and operator +- [AIRFLOW-2558] Clear task/dag is clearing all executions +- [AIRFLOW-XXX] Fix doc typos +- [AIRFLOW-2513] Change `bql` to `sql` for BigQuery Hooks & Ops +- [AIRFLOW-2557] Fix pagination for s3 +- [AIRFLOW-2545] Eliminate DeprecationWarning +- [AIRFLOW-2500] Fix MySqlToHiveTransfer to transfer unsigned type properly +- [AIRFLOW-2462] Change PasswordUser setter to correct syntax +- [AIRFLOW-2525] Fix a bug introduced by commit dabf1b9 +- [AIRFLOW-2553] Add webserver.pid to .gitignore +- [AIRFLOW-1863][AIRFLOW-2529] Add dag run selection widgets to gantt view +- [AIRFLOW-2504] Log username correctly and add extra to search columns +- [AIRFLOW-2551] Encode binary data with base64 standard rather than base64 url +- [AIRFLOW-2537] Add reset-dagrun option to backfill command +- [AIRFLOW-2526] dag_run.conf can override params +- [AIRFLOW-2544][AIRFLOW-1967] Guard against next major release of Celery, Flower +- [AIRFLOW-XXX] Add Yieldr to who is using airflow +- [AIRFLOW-2547] Describe how to run tests using Docker +- [AIRFLOW-2538] Update faq doc on how to reduce airflow scheduler latency +- [AIRFLOW-2529] Improve graph view performance and usability +- [AIRFLOW-2517] backfill support passing key values through CLI +- [AIRFLOW-2532] Support logs_volume_subpath for KubernetesExecutor +- [AIRFLOW-2466] consider task_id in _change_state_for_tis_without_dagrun +- [AIRFLOW-2519] Fix CeleryExecutor with SQLAlchemy +- [AIRFLOW-2402] Fix RBAC task log +- [AIRFLOW-XXX] Add M4U to user list +- [AIRFLOW-2536] docs about how to deal with airflow initdb failure +- [AIRFLOW-2530] KubernetesOperator supports multiple clusters +- [AIRFLOW-1499] Eliminate duplicate and unneeded code +- [AIRFLOW-2521] backfill - make variable name and logging messages more acurate +- [AIRFLOW-2429] Fix hook, macros folder flake8 error +- [Airflow-XXX] add Prime to company list +- [AIRFLOW-2525] Fix PostgresHook.copy_expert to work with "COPY FROM" +- [AIRFLOW-2515] Add dependency on thrift_sasl to hive extra +- [AIRFLOW-2523] Add how-to for managing GCP connections +- [AIRFLOW-2510] Introduce new macros: prev_ds and next_ds +- [AIRFLOW-1730] Unpickle value of XCom queried from DB +- [AIRFLOW-2518] Fix broken ToC links in integration.rst +- [AIRFLOW-1472] Fix SLA misses triggering on skipped tasks. +- [AIRFLOW-2520] CLI - make backfill less verbose +- [AIRFLOW-2107] add time_partitioning to run_query on BigQueryBaseCursor +- [AIRFLOW-1057][AIRFLOW-1380][AIRFLOW-2362][2362] AIRFLOW Update DockerOperator to new API +- [AIRFLOW-2415] Make airflow DAG templating render numbers +- [AIRFLOW-2473] Fix wrong skip condition for TransferTests +- [AIRFLOW-2472] Implement MySqlHook.bulk_dump +- [AIRFLOW-2419] Use default view for subdag operator +- [AIRFLOW-2498] Fix Unexpected argument in SFTP Sensor +- [AIRFLOW-2509] Separate config docs into how-to guides +- [AIRFLOW-2429] Add BaseExecutor back +- [AIRFLOW-2429] Fix dag, example_dags, executors flake8 error +- [AIRFLOW-2502] Change Single triple quotes to double for docstrings +- [AIRFLOW-2503] Fix broken links in CONTRIBUTING.md +- [AIRFLOW-2501] Refer to devel instructions in docs contrib guide +- [AIRFLOW-2429] Fix contrib folder's flake8 errors +- [AIRFLOW-2471] Fix HiveCliHook.load_df to use unused parameters +- [AIRFLOW-2495] Update celery to 4.1.1 +- [AIRFLOW-2429] Fix api, bin, config_templates folders flake8 error +- [AIRFLOW-2493] Mark template_fields of all Operators in the API document as "templated" +- [AIRFLOW-2489] Update FlaskAppBuilder to 1.11.1 +- [AIRFLOW-2448] Enhance HiveCliHook.load_df to work with datetime +- [AIRFLOW-2487] Enhance druid ingestion hook +- [AIRFLOW-2397] Support affinity policies for Kubernetes executor/operator +- [AIRFLOW-2482] Add test for rewrite method in GCS Hook +- [AIRFLOW-2481] Fix flaky Kubernetes test +- [AIRFLOW-2479] Improve doc FAQ section +- [AIRFLOW-2485] Fix Incorrect logging for Qubole Sensor +- [AIRFLOW-2486] Remove unnecessary slash after port +- [AIRFLOW-2429] Make Airflow flake8 compliant +- [AIRFLOW-2491] Resolve flask version conflict +- [AIRFLOW-2484] Remove duplicate key in MySQL to GCS Op +- [ARIFLOW-2458] Add cassandra-to-gcs operator +- [AIRFLOW-2477] Improve time units for task duration and landing times charts for RBAC UI +- [AIRFLOW-2474] Only import snakebite if using py2 +- [AIRFLOW-48] Parse connection uri querystring +- [AIRFLOW-2467][AIRFLOW-2] Update import direct warn message to use the module name +- [AIRFLOW-XXX] Fix order of companies +- [AIRFLOW-2452] Document field_dict must be OrderedDict +- [AIRFLOW-2420] Azure Data Lake Hook +- [AIRFLOW-2213] Add Quoble check operator +- [AIRFLOW-2465] Fix wrong module names in the doc +- [AIRFLOW-1929] Modifying TriggerDagRunOperator to accept execution_date +- [AIRFLOW-2460] Users can now use volume mounts and volumes +- [AIRFLOW-2110][AIRFLOW-2122] Enhance Http Hook +- [AIRFLOW-XXX] Updated contributors list +- [AIRFLOW-2435] Add launch_type to ECSOperator to allow FARGATE +- [AIRFLOW-2451] Remove extra slash ('/') char when using wildcard in gcs_to_gcs operator +- [AIRFLOW-2461] Add support for cluster scaling on dataproc operator +- [AIRFLOW-2376] Fix no hive section error +- [AIRFLOW-2425] Add lineage support +- [AIRFLOW-2430] Extend query batching to additional slow queries +- [AIRFLOW-2453] Add default nil value for kubernetes/git_subpath +- [AIRFLOW-2396] Add support for resources in kubernetes operator +- [AIRFLOW-2169] Encode binary data with base64 before importing to BigQuery +- [AIRFLOW-XXX] Add spotahome in user list +- [AIRFLOW-2457] Update FAB version requirement +- [AIRFLOW-2454][Airflow 2454] Support imagePullPolicy for k8s +- [AIRFLOW-2450] update supported k8s versions to 1.9 and 1.10 +- [AIRFLOW-2333] Add Segment Hook and TrackEventOperator +- [AIRFLOW-2442][AIRFLOW-2] Airflow run command leaves database connections open +- [AIRFLOW-2016] assign template_fields for Dataproc Workflow Template sub-classes, not base class +- [AIRFLOW-2446] Add S3ToRedshiftTransfer into the "Integration" doc +- [AIRFLOW-2449] Fix operators.py to run all test cases +- [AIRFLOW-2424] Add dagrun status endpoint and increased k8s test coverage +- [AIRFLOW-2441] Fix bugs in HiveCliHook.load_df +- [AIRFLOW-2358][AIRFLOW-201804] Make the Kubernetes example optional +- [AIRFLOW-2436] Remove cli_logger in initdb +- [AIRFLOW-2444] Remove unused option(include_adhoc) in cli backfill command +- [AIRFLOW-2447] Fix TestHiveMetastoreHook to run all cases +- [AIRFLOW-2445] Allow templating in kubernetes operator +- [AIRFLOW-2086][AIRFLOW-2393] Customize default dagrun number in tree view +- [AIRFLOW-2437] Add PubNub to list of current airflow users +- [AIRFLOW-XXX] Add Quantopian to list of Airflow users +- [AIRFLOW-1978] Add WinRM windows operator and hook +- [AIRFLOW-2427] Add tests to named hive sensor +- [AIRFLOW-2412] Fix HiveCliHook.load_file to address HIVE-10541 +- [AIRFLOW-2431] Add the navigation bar color parameter for RBAC UI +- [AIRFLOW-2407] Resolve Python undefined names +- [AIRFLOW-1952] Add the navigation bar color parameter +- [AIRFLOW-2222] Implement GoogleCloudStorageHook.rewrite +- [AIRFLOW-2426] Add Google Cloud Storage Hook tests +- [AIRFLOW-2418] Bump Flask-WTF +- [AIRFLOW-2417] Wait for pod is not running to end task +- [AIRFLOW-1914] Add other charset support to email utils +- [AIRFLOW-XXX] Update README.md with Craig@Work +- [AIRFLOW-1899] Fix Kubernetes tests +- [AIRFLOW-1812] Update logging example +- [AIRFLOW-2313] Add TTL parameters for Dataproc +- [AIRFLOW-2411] add dataproc_jars to templated_fields +- [AIRFLOW-XXX] Add Reddit to Airflow users +- [AIRFLOW-XXX] Fix wrong table header in scheduler.rst +- [AIRFLOW-2409] Supply password as a parameter +- [AIRFLOW-2410][AIRFLOW-75] Set the timezone in the RBAC Web UI +- [AIRFLOW-2394] default cmds and arguments in kubernetes operator +- [AIRFLOW-2406] Add Apache2 License Shield to Readme +- [AIRFLOW-2404] Add additional documentation for unqueued task +- [AIRFLOW-2400] Add Ability to set Environment Variables for K8s +- [AIRFLOW-XXX] Add Twine Labs as an Airflow user +- [AIRFLOW-1853] Show only the desired number of runs in tree view +- [AIRFLOW-2401] Document the use of variables in Jinja template +- [AIRFLOW-2403] Fix License Headers +- [AIRFLOW-1313] Fix license header +- [AIRFLOW-2398] Add BounceX to list of current airflow users +- [AIRFLOW-2363] Fix return type bug in TaskHandler +- [AIRFLOW-2389] Create a pinot db api hook +- [AIRFLOW-2390] Resolve FlaskWTFDeprecationWarning +- [AIRFLOW-1933] Fix some typos +- [AIRFLOW-1960] Add support for secrets in kubernetes operator +- [AIRFLOW-1313] Add vertica_to_mysql operator +- [AIRFLOW-1575] Add AWS Kinesis Firehose Hook for inserting batch records +- [AIRFLOW-2266][AIRFLOW-2343] Remove google-cloud-dataflow dependency +- [AIRFLOW-2370] Implement --use_random_password in create_user +- [AIRFLOW-2348] Strip path prefix from the destination_object when source_object contains a wildcard[] +- [AIRFLOW-2391] Fix to Flask 0.12.2 +- [AIRFLOW-2381] Fix the flaky ApiPasswordTests test +- [AIRFLOW-2378] Add Groupon to list of current users +- [AIRFLOW-2382] Fix wrong description for delimiter +- [AIRFLOW-2380] Add support for environment variables in Spark submit operator. +- [AIRFLOW-2377] Improve Sendgrid sender support +- [AIRFLOW-2331] Support init action timeout on dataproc cluster create +- [AIRFLOW-1835] Update docs: Variable file is json +- [AIRFLOW-1781] Make search case-insensitive in LDAP group +- [AIRFLOW-2042] Fix browser menu appearing over the autocomplete menu +- [AIRFLOW-XXX] Remove wheelhouse files from travis not owned by travis +- [AIRFLOW-2336] Use hmsclient in hive_hook +- [AIRFLOW-2041] Correct Syntax in python examples +- [AIRFLOW-74] SubdagOperators can consume all celeryd worker processes +- [AIRFLOW-2369] Fix gcs tests +- [AIRFLOW-2365] Fix autocommit attribute check +- [AIRFLOW-2068] MesosExecutor allows optional Docker image +- [AIRFLOW-1652] Push DatabricksRunSubmitOperator metadata into XCOM +- [AIRFLOW-2234] Enable insert_rows for PrestoHook +- [AIRFLOW-2208][Airflow-22208] Link to same DagRun graph from TaskInstance view +- [AIRFLOW-1153] Allow HiveOperators to take hiveconfs +- [AIRFLOW-775] Fix autocommit settings with Jdbc hook +- [AIRFLOW-2364] Warn when setting autocommit on a connection which does not support it +- [AIRFLOW-2357] Add persistent volume for the logs +- [AIRFLOW-766] Skip conn.commit() when in Auto-commit +- [AIRFLOW-2351] Check for valid default_args start_date +- [AIRFLOW-1433] Set default rbac to initdb +- [AIRFLOW-2270] Handle removed tasks in backfill +- [AIRFLOW-2344] Fix `connections -l` to work with pipe/redirect +- [AIRFLOW-2300] Add S3 Select functionarity to S3ToHiveTransfer +- [AIRFLOW-1314] Cleanup the config +- [AIRFLOW-1314] Polish some of the Kubernetes docs/config +- [AIRFLOW-1314] Improve error handling +- [AIRFLOW-1999] Add per-task GCP service account support +- [AIRFLOW-1314] Rebasing against master +- [AIRFLOW-1314] Small cleanup to address PR comments (#24) +- [AIRFLOW-1314] Add executor_config and tests +- [AIRFLOW-1314] Improve k8s support +- [AIRFLOW-1314] Use VolumeClaim for transporting DAGs +- [AIRFLOW-1314] Create integration testing environment +- [AIRFLOW-1314] Git Mode to pull in DAGs for Kubernetes Executor +- [AIRFLOW-1314] Add support for volume mounts & Secrets in Kubernetes Executor +- [AIRFLOW=1314] Basic Kubernetes Mode +- [AIRFLOW-2326][AIRFLOW-2222] remove contrib.gcs_copy_operator +- [AIRFLOW-2328] Fix empty GCS blob in S3ToGoogleCloudStorageOperator +- [AIRFLOW-2350] Fix grammar in UPDATING.md +- [AIRFLOW-2302] Fix documentation +- [AIRFLOW-2345] pip is not used in this setup.py +- [AIRFLOW-2347] Add Banco de Formaturas to Readme +- [AIRFLOW-2346] Add Investorise as official user of Airflow +- [AIRFLOW-2330] Do not append destination prefix if not given +- [AIRFLOW-2240][DASK] Added TLS/SSL support for the dask-distributed scheduler. +- [AIRFLOW-2309] Fix duration calculation on TaskFail +- [AIRFLOW-2335] fix issue with jdk8 download for ci +- [AIRFLOW-2184] Add druid_checker_operator +- [AIRFLOW-2299] Add S3 Select functionarity to S3FileTransformOperator +- [AIRFLOW-2254] Put header as first row in unload +- [AIRFLOW-610] Respect _cmd option in config before defaults +- [AIRFLOW-2287] Fix incorrect ASF headers +- [AIRFLOW-XXX] Add Zego as an Apache Airflow user +- [AIRFLOW-952] fix save empty extra field in UI +- [AIRFLOW-1325] Add ElasticSearch log handler and reader +- [AIRFLOW-2301] Sync files of an S3 key with a GCS path +- [AIRFLOW-2293] Fix S3FileTransformOperator to work with boto3 +- [AIRFLOW-3212][AIRFLOW-2314] Remove only leading slash in GCS path +- [AIRFLOW-1509][AIRFLOW-442] SFTP Sensor +- [AIRFLOW-2291] Add optional params to ML Engine +- [AIRFLOW-1774] Allow consistent templating of arguments in MLEngineBatchPredictionOperator +- [AIRFLOW-2302] Add missing operators and hooks +- [AIRFLOW-2312] Docs Typo Correction: Corresponding +- [AIRFLOW-1623] Trigger on_kill method in operators +- [AIRFLOW-2162] When impersonating another user, pass env variables to sudo +- [AIRFLOW-2304] Update quickstart doc to mention scheduler part +- [AIRFLOW-1633] docker_operator needs a way to set shm_size +- [AIRFLOW-1340] Add S3 to Redshift transfer operator +- [AIRFLOW-2303] Lists the keys inside an S3 bucket +- [AIRFLOW-2209] restore flask_login imports +- [AIRFLOW-2306] Add Bonnier Broadcasting to list of current users +- [AIRFLOW-2305][AIRFLOW-2027] Fix CI failure caused by [] +- [AIRFLOW-2281] Add support for Sendgrid categories +- [AIRFLOW-2027] Only trigger sleep in scheduler after all files have parsed +- [AIRFLOW-2256] SparkOperator: Add Client Standalone mode and retry mechanism +- [AIRFLOW-2284] GCS to S3 operator +- [AIRFLOW-2287] Update license notices +- [AIRFLOW-2296] Add Cinimex DataLab to Readme +- [AIRFLOW-2298] Add Kalibrr to who uses airflow +- [AIRFLOW-2292] Fix docstring for S3Hook.get_wildcard_key +- [AIRFLOW-XXX] Update PR template +- [AIRFLOW-XXX] Remove outdated migrations.sql +- [AIRFLOW-2287] Add license header to docs/Makefile +- [AIRFLOW-2286] Add tokopedia to the readme +- [AIRFLOW-2273] Add Discord webhook operator/hook +- [AIRFLOW-2282] Fix grammar in UPDATING.md +- [AIRFLOW-2200] Add snowflake operator with tests +- [AIRFLOW-2178] Add handling on SLA miss errors +- [AIRFLOW-2169] Fix type 'bytes' is not JSON serializable in python3 +- [AIRFLOW-2215] Pass environment to subproces.Popen in base_task_runner +- [AIRFLOW-2253] Add Airflow CLI instrumentation +- [AIRFLOW-2274] Fix Dataflow tests +- [AIRFLOW-2269] Add Custom Ink as an Airflow user +- [AIRFLOW-2259] Dataflow Hook Index out of range +- [AIRFLOW-2233] Update updating.md to include the info of hdfs_sensors renaming +- [AIRFLOW-2217] Add Slack webhook operator +- [AIRFLOW-1729] improve dagBag time +- [AIRFLOW-2264] Improve create_user cli help message +- [AIRFLOW-2260] [AIRFLOW-2260] SSHOperator add command template .sh files +- [AIRFLOW-2261] Check config/env for remote base log folder +- [AIRFLOW-2258] Allow import of Parquet-format files into BigQuery +- [AIRFLOW-1430] Include INSTALL instructions to avoid GPL +- [AIRFLOW-1430] Solve GPL dependency +- [AIRFLOW-2251] Add Thinknear as an Airflow user +- [AIRFLOW-2244] bugfix: remove legacy LongText code from models.py +- [AIRFLOW-2247] Fix RedshiftToS3Transfer not to fail with ValueError +- [AIRFLOW-2249] Add side-loading support for Zendesk Hook +- [AIRFLOW-XXX] Add Qplum to Airflow users +- [AIRFLOW-2228] Enhancements in ValueCheckOperator +- [AIRFLOW-1206] Typos +- [AIRFLOW-2060] Update pendulum version to 1.4.4 +- [AIRFLOW-2248] Fix wrong param name in RedshiftToS3Transfer doc +- [AIRFLOW-1433][AIRFLOW-85] New Airflow Webserver UI with RBAC support +- [AIRFLOW-1235] Fix webserver's odd behaviour +- [AIRFLOW-1460] Allow restoration of REMOVED TI's +- [airflow-2235] Fix wrong docstrings in two operators +- [AIRFLOW-XXX] Fix chronological order for companies using Airflow +- [AIRFLOW-2124] Upload Python file to a bucket for Dataproc +- [AIRFLOW-2212] Fix ungenerated sensor API reference +- [AIRFLOW-2226] Rename google_cloud_storage_default to google_cloud_default +- [AIRFLOW-2211] Rename hdfs_sensors.py to hdfs_sensor.py for consistency +- [AIRFLOW-2225] Update document to include DruidDbApiHook +- [Airflow-2202] Add filter support in HiveMetastoreHook().max_partition() +- [AIRFLOW-2220] Remove duplicate numeric list entry in security.rst +- [AIRFLOW-XXX] Update tutorial documentation +- [AIRFLOW-2215] Update celery task to preserve environment variables and improve logging on exception +- [AIRFLOW-2185] Use state instead of query param +- [AIRFLOW-2183] Refactor DruidHook to enable sql +- [AIRFLOW-2203] Defer cycle detection +- [AIRFLOW-2203] Remove Useless Commands. +- [AIRFLOW-2203] Cache signature in apply_defaults +- [AIRFLOW-2203] Speed up Operator Resources +- [AIRFLOW-2203] Cache static rules (trigger/weight) +- [AIRFLOW-2203] Store task ids as sets not lists +- [AIRFLOW-2205] Remove unsupported args from JdbcHook doc +- [AIRFLOW-2207] Fix flaky test that uses app.cached_app() +- [AIRFLOW-2206] Remove unsupported args from JdbcOperator doc +- [AIRFLOW-2140] Add Kubernetes scheduler to SparkSubmitOperator +- [AIRFLOW-XXX] Add Xero to list of users +- [AIRFLOW-2204] Fix webserver debug mode +- [AIRFLOW-102] Fix test_complex_template always succeeds +- [AIRFLOW-442] Add SFTPHook +- [AIRFLOW-2169] Add schema to MySqlToGoogleCloudStorageOperator +- [AIRFLOW-2184][AIRFLOW-2138] Google Cloud Storage allow wildcards +- [AIRFLOW-1588] Cast Variable value to string +- [AIRFLOW-2199] Fix invalid reference to logger +- [AIRFLOW-2191] Change scheduler heartbeat logs from info to debug +- [AIRFLOW-2106] SalesForce hook sandbox option +- [AIRFLOW-2197] Silence hostname_callable config error message +- [AIRFLOW-2150] Use lighter call in HiveMetastoreHook().max_partition() +- [AIRFLOW-2186] Change the way logging is carried out in few ops +- [AIRFLOW-2181] Convert password_auth and test_password_endpoints from DOS to UNIX +- [AIRFLOW-2187] Fix Broken Travis CI due to AIRFLOW-2123 +- [AIRFLOW-2175] Check that filepath is not None +- [AIRFLOW-2173] Don't check task IDs for concurrency reached check +- [AIRFLOW-2168] Remote logging for Azure Blob Storage +- [AIRFLOW-XXX] Add DocuTAP to list of users +- [AIRFLOW-2176] Change the way logging is carried out in BQ Get Data Operator +- [AIRFLOW-2177] Add mock test for GCS Download op +- [AIRFLOW-2123] Install CI dependencies from setup.py +- [AIRFLOW-2129] Presto hook calls _parse_exception_message but defines _get_pretty_exception_message +- [AIRFLOW-2174] Fix typos and wrongly rendered documents +- [AIRFLOW-2171] Store delegated credentials +- [AIRFLOW-2166] Restore BQ run_query dialect param +- [AIRFLOW-2163] Add HBC Digital to users of airflow +- [AIRFLOW-2065] Fix race-conditions when creating loggers +- [AIRFLOW-2147] Plugin manager: added 'sensors' attribute +- [AIRFLOW-2059] taskinstance query is awful, un-indexed, and does not scale +- [AIRFLOW-2159] Fix a few typos in salesforce_hook +- [AIRFLOW-2132] Add step to initialize database +- [AIRFLOW-2160] Fix bad rowid deserialization +- [AIRFLOW-2161] Add Vevo to list of companies using Airflow +- [AIRFLOW-2149] Add link to apache Beam documentation to create self executing Jar +- [AIRFLOW-2151] Allow getting the session from AwsHook +- [AIRFLOW-2097] tz referenced before assignment +- [AIRFLOW-2152] Add Multiply to list of companies using Airflow +- [AIRFLOW-1551] Add operator to trigger Jenkins job +- [AIRFLOW-2034] Fix mixup between %s and {} when using str.format Convention is to use .format for string formating oustide logging, else use lazy format See comment in related issue https://github.com/apache/airflow/pull/2823/files Identified problematic case using following command line .git/COMMIT_EDITMSG:`grep -r '%s'./* | grep '\.format('` +- [AIRFLOW-2102] Add custom_args to Sendgrid personalizations +- [AIRFLOW-1035][AIRFLOW-1053] import unicode_literals to parse Unicode in HQL +- [AIRFLOW-2127] Keep loggers during DB migrations +- [AIRFLOW-2146] Resolve issues with BQ using DbApiHook methods +- [AIRFLOW-2087] Scheduler Report shows incorrect Total task number +- [AIRFLOW-2139] Remove unncecessary boilerplate to get DataFrame using pandas_gbq +- [AIRFLOW-2125] Using binary package psycopg2-binary +- [AIRFLOW-2142] Include message on mkdir failure +- [AIRFLOW-1615] SSHHook: use port specified by Connection +- [AIRFLOW-2122] Handle boolean values in sshHook +- [AIRFLOW-XXX] Add Tile to the list of users +- [AIRFLOW-2130] Add missing Operators to API Reference docs +- [AIRFLOW-XXX] Add timeout units (seconds) +- [AIRFLOW-2134] Add Alan to the list of companies that use Airflow +- [AIRFLOW-2133] Remove references to GitHub issues in CONTRIBUTING +- [AIRFLOW-2131] Remove confusing AirflowImport docs +- [AIRFLOW-1852] Allow hostname to be overridable. +- [AIRFLOW-2126] Add Bluecore to active users +- [AIRFLOW-1618] Add feature to create GCS bucket +- [AIRFLOW-2108] Fix log indentation in BashOperator +- [AIRFLOW-2115] Fix doc links to PythonHosted +- [AIRFLOW-XXX] Add contributor from Easy company +- [AIRFLOW-1882] Add ignoreUnknownValues option to gcs_to_bq operator +- [AIRFLOW-2089] Add on kill for SparkSubmit in Standalone Cluster +- [AIRFLOW-2113] Address missing DagRun callbacks Given that the handle_callback method belongs to the DAG object, we are able to get the list of task directly with get_task and reduce the communication with the database, making airflow more lightweight. +- [AIRFLOW-2112] Fix svg width for Recent Tasks on UI. +- [AIRFLOW-2116] Set CI Cloudant version to <2.0 +- [AIRFLOW-XXX] Add PMC to list of companies using Airflow +- [AIRFLOW-2100] Fix Broken Documentation Links +- [AIRFLOW-1404] Add 'flatten_results' & 'maximum_bytes_billed' to BQ Operator +- [AIRFLOW-800] Initialize valid Google BigQuery Connection +- [AIRFLOW-1319] Fix misleading SparkSubmitOperator and SparkSubmitHook docstring +- [AIRFLOW-1983] Parse environment parameter as template +- [AIRFLOW-2095] Add operator to create External BigQuery Table +- [AIRFLOW-2085] Add SparkJdbc operator +- [AIRFLOW-1002] Add ability to clean all dependencies of removed DAG +- [AIRFLOW-2094] Jinjafied project_id, region & zone in DataProc{*} Operators +- [AIRFLOW-2092] Fixed incorrect parameter in docstring for FTPHook +- [AIRFLOW-XXX] Add SocialCops to Airflow users +- [AIRFLOW-2088] Fix duplicate keys in MySQL to GCS Helper function +- [AIRFLOW-2091] Fix incorrect docstring parameter in BigQuery Hook +- [AIRFLOW-2090] Fix typo in DataStore Hook +- [AIRFLOW-1157] Fix missing pools crashing the scheduler +- [AIRFLOW-713] Jinjafy {EmrCreateJobFlow,EmrAddSteps}Operator attributes +- [AIRFLOW-2083] Docs: Use "its" instead of "it's" where appropriate +- [AIRFLOW-2066] Add operator to create empty BQ table +- [AIRFLOW-XXX] add Karmic to list of companies +- [AIRFLOW-2073] Make FileSensor fail when the file doesn't exist +- [AIRFLOW-2078] Improve task_stats and dag_stats performance +- [AIRFLOW-2080] Use a log-out icon instead of a power button +- [AIRFLOW-2077] Fetch all pages of list_objects_v2 response +- [AIRFLOW-XXX] Add TM to list of companies +- [AIRFLOW-1985] Impersonation fixes for using `run_as_user` +- [AIRFLOW-2018][AIRFLOW-2] Make Sensors backward compatible +- [AIRFLOW-XXX] Fix typo in concepts doc (dag_md) +- [AIRFLOW-2069] Allow Bytes to be uploaded to S3 +- [AIRFLOW-2074] Fix log var name in GHE auth +- [AIRFLOW-1927] Convert naive datetimes for TaskInstances +- [AIRFLOW-1760] Password auth for experimental API +- [AIRFLOW-2038] Add missing kubernetes dependency for dev +- [AIRFLOW-2040] Escape special chars in task instance logs URL +- [AIRFLOW-1968][AIRFLOW-1520] Add role_arn and aws_account_id/aws_iam_role support back to aws hook +- [AIRFLOW-2048] Fix task instance failure string formatting +- [AIRFLOW-2046] Fix kerberos error to work with python 3.x +- [AIRFLOW-2063] Add missing docs for GCP +- [AIRFLOW-XXX] Fix typo in docs +- [AIRFLOW-1793] Use docker_url instead of invalid base_url +- [AIRFLOW-2055] Elaborate on slightly ambiguous documentation +- [AIRFLOW-2039] BigQueryOperator supports priority property +- [AIRFLOW-2053] Fix quote character bug in BQ hook +- [AIRFLOW-2057] Add Overstock to list of companies +- [AIRFLOW-XXX] Add Plaid to Airflow users +- [AIRFLOW-2044] Add SparkSubmitOperator to documentation +- [AIRFLOW-2037] Add methods to get Hash values of a GCS object +- [AIRFLOW-2050] Fix Travis permission problem +- [AIRFLOW-2043] Add Intercom to list of companies +- [AIRFLOW-2023] Add debug logging around number of queued files +- [AIRFLOW-XXX] Add Pernod-ricard as a airflow user +- [AIRFLOW-1453] Add 'steps' into template_fields in EmrAddSteps +- [AIRFLOW-2015] Add flag for interactive runs +- [AIRFLOW-1895] Fix primary key integrity for mysql +- [AIRFLOW-2030] Fix KeyError:`i` in DbApiHook for insert +- [AIRFLOW-1943] Add External BigQuery Table feature +- [AIRFLOW-2033] Add Google Cloud Storage List Operator +- [AIRFLOW-2006] Add local log catching to kubernetes operator +- [AIRFLOW-2031] Add missing gcp_conn_id in the example in DataFlow docstrings +- [AIRFLOW-2029] Fix AttributeError in BigQueryPandasConnector +- [AIRFLOW-2028] Add JobTeaser to official users list +- [AIRFLOW-2016] Add support for Dataproc Workflow Templates +- [AIRFLOW-2025] Reduced Logging verbosity +- [AIRFLOW-1267][AIRFLOW-1874] Add dialect parameter to BigQueryHook +- [AIRFLOW-XXX] Fixed a typo +- [AIRFLOW-XXX] Typo node to nodes +- [AIRFLOW-2019] Update DataflowHook for updating Streaming type job +- [AIRFLOW-2017][Airflow 2017] adding query output to PostgresOperator +- [AIRFLOW-1889] Split sensors into separate files +- [AIRFLOW-1950] Optionally pass xcom_pull task_ids +- [AIRFLOW-1755] Allow mount below root +- [AIRFLOW-511][Airflow 511] add success/failure callbacks on dag level +- [AIRFLOW-192] Add weight_rule param to BaseOperator +- [AIRFLOW-2008] Use callable for python column defaults +- [AIRFLOW-1984] Fix to AWS Batch operator +- [AIRFLOW-2000] Support non-main dataflow job class +- [AIRFLOW-2003] Use flask-caching instead of flask-cache +- [AIRFLOW-2002] Do not swallow exception on logging import +- [AIRFLOW-2004] Import flash from flask not flask.login +- [AIRFLOW-1997] Fix GCP operator doc strings +- [AIRFLOW-1996] Update DataflowHook waitfordone for Streaming type job[] +- [AIRFLOW-1995][Airflow 1995] add on_kill method to SqoopOperator +- [AIRFLOW-1770] Allow HiveOperator to take in a file +- [AIRFLOW-1994] Change background color of Scheduled state Task Instances +- [AIRFLOW-1436][AIRFLOW-1475] EmrJobFlowSensor considers Cancelled step as Successful +- [AIRFLOW-1517] Kubernetes operator PR fixes +- [AIRFLOW-1517] addressed PR comments +- [AIRFLOW-1517] started documentation of k8s operator +- [AIRFLOW-1517] Restore authorship of resources +- [AIRFLOW-1517] Remove authorship of resources +- [AIRFLOW-1517] Add minikube for kubernetes integration tests +- [AIRFLOW-1517] Restore authorship of resources +- [AIRFLOW-1517] fixed license issues +- [AIRFLOW-1517] Created more accurate failures for kube cluster issues +- [AIRFLOW-1517] Remove authorship of resources +- [AIRFLOW-1517] Add minikube for kubernetes integration tests +- [AIRFLOW-1988] Change BG color of None state TIs +- [AIRFLOW-790] Clean up TaskInstances without DagRuns +- [AIRFLOW-1949] Fix var upload, str() produces "b'...'" which is not json +- [AIRFLOW-1930] Convert func.now() to timezone.utcnow() +- [AIRFLOW-1688] Support load.time_partitioning in bigquery_hook +- [AIRFLOW-1975] Make TriggerDagRunOperator callback optional +- [AIRFLOW-1480] Render template attributes for ExternalTaskSensor fields +- [AIRFLOW-1958] Add kwargs to send_email +- [AIRFLOW-1976] Fix for missing log/logger attribute FileProcessHandler +- [AIRFLOW-1982] Fix Executor event log formatting +- [AIRFLOW-1971] Propagate hive config on impersonation +- [AIRFLOW-1969] Always use HTTPS URIs for Google OAuth2 +- [AIRFLOW-1954] Add DataFlowTemplateOperator +- [AIRFLOW-1963] Add config for HiveOperator mapred_queue +- [AIRFLOW-1946][AIRFLOW-1855] Create a BigQuery Get Data Operator +- [AIRFLOW-1953] Add labels to dataflow operators +- [AIRFLOW-1967] Update Celery to 4.0.2 +- [AIRFLOW-1964] Add Upsight to list of Airflow users +- [AIRFLOW-XXX] Changelog for 1.9.0 +- [AIRFLOW-1470] Implement BashSensor operator +- [AIRFLOW-XXX] Pin sqlalchemy dependency +- [AIRFLOW-1955] Do not reference unassigned variable +- [AIRFLOW-1957] Add contributor to BalanceHero in Readme +- [AIRFLOW-1517] Restore authorship of secrets and init container +- [AIRFLOW-1517] Remove authorship of secrets and init container +- [AIRFLOW-1935] Add BalanceHero to readme +- [AIRFLOW-1939] add astronomer contributors +- [AIRFLOW-1517] Kubernetes Operator +- [AIRFLOW-1928] Fix @once with catchup=False +- [AIRFLOW-1937] Speed up scheduling by committing in batch +- [AIRFLOW-1821] Enhance default logging config by removing extra loggers +- [AIRFLOW-1904] Correct DAG fileloc to the right filepath +- [AIRFLOW-1909] Update docs with supported versions of MySQL server +- [AIRFLOW-1915] Relax flask-wtf dependency specification +- [AIRFLOW-1920] Update CONTRIBUTING.md to reflect enforced linting rules +- [AIRFLOW-1942] Update Sphinx docs to remove deprecated import structure +- [AIRFLOW-1846][AIRFLOW-1697] Hide Ad Hoc Query behind secure_mode config +- [AIRFLOW-1948] Include details for on_kill failure +- [AIRFLOW-1938] Clean up unused exception +- [AIRFLOW-1932] Add GCP Pub/Sub Pull and Ack +- [AIRFLOW-XXX] Purge coveralls +- [AIRFLOW-XXX] Remove unused coveralls token +- [AIRFLOW-1938] Remove tag version check in setup.py +- [AIRFLOW-1916] Don't upload logs to remote from `run --raw` +- [AIRFLOW-XXX] Fix failing PubSub tests on Python3 +- [AIRFLOW-XXX] Upgrade to python 3.5 and disable dask tests +- [AIRFLOW-1913] Add new GCP PubSub operators +- [AIRFLOW-1525] Fix minor LICENSE and NOTICE issues +- [AIRFLOW-1687] fix fernet error without encryption +- [AIRFLOW-1912] airflow.processor should not propagate logging +- [AIRFLOW-1911] Rename celeryd_concurrency +- [AIRFLOW-1885] Fix IndexError in ready_prefix_on_cmdline +- [AIRFLOW-1854] Improve Spark Submit operator for standalone cluster mode +- [AIRFLOW-1908] Fix celery broker options config load +- [AIRFLOW-1907] Pass max_ingestion_time to Druid hook +- [AIRFLOW-1909] Add away to list of users +- [AIRFLOW-1893][AIRFLOW-1901] Propagate PYTHONPATH when using impersonation +- [AIRFLOW-1892] Modify BQ hook to extract data filtered by column +- [AIRFLOW-1829] Support for schema updates in query jobs +- [AIRFLOW-1840] Make celery configuration congruent with Celery 4 +- [AIRFLOW-1878] Fix stderr/stdout redirection for tasks +- [AIRFLOW-1897][AIRFLOW-1873] Task Logs for running instance not visible in WebUI +- [AIRFLOW-1896] FIX bleach <> html5lib incompatibility +- [AIRFLOW-1884][AIRFLOW-1059] Reset orphaned task state for external dagruns +- [AIRFLOW-XXX] Fix typo in comment +- [AIRFLOW-1869] Do not emit spurious warning on missing logs +- [AIRFLOW-1888] Add AWS Redshift Cluster Sensor +- [AIRFLOW-1887] Renamed endpoint url variable +- [AIRFLOW-1873] Set TI.try_number to right value depending TI state +- [AIRFLOW-1891] Fix non-ascii typo in default configuration template +- [AIRFLOW-1879] Handle ti log entirely within ti +- [AIRFLOW-1869] Write more error messages into gcs and file logs +- [AIRFLOW-1876] Write subtask id to task log header +- [AIRFLOW-1554] Fix wrong DagFileProcessor termination method call +- [AIRFLOW-342] Do not use amqp, rpc as result backend +- [AIRFLOW-966] Make celery broker_transport_options configurable +- [AIRFLOW-1881] Make operator log in task log +- [AIRFLOW-XXX] Added DataReply to the list of Airflow Users +- [AIRFLOW-1883] Get File Size for objects in Google Cloud Storage +- [AIRFLOW-1872] Set context for all handlers including parents +- [AIRFLOW-1855][AIRFLOW-1866] Add GCS Copy Operator to copy multiple files +- [AIRFLOW-1870] Enable flake8 tests +- [AIRFLOW-1785] Enable Python 3 tests +- [AIRFLOW-1850] Copy cmd before masking +- [AIRFLOW-1665] Reconnect on database errors +- [AIRFLOW-1559] Dispose SQLAlchemy engines on exit +- [AIRFLOW-1559] Close file handles in subprocesses +- [AIRFLOW-1559] Make database pooling optional +- [AIRFLOW-1848][Airflow-1848] Fix DataFlowPythonOperator py_file extension doc comment +- [AIRFLOW-1843] Add Google Cloud Storage Sensor with prefix +- [AIRFLOW-1803] Time zone documentation +- [AIRFLOW-1826] Update views to use timezone aware objects +- [AIRFLOW-1827] Fix api endpoint date parsing +- [AIRFLOW-1806] Use naive datetime when using cron +- [AIRFLOW-1809] Update tests to use timezone aware objects +- [AIRFLOW-1806] Use naive datetime for cron scheduling +- [AIRFLOW-1807] Force use of time zone aware db fields +- [AIRFLOW-1808] Convert all utcnow() to time zone aware +- [AIRFLOW-1804] Add time zone configuration options +- [AIRFLOW-1802] Convert database fields to timezone aware +- [AIRFLOW-XXX] Add dask lock files to excludes +- [AIRFLOW-1790] Add support for AWS Batch operator +- [AIRFLOW-XXX] Update README.md +- [AIRFLOW-1820] Remove timestamp from metric name +- [AIRFLOW-1810] Remove unused mysql import in migrations. +- [AIRFLOW-1838] Properly log collect_dags exception +- [AIRFLOW-1842] Fixed Super class name for the gcs to gcs copy operator +- [AIRFLOW-1845] Modal background now covers long or tall pages +- [AIRFLOW-1229] Add link to Run Id, incl execution_date +- [AIRFLOW-1842] Add gcs to gcs copy operator with renaming if required +- [AIRFLOW-1841] change False to None in operator and hook +- [AIRFLOW-1839] Fix more bugs in S3Hook boto -> boto3 migration +- [AIRFLOW-1830] Support multiple domains in Google authentication backend +- [AIRFLOW-1831] Add driver-classpath spark submit +- [AIRFLOW-1795] Correctly call S3Hook after migration to boto3 +- [AIRFLOW-1811] Fix render Druid operator +- [AIRFLOW-1819] Fix slack operator unittest bug +- [AIRFLOW-1805] Allow Slack token to be passed through connection +- [AIRFLOW-1816] Add region param to Dataproc operators +- [AIRFLOW-868] Add postgres_to_gcs operator and unittests +- [AIRFLOW-1613] make mysql_to_gcs_operator py3 compatible +- [AIRFLOW-1817] use boto3 for s3 dependency +- [AIRFLOW-1813] Bug SSH Operator empty buffer +- [AIRFLOW-1801][AIRFLOW-288] Url encode execution dates +- [AIRFLOW-1563] Catch OSError while symlinking the latest log directory +- [AIRFLOW-1794] Remove uses of Exception.message for Python 3 +- [AIRFLOW-1799] Fix logging line which raises errors +- [AIRFLOW-1102] Upgrade Gunicorn >=19.4.0 +- [AIRFLOW-1756] Fix S3TaskHandler to work with Boto3-based S3Hook +- [AIRFLOW-1797] S3Hook.load_string didn't work on Python3 +- [AIRFLOW-646] Add docutils to setup_requires +- [AIRFLOW-1792] Missing intervals DruidOperator +- [AIRFLOW-1789][AIRFLOW-1712] Log SSHOperator stderr to log.warning +- [AIRFLOW-1787] Fix task instance batch clear and set state bugs +- [AIRFLOW-1780] Fix long output lines with unicode from hanging parent +- [AIRFLOW-387] Close SQLAlchemy sessions properly +- [AIRFLOW-1779] Add keepalive packets to ssh hook +- [AIRFLOW-1669] Fix Docker and pin Moto to 1.1.19 +- [AIRFLOW-71] Add support for private Docker images +- [AIRFLOW-XXX] Give a clue what the 'ds' variable is +- [AIRFLOW-XXX] Correct typos in the faq docs page +- [AIRFLOW-1571] Add AWS Lambda Hook +- [AIRFLOW-1675] Fix docstrings for API docs +- [AIRFLOW-1712][AIRFLOW-756][AIRFLOW-751] Log SSHOperator output +- [AIRFLOW-1776] Capture stdout and stderr for logging +- [AIRFLOW-1765] Make experimental API securable without needing Kerberos. +- [AIRFLOW-1764] The web interface should not use the experimental API +- [AIRFLOW-1771] Rename heartbeat to avoid confusion +- [AIRFLOW-1769] Add support for templates in VirtualenvOperator +- [AIRFLOW-1763] Fix S3TaskHandler unit tests +- [AIRFLOW-1315] Add Qubole File & Partition Sensors +- [AIRFLOW-1018] Make processor use logging framework +- [AIRFLOW-1695] Add RedshiftHook using boto3 +- [AIRFLOW-1706] Fix query error for MSSQL backend +- [AIRFLOW-1711] Use ldap3 dict for group membership +- [AIRFLOW-1723] Make sendgrid a plugin +- [AIRFLOW-1757] Add missing options to SparkSubmitOperator +- [AIRFLOW-1734][Airflow 1734] Sqoop hook/operator enhancements +- [AIRFLOW-1761] Fix type in scheduler.rst +- [AIRFLOW-1731] Set pythonpath for logging +- [AIRFLOW-1641] Handle executor events in the scheduler +- [AIRFLOW-1744] Make sure max_tries can be set +- [AIRFLOW-1732] Improve dataflow hook logging +- [AIRFLOW-1736] Add HotelQuickly to Who Uses Airflow +- [AIRFLOW-1657] Handle failing qubole operator +- [AIRFLOW-1677] Fix typo in example_qubole_operator +- [AIRFLOW-926] Fix JDBC Hook +- [AIRFLOW-1520] Boto3 S3Hook, S3Log +- [AIRFLOW-1716] Fix multiple __init__ def in SimpleDag +- [AIRFLOW-XXX] Fix DateTime in Tree View +- [AIRFLOW-1719] Fix small typo +- [AIRFLOW-1432] Charts label for Y axis not visible +- [AIRFLOW-1743] Verify ldap filters correctly +- [AIRFLOW-1745] Restore default signal disposition +- [AIRFLOW-1741] Correctly hide second chart on task duration page +- [AIRFLOW-1728] Add networkUri, subnet, tags to Dataproc operator +- [AIRFLOW-1726] Add copy_expert psycopg2 method to PostgresHook +- [AIRFLOW-1330] Add conn_type argument to CLI when adding connection +- [AIRFLOW-1698] Remove SCHEDULER_RUNS env var in systemd +- [AIRFLOW-1694] Stop using itertools.izip +- [AIRFLOW-1692] Change test_views filename to support Windows +- [AIRFLOW-1722] Fix typo in scheduler autorestart output filename +- [AIRFLOW-1723] Support sendgrid in email backend +- [AIRFLOW-1718] Set num_retries on Dataproc job request execution +- [AIRFLOW-1727] Add unit tests for DataProcHook +- [AIRFLOW-1631] Fix timing issue in unit test +- [AIRFLOW-1631] Fix local executor unbound parallelism +- [AIRFLOW-1724] Add Fundera to Who uses Airflow? +- [AIRFLOW-1683] Cancel BigQuery job on timeout. +- [AIRFLOW-1714] Fix misspelling: s/seperate/separate/ +- [AIRFLOW-1681] Add batch clear in task instance view +- [AIRFLOW-1696] Fix dataproc version label error +- [AIRFLOW-1613] Handle binary field in MySqlToGoogleCloudStorageOperator +- [AIRFLOW-1697] Mode to disable charts endpoint +- [AIRFLOW-1691] Add better Google cloud logging documentation +- [AIRFLOW-1690] Add detail to gcs error messages +- [AIRFLOW-1682] Make S3TaskHandler write to S3 on close +- [AIRFLOW-1634] Adds task_concurrency feature +- [AIRFLOW-1676] Make GCSTaskHandler write to GCS on close +- [AIRFLOW-1678] Fix erroneously repeated word in function docstrings +- [AIRFLOW-1323] Made Dataproc operator parameter names consistent +- [AIRFLOW-1590] fix unused module and variable +- [AIRFLOW-1671] Add @apply_defaults back to gcs download operator +- [AIRFLOW-988] Fix repeating SLA miss callbacks +- [AIRFLOW-1611] Customize logging +- [AIRFLOW-1668] Expose keepalives_idle for Postgres connections +- [AIRFLOW-1658] Kill Druid task on timeout +- [AIRFLOW-1669][AIRFLOW-1368] Fix Docker import +- [AIRFLOW-891] Make webserver clock include date +- [AIRFLOW-1560] Add AWS DynamoDB hook and operator for inserting batch items +- [AIRFLOW-1654] Show tooltips for link icons in DAGs view +- [AIRFLOW-1660] Change webpage width to full-width +- [AIRFLOW-1664] write file as binary instead of str +- [AIRFLOW-1659] Fix invalid obj attribute bug in file_task_handler.py +- [AIRFLOW-1635] Allow creating GCP connection without requiring a JSON file +- [AIRFLOW-1650] Fix custom celery config loading +- [AIRFLOW-1647] Fix Spark-sql hook +- [AIRFLOW-1587] Fix CeleryExecutor import error +- [Airflow-1640][AIRFLOW-1640] Add qubole default connection +- [AIRFLOW-1576] Added region param to Dataproc{*}Operators +- [AIRFLOW-1643] Add healthjump to officially using list +- [AIRFLOW-1626] Add Azri Solutions to Airflow users +- [AIRFLOW-1636] Add AWS and EMR connection type +- [AIRFLOW-1527] Refactor celery config +- [AIRFLOW-1639] Fix Fernet error handling +- [AIRFLOW-1637] Fix Travis CI build status link +- [AIRFLOW-1628] Fix docstring of sqlsensor +- [AIRFLOW-1331] add SparkSubmitOperator option +- [AIRFLOW-1627] Only query pool in SubDAG init when necessary +- [AIRFLOW-1629] Make extra a textarea in edit connections form +- [AIRFLOW-1368] Automatically remove Docker container on exit +- [AIRFLOW-289] Make airflow timezone independent +- [AIRFLOW-1356] Add `--celery_hostname` to `airflow worker` +- [AIRFLOW-1247] Fix ignore_all_dependencies argument ignored +- [AIRFLOW-1621] Add tests for server side paging +- [AIRFLOW-1591] Avoid attribute error when rendering logging filename +- [AIRFLOW-1031] Replace hard-code to DagRun.ID_PREFIX +- [AIRFLOW-1604] Rename logger to log +- [AIRFLOW-1512] Add PythonVirtualenvOperator +- [AIRFLOW-1617] Fix XSS vulnerability in Variable endpoint +- [AIRFLOW-1497] Reset hidden fields when changing connection type +- [AIRFLOW-1619] Add poll_sleep parameter to GCP dataflow operator +- [AIRFLOW-XXX] Remove landscape.io config +- [AIRFLOW-XXX] Remove non working service badges +- [AIRFLOW-1177] Fix Variable.setdefault w/existing JSON +- [AIRFLOW-1600] Fix exception handling in get_fernet +- [AIRFLOW-1614] Replace inspect.stack() with sys._getframe() +- [AIRFLOW-1519] Add server side paging in DAGs list +- [AIRFLOW-1309] Allow hive_to_druid to take tblproperties +- [AIRFLOW-1613] Make MySqlToGoogleCloudStorageOperator compaitible with python3 +- [AIRFLOW-1603] add PAYMILL to companies list +- [AIRFLOW-1609] Fix gitignore to ignore all venvs +- [AIRFLOW-1601] Add configurable task cleanup time + +Airflow 1.9.0, 2018-01-02 ------------------------- -[AIRFLOW-1525] Fix minor LICENSE and NOTICE issues -[AIRFLOW-XXX] Bump version to 1.9.0 -[AIRFLOW-1897][AIRFLOW-1873] Task Logs for running instance not visible in WebUI -[AIRFLOW-XXX] Make sure session is committed -[AIRFLOW-1896] FIX bleach <> html5lib incompatibility -[AIRFLOW-XXX] Fix log handler test -[AIRFLOW-1873] Set TI.try_number to right value depending TI state -[AIRFLOW-1554] Fix wrong DagFileProcessor termination method call -[AIRFLOW-1872] Set context for all handlers including parents -[AIRFLOW-XXX] Add dask lock files to excludes -[AIRFLOW-1839] Fix more bugs in S3Hook boto -> boto3 migration -[AIRFLOW-1795] Correctly call S3Hook after migration to boto3 -[AIRFLOW-1813] Bug SSH Operator empty buffer -[AIRFLOW-1794] Remove uses of Exception.message for Python 3 -[AIRFLOW-1799] Fix logging line which raises errors -[AIRFLOW-1102] Upgrade Gunicorn >=19.4.0 -[AIRFLOW-1756] Fix S3TaskHandler to work with Boto3-based S3Hook -[AIRFLOW-1797] S3Hook.load_string didn't work on Python3 -[AIRFLOW-1792] Missing intervals DruidOperator -[AIRFLOW-1789][AIRFLOW-1712] Log SSHOperator stderr to log.warning -[AIRFLOW-1669] Fix Docker and pin Moto to 1.1.19 -[AIRFLOW-71] Add support for private Docker images -[AIRFLOW-1779] Add keepalive packets to ssh hook -[AIRFLOW-XXX] Give a clue what the 'ds' variable is -[AIRFLOW-XXX] Correct typos in the faq docs page -[AIRFLOW-1571] Add AWS Lambda Hook -[AIRFLOW-1675] Fix docstrings for API docs -[AIRFLOW-1712][AIRFLOW-756][AIRFLOW-751] Log SSHOperator output -[AIRFLOW-1776] Capture stdout and stderr for logging -[AIRFLOW-1765] Make experimental API securable without needing Kerberos. -[AIRFLOW-1764] The web interface should not use the experimental API -[AIRFLOW-1634] Adds task_concurrency feature -[AIRFLOW-1018] Make processor use logging framework -[AIRFLOW-1695] Add RedshiftHook using boto3 -[AIRFLOW-1706] Fix query error for MSSQL backend -[AIRFLOW-1711] Use ldap3 dict for group membership -[AIRFLOW-1757] Add missing options to SparkSubmitOperator -[AIRFLOW-1734][Airflow 1734] Sqoop hook/operator enhancements -[AIRFLOW-1731] Set pythonpath for logging -[AIRFLOW-1641] Handle executor events in the scheduler -[AIRFLOW-1744] Make sure max_tries can be set -[AIRFLOW-1330] Add conn_type argument to CLI when adding connection -[AIRFLOW-926] Fix JDBC Hook -[AIRFLOW-1520] Boto3 S3Hook, S3Log -[AIRFLOW-XXX] Fix DateTime in Tree View -[AIRFLOW-1432] Charts label for Y axis not visible -[AIRFLOW-1743] Verify ldap filters correctly -[AIRFLOW-1745] Restore default signal disposition -[AIRFLOW-1741] Correctly hide second chart on task duration page -[AIRFLOW-1726] Add copy_expert psycopg2 method to PostgresHook -[AIRFLOW-1698] Remove SCHEDULER_RUNS env var in systemd -[AIRFLOW-1694] Stop using itertools.izip -[AIRFLOW-1692] Change test_views filename to support Windows -[AIRFLOW-1722] Fix typo in scheduler autorestart output filename -[AIRFLOW-1691] Add better Google cloud logging documentation -[AIRFLOW-1690] Add detail to gcs error messages -[AIRFLOW-1682] Make S3TaskHandler write to S3 on close -[AIRFLOW-1676] Make GCSTaskHandler write to GCS on close -[AIRFLOW-1635] Allow creating GCP connection without requiring a JSON file -[AIRFLOW-1323] Made Dataproc operator parameter names consistent -[AIRFLOW-1590] fix unused module and variable -[AIRFLOW-988] Fix repeating SLA miss callbacks -[AIRFLOW-1611] Customize logging -[AIRFLOW-1668] Expose keepalives_idle for Postgres connections -[AIRFLOW-1658] Kill Druid task on timeout -[AIRFLOW-1669][AIRFLOW-1368] Fix Docker import -[AIRFLOW-1560] Add AWS DynamoDB hook and operator for inserting batch items -[AIRFLOW-1654] Show tooltips for link icons in DAGs view -[AIRFLOW-1660] Change webpage width to full-width -[AIRFLOW-1664] write file as binary instead of str -[AIRFLOW-1659] Fix invalid obj attribute bug in file_task_handler.py -[AIRFLOW-1650] Fix custom celery config loading -[AIRFLOW-1647] Fix Spark-sql hook -[AIRFLOW-1587] Fix CeleryExecutor import error -[AIRFLOW-1636] Add AWS and EMR connection type -[AIRFLOW-1527] Refactor celery config -[AIRFLOW-1639] Fix Fernet error handling -[AIRFLOW-1628] Fix docstring of sqlsensor -[AIRFLOW-1331] add SparkSubmitOperator option -[AIRFLOW-1627] Only query pool in SubDAG init when necessary -[AIRFLOW-1629] Make extra a textarea in edit connections form -[AIRFLOW-1621] Add tests for server side paging -[AIRFLOW-1519] Add server side paging in DAGs list -[AIRFLOW-289] Make airflow timezone independent -[AIRFLOW-1356] Add `--celery_hostname` to `airflow worker` -[AIRFLOW-1591] Avoid attribute error when rendering logging filename -[AIRFLOW-1031] Replace hard-code to DagRun.ID_PREFIX -[AIRFLOW-1604] Rename logger to log -[AIRFLOW-1512] Add PythonVirtualenvOperator -[AIRFLOW-1617] Fix XSS vulnerability in Variable endpoint -[AIRFLOW-1497] Reset hidden fields when changing connection type -[AIRFLOW-1177] Fix Variable.setdefault w/existing JSON -[AIRFLOW-1600] Fix exception handling in get_fernet -[AIRFLOW-1614] Replace inspect.stack() with sys._getframe() -[AIRFLOW-1613] Make MySqlToGoogleCloudStorageOperator compaitible with python3 -[AIRFLOW-1609] Fix gitignore to ignore all venvs -[AIRFLOW-1601] Add configurable task cleanup time -[AIRFLOW-XXX] Bumping Airflow 1.9.0alpha0 version -[AIRFLOW-1608] Handle pending job state in GCP Dataflow hook -[AIRFLOW-1606] Use non static DAG.sync_to_db -[AIRFLOW-1606][Airflow-1606][AIRFLOW-1605][AIRFLOW-160] DAG.sync_to_db is now a normal method -[AIRFLOW-1602] LoggingMixin in DAG class -[AIRFLOW-1593] expose load_string in WasbHook -[AIRFLOW-1597] Add GameWisp as Airflow user -[AIRFLOW-1594] Don't install test packages into python root.[] -[AIRFLOW-1582] Improve logging within Airflow -[AIRFLOW-1476] add INSTALL instruction for source releases -[AIRFLOW-XXX] Save username and password in airflow-pr -[AIRFLOW-1522] Increase text size for var field in variables for MySQL -[AIRFLOW-950] Missing AWS integrations on documentation::integrations -[AIRFLOW-XXX] 1.8.2 release notes -[AIRFLOW-1573] Remove `thrift < 0.10.0` requirement -[AIRFLOW-1584] Remove insecure /headers endpoint -[AIRFLOW-1586] Add mapping for date type to mysql_to_gcs operator -[AIRFLOW-1579] Adds support for jagged rows in Bigquery hook for BQ load jobs -[AIRFLOW-1577] Add token support to DatabricksHook -[AIRFLOW-1580] Error in string formating -[AIRFLOW-1567] Updated docs for Google ML Engine operators/hooks -[AIRFLOW-1574] add 'to' attribute to templated vars of email operator -[AIRFLOW-1572] add carbonite to company list -[AIRFLOW-1568] Fix typo in BigQueryHook -[AIRFLOW-1493][AIRFLOW-XXXX][WIP] fixed dumb thing -[AIRFLOW-1567][Airflow-1567] Renamed cloudml hook and operator to mlengine -[AIRFLOW-1568] Add datastore export/import operators -[AIRFLOW-1564] Use Jinja2 to render logging filename -[AIRFLOW-1562] Spark-sql logging contains deadlock -[AIRFLOW-1556][Airflow 1556] Add support for SQL parameters in BigQueryBaseCursor -[AIRFLOW-108] Add CreditCards.com to companies list -[AIRFLOW-1541] Add channel to template fields of slack_operator -[AIRFLOW-1535] Add service account/scopes in dataproc -[AIRFLOW-1384] Add to README.md CaDC/ARGO -[AIRFLOW-1546] add Zymergen 80to org list in README -[AIRFLOW-1545] Add Nextdoor to companies list -[AIRFLOW-1544] Add DataFox to companies list -[AIRFLOW-1529] Add logic supporting quoted newlines in Google BigQuery load jobs -[AIRFLOW-1521] Fix emplate rendering for BigqueryTableDeleteOperator -[AIRFLOW-1324] Generalize Druid operator and hook -[AIRFLOW-1516] Fix error handling getting fernet -[AIRFLOW-1420][AIRFLOW-1473] Fix deadlock check -[AIRFLOW-1495] Fix migration on index on job_id -[AIRFLOW-1483] Making page size consistent in list -[AIRFLOW-1495] Add TaskInstance index on job_id -[AIRFLOW-855] Replace PickleType with LargeBinary in XCom -[AIRFLOW-1505] Document when Jinja substitution occurs -[AIRFLOW-1504] Log dataproc cluster name -[AIRFLOW-1239] Fix unicode error for logs in base_task_runner -[AIRFLOW-1280] Fix Gantt chart height -[AIRFLOW-1507] Template parameters in file_to_gcs operator -[AIRFLOW-1452] workaround lock on method -[AIRFLOW-1385] Make Airflow task logging configurable -[AIRFLOW-940] Handle error on variable decrypt -[AIRFLOW-1492] Add gauge for task successes/failures -[AIRFLOW-1443] Update Airflow configuration documentation -[AIRFLOW-1486] Unexpected S3 writing log error -[AIRFLOW-1487] Added links to all companies officially using Airflow -[AIRFLOW-1489] Fix typo in BigQueryCheckOperator -[AIRFLOW-1349] Fix backfill to respect limits -[AIRFLOW-1478] Chart owner column should be sortable -[AIRFLOW-1397][AIRFLOW-1] No Last Run column data displyed in Airflow UI 1.8.1 -[AIRFLOW-1474] Add dag_id regex feature for `airflow clear` command -[AIRFLOW-1445] Changing HivePartitionSensor UI color to lighter shade -[AIRFLOW-1359] Use default_args in Cloud ML eval -[AIRFLOW-1389] Support createDisposition in BigQueryOperator -[AIRFLOW-1349] Refactor BackfillJob _execute -[AIRFLOW-1459] Fixed broken integration .rst formatting -[AIRFLOW-1448] Revert "Fix cli reading logfile in memory" -[AIRFLOW-1398] Allow ExternalTaskSensor to wait on multiple runs of a task -[AIRFLOW-1399] Fix cli reading logfile in memory -[AIRFLOW-1442] Remove extra space from ignore_all_deps generated command -[AIRFLOW-1438] Change batch size per query in scheduler -[AIRFLOW-1439] Add max billing tier for the BQ Hook and Operator -[AIRFLOW-1437] Modify BigQueryTableDeleteOperator -[Airflow 1332] Split logs based on try number -[AIRFLOW-1385] Create abstraction for Airflow task logging -[AIRFLOW-756][AIRFLOW-751] Replace ssh hook, operator & sftp operator with paramiko based -[AIRFLOW-1393][[AIRFLOW-1393] Enable Py3 tests in contrib/spark_submit_hook[ -[AIRFLOW-1345] Dont expire TIs on each scheduler loop -[AIRFLOW-1059] Reset orphaned tasks in batch for scheduler -[AIRFLOW-1255] Fix SparkSubmitHook output deadlock -[AIRFLOW-1359] Add Google CloudML utils for model evaluation -[AIRFLOW-1247] Fix ignore all dependencies argument ignored -[AIRFLOW-1401] Standardize cloud ml operator arguments -[AIRFLOW-1394] Add quote_character param to GCS hook and operator -[AIRFLOW-1402] Cleanup SafeConfigParser DeprecationWarning -[AIRFLOW-1326][[AIRFLOW-1326][AIRFLOW-1184] Don't split argument array -- it's already an array.[ -[AIRFLOW-1384] Add ARGO/CaDC as a Airflow user -[AIRFLOW-1357] Fix scheduler zip file support -[AIRFLOW-1382] Add working dir option to DockerOperator -[AIRFLOW-1388] Add Cloud ML Engine operators to integration doc -[AIRFLOW-1387] Add unicode string prefix -[AIRFLOW-1366] Add max_tries to task instance -[AIRFLOW-1300] Enable table creation with TBLPROPERTIES -[AIRFLOW-1271] Add Google CloudML Training Operator -[AIRFLOW-300] Add Google Pubsub hook and operator -[AIRFLOW-1343] Fix dataproc label format -[AIRFLOW-1367] Pass Content-ID To reference inline images in an email, we need to be able to add to the HTML. However currently the Content-ID (cid) is not passed, so we need to add it -[AIRFLOW-1265] Fix celery executor parsing CELERY_SSL_ACTIVE -[AIRFLOW-1272] Google Cloud ML Batch Prediction Operator -[AIRFLOW-1352][AIRFLOW-1335] Revert MemoryHandler change ()[] -[AIRFLOW-1350] Add query_uri param to Hive/SparkSQL DataProc operator -[AIRFLOW-1334] Check if tasks are backfill on scheduler in a join -[AIRFLOW-1343] Add Airflow default label to the dataproc operator -[AIRFLOW-1273] Add Google Cloud ML version and model operators -[AIRFLOW-1273]AIRFLOW-1273] Add Google Cloud ML version and model operators -[AIRFLOW-1321] Fix hidden field key to ignore case -[AIRFLOW-1337] Make log_format key names lowercase -[AIRFLOW-1338][AIRFLOW-782] Add GCP dataflow hook runner change to UPDATING.md -[AIRFLOW-801] Remove outdated docstring on BaseOperator -[AIRFLOW-1344] Fix text encoding bug when reading logs for Python 3.5 -[AIRFLOW-1338] Fix incompatible GCP dataflow hook -[AIRFLOW-1333] Enable copy function for Google Cloud Storage Hook -[AIRFLOW-1337] Allow log format customization via airflow.cfg -[AIRFLOW-1320] Update LetsBonus users in README -[AIRFLOW-1335] Use MemoryHandler for buffered logging -[AIRFLOW-1339] Add Drivy to the list of users -[AIRFLOW-1275] Put 'airflow pool' into API -[AIRFLOW-1296] Propagate SKIPPED to all downstream tasks -[AIRFLOW-1317] Fix minor issues in API reference -[AIRFLOW-1308] Disable nanny usage for Dask -[AIRFLOW-1172] Support nth weekday of the month cron expression -[AIRFLOW-936] Add clear/mark success for DAG in the UI -[AIRFLOW-1294] Backfills can loose tasks to execute -[AIRFLOW-1299] Support imageVersion in Google Dataproc cluster -[AIRFLOW-1291] Update NOTICE and LICENSE files to match ASF requirements -[AIRFLOW-1301] Add New Relic to list of companies -[AIRFLOW-1289] Removes restriction on number of scheduler threads -[AIRFLOW-1024] Ignore celery executor errors (#49) -[AIRFLOW-1265] Fix exception while loading celery configurations -[AIRFLOW-1290] set docs author to 'Apache Airflow' -[AIRFLOW-1242] Allowing project_id to have a colon in it. -[AIRFLOW-1282] Fix known event column sorting -[AIRFLOW-1166] Speed up _change_state_for_tis_without_dagrun -[AIRFLOW-1208] Speed-up cli tests -[AIRFLOW-1192] Some enhancements to qubole_operator -[AIRFLOW-1281] Sort variables by key field by default -[AIRFLOW-1277] Forbid KE creation with empty fields -[AIRFLOW-1276] Forbid event creation with end_data earlier than start_date -[AIRFLOW-1263] Dynamic height for charts -[AIRFLOW-1266] Increase width of gantt y axis -[AIRFLOW-1244] Forbid creation of a pool with empty name -[AIRFLOW-1274][HTTPSENSOR] Rename parameter params to data -[AIRFLOW-654] Add SSL Config Option for CeleryExecutor w/ RabbitMQ - Add BROKER_USE_SSL config to give option to send AMQP messages over SSL - Can be set using usual airflow options (e.g. airflow.cfg, env vars, etc.) -[AIRFLOW-1256] Add United Airlines to readme -[AIRFLOW-1251] Add eRevalue to Airflow users -[AIRFLOW-908] Print hostname at the start of cli run -[AIRFLOW-1237] Fix IN-predicate sqlalchemy warning -[AIRFLOW-1243] DAGs table has no default entries to show -[AIRFLOW-1245] Fix random failure in test_trigger_dag_for_date -[AIRFLOW-1248] Fix wrong conf name for worker timeout -[AIRFLOW-1197] : SparkSubmitHook on_kill error -[AIRFLOW-1191] : SparkSubmitHook custom cmd -[AIRFLOW-1234] Cover utils.operator_helpers with UTs -[AIRFLOW-1217] Enable Sqoop logging -[AIRFLOW-645] Support HTTPS connections in HttpHook -[AIRFLOW-1231] Use flask_wtf.CSRFProtect -[AIRFLOW-1232] Remove deprecated readfp warning -[AIRFLOW-1233] Cover utils.json with unit tests -[AIRFLOW-1227] Remove empty column on the Logs view -[AIRFLOW-1226] Remove empty column on the Jobs view -[AIRFLOW-1221] Fix templating bug with DatabricksSubmitRunOperator -[AIRFLOW-1210] Enable DbApiHook unit tests -[AIRFLOW-1199] Fix create modal -[AIRFLOW-1200] Forbid creation of a variable with an empty key -[AIRFLOW-1207] Enable utils.helpers unit tests -[AIRFLOW-1213] Add hcatalog parameters to sqoop -[AIRFLOW-1201] Update deprecated 'nose-parameterized' -[AIRFLOW-1186] Sort dag.get_task_instances by execution_date -[AIRFLOW-1203] Pin Google API client version to fix OAuth issue -[AIRFLOW-1145] Fix closest_date_partition function with before set to True If we're looking for the closest date before, we should take the latest date in the list of date before. -[AIRFLOW-1180] Fix flask-wtf version for test_csrf_rejection -[AIRFLOW-993] Update date inference logic -[AIRFLOW-1170] DbApiHook insert_rows inserts parameters separately -[AIRFLOW-1041] Do not shadow xcom_push method[] -[AIRFLOW-860][AIRFLOW-935] Fix plugin executor import cycle and executor selection -[AIRFLOW-1189] Fix get a DataFrame using BigQueryHook failing -[AIRFLOW-1184] SparkSubmitHook does not split args -[AIRFLOW-1182] SparkSubmitOperator template field -[AIRFLOW-823] Allow specifying execution date in task_info API -[AIRFLOW-1175] Add Pronto Tools to Airflow user list -[AIRFLOW-1150] Fix scripts execution in sparksql hook[] -[AIRFLOW-1141] remove crawl_for_tasks -[AIRFLOW-1193] Add Checkr to company using Airflow -[AIRFLOW-1168] Add closing() to all connections and cursors -[AIRFLOW-1188] Add max_bad_records param to GoogleCloudStorageToBigQueryOperator -[AIRFLOW-1187][AIRFLOW-1185] Fix PyPi package names in documents -[AIRFLOW-1185] Fix PyPi URL in templates -[AIRFLOW-XXX] Updating CHANGELOG, README, and UPDATING after 1.8.1 release -[AIRFLOW-1181] Add delete and list functionality to gcs_hook -[AIRFLOW-1179] Fix Pandas 0.2x breaking Google BigQuery change -[AIRFLOW-1167] Support microseconds in FTPHook modification time -[AIRFLOW-1173] Add Robinhood to who uses Airflow -[AIRFLOW-945][AIRFLOW-941] Remove psycopg2 connection workaround -[AIRFLOW-1140] DatabricksSubmitRunOperator should template the "json" field. -[AIRFLOW-1160] Update Spark parameters for Mesos -[AIRFLOW 1149][AIRFLOW-1149] Allow for custom filters in Jinja2 templates -[AIRFLOW-1036] Randomize exponential backoff -[AIRFLOW-1155] Add Tails.com to community -[AIRFLOW-1142] Do not reset orphaned state for backfills -[AIRFLOW-492] Make sure stat updates cannot fail a task -[AIRFLOW-1119] Fix unload query so headers are on first row[] -[AIRFLOW-1089] Add Spark application arguments -[AIRFLOW-1125] Document encrypted connections -[AIRFLOW-1122] Increase stroke width in UI -[AIRFLOW-1138] Add missing licenses to files in scripts directory -(AIRFLOW-11-38) [AIRFLOW-1136] Capture invalid arguments for Sqoop -[AIRFLOW-1127] Move license notices to LICENSE -[AIRFLOW-1118] Add evo.company to Airflow users -[AIRFLOW-1121][AIRFLOW-1004] Fix `airflow webserver --pid` to write out pid file -[AIRFLOW-1124] Do not set all tasks to scheduled in backfill -[AIRFLOW-1120] Update version view to include Apache prefix -[AIRFLOW-1091] Add script that can compare jira target against merges -[AIRFLOW-1107] Add support for ftps non-default port -[AIRFLOW-1000] Rebrand distribution to Apache Airflow -[AIRFLOW-1094] Run unit tests under contrib in Travis -[AIRFLOW-1112] Log which pool when pool is full in scheduler -[AIRFLOW-1106] Add Groupalia/Letsbonus to the ReadMe -[AIRFLOW-1109] Use kill signal to kill processes and log results -[AIRFLOW-1074] Don't count queued tasks for concurrency limits -[AIRFLOW-1095] Make ldap_auth memberOf come from configuration -[AIRFLOW-1090] Add HBO -[AIRFLOW-1035] Use binary exponential backoff -[AIRFLOW-1081] Improve performance of duration chart -[AIRFLOW-1078] Fix latest_runs endpoint for old flask versions -[AIRFLOW-1085] Enhance the SparkSubmitOperator -[AIRFLOW-1050] Do not count up_for_retry as not ready -[AIRFLOW-1028] Databricks Operator for Airflow -[AIRFLOW-1075] Security docs cleanup -[AIRFLOW-1033][AIFRLOW-1033] Fix ti_deps for no schedule dags -[AIRFLOW-1016] Allow HTTP HEAD request method on HTTPSensor -[AIRFLOW-970] Load latest_runs on homepage async -[AIRFLOW-111] Include queued tasks in scheduler concurrency check -[AIRFLOW-1001] Fix landing times if there is no following schedule -[AIRFLOW-1065] Add functionality for Azure Blob Storage over wasb:// -[AIRFLOW-947] Improve exceptions for unavailable Presto cluster -[AIRFLOW-1067] use example.com in examples -[AIRFLOW-1064] Change default sort to job_id for TaskInstanceModelView -[AIRFLOW-1030][AIRFLOW-1] Fix hook import for HttpSensor -[AIRFLOW-1051] Add a test for resetdb to CliTests -[AIRFLOW-1004][AIRFLOW-276] Fix `airflow webserver -D` to run in background -[AIRFLOW-1062] Fix DagRun#find to return correct result -[AIRFLOW-1011] Fix bug in BackfillJob._execute() for SubDAGs -[AIRFLOW-1038] Specify celery serialization options explicitly -[AIRFLOW-1054] Fix broken import in test_dag -[AIRFLOW-1007] Use Jinja sandbox for chart_data endpoint -[AIRFLOW-719] Fix race condition in ShortCircuit, Branch and LatestOnly -[AIRFLOW-1043] Fix doc strings of operators -[AIRFLOW-840] Make ticket renewer python3 compatible -[AIRFLOW-985] Extend the sqoop operator and hook -[AIRFLOW-1034] Make it possible to connect to S3 in sigv4 regions -[AIRFLOW-1045] Make log level configurable via airflow.cfg -[AIRFLOW-1047] Sanitize strings passed to Markup -[AIRFLOW-1040] Fix some small typos in comments and docstrings -[AIRFLOW-1017] get_task_instance shouldn't throw exception when no TI -[AIRFLOW-1006] Add config_templates to MANIFEST -[AIRFLOW-999] Add support for Redis database -[AIRFLOW-1009] Remove SQLOperator from Concepts page -[AIRFLOW-1006] Move config templates to separate files -[AIRFLOW-1005] Improve Airflow startup time -[AIRFLOW-1010] Add convenience script for signing releases -[AIRFLOW-995] Remove reference to actual Airflow issue -[AIRFLOW-681] homepage doc link should pointing to apache repo not airbnb repo -[AIRFLOW-705][AIRFLOW-706] Fix run_command bugs -[AIRFLOW-990] Fix Py27 unicode logging in DockerOperator -[AIRFLOW-963] Fix non-rendered code examples -[AIRFLOW-969] Catch bad python_callable argument -[AIRFLOW-984] Enable subclassing of SubDagOperator -[AIRFLOW-997] Update setup.cfg to point to Apache -[AIRFLOW-994] Add MiNODES to the official airflow user list -[AIRFLOW-995][AIRFLOW-1] Update GitHub PR Template -[AIRFLOW-989] Do not mark dag run successful if unfinished tasks -[AIRFLOW-903] New configuration setting for the default dag view -[AIRFLOW-979] Add GovTech GDS -[AIRFLOW-933] Replace eval with literal_eval to prevent RCE -[AIRFLOW-974] Fix mkdirs race condition -[AIRFLOW-917] Fix formatting of error message -[AIRFLOW-770] Refactor BaseHook so env vars are always read -[AIRFLOW-900] Double trigger should not kill original task instance -[AIRFLOW-900] Fixes bugs in LocalTaskJob for double run protection -[AIRFLOW-932][AIRFLOW-932][AIRFLOW-921][AIRFLOW-910] Do not mark tasks removed when backfilling[ -[AIRFLOW-961] run onkill when SIGTERMed -[AIRFLOW-910] Use parallel task execution for backfills -[AIRFLOW-967] Wrap strings in native for py2 ldap compatibility -[AIRFLOW-958] Improve tooltip readability -AIRFLOW-959 Cleanup and reorganize .gitignore -AIRFLOW-960 Add .editorconfig file -[AIRFLOW-931] Do not set QUEUED in TaskInstances -[AIRFLOW-956] Get docs working on readthedocs.org -[AIRFLOW-954] Fix configparser ImportError -[AIRFLOW-941] Use defined parameters for psycopg2 -[AIRFLOW-943] Update Digital First Media in users list -[AIRFLOW-942] Add mytaxi to Airflow users -[AIRFLOW-939] add .swp to gitginore -[AIRFLOW-719] Prevent DAGs from ending prematurely -[AIRFLOW-938] Use test for True in task_stats queries -[AIRFLOW-937] Improve performance of task_stats -[AIRFLOW-933] use ast.literal_eval rather eval because ast.literal_eval does not execute input. -[AIRFLOW-925] Revert airflow.hooks change that cherry-pick picked -[AIRFLOW-919] Running tasks with no start date shouldn't break a DAGs UI -[AIRFLOW-802][AIRFLOW-1] Add spark-submit operator/hook -[AIRFLOW-725] Use keyring to store credentials for JIRA -[AIRFLOW-916] Remove deprecated readfp function -[AIRFLOW-911] Add coloring and timing to tests -[AIRFLOW-906] Update Code icon from lightning bolt to file -[AIRFLOW-897] Prevent dagruns from failing with unfinished tasks -[AIRFLOW-896] Remove unicode to 8-bit conversion in BigQueryOperator -[AIRFLOW-899] Tasks in SCHEDULED state should be white in the UI instead of black -[AIRFLOW-895] Address Apache release incompliancies -[AIRFLOW-893][AIRFLOW-510] Fix crashing webservers when a dagrun has no start date -[AIRFLOW-880] Make webserver serve logs in a sane way for remote logs -[AIRFLOW-889] Fix minor error in the docstrings for BaseOperator -[AIRFLOW-809][AIRFLOW-1] Use __eq__ ColumnOperator When Testing Booleans -[AIRFLOW-875] Add template to HttpSensor params -[AIRFLOW-866] Add FTPSensor -[AIRFLOW-881] Check if SubDagOperator is in DAG context manager -[AIRFLOW-885] Add change.org to the users list -[AIRFLOW-836] Use POST and CSRF for state changing endpoints -[AIRFLOW-862] Fix Unit Tests for DaskExecutor -[AIRFLOW-887] Support future v0.16 -[AIRFLOW-886] Pass result to post_execute() hook -[AIRFLOW-871] change logging.warn() into warning() -[AIRFLOW-882] Remove unnecessary dag>>op assignment in docs -[AIRFLOW-861] make pickle_info endpoint be login_required -[AIRFLOW-869] Refactor mark success functionality -[AIRFLOW-877] Remove .sql template extension from GCS download operator -[AIRFLOW-826] Add Zendesk hook -[AIRFLOW-842] do not query the DB with an empty IN clause -[AIRFLOW-834] change raise StopIteration into return -[AIRFLOW-832] Let debug server run without SSL -[AIRFLOW-862] Add DaskExecutor -[AIRFLOW-858] Configurable database name for DB operators -[AIRFLOW-863] Example DAGs should have recent start dates -[AIRFLOW-853] use utf8 encoding for stdout line decode -[AIRFLOW-857] Use library assert statements instead of conditionals -[AIRFLOW-856] Make sure execution date is set for local client -[AIRFLOW-854] Add OKI as Airflow user -[AIRFLOW-830][AIRFLOW-829][AIRFLOW-88] Reduce Travis log verbosity -[AIRFLOW-814] Fix Presto*CheckOperator.__init__ -[AIRFLOW-793] Enable compressed loading in S3ToHiveTransfer -[AIRFLOW-844] Fix cgroups directory creation -[AIRFLOW-831] Restore import to fix broken tests -[AIRFLOW-794] Access DAGS_FOLDER and SQL_ALCHEMY_CONN exclusively from settings -[AIRFLOW-694] Fix config behaviour for empty envvar -[AIRFLOW-365] Set dag.fileloc explicitly and use for Code view -[AIRFLOW-781] Allow DataFlowOperators to accept jobs stored in GCS +- [AIRFLOW-1525] Fix minor LICENSE and NOTICE issues +- [AIRFLOW-XXX] Bump version to 1.9.0 +- [AIRFLOW-1897][AIRFLOW-1873] Task Logs for running instance not visible in WebUI +- [AIRFLOW-XXX] Make sure session is committed +- [AIRFLOW-1896] FIX bleach <> html5lib incompatibility +- [AIRFLOW-XXX] Fix log handler test +- [AIRFLOW-1873] Set TI.try_number to right value depending TI state +- [AIRFLOW-1554] Fix wrong DagFileProcessor termination method call +- [AIRFLOW-1872] Set context for all handlers including parents +- [AIRFLOW-XXX] Add dask lock files to excludes +- [AIRFLOW-1839] Fix more bugs in S3Hook boto -> boto3 migration +- [AIRFLOW-1795] Correctly call S3Hook after migration to boto3 +- [AIRFLOW-1813] Bug SSH Operator empty buffer +- [AIRFLOW-1794] Remove uses of Exception.message for Python 3 +- [AIRFLOW-1799] Fix logging line which raises errors +- [AIRFLOW-1102] Upgrade Gunicorn >=19.4.0 +- [AIRFLOW-1756] Fix S3TaskHandler to work with Boto3-based S3Hook +- [AIRFLOW-1797] S3Hook.load_string didn't work on Python3 +- [AIRFLOW-1792] Missing intervals DruidOperator +- [AIRFLOW-1789][AIRFLOW-1712] Log SSHOperator stderr to log.warning +- [AIRFLOW-1669] Fix Docker and pin Moto to 1.1.19 +- [AIRFLOW-71] Add support for private Docker images +- [AIRFLOW-1779] Add keepalive packets to ssh hook +- [AIRFLOW-XXX] Give a clue what the 'ds' variable is +- [AIRFLOW-XXX] Correct typos in the faq docs page +- [AIRFLOW-1571] Add AWS Lambda Hook +- [AIRFLOW-1675] Fix docstrings for API docs +- [AIRFLOW-1712][AIRFLOW-756][AIRFLOW-751] Log SSHOperator output +- [AIRFLOW-1776] Capture stdout and stderr for logging +- [AIRFLOW-1765] Make experimental API securable without needing Kerberos. +- [AIRFLOW-1764] The web interface should not use the experimental API +- [AIRFLOW-1634] Adds task_concurrency feature +- [AIRFLOW-1018] Make processor use logging framework +- [AIRFLOW-1695] Add RedshiftHook using boto3 +- [AIRFLOW-1706] Fix query error for MSSQL backend +- [AIRFLOW-1711] Use ldap3 dict for group membership +- [AIRFLOW-1757] Add missing options to SparkSubmitOperator +- [AIRFLOW-1734][Airflow 1734] Sqoop hook/operator enhancements +- [AIRFLOW-1731] Set pythonpath for logging +- [AIRFLOW-1641] Handle executor events in the scheduler +- [AIRFLOW-1744] Make sure max_tries can be set +- [AIRFLOW-1330] Add conn_type argument to CLI when adding connection +- [AIRFLOW-926] Fix JDBC Hook +- [AIRFLOW-1520] Boto3 S3Hook, S3Log +- [AIRFLOW-XXX] Fix DateTime in Tree View +- [AIRFLOW-1432] Charts label for Y axis not visible +- [AIRFLOW-1743] Verify ldap filters correctly +- [AIRFLOW-1745] Restore default signal disposition +- [AIRFLOW-1741] Correctly hide second chart on task duration page +- [AIRFLOW-1726] Add copy_expert psycopg2 method to PostgresHook +- [AIRFLOW-1698] Remove SCHEDULER_RUNS env var in systemd +- [AIRFLOW-1694] Stop using itertools.izip +- [AIRFLOW-1692] Change test_views filename to support Windows +- [AIRFLOW-1722] Fix typo in scheduler autorestart output filename +- [AIRFLOW-1691] Add better Google cloud logging documentation +- [AIRFLOW-1690] Add detail to gcs error messages +- [AIRFLOW-1682] Make S3TaskHandler write to S3 on close +- [AIRFLOW-1676] Make GCSTaskHandler write to GCS on close +- [AIRFLOW-1635] Allow creating GCP connection without requiring a JSON file +- [AIRFLOW-1323] Made Dataproc operator parameter names consistent +- [AIRFLOW-1590] fix unused module and variable +- [AIRFLOW-988] Fix repeating SLA miss callbacks +- [AIRFLOW-1611] Customize logging +- [AIRFLOW-1668] Expose keepalives_idle for Postgres connections +- [AIRFLOW-1658] Kill Druid task on timeout +- [AIRFLOW-1669][AIRFLOW-1368] Fix Docker import +- [AIRFLOW-1560] Add AWS DynamoDB hook and operator for inserting batch items +- [AIRFLOW-1654] Show tooltips for link icons in DAGs view +- [AIRFLOW-1660] Change webpage width to full-width +- [AIRFLOW-1664] write file as binary instead of str +- [AIRFLOW-1659] Fix invalid obj attribute bug in file_task_handler.py +- [AIRFLOW-1650] Fix custom celery config loading +- [AIRFLOW-1647] Fix Spark-sql hook +- [AIRFLOW-1587] Fix CeleryExecutor import error +- [AIRFLOW-1636] Add AWS and EMR connection type +- [AIRFLOW-1527] Refactor celery config +- [AIRFLOW-1639] Fix Fernet error handling +- [AIRFLOW-1628] Fix docstring of sqlsensor +- [AIRFLOW-1331] add SparkSubmitOperator option +- [AIRFLOW-1627] Only query pool in SubDAG init when necessary +- [AIRFLOW-1629] Make extra a textarea in edit connections form +- [AIRFLOW-1621] Add tests for server side paging +- [AIRFLOW-1519] Add server side paging in DAGs list +- [AIRFLOW-289] Make airflow timezone independent +- [AIRFLOW-1356] Add `--celery_hostname` to `airflow worker` +- [AIRFLOW-1591] Avoid attribute error when rendering logging filename +- [AIRFLOW-1031] Replace hard-code to DagRun.ID_PREFIX +- [AIRFLOW-1604] Rename logger to log +- [AIRFLOW-1512] Add PythonVirtualenvOperator +- [AIRFLOW-1617] Fix XSS vulnerability in Variable endpoint +- [AIRFLOW-1497] Reset hidden fields when changing connection type +- [AIRFLOW-1177] Fix Variable.setdefault w/existing JSON +- [AIRFLOW-1600] Fix exception handling in get_fernet +- [AIRFLOW-1614] Replace inspect.stack() with sys._getframe() +- [AIRFLOW-1613] Make MySqlToGoogleCloudStorageOperator compaitible with python3 +- [AIRFLOW-1609] Fix gitignore to ignore all venvs +- [AIRFLOW-1601] Add configurable task cleanup time +- [AIRFLOW-XXX] Bumping Airflow 1.9.0alpha0 version +- [AIRFLOW-1608] Handle pending job state in GCP Dataflow hook +- [AIRFLOW-1606] Use non static DAG.sync_to_db +- [AIRFLOW-1606][Airflow-1606][AIRFLOW-1605][AIRFLOW-160] DAG.sync_to_db is now a normal method +- [AIRFLOW-1602] LoggingMixin in DAG class +- [AIRFLOW-1593] expose load_string in WasbHook +- [AIRFLOW-1597] Add GameWisp as Airflow user +- [AIRFLOW-1594] Don't install test packages into python root.[] +- [AIRFLOW-1582] Improve logging within Airflow +- [AIRFLOW-1476] add INSTALL instruction for source releases +- [AIRFLOW-XXX] Save username and password in airflow-pr +- [AIRFLOW-1522] Increase text size for var field in variables for MySQL +- [AIRFLOW-950] Missing AWS integrations on documentation::integrations +- [AIRFLOW-XXX] 1.8.2 release notes +- [AIRFLOW-1573] Remove `thrift < 0.10.0` requirement +- [AIRFLOW-1584] Remove insecure /headers endpoint +- [AIRFLOW-1586] Add mapping for date type to mysql_to_gcs operator +- [AIRFLOW-1579] Adds support for jagged rows in Bigquery hook for BQ load jobs +- [AIRFLOW-1577] Add token support to DatabricksHook +- [AIRFLOW-1580] Error in string formating +- [AIRFLOW-1567] Updated docs for Google ML Engine operators/hooks +- [AIRFLOW-1574] add 'to' attribute to templated vars of email operator +- [AIRFLOW-1572] add carbonite to company list +- [AIRFLOW-1568] Fix typo in BigQueryHook +- [AIRFLOW-1493][AIRFLOW-XXXX][WIP] fixed dumb thing +- [AIRFLOW-1567][Airflow-1567] Renamed cloudml hook and operator to mlengine +- [AIRFLOW-1568] Add datastore export/import operators +- [AIRFLOW-1564] Use Jinja2 to render logging filename +- [AIRFLOW-1562] Spark-sql logging contains deadlock +- [AIRFLOW-1556][Airflow 1556] Add support for SQL parameters in BigQueryBaseCursor +- [AIRFLOW-108] Add CreditCards.com to companies list +- [AIRFLOW-1541] Add channel to template fields of slack_operator +- [AIRFLOW-1535] Add service account/scopes in dataproc +- [AIRFLOW-1384] Add to README.md CaDC/ARGO +- [AIRFLOW-1546] add Zymergen 80to org list in README +- [AIRFLOW-1545] Add Nextdoor to companies list +- [AIRFLOW-1544] Add DataFox to companies list +- [AIRFLOW-1529] Add logic supporting quoted newlines in Google BigQuery load jobs +- [AIRFLOW-1521] Fix emplate rendering for BigqueryTableDeleteOperator +- [AIRFLOW-1324] Generalize Druid operator and hook +- [AIRFLOW-1516] Fix error handling getting fernet +- [AIRFLOW-1420][AIRFLOW-1473] Fix deadlock check +- [AIRFLOW-1495] Fix migration on index on job_id +- [AIRFLOW-1483] Making page size consistent in list +- [AIRFLOW-1495] Add TaskInstance index on job_id +- [AIRFLOW-855] Replace PickleType with LargeBinary in XCom +- [AIRFLOW-1505] Document when Jinja substitution occurs +- [AIRFLOW-1504] Log dataproc cluster name +- [AIRFLOW-1239] Fix unicode error for logs in base_task_runner +- [AIRFLOW-1280] Fix Gantt chart height +- [AIRFLOW-1507] Template parameters in file_to_gcs operator +- [AIRFLOW-1452] workaround lock on method +- [AIRFLOW-1385] Make Airflow task logging configurable +- [AIRFLOW-940] Handle error on variable decrypt +- [AIRFLOW-1492] Add gauge for task successes/failures +- [AIRFLOW-1443] Update Airflow configuration documentation +- [AIRFLOW-1486] Unexpected S3 writing log error +- [AIRFLOW-1487] Added links to all companies officially using Airflow +- [AIRFLOW-1489] Fix typo in BigQueryCheckOperator +- [AIRFLOW-1349] Fix backfill to respect limits +- [AIRFLOW-1478] Chart owner column should be sortable +- [AIRFLOW-1397][AIRFLOW-1] No Last Run column data displyed in Airflow UI 1.8.1 +- [AIRFLOW-1474] Add dag_id regex feature for `airflow clear` command +- [AIRFLOW-1445] Changing HivePartitionSensor UI color to lighter shade +- [AIRFLOW-1359] Use default_args in Cloud ML eval +- [AIRFLOW-1389] Support createDisposition in BigQueryOperator +- [AIRFLOW-1349] Refactor BackfillJob _execute +- [AIRFLOW-1459] Fixed broken integration .rst formatting +- [AIRFLOW-1448] Revert "Fix cli reading logfile in memory" +- [AIRFLOW-1398] Allow ExternalTaskSensor to wait on multiple runs of a task +- [AIRFLOW-1399] Fix cli reading logfile in memory +- [AIRFLOW-1442] Remove extra space from ignore_all_deps generated command +- [AIRFLOW-1438] Change batch size per query in scheduler +- [AIRFLOW-1439] Add max billing tier for the BQ Hook and Operator +- [AIRFLOW-1437] Modify BigQueryTableDeleteOperator +- [Airflow 1332] Split logs based on try number +- [AIRFLOW-1385] Create abstraction for Airflow task logging +- [AIRFLOW-756][AIRFLOW-751] Replace ssh hook, operator & sftp operator with paramiko based +- [AIRFLOW-1393][[AIRFLOW-1393] Enable Py3 tests in contrib/spark_submit_hook[ +- [AIRFLOW-1345] Dont expire TIs on each scheduler loop +- [AIRFLOW-1059] Reset orphaned tasks in batch for scheduler +- [AIRFLOW-1255] Fix SparkSubmitHook output deadlock +- [AIRFLOW-1359] Add Google CloudML utils for model evaluation +- [AIRFLOW-1247] Fix ignore all dependencies argument ignored +- [AIRFLOW-1401] Standardize cloud ml operator arguments +- [AIRFLOW-1394] Add quote_character param to GCS hook and operator +- [AIRFLOW-1402] Cleanup SafeConfigParser DeprecationWarning +- [AIRFLOW-1326][[AIRFLOW-1326][AIRFLOW-1184] Don't split argument array -- it's already an array.[ +- [AIRFLOW-1384] Add ARGO/CaDC as a Airflow user +- [AIRFLOW-1357] Fix scheduler zip file support +- [AIRFLOW-1382] Add working dir option to DockerOperator +- [AIRFLOW-1388] Add Cloud ML Engine operators to integration doc +- [AIRFLOW-1387] Add unicode string prefix +- [AIRFLOW-1366] Add max_tries to task instance +- [AIRFLOW-1300] Enable table creation with TBLPROPERTIES +- [AIRFLOW-1271] Add Google CloudML Training Operator +- [AIRFLOW-300] Add Google Pubsub hook and operator +- [AIRFLOW-1343] Fix dataproc label format +- [AIRFLOW-1367] Pass Content-ID To reference inline images in an email, we need to be able to add to the HTML. However currently the Content-ID (cid) is not passed, so we need to add it +- [AIRFLOW-1265] Fix celery executor parsing CELERY_SSL_ACTIVE +- [AIRFLOW-1272] Google Cloud ML Batch Prediction Operator +- [AIRFLOW-1352][AIRFLOW-1335] Revert MemoryHandler change ()[] +- [AIRFLOW-1350] Add query_uri param to Hive/SparkSQL DataProc operator +- [AIRFLOW-1334] Check if tasks are backfill on scheduler in a join +- [AIRFLOW-1343] Add Airflow default label to the dataproc operator +- [AIRFLOW-1273] Add Google Cloud ML version and model operators +- [AIRFLOW-1273]AIRFLOW-1273] Add Google Cloud ML version and model operators +- [AIRFLOW-1321] Fix hidden field key to ignore case +- [AIRFLOW-1337] Make log_format key names lowercase +- [AIRFLOW-1338][AIRFLOW-782] Add GCP dataflow hook runner change to UPDATING.md +- [AIRFLOW-801] Remove outdated docstring on BaseOperator +- [AIRFLOW-1344] Fix text encoding bug when reading logs for Python 3.5 +- [AIRFLOW-1338] Fix incompatible GCP dataflow hook +- [AIRFLOW-1333] Enable copy function for Google Cloud Storage Hook +- [AIRFLOW-1337] Allow log format customization via airflow.cfg +- [AIRFLOW-1320] Update LetsBonus users in README +- [AIRFLOW-1335] Use MemoryHandler for buffered logging +- [AIRFLOW-1339] Add Drivy to the list of users +- [AIRFLOW-1275] Put 'airflow pool' into API +- [AIRFLOW-1296] Propagate SKIPPED to all downstream tasks +- [AIRFLOW-1317] Fix minor issues in API reference +- [AIRFLOW-1308] Disable nanny usage for Dask +- [AIRFLOW-1172] Support nth weekday of the month cron expression +- [AIRFLOW-936] Add clear/mark success for DAG in the UI +- [AIRFLOW-1294] Backfills can loose tasks to execute +- [AIRFLOW-1299] Support imageVersion in Google Dataproc cluster +- [AIRFLOW-1291] Update NOTICE and LICENSE files to match ASF requirements +- [AIRFLOW-1301] Add New Relic to list of companies +- [AIRFLOW-1289] Removes restriction on number of scheduler threads +- [AIRFLOW-1024] Ignore celery executor errors (#49) +- [AIRFLOW-1265] Fix exception while loading celery configurations +- [AIRFLOW-1290] set docs author to 'Apache Airflow' +- [AIRFLOW-1242] Allowing project_id to have a colon in it. +- [AIRFLOW-1282] Fix known event column sorting +- [AIRFLOW-1166] Speed up _change_state_for_tis_without_dagrun +- [AIRFLOW-1208] Speed-up cli tests +- [AIRFLOW-1192] Some enhancements to qubole_operator +- [AIRFLOW-1281] Sort variables by key field by default +- [AIRFLOW-1277] Forbid KE creation with empty fields +- [AIRFLOW-1276] Forbid event creation with end_data earlier than start_date +- [AIRFLOW-1263] Dynamic height for charts +- [AIRFLOW-1266] Increase width of gantt y axis +- [AIRFLOW-1244] Forbid creation of a pool with empty name +- [AIRFLOW-1274][HTTPSENSOR] Rename parameter params to data +- [AIRFLOW-654] Add SSL Config Option for CeleryExecutor w/ RabbitMQ - Add BROKER_USE_SSL config to give option to send AMQP messages over SSL - Can be set using usual airflow options (e.g. airflow.cfg, env vars, etc.) +- [AIRFLOW-1256] Add United Airlines to readme +- [AIRFLOW-1251] Add eRevalue to Airflow users +- [AIRFLOW-908] Print hostname at the start of cli run +- [AIRFLOW-1237] Fix IN-predicate sqlalchemy warning +- [AIRFLOW-1243] DAGs table has no default entries to show +- [AIRFLOW-1245] Fix random failure in test_trigger_dag_for_date +- [AIRFLOW-1248] Fix wrong conf name for worker timeout +- [AIRFLOW-1197] : SparkSubmitHook on_kill error +- [AIRFLOW-1191] : SparkSubmitHook custom cmd +- [AIRFLOW-1234] Cover utils.operator_helpers with UTs +- [AIRFLOW-1217] Enable Sqoop logging +- [AIRFLOW-645] Support HTTPS connections in HttpHook +- [AIRFLOW-1231] Use flask_wtf.CSRFProtect +- [AIRFLOW-1232] Remove deprecated readfp warning +- [AIRFLOW-1233] Cover utils.json with unit tests +- [AIRFLOW-1227] Remove empty column on the Logs view +- [AIRFLOW-1226] Remove empty column on the Jobs view +- [AIRFLOW-1221] Fix templating bug with DatabricksSubmitRunOperator +- [AIRFLOW-1210] Enable DbApiHook unit tests +- [AIRFLOW-1199] Fix create modal +- [AIRFLOW-1200] Forbid creation of a variable with an empty key +- [AIRFLOW-1207] Enable utils.helpers unit tests +- [AIRFLOW-1213] Add hcatalog parameters to sqoop +- [AIRFLOW-1201] Update deprecated 'nose-parameterized' +- [AIRFLOW-1186] Sort dag.get_task_instances by execution_date +- [AIRFLOW-1203] Pin Google API client version to fix OAuth issue +- [AIRFLOW-1145] Fix closest_date_partition function with before set to True If we're looking for the closest date before, we should take the latest date in the list of date before. +- [AIRFLOW-1180] Fix flask-wtf version for test_csrf_rejection +- [AIRFLOW-993] Update date inference logic +- [AIRFLOW-1170] DbApiHook insert_rows inserts parameters separately +- [AIRFLOW-1041] Do not shadow xcom_push method[] +- [AIRFLOW-860][AIRFLOW-935] Fix plugin executor import cycle and executor selection +- [AIRFLOW-1189] Fix get a DataFrame using BigQueryHook failing +- [AIRFLOW-1184] SparkSubmitHook does not split args +- [AIRFLOW-1182] SparkSubmitOperator template field +- [AIRFLOW-823] Allow specifying execution date in task_info API +- [AIRFLOW-1175] Add Pronto Tools to Airflow user list +- [AIRFLOW-1150] Fix scripts execution in sparksql hook[] +- [AIRFLOW-1141] remove crawl_for_tasks +- [AIRFLOW-1193] Add Checkr to company using Airflow +- [AIRFLOW-1168] Add closing() to all connections and cursors +- [AIRFLOW-1188] Add max_bad_records param to GoogleCloudStorageToBigQueryOperator +- [AIRFLOW-1187][AIRFLOW-1185] Fix PyPi package names in documents +- [AIRFLOW-1185] Fix PyPi URL in templates +- [AIRFLOW-XXX] Updating CHANGELOG, README, and UPDATING after 1.8.1 release +- [AIRFLOW-1181] Add delete and list functionality to gcs_hook +- [AIRFLOW-1179] Fix Pandas 0.2x breaking Google BigQuery change +- [AIRFLOW-1167] Support microseconds in FTPHook modification time +- [AIRFLOW-1173] Add Robinhood to who uses Airflow +- [AIRFLOW-945][AIRFLOW-941] Remove psycopg2 connection workaround +- [AIRFLOW-1140] DatabricksSubmitRunOperator should template the "json" field. +- [AIRFLOW-1160] Update Spark parameters for Mesos +- [AIRFLOW 1149][AIRFLOW-1149] Allow for custom filters in Jinja2 templates +- [AIRFLOW-1036] Randomize exponential backoff +- [AIRFLOW-1155] Add Tails.com to community +- [AIRFLOW-1142] Do not reset orphaned state for backfills +- [AIRFLOW-492] Make sure stat updates cannot fail a task +- [AIRFLOW-1119] Fix unload query so headers are on first row[] +- [AIRFLOW-1089] Add Spark application arguments +- [AIRFLOW-1125] Document encrypted connections +- [AIRFLOW-1122] Increase stroke width in UI +- [AIRFLOW-1138] Add missing licenses to files in scripts directory +- (AIRFLOW-11-38) [AIRFLOW-1136] Capture invalid arguments for Sqoop +- [AIRFLOW-1127] Move license notices to LICENSE +- [AIRFLOW-1118] Add evo.company to Airflow users +- [AIRFLOW-1121][AIRFLOW-1004] Fix `airflow webserver --pid` to write out pid file +- [AIRFLOW-1124] Do not set all tasks to scheduled in backfill +- [AIRFLOW-1120] Update version view to include Apache prefix +- [AIRFLOW-1091] Add script that can compare jira target against merges +- [AIRFLOW-1107] Add support for ftps non-default port +- [AIRFLOW-1000] Rebrand distribution to Apache Airflow +- [AIRFLOW-1094] Run unit tests under contrib in Travis +- [AIRFLOW-1112] Log which pool when pool is full in scheduler +- [AIRFLOW-1106] Add Groupalia/Letsbonus to the ReadMe +- [AIRFLOW-1109] Use kill signal to kill processes and log results +- [AIRFLOW-1074] Don't count queued tasks for concurrency limits +- [AIRFLOW-1095] Make ldap_auth memberOf come from configuration +- [AIRFLOW-1090] Add HBO +- [AIRFLOW-1035] Use binary exponential backoff +- [AIRFLOW-1081] Improve performance of duration chart +- [AIRFLOW-1078] Fix latest_runs endpoint for old flask versions +- [AIRFLOW-1085] Enhance the SparkSubmitOperator +- [AIRFLOW-1050] Do not count up_for_retry as not ready +- [AIRFLOW-1028] Databricks Operator for Airflow +- [AIRFLOW-1075] Security docs cleanup +- [AIRFLOW-1033][AIFRLOW-1033] Fix ti_deps for no schedule dags +- [AIRFLOW-1016] Allow HTTP HEAD request method on HTTPSensor +- [AIRFLOW-970] Load latest_runs on homepage async +- [AIRFLOW-111] Include queued tasks in scheduler concurrency check +- [AIRFLOW-1001] Fix landing times if there is no following schedule +- [AIRFLOW-1065] Add functionality for Azure Blob Storage over wasb:// +- [AIRFLOW-947] Improve exceptions for unavailable Presto cluster +- [AIRFLOW-1067] use example.com in examples +- [AIRFLOW-1064] Change default sort to job_id for TaskInstanceModelView +- [AIRFLOW-1030][AIRFLOW-1] Fix hook import for HttpSensor +- [AIRFLOW-1051] Add a test for resetdb to CliTests +- [AIRFLOW-1004][AIRFLOW-276] Fix `airflow webserver -D` to run in background +- [AIRFLOW-1062] Fix DagRun#find to return correct result +- [AIRFLOW-1011] Fix bug in BackfillJob._execute() for SubDAGs +- [AIRFLOW-1038] Specify celery serialization options explicitly +- [AIRFLOW-1054] Fix broken import in test_dag +- [AIRFLOW-1007] Use Jinja sandbox for chart_data endpoint +- [AIRFLOW-719] Fix race condition in ShortCircuit, Branch and LatestOnly +- [AIRFLOW-1043] Fix doc strings of operators +- [AIRFLOW-840] Make ticket renewer python3 compatible +- [AIRFLOW-985] Extend the sqoop operator and hook +- [AIRFLOW-1034] Make it possible to connect to S3 in sigv4 regions +- [AIRFLOW-1045] Make log level configurable via airflow.cfg +- [AIRFLOW-1047] Sanitize strings passed to Markup +- [AIRFLOW-1040] Fix some small typos in comments and docstrings +- [AIRFLOW-1017] get_task_instance shouldn't throw exception when no TI +- [AIRFLOW-1006] Add config_templates to MANIFEST +- [AIRFLOW-999] Add support for Redis database +- [AIRFLOW-1009] Remove SQLOperator from Concepts page +- [AIRFLOW-1006] Move config templates to separate files +- [AIRFLOW-1005] Improve Airflow startup time +- [AIRFLOW-1010] Add convenience script for signing releases +- [AIRFLOW-995] Remove reference to actual Airflow issue +- [AIRFLOW-681] homepage doc link should pointing to apache repo not airbnb repo +- [AIRFLOW-705][AIRFLOW-706] Fix run_command bugs +- [AIRFLOW-990] Fix Py27 unicode logging in DockerOperator +- [AIRFLOW-963] Fix non-rendered code examples +- [AIRFLOW-969] Catch bad python_callable argument +- [AIRFLOW-984] Enable subclassing of SubDagOperator +- [AIRFLOW-997] Update setup.cfg to point to Apache +- [AIRFLOW-994] Add MiNODES to the official airflow user list +- [AIRFLOW-995][AIRFLOW-1] Update GitHub PR Template +- [AIRFLOW-989] Do not mark dag run successful if unfinished tasks +- [AIRFLOW-903] New configuration setting for the default dag view +- [AIRFLOW-979] Add GovTech GDS +- [AIRFLOW-933] Replace eval with literal_eval to prevent RCE +- [AIRFLOW-974] Fix mkdirs race condition +- [AIRFLOW-917] Fix formatting of error message +- [AIRFLOW-770] Refactor BaseHook so env vars are always read +- [AIRFLOW-900] Double trigger should not kill original task instance +- [AIRFLOW-900] Fixes bugs in LocalTaskJob for double run protection +- [AIRFLOW-932][AIRFLOW-932][AIRFLOW-921][AIRFLOW-910] Do not mark tasks removed when backfilling[ +- [AIRFLOW-961] run onkill when SIGTERMed +- [AIRFLOW-910] Use parallel task execution for backfills +- [AIRFLOW-967] Wrap strings in native for py2 ldap compatibility +- [AIRFLOW-958] Improve tooltip readability +- AIRFLOW-959 Cleanup and reorganize .gitignore +- AIRFLOW-960 Add .editorconfig file +- [AIRFLOW-931] Do not set QUEUED in TaskInstances +- [AIRFLOW-956] Get docs working on readthedocs.org +- [AIRFLOW-954] Fix configparser ImportError +- [AIRFLOW-941] Use defined parameters for psycopg2 +- [AIRFLOW-943] Update Digital First Media in users list +- [AIRFLOW-942] Add mytaxi to Airflow users +- [AIRFLOW-939] add .swp to gitginore +- [AIRFLOW-719] Prevent DAGs from ending prematurely +- [AIRFLOW-938] Use test for True in task_stats queries +- [AIRFLOW-937] Improve performance of task_stats +- [AIRFLOW-933] use ast.literal_eval rather eval because ast.literal_eval does not execute input. +- [AIRFLOW-925] Revert airflow.hooks change that cherry-pick picked +- [AIRFLOW-919] Running tasks with no start date shouldn't break a DAGs UI +- [AIRFLOW-802][AIRFLOW-1] Add spark-submit operator/hook +- [AIRFLOW-725] Use keyring to store credentials for JIRA +- [AIRFLOW-916] Remove deprecated readfp function +- [AIRFLOW-911] Add coloring and timing to tests +- [AIRFLOW-906] Update Code icon from lightning bolt to file +- [AIRFLOW-897] Prevent dagruns from failing with unfinished tasks +- [AIRFLOW-896] Remove unicode to 8-bit conversion in BigQueryOperator +- [AIRFLOW-899] Tasks in SCHEDULED state should be white in the UI instead of black +- [AIRFLOW-895] Address Apache release incompliancies +- [AIRFLOW-893][AIRFLOW-510] Fix crashing webservers when a dagrun has no start date +- [AIRFLOW-880] Make webserver serve logs in a sane way for remote logs +- [AIRFLOW-889] Fix minor error in the docstrings for BaseOperator +- [AIRFLOW-809][AIRFLOW-1] Use __eq__ ColumnOperator When Testing Booleans +- [AIRFLOW-875] Add template to HttpSensor params +- [AIRFLOW-866] Add FTPSensor +- [AIRFLOW-881] Check if SubDagOperator is in DAG context manager +- [AIRFLOW-885] Add change.org to the users list +- [AIRFLOW-836] Use POST and CSRF for state changing endpoints +- [AIRFLOW-862] Fix Unit Tests for DaskExecutor +- [AIRFLOW-887] Support future v0.16 +- [AIRFLOW-886] Pass result to post_execute() hook +- [AIRFLOW-871] change logging.warn() into warning() +- [AIRFLOW-882] Remove unnecessary dag>>op assignment in docs +- [AIRFLOW-861] make pickle_info endpoint be login_required +- [AIRFLOW-869] Refactor mark success functionality +- [AIRFLOW-877] Remove .sql template extension from GCS download operator +- [AIRFLOW-826] Add Zendesk hook +- [AIRFLOW-842] do not query the DB with an empty IN clause +- [AIRFLOW-834] change raise StopIteration into return +- [AIRFLOW-832] Let debug server run without SSL +- [AIRFLOW-862] Add DaskExecutor +- [AIRFLOW-858] Configurable database name for DB operators +- [AIRFLOW-863] Example DAGs should have recent start dates +- [AIRFLOW-853] use utf8 encoding for stdout line decode +- [AIRFLOW-857] Use library assert statements instead of conditionals +- [AIRFLOW-856] Make sure execution date is set for local client +- [AIRFLOW-854] Add OKI as Airflow user +- [AIRFLOW-830][AIRFLOW-829][AIRFLOW-88] Reduce Travis log verbosity +- [AIRFLOW-814] Fix Presto*CheckOperator.__init__ +- [AIRFLOW-793] Enable compressed loading in S3ToHiveTransfer +- [AIRFLOW-844] Fix cgroups directory creation +- [AIRFLOW-831] Restore import to fix broken tests +- [AIRFLOW-794] Access DAGS_FOLDER and SQL_ALCHEMY_CONN exclusively from settings +- [AIRFLOW-694] Fix config behaviour for empty envvar +- [AIRFLOW-365] Set dag.fileloc explicitly and use for Code view +- [AIRFLOW-781] Allow DataFlowOperators to accept jobs stored in GCS -AIRFLOW 1.8.2, 2017-09-04 +Airflow 1.8.2, 2017-09-04 ------------------------- -9a53e66 [AIRFLOW-809][AIRFLOW-1] Use __eq__ ColumnOperator When Testing Booleans -333e0b3 [AIRFLOW-1296] Propagate SKIPPED to all downstream tasks -93825d5 Re-enable caching for hadoop components -33a9dcb Pin Hive and Hadoop to a specific version and create writable warehouse dir -7cff6cd [AIRFLOW-1308] Disable nanny usage for Dask -c6a09c4 Updating CHANGELOG for 1.8.2rc1 -570b2ed [AIRFLOW-1294] Backfills can loose tasks to execute -3f48d48 [AIRFLOW-1291] Update NOTICE and LICENSE files to match ASF requirements -e10af9a [AIRFLOW-XXX] Set version to 1.8.2rc1 -69bd269 [AIRFLOW-1160] Update Spark parameters for Mesos -9692510 [AIRFLOW 1149][AIRFLOW-1149] Allow for custom filters in Jinja2 templates -6de5330 [AIRFLOW-1119] Fix unload query so headers are on first row[] -b4e9eb8 [AIRFLOW-1089] Add Spark application arguments -a4083f3 [AIRFLOW-1078] Fix latest_runs endpoint for old flask versions -7a02841 [AIRFLOW-1074] Don't count queued tasks for concurrency limits -a2c18a5 [AIRFLOW-1064] Change default sort to job_id for TaskInstanceModelView -d1c64ab [AIRFLOW-1038] Specify celery serialization options explicitly -b4ee88a [AIRFLOW-1036] Randomize exponential backoff -9fca409 [AIRFLOW-993] Update date inference logic -272c2f5 [AIRFLOW-1167] Support microseconds in FTPHook modification time -c7c0b72 [AIRFLOW-1179] Fix Pandas 0.2x breaking Google BigQuery change -acd0166 [AIRFLOW-1263] Dynamic height for charts -7f33f6e [AIRFLOW-1266] Increase width of gantt y axis -fc33c04 [AIRFLOW-1290] set docs author to 'Apache Airflow' -2e9eee3 [AIRFLOW-1282] Fix known event column sorting -2389a8a [AIRFLOW-1166] Speed up _change_state_for_tis_without_dagrun -bf966e6 [AIRFLOW-1192] Some enhancements to qubole_operator -57d5bcd [AIRFLOW-1281] Sort variables by key field by default -802fc15 [AIRFLOW-1244] Forbid creation of a pool with empty name -1232b6a [AIRFLOW-1243] DAGs table has no default entries to show -b0ba3c9 [AIRFLOW-1227] Remove empty column on the Logs view -c406652 [AIRFLOW-1226] Remove empty column on the Jobs view -51a83cc [AIRFLOW-1199] Fix create modal -cac7d4c [AIRFLOW-1200] Forbid creation of a variable with an empty key -5f3ee52 [AIRFLOW-1186] Sort dag.get_task_instances by execution_date -f446c08 [AIRFLOW-1145] Fix closest_date_partition function with before set to True If we're looking for the closest date before, we should take the latest date in the list of date before. -93b8e96 [AIRFLOW-1180] Fix flask-wtf version for test_csrf_rejection -bb56805 [AIRFLOW-1170] DbApiHook insert_rows inserts parameters separately -093b2f0 [AIRFLOW-1150] Fix scripts execution in sparksql hook[] -777f181 [AIRFLOW-1168] Add closing() to all connections and cursors -bc8e912 [AIRFLOW-XXX] Updating CHANGELOG, README, and UPDATING after 1.8.1 release +- [AIRFLOW-809][AIRFLOW-1] Use __eq__ ColumnOperator When Testing Booleans +- [AIRFLOW-1296] Propagate SKIPPED to all downstream tasks +- Re-enable caching for hadoop components +- Pin Hive and Hadoop to a specific version and create writable warehouse dir +- [AIRFLOW-1308] Disable nanny usage for Dask +- Updating CHANGELOG for 1.8.2rc1 +- [AIRFLOW-1294] Backfills can loose tasks to execute +- [AIRFLOW-1291] Update NOTICE and LICENSE files to match ASF requirements +- [AIRFLOW-XXX] Set version to 1.8.2rc1 +- [AIRFLOW-1160] Update Spark parameters for Mesos +- [AIRFLOW 1149][AIRFLOW-1149] Allow for custom filters in Jinja2 templates +- [AIRFLOW-1119] Fix unload query so headers are on first row[] +- [AIRFLOW-1089] Add Spark application arguments +- [AIRFLOW-1078] Fix latest_runs endpoint for old flask versions +- [AIRFLOW-1074] Don't count queued tasks for concurrency limits +- [AIRFLOW-1064] Change default sort to job_id for TaskInstanceModelView +- [AIRFLOW-1038] Specify celery serialization options explicitly +- [AIRFLOW-1036] Randomize exponential backoff +- [AIRFLOW-993] Update date inference logic +- [AIRFLOW-1167] Support microseconds in FTPHook modification time +- [AIRFLOW-1179] Fix Pandas 0.2x breaking Google BigQuery change +- [AIRFLOW-1263] Dynamic height for charts +- [AIRFLOW-1266] Increase width of gantt y axis +- [AIRFLOW-1290] set docs author to 'Apache Airflow' +- [AIRFLOW-1282] Fix known event column sorting +- [AIRFLOW-1166] Speed up _change_state_for_tis_without_dagrun +- [AIRFLOW-1192] Some enhancements to qubole_operator +- [AIRFLOW-1281] Sort variables by key field by default +- [AIRFLOW-1244] Forbid creation of a pool with empty name +- [AIRFLOW-1243] DAGs table has no default entries to show +- [AIRFLOW-1227] Remove empty column on the Logs view +- [AIRFLOW-1226] Remove empty column on the Jobs view +- [AIRFLOW-1199] Fix create modal +- [AIRFLOW-1200] Forbid creation of a variable with an empty key +- [AIRFLOW-1186] Sort dag.get_task_instances by execution_date +- [AIRFLOW-1145] Fix closest_date_partition function with before set to True If we're looking for the closest date before, we should take the latest date in the list of date before. +- [AIRFLOW-1180] Fix flask-wtf version for test_csrf_rejection +- [AIRFLOW-1170] DbApiHook insert_rows inserts parameters separately +- [AIRFLOW-1150] Fix scripts execution in sparksql hook[] +- [AIRFLOW-1168] Add closing() to all connections and cursors +- [AIRFLOW-XXX] Updating CHANGELOG, README, and UPDATING after 1.8.1 release -AIRFLOW 1.8.1, 2017-05-09 +Airflow 1.8.1, 2017-05-09 ------------------------- -[AIRFLOW-1142] SubDAG Tasks Not Executed Even Though All Dependencies Met -[AIRFLOW-1138] Add licenses to files in scripts directory -[AIRFLOW-1127] Move license notices to LICENSE instead of NOTICE -[AIRFLOW-1124] Do not set all task instances to scheduled on backfill -[AIRFLOW-1120] Update version view to include Apache prefix -[AIRFLOW-1062] DagRun#find returns wrong result if external_trigger=False is specified -[AIRFLOW-1054] Fix broken import on test_dag -[AIRFLOW-1050] Retries ignored - regression -[AIRFLOW-1033] TypeError: can't compare datetime.datetime to None -[AIRFLOW-1017] get_task_instance should return None instead of throw an exception for non-existent TIs -[AIRFLOW-1011] Fix bug in BackfillJob._execute() for SubDAGs -[AIRFLOW-1004] `airflow webserver -D` runs in foreground -[AIRFLOW-1001] Landing Time shows "unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'" on example_subdag_operator -[AIRFLOW-1000] Rebrand to Apache Airflow instead of Airflow -[AIRFLOW-989] Clear Task Regression -[AIRFLOW-974] airflow.util.file mkdir has a race condition -[AIRFLOW-906] Update Code icon from lightning bolt to file -[AIRFLOW-858] Configurable database name for DB operators -[AIRFLOW-853] ssh_execute_operator.py stdout decode default to ASCII -[AIRFLOW-832] Fix debug server -[AIRFLOW-817] Trigger dag fails when using CLI + API -[AIRFLOW-816] Make sure to pull nvd3 from local resources -[AIRFLOW-815] Add previous/next execution dates to available default variables. -[AIRFLOW-813] Fix unterminated unit tests in tests.job (tests/job.py) -[AIRFLOW-812] Scheduler job terminates when there is no dag file -[AIRFLOW-806] UI should properly ignore DAG doc when it is None -[AIRFLOW-794] Consistent access to DAGS_FOLDER and SQL_ALCHEMY_CONN -[AIRFLOW-785] ImportError if cgroupspy is not installed -[AIRFLOW-784] Cannot install with funcsigs > 1.0.0 -[AIRFLOW-780] The UI no longer shows broken DAGs -[AIRFLOW-777] dag_is_running is initlialized to True instead of False -[AIRFLOW-719] Skipped operations make DAG finish prematurely -[AIRFLOW-694] Empty env vars do not overwrite non-empty config values -[AIRFLOW-492] Insert into dag_stats table results into failed task while task itself succeeded -[AIRFLOW-139] Executing VACUUM with PostgresOperator -[AIRFLOW-111] DAG concurrency is not honored -[AIRFLOW-88] Improve clarity Travis CI reports +- [AIRFLOW-1142] SubDAG Tasks Not Executed Even Though All Dependencies Met +- [AIRFLOW-1138] Add licenses to files in scripts directory +- [AIRFLOW-1127] Move license notices to LICENSE instead of NOTICE +- [AIRFLOW-1124] Do not set all task instances to scheduled on backfill +- [AIRFLOW-1120] Update version view to include Apache prefix +- [AIRFLOW-1062] DagRun#find returns wrong result if external_trigger=False is specified +- [AIRFLOW-1054] Fix broken import on test_dag +- [AIRFLOW-1050] Retries ignored - regression +- [AIRFLOW-1033] TypeError: can't compare datetime.datetime to None +- [AIRFLOW-1017] get_task_instance should return None instead of throw an exception for non-existent TIs +- [AIRFLOW-1011] Fix bug in BackfillJob._execute() for SubDAGs +- [AIRFLOW-1004] `airflow webserver -D` runs in foreground +- [AIRFLOW-1001] Landing Time shows "unsupported operand type(s) for -: 'datetime.datetime' and 'NoneType'" on example_subdag_operator +- [AIRFLOW-1000] Rebrand to Apache Airflow instead of Airflow +- [AIRFLOW-989] Clear Task Regression +- [AIRFLOW-974] airflow.util.file mkdir has a race condition +- [AIRFLOW-906] Update Code icon from lightning bolt to file +- [AIRFLOW-858] Configurable database name for DB operators +- [AIRFLOW-853] ssh_execute_operator.py stdout decode default to ASCII +- [AIRFLOW-832] Fix debug server +- [AIRFLOW-817] Trigger dag fails when using CLI + API +- [AIRFLOW-816] Make sure to pull nvd3 from local resources +- [AIRFLOW-815] Add previous/next execution dates to available default variables. +- [AIRFLOW-813] Fix unterminated unit tests in tests.job (tests/job.py) +- [AIRFLOW-812] Scheduler job terminates when there is no dag file +- [AIRFLOW-806] UI should properly ignore DAG doc when it is None +- [AIRFLOW-794] Consistent access to DAGS_FOLDER and SQL_ALCHEMY_CONN +- [AIRFLOW-785] ImportError if cgroupspy is not installed +- [AIRFLOW-784] Cannot install with funcsigs > 1.0.0 +- [AIRFLOW-780] The UI no longer shows broken DAGs +- [AIRFLOW-777] dag_is_running is initlialized to True instead of False +- [AIRFLOW-719] Skipped operations make DAG finish prematurely +- [AIRFLOW-694] Empty env vars do not overwrite non-empty config values +- [AIRFLOW-492] Insert into dag_stats table results into failed task while task itself succeeded +- [AIRFLOW-139] Executing VACUUM with PostgresOperator +- [AIRFLOW-111] DAG concurrency is not honored +- [AIRFLOW-88] Improve clarity Travis CI reports -AIRFLOW 1.8.0, 2017-03-12 +Airflow 1.8.0, 2017-03-12 ------------------------- -[AIRFLOW-900] Double trigger should not kill original task instance -[AIRFLOW-900] Fixes bugs in LocalTaskJob for double run protection -[AIRFLOW-932] Do not mark tasks removed when backfilling -[AIRFLOW-961] run onkill when SIGTERMed -[AIRFLOW-910] Use parallel task execution for backfills -[AIRFLOW-967] Wrap strings in native for py2 ldap compatibility -[AIRFLOW-941] Use defined parameters for psycopg2 -[AIRFLOW-719] Prevent DAGs from ending prematurely -[AIRFLOW-938] Use test for True in task_stats queries -[AIRFLOW-937] Improve performance of task_stats -[AIRFLOW-933] use ast.literal_eval rather eval because ast.literal_eval does not execute input. -[AIRFLOW-925] Revert airflow.hooks change that cherry-pick picked -[AIRFLOW-919] Running tasks with no start date shouldn't break a DAGs UI -[AIRFLOW-802] Add spark-submit operator/hook -[AIRFLOW-897] Prevent dagruns from failing with unfinished tasks -[AIRFLOW-861] make pickle_info endpoint be login_required -[AIRFLOW-853] use utf8 encoding for stdout line decode -[AIRFLOW-856] Make sure execution date is set for local client -[AIRFLOW-830][AIRFLOW-829][AIRFLOW-88] Reduce Travis log verbosity -[AIRFLOW-831] Restore import to fix broken tests -[AIRFLOW-794] Access DAGS_FOLDER and SQL_ALCHEMY_CONN exclusively from settings -[AIRFLOW-694] Fix config behaviour for empty envvar -[AIRFLOW-365] Set dag.fileloc explicitly and use for Code view -[AIRFLOW-931] Do not set QUEUED in TaskInstances -[AIRFLOW-899] Tasks in SCHEDULED state should be white in the UI instead of black -[AIRFLOW-895] Address Apache release incompliancies -[AIRFLOW-893][AIRFLOW-510] Fix crashing webservers when a dagrun has no start date -[AIRFLOW-793] Enable compressed loading in S3ToHiveTransfer -[AIRFLOW-863] Example DAGs should have recent start dates -[AIRFLOW-869] Refactor mark success functionality -[AIRFLOW-856] Make sure execution date is set for local client -[AIRFLOW-814] Fix Presto*CheckOperator.__init__ -[AIRFLOW-844] Fix cgroups directory creation -[AIRFLOW-816] Use static nvd3 and d3 -[AIRFLOW-821] Fix py3 compatibility -[AIRFLOW-817] Check for None value of execution_date in endpoint -[AIRFLOW-822] Close db before exception -[AIRFLOW-815] Add prev/next execution dates to template variables -[AIRFLOW-813] Fix unterminated unit tests in SchedulerJobTest -[AIRFLOW-813] Fix unterminated scheduler unit tests -[AIRFLOW-806] UI should properly ignore DAG doc when it is None -[AIRFLOW-812] Fix the scheduler termination bug. -[AIRFLOW-780] Fix dag import errors no longer working -[AIRFLOW-783] Fix py3 incompatibility in BaseTaskRunner -[AIRFLOW-810] Correct down_revision dag_id/state index creation -[AIRFLOW-807] Improve scheduler performance for large DAGs -[AIRFLOW-798] Check return_code before forcing termination -[AIRFLOW-139] Let psycopg2 handle autocommit for PostgresHook -[AIRFLOW-776] Add missing cgroups devel dependency -[AIRFLOW-777] Fix expression to check if a DagRun is in running state -[AIRFLOW-785] Don't import CgroupTaskRunner at global scope -[AIRFLOW-784] Pin funcsigs to 1.0.0 -[AIRFLOW-624] Fix setup.py to not import airflow.version as version -[AIRFLOW-779] Task should fail with specific message when deleted -[AIRFLOW-778] Fix completey broken MetastorePartitionSensor -[AIRFLOW-739] Set pickle_info log to debug -[AIRFLOW-771] Make S3 logs append instead of clobber -[AIRFLOW-773] Fix flaky datetime addition in api test -[AIRFLOW-219][AIRFLOW-398] Cgroups + impersonation -[AIRFLOW-683] Add jira hook, operator and sensor -[AIRFLOW-762] Add Google DataProc delete operator -[AIRFLOW-760] Update systemd config -[AIRFLOW-759] Use previous dag_run to verify depend_on_past -[AIRFLOW-757] Set child_process_log_directory default more sensible -[AIRFLOW-692] Open XCom page to super-admins only -[AIRFLOW-737] Fix HDFS Sensor directory. -[AIRFLOW-747] Fix retry_delay not honoured -[AIRFLOW-558] Add Support for dag.catchup=(True|False) Option -[AIRFLOW-489] Allow specifying execution date in trigger_dag API -[AIRFLOW-738] Commit deleted xcom items before insert -[AIRFLOW-729] Add Google Cloud Dataproc cluster creation operator -[AIRFLOW-728] Add Google BigQuery table sensor -[AIRFLOW-741] Log to debug instead of info for app.py -[AIRFLOW-731] Fix period bug for NamedHivePartitionSensor -[AIRFLOW-740] Pin jinja2 to < 2.9.0 -[AIRFLOW-663] Improve time units for task performance charts -[AIRFLOW-665] Fix email attachments -[AIRFLOW-734] Fix SMTP auth regression when not using user/pass -[AIRFLOW-702] Fix LDAP Regex Bug -[AIRFLOW-717] Add Cloud Storage updated sensor -[AIRFLOW-695] Retries do not execute because dagrun is in FAILED state -[AIRFLOW-673] Add operational metrics test for SchedulerJob -[AIRFLOW-727] try_number is not increased -[AIRFLOW-715] A more efficient HDFS Sensor: -[AIRFLOW-716] Allow AVRO BigQuery load-job without schema -[AIRFLOW-718] Allow the query URI for DataProc Pig -Log needs to be part of try/catch block -[AIRFLOW-721] Descendant process can disappear before termination -[AIRFLOW-403] Bash operator's kill method leaves underlying processes running -[AIRFLOW-657] Add AutoCommit Parameter for MSSQL -[AIRFLOW-641] Improve pull request instructions -[AIRFLOW-685] Add test for MySqlHook.bulk_load() -[AIRFLOW-686] Match auth backend config section -[AIRFLOW-691] Add SSH KeepAlive option to SSH_hook -[AIRFLOW-709] Use same engine for migrations and reflection -[AIRFLOW-700] Update to reference to web authentication documentation -[AIRFLOW-649] Support non-sched DAGs in LatestOnlyOp -[AIRFLOW-712] Fix AIRFLOW-667 to use proper HTTP error properties -[AIRFLOW-710] Add OneFineStay as official user -[AIRFLOW-703][AIRFLOW-1] Stop Xcom being cleared too early -[AIRFLOW-679] Stop concurrent task instances from running -[AIRFLOW-704][AIRFLOW-1] Fix invalid syntax in BQ hook -[AIRFLOW-667] Handle BigQuery 503 error -[AIRFLOW-680] Disable connection pool for commands -[AIRFLOW-678] Prevent scheduler from double triggering TIs -[AIRFLOW-677] Kill task if it fails to heartbeat -[AIRFLOW-674] Ability to add descriptions for DAGs -[AIRFLOW-682] Bump MAX_PERIODS to make mark_success work for large DAGs -Use jdk selector to set required jdk -[AIRFLOW-647] Restore dag.get_active_runs -[AIRFLOW-662] Change seasons to months in project description -[AIRFLOW-656] Add dag/task/date index to xcom table -[AIRFLOW-658] Improve schema_update_options in GCP -[AIRFLOW-41] Fix pool oversubscription -[AIRFLOW-489] Add API Framework -[AIRFLOW-653] Add some missing endpoint tests -[AIRFLOW-652] Remove obsolete endpoint -[AIRFLOW-345] Add contrib ECSOperator -[AIRFLOW-650] Adding Celect to user list -[AIRFLOW-510] Filter Paused Dags, show Last Run & Trigger Dag -[AIRFLOW-643] Improve date handling for sf_hook -[AIRFLOW-638] Add schema_update_options to GCP ops -[AIRFLOW-640] Install and enable nose-ignore-docstring -[AIRFLOW-639]AIRFLOW-639] Alphasort package names -[AIRFLOW-375] Fix pylint errors -[AIRFLOW-347] Show empty DAG runs in tree view -[AIRFLOW-628] Adding SalesforceHook to contrib/hooks -[AIRFLOW-514] hive hook loads data from pandas DataFrame into hive and infers types -[AIRFLOW-565] Fixes DockerOperator on Python3.x -[AIRFLOW-635] Encryption option for S3 hook -[AIRFLOW-137] Fix max_active_runs on clearing tasks -[AIRFLOW-343] Fix schema plumbing in HiveServer2Hook -[AIRFLOW-130] Fix ssh operator macosx -[AIRFLOW-633] Show TI attributes in TI view -[AIRFLOW-626][AIRFLOW-1] HTML Content does not show up when sending email with attachment -[AIRFLOW-533] Set autocommit via set_autocommit -[AIRFLOW-629] stop pinning lxml -[AIRFLOW-464] Add setdefault method to Variable -[AIRFLOW-626][AIRFLOW-1] HTML Content does not show up when sending email with attachment -[AIRFLOW-591] Add datadog hook & sensor -[AIRFLOW-561] Add RedshiftToS3Transfer operator -[AIRFLOW-570] Pass root to date form on gantt -[AIRFLOW-504] Store fractional seconds in MySQL tables -[AIRFLOW-623] LDAP attributes not always a list -[AIRFLOW-611] source_format in BigQueryBaseCursor -[AIRFLOW-619] Fix exception in Gannt chart -[AIRFLOW-618] Cast DateTimes to avoid sqllite errors -[AIRFLOW-422] Add JSON endpoint for task info -[AIRFLOW-616][AIRFLOW-617] Minor fixes to PR tool UX -[AIRFLOW-179] Fix DbApiHook with non-ASCII chars -[AIRFLOW-566] Add timeout while fetching logs -[AIRFLOW-615] Set graph glyphicon first -[AIRFLOW-609] Add application_name to PostgresHook -[AIRFLOW-604] Revert .first() to .one() -[AIRFLOW-370] Create AirflowConfigException in exceptions.py -[AIRFLOW-582] Fixes TI.get_dagrun filter (removes start_date) -[AIRFLOW-568] Fix double task_stats count if a DagRun is active -[AIRFLOW-585] Fix race condition in backfill execution loop -[AIRFLOW-580] Prevent landscape warning on .format -[AIRFLOW-597] Check if content is None, not false-equivalent -[AIRFLOW-586] test_dag_v1 fails from 0 to 3 a.m. -[AIRFLOW-453] Add XCom Admin Page -[AIRFLOW-588] Add Google Cloud Storage Object sensor[] -[AIRFLOW-592] example_xcom import Error -[AIRFLOW-587] Fix incorrect scope for Google Auth[] -[AIRFLOW-589] Add templatable job_name[] -[AIRFLOW-227] Show running config in config view -[AIRFLOW-319]AIRFLOW-319] xcom push response in HTTP Operator -[AIRFLOW-385] Add symlink to latest scheduler log directory -[AIRFLOW-583] Fix decode error in gcs_to_bq -[AIRFLOW-96] s3_conn_id using environment variable -[AIRFLOW-575] Clarify tutorial and FAQ about `schedule_interval` always inheriting from DAG object -[AIRFLOW-577] Output BigQuery job for improved debugging -[AIRFLOW-560] Get URI & SQLA engine from Connection -[AIRFLOW-518] Require DataProfilingMixin for Variables CRUD -[AIRFLOW-553] Fix load path for filters.js -[AIRFLOW-554] Add Jinja support to Spark-sql -[AIRFLOW-550] Make ssl config check empty string safe -[AIRFLOW-500] Use id for github allowed teams -[AIRFLOW-556] Add UI PR guidelines -[AIRFLOW-358][AIRFLOW-430] Add `connections` cli -[AIRFLOW-548] Load DAGs immediately & continually -[AIRFLOW-539] Updated BQ hook and BQ operator to support Standard SQL. -[AIRFLOW-378] Add string casting to params of spark-sql operator -[AIRFLOW-544] Add Pause/Resume toggle button -[AIRFLOW-333][AIRFLOW-258] Fix non-module plugin components -[AIRFLOW-542] Add tooltip to DAGs links icons -[AIRFLOW-530] Update docs to reflect connection environment var has to be in uppercase -[AIRFLOW-525] Update template_fields in Qubole Op -[AIRFLOW-480] Support binary file download from GCS -[AIRFLOW-198] Implement latest_only_operator -[AIRFLOW-91] Add SSL config option for the webserver -[AIRFLOW-191] Fix connection leak with PostgreSQL backend -[AIRFLOW-512] Fix 'bellow' typo in docs & comments -[AIRFLOW-509][AIRFLOW-1] Create operator to delete tables in BigQuery -[AIRFLOW-498] Remove hard-coded gcp project id -[AIRFLOW-505] Support unicode characters in authors' names -[AIRFLOW-494] Add per-operator success/failure metrics -[AIRFLOW-488] Fix test_simple fail -[AIRFLOW-468] Update Panda requirement to 0.17.1 -[AIRFLOW-159] Add cloud integration section + GCP documentation -[AIRFLOW-477][AIRFLOW-478] Restructure security section for clarity -[AIRFLOW-467] Allow defining of project_id in BigQueryHook -[AIRFLOW-483] Change print to logging statement -[AIRFLOW-475] make the segment granularity in Druid hook configurable +- [AIRFLOW-900] Double trigger should not kill original task instance +- [AIRFLOW-900] Fixes bugs in LocalTaskJob for double run protection +- [AIRFLOW-932] Do not mark tasks removed when backfilling +- [AIRFLOW-961] run onkill when SIGTERMed +- [AIRFLOW-910] Use parallel task execution for backfills +- [AIRFLOW-967] Wrap strings in native for py2 ldap compatibility +- [AIRFLOW-941] Use defined parameters for psycopg2 +- [AIRFLOW-719] Prevent DAGs from ending prematurely +- [AIRFLOW-938] Use test for True in task_stats queries +- [AIRFLOW-937] Improve performance of task_stats +- [AIRFLOW-933] use ast.literal_eval rather eval because ast.literal_eval does not execute input. +- [AIRFLOW-925] Revert airflow.hooks change that cherry-pick picked +- [AIRFLOW-919] Running tasks with no start date shouldn't break a DAGs UI +- [AIRFLOW-802] Add spark-submit operator/hook +- [AIRFLOW-897] Prevent dagruns from failing with unfinished tasks +- [AIRFLOW-861] make pickle_info endpoint be login_required +- [AIRFLOW-853] use utf8 encoding for stdout line decode +- [AIRFLOW-856] Make sure execution date is set for local client +- [AIRFLOW-830][AIRFLOW-829][AIRFLOW-88] Reduce Travis log verbosity +- [AIRFLOW-831] Restore import to fix broken tests +- [AIRFLOW-794] Access DAGS_FOLDER and SQL_ALCHEMY_CONN exclusively from settings +- [AIRFLOW-694] Fix config behaviour for empty envvar +- [AIRFLOW-365] Set dag.fileloc explicitly and use for Code view +- [AIRFLOW-931] Do not set QUEUED in TaskInstances +- [AIRFLOW-899] Tasks in SCHEDULED state should be white in the UI instead of black +- [AIRFLOW-895] Address Apache release incompliancies +- [AIRFLOW-893][AIRFLOW-510] Fix crashing webservers when a dagrun has no start date +- [AIRFLOW-793] Enable compressed loading in S3ToHiveTransfer +- [AIRFLOW-863] Example DAGs should have recent start dates +- [AIRFLOW-869] Refactor mark success functionality +- [AIRFLOW-856] Make sure execution date is set for local client +- [AIRFLOW-814] Fix Presto*CheckOperator.__init__ +- [AIRFLOW-844] Fix cgroups directory creation +- [AIRFLOW-816] Use static nvd3 and d3 +- [AIRFLOW-821] Fix py3 compatibility +- [AIRFLOW-817] Check for None value of execution_date in endpoint +- [AIRFLOW-822] Close db before exception +- [AIRFLOW-815] Add prev/next execution dates to template variables +- [AIRFLOW-813] Fix unterminated unit tests in SchedulerJobTest +- [AIRFLOW-813] Fix unterminated scheduler unit tests +- [AIRFLOW-806] UI should properly ignore DAG doc when it is None +- [AIRFLOW-812] Fix the scheduler termination bug. +- [AIRFLOW-780] Fix dag import errors no longer working +- [AIRFLOW-783] Fix py3 incompatibility in BaseTaskRunner +- [AIRFLOW-810] Correct down_revision dag_id/state index creation +- [AIRFLOW-807] Improve scheduler performance for large DAGs +- [AIRFLOW-798] Check return_code before forcing termination +- [AIRFLOW-139] Let psycopg2 handle autocommit for PostgresHook +- [AIRFLOW-776] Add missing cgroups devel dependency +- [AIRFLOW-777] Fix expression to check if a DagRun is in running state +- [AIRFLOW-785] Don't import CgroupTaskRunner at global scope +- [AIRFLOW-784] Pin funcsigs to 1.0.0 +- [AIRFLOW-624] Fix setup.py to not import airflow.version as version +- [AIRFLOW-779] Task should fail with specific message when deleted +- [AIRFLOW-778] Fix completey broken MetastorePartitionSensor +- [AIRFLOW-739] Set pickle_info log to debug +- [AIRFLOW-771] Make S3 logs append instead of clobber +- [AIRFLOW-773] Fix flaky datetime addition in api test +- [AIRFLOW-219][AIRFLOW-398] Cgroups + impersonation +- [AIRFLOW-683] Add jira hook, operator and sensor +- [AIRFLOW-762] Add Google DataProc delete operator +- [AIRFLOW-760] Update systemd config +- [AIRFLOW-759] Use previous dag_run to verify depend_on_past +- [AIRFLOW-757] Set child_process_log_directory default more sensible +- [AIRFLOW-692] Open XCom page to super-admins only +- [AIRFLOW-737] Fix HDFS Sensor directory. +- [AIRFLOW-747] Fix retry_delay not honoured +- [AIRFLOW-558] Add Support for dag.catchup=(True|False) Option +- [AIRFLOW-489] Allow specifying execution date in trigger_dag API +- [AIRFLOW-738] Commit deleted xcom items before insert +- [AIRFLOW-729] Add Google Cloud Dataproc cluster creation operator +- [AIRFLOW-728] Add Google BigQuery table sensor +- [AIRFLOW-741] Log to debug instead of info for app.py +- [AIRFLOW-731] Fix period bug for NamedHivePartitionSensor +- [AIRFLOW-740] Pin jinja2 to < 2.9.0 +- [AIRFLOW-663] Improve time units for task performance charts +- [AIRFLOW-665] Fix email attachments +- [AIRFLOW-734] Fix SMTP auth regression when not using user/pass +- [AIRFLOW-702] Fix LDAP Regex Bug +- [AIRFLOW-717] Add Cloud Storage updated sensor +- [AIRFLOW-695] Retries do not execute because dagrun is in FAILED state +- [AIRFLOW-673] Add operational metrics test for SchedulerJob +- [AIRFLOW-727] try_number is not increased +- [AIRFLOW-715] A more efficient HDFS Sensor: +- [AIRFLOW-716] Allow AVRO BigQuery load-job without schema +- [AIRFLOW-718] Allow the query URI for DataProc Pig +- Log needs to be part of try/catch block +- [AIRFLOW-721] Descendant process can disappear before termination +- [AIRFLOW-403] Bash operator's kill method leaves underlying processes running +- [AIRFLOW-657] Add AutoCommit Parameter for MSSQL +- [AIRFLOW-641] Improve pull request instructions +- [AIRFLOW-685] Add test for MySqlHook.bulk_load() +- [AIRFLOW-686] Match auth backend config section +- [AIRFLOW-691] Add SSH KeepAlive option to SSH_hook +- [AIRFLOW-709] Use same engine for migrations and reflection +- [AIRFLOW-700] Update to reference to web authentication documentation +- [AIRFLOW-649] Support non-sched DAGs in LatestOnlyOp +- [AIRFLOW-712] Fix AIRFLOW-667 to use proper HTTP error properties +- [AIRFLOW-710] Add OneFineStay as official user +- [AIRFLOW-703][AIRFLOW-1] Stop Xcom being cleared too early +- [AIRFLOW-679] Stop concurrent task instances from running +- [AIRFLOW-704][AIRFLOW-1] Fix invalid syntax in BQ hook +- [AIRFLOW-667] Handle BigQuery 503 error +- [AIRFLOW-680] Disable connection pool for commands +- [AIRFLOW-678] Prevent scheduler from double triggering TIs +- [AIRFLOW-677] Kill task if it fails to heartbeat +- [AIRFLOW-674] Ability to add descriptions for DAGs +- [AIRFLOW-682] Bump MAX_PERIODS to make mark_success work for large DAGs +- Use jdk selector to set required jdk +- [AIRFLOW-647] Restore dag.get_active_runs +- [AIRFLOW-662] Change seasons to months in project description +- [AIRFLOW-656] Add dag/task/date index to xcom table +- [AIRFLOW-658] Improve schema_update_options in GCP +- [AIRFLOW-41] Fix pool oversubscription +- [AIRFLOW-489] Add API Framework +- [AIRFLOW-653] Add some missing endpoint tests +- [AIRFLOW-652] Remove obsolete endpoint +- [AIRFLOW-345] Add contrib ECSOperator +- [AIRFLOW-650] Adding Celect to user list +- [AIRFLOW-510] Filter Paused Dags, show Last Run & Trigger Dag +- [AIRFLOW-643] Improve date handling for sf_hook +- [AIRFLOW-638] Add schema_update_options to GCP ops +- [AIRFLOW-640] Install and enable nose-ignore-docstring +- [AIRFLOW-639]AIRFLOW-639] Alphasort package names +- [AIRFLOW-375] Fix pylint errors +- [AIRFLOW-347] Show empty DAG runs in tree view +- [AIRFLOW-628] Adding SalesforceHook to contrib/hooks +- [AIRFLOW-514] hive hook loads data from pandas DataFrame into hive and infers types +- [AIRFLOW-565] Fixes DockerOperator on Python3.x +- [AIRFLOW-635] Encryption option for S3 hook +- [AIRFLOW-137] Fix max_active_runs on clearing tasks +- [AIRFLOW-343] Fix schema plumbing in HiveServer2Hook +- [AIRFLOW-130] Fix ssh operator macosx +- [AIRFLOW-633] Show TI attributes in TI view +- [AIRFLOW-626][AIRFLOW-1] HTML Content does not show up when sending email with attachment +- [AIRFLOW-533] Set autocommit via set_autocommit +- [AIRFLOW-629] stop pinning lxml +- [AIRFLOW-464] Add setdefault method to Variable +- [AIRFLOW-626][AIRFLOW-1] HTML Content does not show up when sending email with attachment +- [AIRFLOW-591] Add datadog hook & sensor +- [AIRFLOW-561] Add RedshiftToS3Transfer operator +- [AIRFLOW-570] Pass root to date form on gantt +- [AIRFLOW-504] Store fractional seconds in MySQL tables +- [AIRFLOW-623] LDAP attributes not always a list +- [AIRFLOW-611] source_format in BigQueryBaseCursor +- [AIRFLOW-619] Fix exception in Gannt chart +- [AIRFLOW-618] Cast DateTimes to avoid sqllite errors +- [AIRFLOW-422] Add JSON endpoint for task info +- [AIRFLOW-616][AIRFLOW-617] Minor fixes to PR tool UX +- [AIRFLOW-179] Fix DbApiHook with non-ASCII chars +- [AIRFLOW-566] Add timeout while fetching logs +- [AIRFLOW-615] Set graph glyphicon first +- [AIRFLOW-609] Add application_name to PostgresHook +- [AIRFLOW-604] Revert .first() to .one() +- [AIRFLOW-370] Create AirflowConfigException in exceptions.py +- [AIRFLOW-582] Fixes TI.get_dagrun filter (removes start_date) +- [AIRFLOW-568] Fix double task_stats count if a DagRun is active +- [AIRFLOW-585] Fix race condition in backfill execution loop +- [AIRFLOW-580] Prevent landscape warning on .format +- [AIRFLOW-597] Check if content is None, not false-equivalent +- [AIRFLOW-586] test_dag_v1 fails from 0 to 3 a.m. +- [AIRFLOW-453] Add XCom Admin Page +- [AIRFLOW-588] Add Google Cloud Storage Object sensor[] +- [AIRFLOW-592] example_xcom import Error +- [AIRFLOW-587] Fix incorrect scope for Google Auth[] +- [AIRFLOW-589] Add templatable job_name[] +- [AIRFLOW-227] Show running config in config view +- [AIRFLOW-319]AIRFLOW-319] xcom push response in HTTP Operator +- [AIRFLOW-385] Add symlink to latest scheduler log directory +- [AIRFLOW-583] Fix decode error in gcs_to_bq +- [AIRFLOW-96] s3_conn_id using environment variable +- [AIRFLOW-575] Clarify tutorial and FAQ about `schedule_interval` always inheriting from DAG object +- [AIRFLOW-577] Output BigQuery job for improved debugging +- [AIRFLOW-560] Get URI & SQLA engine from Connection +- [AIRFLOW-518] Require DataProfilingMixin for Variables CRUD +- [AIRFLOW-553] Fix load path for filters.js +- [AIRFLOW-554] Add Jinja support to Spark-sql +- [AIRFLOW-550] Make ssl config check empty string safe +- [AIRFLOW-500] Use id for github allowed teams +- [AIRFLOW-556] Add UI PR guidelines +- [AIRFLOW-358][AIRFLOW-430] Add `connections` cli +- [AIRFLOW-548] Load DAGs immediately & continually +- [AIRFLOW-539] Updated BQ hook and BQ operator to support Standard SQL. +- [AIRFLOW-378] Add string casting to params of spark-sql operator +- [AIRFLOW-544] Add Pause/Resume toggle button +- [AIRFLOW-333][AIRFLOW-258] Fix non-module plugin components +- [AIRFLOW-542] Add tooltip to DAGs links icons +- [AIRFLOW-530] Update docs to reflect connection environment var has to be in uppercase +- [AIRFLOW-525] Update template_fields in Qubole Op +- [AIRFLOW-480] Support binary file download from GCS +- [AIRFLOW-198] Implement latest_only_operator +- [AIRFLOW-91] Add SSL config option for the webserver +- [AIRFLOW-191] Fix connection leak with PostgreSQL backend +- [AIRFLOW-512] Fix 'bellow' typo in docs & comments +- [AIRFLOW-509][AIRFLOW-1] Create operator to delete tables in BigQuery +- [AIRFLOW-498] Remove hard-coded gcp project id +- [AIRFLOW-505] Support unicode characters in authors' names +- [AIRFLOW-494] Add per-operator success/failure metrics +- [AIRFLOW-488] Fix test_simple fail +- [AIRFLOW-468] Update Panda requirement to 0.17.1 +- [AIRFLOW-159] Add cloud integration section + GCP documentation +- [AIRFLOW-477][AIRFLOW-478] Restructure security section for clarity +- [AIRFLOW-467] Allow defining of project_id in BigQueryHook +- [AIRFLOW-483] Change print to logging statement +- [AIRFLOW-475] make the segment granularity in Druid hook configurable -AIRFLOW 1.7.2 +Airflow 1.7.2 ------------- -[AIRFLOW-463] Link Airflow icon to landing page -[AIRFLOW-149] Task Dependency Engine + Why Isn't My Task Running View -[AIRFLOW-361] Add default failure handler for the Qubole Operator -[AIRFLOW-353] Fix dag run status update failure -[AIRFLOW-447] Store source URIs in Python 3 compatible list -[AIRFLOW-443] Make module names unique when importing -[AIRFLOW-444] Add Google authentication backend -[AIRFLOW-446][AIRFLOW-445] Adds missing dataproc submit options -[AIRFLOW-431] Add CLI for CRUD operations on pools -[AIRFLOW-329] Update Dag Overview Page with Better Status Columns -[AIRFLOW-360] Fix style warnings in models.py -[AIRFLOW-425] Add white fill for null state tasks in tree view. -[AIRFLOW-69] Use dag runs in backfill jobs -[AIRFLOW-415] Make dag_id not found error clearer -[AIRFLOW-416] Use ordinals in README's company list -[AIRFLOW-369] Allow setting default DAG orientation -[AIRFLOW-410] Add 2 Q/A to the FAQ in the docs -[AIRFLOW-407] Add different colors for some sensors -[AIRFLOW-414] Improve error message for missing FERNET_KEY -[AIRFLOW-406] Sphinx/rst fixes -[AIRFLOW-412] Fix lxml dependency -[AIRFLOW-413] Fix unset path bug when backfilling via pickle -[AIRFLOW-78] Airflow clear leaves dag_runs -[AIRFLOW-402] Remove NamedHivePartitionSensor static check, add docs -[AIRFLOW-394] Add an option to the Task Duration graph to show cumulative times -[AIRFLOW-404] Retry download if unpacking fails for hive -[AIRFLOW-276] Gunicorn rolling restart -[AIRFLOW-399] Remove dags/testdruid.py -[AIRFLOW-400] models.py/DAG.set_dag_runs_state() does not correctly set state -[AIRFLOW-395] Fix colon/equal signs typo for resources in default config -[AIRFLOW-397] Documentation: Fix typo "instatiating" to "instantiating" -[AIRFLOW-395] Remove trailing commas from resources in config -[AIRFLOW-388] Add a new chart for Task_Tries for each DAG -[AIRFLOW-322] Fix typo in FAQ section -[AIRFLOW-375] Pylint fixes -limit scope to user email only AIRFLOW-386 -[AIRFLOW-383] Cleanup example qubole operator dag -[AIRFLOW-160] Parse DAG files through child processes -[AIRFLOW-381] Manual UI Dag Run creation: require dag_id field -[AIRFLOW-373] Enhance CLI variables functionality -[AIRFLOW-379] Enhance Variables page functionality: import/export variables -[AIRFLOW-331] modify the LDAP authentication config lines in 'Security' sample codes -[AIRFLOW-356][AIRFLOW-355][AIRFLOW-354] Replace nobr, enable DAG only exists locally message, change edit DAG icon -[AIRFLOW-362] Import __future__ division -[AIRFLOW-359] Pin flask-login to 0.2.11 -[AIRFLOW-261] Add bcc and cc fields to EmailOperator -[AIRFLOW-348] Fix code style warnings -[AIRFLOW-349] Add metric for number of zombies killed -[AIRFLOW-340] Remove unused dependency on Babel -[AIRFLOW-339]: Ability to pass a flower conf file -[AIRFLOW-341][operators] Add resource requirement attributes to operators -[AIRFLOW-335] Fix simple style errors/warnings -[AIRFLOW-337] Add __repr__ to VariableAccessor and VariableJsonAccessor -[AIRFLOW-334] Fix using undefined variable -[AIRFLOW-315] Fix blank lines code style warnings -[AIRFLOW-306] Add Spark-sql Hook and Operator -[AIRFLOW-327] Add rename method to the FTPHook -[AIRFLOW-321] Fix a wrong code example about tests/dags -[AIRFLOW-316] Always check DB state for Backfill Job execution -[AIRFLOW-264] Adding workload management for Hive -[AIRFLOW-297] support exponential backoff option for retry delay -[AIRFLOW-31][AIRFLOW-200] Add note to updating.md -[AIRFLOW-307] There is no __neq__ python magic method. -[AIRFLOW-309] Add requirements of develop dependencies to docs -[AIRFLOW-307] Rename __neq__ to __ne__ python magic method. -[AIRFLOW-313] Fix code style for sqoop_hook.py -[AIRFLOW-311] Fix wrong path in CONTRIBUTING.md -[AIRFLOW-24] DataFlow Java Operator -[AIRFLOW-308] Add link to refresh DAG within DAG view header -[AIRFLOW-314] Fix BigQuery cursor run_table_upsert method -[AIRFLOW-298] fix incubator diclaimer in docs -[AIRFLOW-284] HiveServer2Hook fix for cursor scope for get_results -[AIRFLOW-260] More graceful exit when issues can't be closed -[AIRFLOW-260] Handle case when no version is found -[AIRFLOW-228] Handle empty version list in PR tool -[AIRFLOW-302] Improve default squash commit message -[AIRFLOW-187] Improve prompt styling -[AIRFLOW-187] Fix typo in argument name -[AIRFLOW-187] Move "Close XXX" message to end of squash commit -[AIRFLOW-247] Add EMR hook, operators and sensors. Add AWS base hook -[AIRFLOW-301] Fix broken unit test -[AIRFLOW-100] Add execution_date_fn to ExternalTaskSensor -[AIRFLOW-282] Remove PR Tool logic that depends on version formatting -[AIRFLOW-291] Add index for state in TI table -[AIRFLOW-269] Add some unit tests for PostgreSQL -[AIRFLOW-296] template_ext is being treated as a string rather than a tuple in qubole operator -[AIRFLOW-286] Improve FTPHook to implement context manager interface -[AIRFLOW-243] Create NamedHivePartitionSensor -[AIRFLOW-246] Improve dag_stats endpoint query -[AIRFLOW-189] Highlighting of Parent/Child nodes in Graphs -[ARFLOW-255] Check dagrun timeout when comparing active runs -[AIRFLOW-281] Add port to mssql_hook -[AIRFLOW-285] Use Airflow 2.0 style imports for all remaining hooks/operators -[AIRFLOW-40] Add LDAP group filtering feature. -[AIRFLOW-277] Multiple deletions does not work in Task Instances view if using SQLite backend -[AIRFLOW-200] Make hook/operator imports lazy, and print proper exceptions -[AIRFLOW-283] Make store_to_xcom_key a templated field in GoogleCloudStorageDownloadOperator -[AIRFLOW-278] Support utf-8 ecoding for SQL -[AIRFLOW-280] clean up tmp druid table no matter if an ingestion job succeeds or not -[AIRFLOW-274] Add XCom functionality to GoogleCloudStorageDownloadOperator -[AIRFLOW-273] Create an svg version of the airflow logo. -[AIRFLOW-275] Update contributing guidelines -[AIRFLOW-244] Modify hive operator to inject analysis data -[AIRFLOW-162] Allow variable to be accessible into templates -[AIRFLOW-248] Add Apache license header to all files -[AIRFLOW-263] Remove temp backtick file -[AIRFLOW-252] Raise Sqlite exceptions when deleting tasks instance in WebUI -[AIRFLOW-180] Fix timeout behavior for sensors -[AIRFLOW-262] Simplify commands in MANIFEST.in -[AIRFLOW-31] Add zope dependency -[AIRFLOW-6] Remove dependency on Highcharts -[AIRFLOW-234] make task that aren't `running` self-terminate -[AIRFLOW-256] Fix test_scheduler_reschedule heartrate -Add Python 3 compatibility fix -[AIRFLOW-31] Use standard imports for hooks/operators -[AIRFLOW-173] Initial implementation of FileSensor -[AIRFLOW-224] Collect orphaned tasks and reschedule them -[AIRFLOW-239] Fix tests indentation -[AIRFLOW-225] Better units for task duration graph -[AIRFLOW-241] Add testing done section to PR template -[AIRFLOW-222] Show duration of task instances in ui -[AIRFLOW-231] Do not eval user input in PrestoHook -[AIRFLOW-216] Add Sqoop Hook and Operator -[AIRFLOW-171] Add upgrade notes on email and S3 to 1.7.1.2 -[AIRFLOW-238] Make compatible with flask-admin 1.4.1 -[AIRFLOW-230] [HiveServer2Hook] adding multi statements support -[AIRFLOW-142] setup_env.sh doesn't download hive tarball if hdp is specified as distro -[AIRFLOW-223] Make parametrable the IP on which Flower binds to -[AIRFLOW-218] Added option to enable webserver gunicorn access/err logs -[AIRFLOW-213] Add "Closes #X" phrase to commit messages -[AIRFLOW-68] Align start_date with the schedule_interval -[AIRFLOW-9] Improving docs to meet Apache's standards -[AIRFLOW-131] Make XCom.clear more selective -[AIRFLOW-214] Fix occasion of detached taskinstance -[AIRFLOW-206] Add commit to close PR -[AIRFLOW-206] Always load local log files if they exist -[AIRFLOW-211] Fix JIRA "resolve" vs "close" behavior -[AIRFLOW-64] Add note about relative DAGS_FOLDER -[AIRFLOW-114] Sort plugins dropdown -[AIRFLOW-209] Add scheduler tests and improve lineage handling -[AIRFLOW-207] Improve JIRA auth workflow -[AIRFLOW-187] Improve PR tool UX -[AIRFLOW-155] Documentation of Qubole Operator -Optimize and refactor process_dag -[AIRFLOW-185] Handle empty versions list -[AIRFLOW-201] Fix for HiveMetastoreHook + kerberos -[AIRFLOW-202]: Fixes stray print line -[AIRFLOW-196] Fix bug that exception is not handled in HttpSensor -[AIRFLOW-195] : Add toggle support to subdag clearing in the CLI -[AIRFLOW-23] Support for Google Cloud DataProc -[AIRFLOW-25] Configuration for Celery always required -[AIRFLOW-190] Add codecov and remove download count -[AIRFLOW-168] Correct evaluation of @once schedule -[AIRFLOW-183] Fetch log from remote when worker returns 4xx/5xx response -[AIRFLOW-181] Fix failing unpacking of hadoop by redownloading -[AIRFLOW-176] remove unused formatting key -[AIRFLOW-167]: Add dag_state option in cli -[AIRFLOW-178] Fix bug so that zip file is detected in DAG folder -[AIRFLOW-176] Improve PR Tool JIRA workflow -AIRFLOW-45: Support Hidden Airflow Variables -[AIRFLOW-175] Run git-reset before checkout in PR tool -[AIRFLOW-157] Make PR tool Py3-compat; add JIRA command -[AIRFLOW-170] Add missing @apply_defaults - +- [AIRFLOW-463] Link Airflow icon to landing page +- [AIRFLOW-149] Task Dependency Engine + Why Isn't My Task Running View +- [AIRFLOW-361] Add default failure handler for the Qubole Operator +- [AIRFLOW-353] Fix dag run status update failure +- [AIRFLOW-447] Store source URIs in Python 3 compatible list +- [AIRFLOW-443] Make module names unique when importing +- [AIRFLOW-444] Add Google authentication backend +- [AIRFLOW-446][AIRFLOW-445] Adds missing dataproc submit options +- [AIRFLOW-431] Add CLI for CRUD operations on pools +- [AIRFLOW-329] Update Dag Overview Page with Better Status Columns +- [AIRFLOW-360] Fix style warnings in models.py +- [AIRFLOW-425] Add white fill for null state tasks in tree view. +- [AIRFLOW-69] Use dag runs in backfill jobs +- [AIRFLOW-415] Make dag_id not found error clearer +- [AIRFLOW-416] Use ordinals in README's company list +- [AIRFLOW-369] Allow setting default DAG orientation +- [AIRFLOW-410] Add 2 Q/A to the FAQ in the docs +- [AIRFLOW-407] Add different colors for some sensors +- [AIRFLOW-414] Improve error message for missing FERNET_KEY +- [AIRFLOW-406] Sphinx/rst fixes +- [AIRFLOW-412] Fix lxml dependency +- [AIRFLOW-413] Fix unset path bug when backfilling via pickle +- [AIRFLOW-78] Airflow clear leaves dag_runs +- [AIRFLOW-402] Remove NamedHivePartitionSensor static check, add docs +- [AIRFLOW-394] Add an option to the Task Duration graph to show cumulative times +- [AIRFLOW-404] Retry download if unpacking fails for hive +- [AIRFLOW-276] Gunicorn rolling restart +- [AIRFLOW-399] Remove dags/testdruid.py +- [AIRFLOW-400] models.py/DAG.set_dag_runs_state() does not correctly set state +- [AIRFLOW-395] Fix colon/equal signs typo for resources in default config +- [AIRFLOW-397] Documentation: Fix typo "instatiating" to "instantiating" +- [AIRFLOW-395] Remove trailing commas from resources in config +- [AIRFLOW-388] Add a new chart for Task_Tries for each DAG +- [AIRFLOW-322] Fix typo in FAQ section +- [AIRFLOW-375] Pylint fixes +- limit scope to user email only AIRFLOW-386 +- [AIRFLOW-383] Cleanup example qubole operator dag +- [AIRFLOW-160] Parse DAG files through child processes +- [AIRFLOW-381] Manual UI Dag Run creation: require dag_id field +- [AIRFLOW-373] Enhance CLI variables functionality +- [AIRFLOW-379] Enhance Variables page functionality: import/export variables +- [AIRFLOW-331] modify the LDAP authentication config lines in 'Security' sample codes +- [AIRFLOW-356][AIRFLOW-355][AIRFLOW-354] Replace nobr, enable DAG only exists locally message, change edit DAG icon +- [AIRFLOW-362] Import __future__ division +- [AIRFLOW-359] Pin flask-login to 0.2.11 +- [AIRFLOW-261] Add bcc and cc fields to EmailOperator +- [AIRFLOW-348] Fix code style warnings +- [AIRFLOW-349] Add metric for number of zombies killed +- [AIRFLOW-340] Remove unused dependency on Babel +- [AIRFLOW-339]: Ability to pass a flower conf file +- [AIRFLOW-341][operators] Add resource requirement attributes to operators +- [AIRFLOW-335] Fix simple style errors/warnings +- [AIRFLOW-337] Add __repr__ to VariableAccessor and VariableJsonAccessor +- [AIRFLOW-334] Fix using undefined variable +- [AIRFLOW-315] Fix blank lines code style warnings +- [AIRFLOW-306] Add Spark-sql Hook and Operator +- [AIRFLOW-327] Add rename method to the FTPHook +- [AIRFLOW-321] Fix a wrong code example about tests/dags +- [AIRFLOW-316] Always check DB state for Backfill Job execution +- [AIRFLOW-264] Adding workload management for Hive +- [AIRFLOW-297] support exponential backoff option for retry delay +- [AIRFLOW-31][AIRFLOW-200] Add note to updating.md +- [AIRFLOW-307] There is no __neq__ python magic method. +- [AIRFLOW-309] Add requirements of develop dependencies to docs +- [AIRFLOW-307] Rename __neq__ to __ne__ python magic method. +- [AIRFLOW-313] Fix code style for sqoop_hook.py +- [AIRFLOW-311] Fix wrong path in CONTRIBUTING.md +- [AIRFLOW-24] DataFlow Java Operator +- [AIRFLOW-308] Add link to refresh DAG within DAG view header +- [AIRFLOW-314] Fix BigQuery cursor run_table_upsert method +- [AIRFLOW-298] fix incubator diclaimer in docs +- [AIRFLOW-284] HiveServer2Hook fix for cursor scope for get_results +- [AIRFLOW-260] More graceful exit when issues can't be closed +- [AIRFLOW-260] Handle case when no version is found +- [AIRFLOW-228] Handle empty version list in PR tool +- [AIRFLOW-302] Improve default squash commit message +- [AIRFLOW-187] Improve prompt styling +- [AIRFLOW-187] Fix typo in argument name +- [AIRFLOW-187] Move "Close XXX" message to end of squash commit +- [AIRFLOW-247] Add EMR hook, operators and sensors. Add AWS base hook +- [AIRFLOW-301] Fix broken unit test +- [AIRFLOW-100] Add execution_date_fn to ExternalTaskSensor +- [AIRFLOW-282] Remove PR Tool logic that depends on version formatting +- [AIRFLOW-291] Add index for state in TI table +- [AIRFLOW-269] Add some unit tests for PostgreSQL +- [AIRFLOW-296] template_ext is being treated as a string rather than a tuple in qubole operator +- [AIRFLOW-286] Improve FTPHook to implement context manager interface +- [AIRFLOW-243] Create NamedHivePartitionSensor +- [AIRFLOW-246] Improve dag_stats endpoint query +- [AIRFLOW-189] Highlighting of Parent/Child nodes in Graphs +- [ARFLOW-255] Check dagrun timeout when comparing active runs +- [AIRFLOW-281] Add port to mssql_hook +- [AIRFLOW-285] Use Airflow 2.0 style imports for all remaining hooks/operators +- [AIRFLOW-40] Add LDAP group filtering feature. +- [AIRFLOW-277] Multiple deletions does not work in Task Instances view if using SQLite backend +- [AIRFLOW-200] Make hook/operator imports lazy, and print proper exceptions +- [AIRFLOW-283] Make store_to_xcom_key a templated field in GoogleCloudStorageDownloadOperator +- [AIRFLOW-278] Support utf-8 ecoding for SQL +- [AIRFLOW-280] clean up tmp druid table no matter if an ingestion job succeeds or not +- [AIRFLOW-274] Add XCom functionality to GoogleCloudStorageDownloadOperator +- [AIRFLOW-273] Create an svg version of the airflow logo. +- [AIRFLOW-275] Update contributing guidelines +- [AIRFLOW-244] Modify hive operator to inject analysis data +- [AIRFLOW-162] Allow variable to be accessible into templates +- [AIRFLOW-248] Add Apache license header to all files +- [AIRFLOW-263] Remove temp backtick file +- [AIRFLOW-252] Raise Sqlite exceptions when deleting tasks instance in WebUI +- [AIRFLOW-180] Fix timeout behavior for sensors +- [AIRFLOW-262] Simplify commands in MANIFEST.in +- [AIRFLOW-31] Add zope dependency +- [AIRFLOW-6] Remove dependency on Highcharts +- [AIRFLOW-234] make task that aren't `running` self-terminate +- [AIRFLOW-256] Fix test_scheduler_reschedule heartrate +- Add Python 3 compatibility fix +- [AIRFLOW-31] Use standard imports for hooks/operators +- [AIRFLOW-173] Initial implementation of FileSensor +- [AIRFLOW-224] Collect orphaned tasks and reschedule them +- [AIRFLOW-239] Fix tests indentation +- [AIRFLOW-225] Better units for task duration graph +- [AIRFLOW-241] Add testing done section to PR template +- [AIRFLOW-222] Show duration of task instances in ui +- [AIRFLOW-231] Do not eval user input in PrestoHook +- [AIRFLOW-216] Add Sqoop Hook and Operator +- [AIRFLOW-171] Add upgrade notes on email and S3 to 1.7.1.2 +- [AIRFLOW-238] Make compatible with flask-admin 1.4.1 +- [AIRFLOW-230] [HiveServer2Hook] adding multi statements support +- [AIRFLOW-142] setup_env.sh doesn't download hive tarball if hdp is specified as distro +- [AIRFLOW-223] Make parametrable the IP on which Flower binds to +- [AIRFLOW-218] Added option to enable webserver gunicorn access/err logs +- [AIRFLOW-213] Add "Closes #X" phrase to commit messages +- [AIRFLOW-68] Align start_date with the schedule_interval +- [AIRFLOW-9] Improving docs to meet Apache's standards +- [AIRFLOW-131] Make XCom.clear more selective +- [AIRFLOW-214] Fix occasion of detached taskinstance +- [AIRFLOW-206] Add commit to close PR +- [AIRFLOW-206] Always load local log files if they exist +- [AIRFLOW-211] Fix JIRA "resolve" vs "close" behavior +- [AIRFLOW-64] Add note about relative DAGS_FOLDER +- [AIRFLOW-114] Sort plugins dropdown +- [AIRFLOW-209] Add scheduler tests and improve lineage handling +- [AIRFLOW-207] Improve JIRA auth workflow +- [AIRFLOW-187] Improve PR tool UX +- [AIRFLOW-155] Documentation of Qubole Operator +- Optimize and refactor process_dag +- [AIRFLOW-185] Handle empty versions list +- [AIRFLOW-201] Fix for HiveMetastoreHook + kerberos +- [AIRFLOW-202]: Fixes stray print line +- [AIRFLOW-196] Fix bug that exception is not handled in HttpSensor +- [AIRFLOW-195] : Add toggle support to subdag clearing in the CLI +- [AIRFLOW-23] Support for Google Cloud DataProc +- [AIRFLOW-25] Configuration for Celery always required +- [AIRFLOW-190] Add codecov and remove download count +- [AIRFLOW-168] Correct evaluation of @once schedule +- [AIRFLOW-183] Fetch log from remote when worker returns 4xx/5xx response +- [AIRFLOW-181] Fix failing unpacking of hadoop by redownloading +- [AIRFLOW-176] remove unused formatting key +- [AIRFLOW-167]: Add dag_state option in cli +- [AIRFLOW-178] Fix bug so that zip file is detected in DAG folder +- [AIRFLOW-176] Improve PR Tool JIRA workflow +- AIRFLOW-45: Support Hidden Airflow Variables +- [AIRFLOW-175] Run git-reset before checkout in PR tool +- [AIRFLOW-157] Make PR tool Py3-compat; add JIRA command +- [AIRFLOW-170] Add missing @apply_defaults -AIRFLOW 1.7.1, 2016-05-19 +Airflow 1.7.1, 2016-05-19 ------------------------- - Fix : Don't treat premature tasks as could_not_run tasks -- AIRFLOW-92 Avoid unneeded upstream_failed session closes apache/incubator-airflow#1485 +- AIRFLOW-92 Avoid unneeded upstream_failed session closes apache/airflow#1485 - Add logic to lock DB and avoid race condition - Handle queued tasks from multiple jobs/executors - AIRFLOW-52 Warn about overwriting tasks in a DAG @@ -978,7 +2496,7 @@ AIRFLOW 1.7.1, 2016-05-19 - Show only Airflow's deprecation warnings - Set DAG_FOLDER for unit tests - Missing comma in setup.py -- Deprecate *args and **kwargs in BaseOperator +- Deprecate args and kwargs in BaseOperator - Raise deep scheduler exceptions to force a process restart. - Change inconsistent example DAG owners - Fix module path of send_email_smtp in configuration diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e45060bfe75c0..56258d5d75142 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,9 +1,27 @@ + + # Contributing Contributions are welcome and are greatly appreciated! Every little bit helps, and credit will always be given. - # Table of Contents * [TOC](#table-of-contents) * [Types of Contributions](#types-of-contributions) @@ -15,11 +33,10 @@ little bit helps, and credit will always be given. * [Documentation](#documentation) * [Development and Testing](#development-and-testing) - [Setting up a development environment](#setting-up-a-development-environment) - - [Pull requests guidelines](#pull-request-guidelines) - - [Testing Locally](#testing-locally) + - [Running unit tests](#running-unit-tests) + * [Pull requests guidelines](#pull-request-guidelines) * [Changing the Metadata Database](#changing-the-metadata-database) - ## Types of Contributions ### Report Bugs @@ -64,11 +81,13 @@ If you are proposing a feature: ## Documentation The latest API documentation is usually available -[here](https://airflow.incubator.apache.org/). To generate a local version, +[here](https://airflow.apache.org/). To generate a local version, you need to have set up an Airflow development environemnt (see below). Also install the `doc` extra. - pip install -e .[doc] +``` +pip install -e '.[doc]' +``` Generate the documentation by running: @@ -79,65 +98,170 @@ extras to build the full API reference. ## Development and Testing -### Set up a development env using Docker +### Setting up a development environment -Go to your Airflow directory and start a new docker container. You can choose between Python 2 or 3, whatever you prefer. +There are three ways to setup an Apache Airflow development environment. -``` -# Start docker in your Airflow directory -docker run -t -i -v `pwd`:/airflow/ python:2 bash +1. Using tools and libraries installed directly on your system. + + Install Python (2.7.x or 3.4.x), MySQL, and libxml by using system-level package + managers like yum, apt-get for Linux, or Homebrew for Mac OS at first. Refer to the [base CI Dockerfile](https://github.com/apache/incubator-airflow-ci/blob/master/Dockerfile) for + a comprehensive list of required packages. + + Then install python development requirements. It is usually best to work in a virtualenv: + + ```bash + cd $AIRFLOW_HOME + virtualenv env + source env/bin/activate + pip install -e '.[devel]' + ``` + +2. Using a Docker container + + Go to your Airflow directory and start a new docker container. You can choose between Python 2 or 3, whatever you prefer. + + ``` + # Start docker in your Airflow directory + docker run -t -i -v `pwd`:/airflow/ -w /airflow/ python:3 bash + + + # Install Airflow with all the required dependencies, + # including the devel which will provide the development tools + pip install -e '.[hdfs,hive,druid,devel]' + + # Init the database + airflow initdb + + nosetests -v tests/hooks/test_druid_hook.py + + test_get_first_record (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok + test_get_records (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok + test_get_uri (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok + test_get_conn_url (tests.hooks.test_druid_hook.TestDruidHook) ... ok + test_submit_gone_wrong (tests.hooks.test_druid_hook.TestDruidHook) ... ok + test_submit_ok (tests.hooks.test_druid_hook.TestDruidHook) ... ok + test_submit_timeout (tests.hooks.test_druid_hook.TestDruidHook) ... ok + test_submit_unknown_response (tests.hooks.test_druid_hook.TestDruidHook) ... ok + + ---------------------------------------------------------------------- + Ran 8 tests in 3.036s + + OK + ``` + + The Airflow code is mounted inside of the Docker container, so if you change something using your favorite IDE, you can directly test is in the container. + +3. Using [Docker Compose](https://docs.docker.com/compose/) and Airflow's CI scripts. + + Start a docker container through Compose for development to avoid installing the packages directly on your system. The following will give you a shell inside a container, run all required service containers (MySQL, PostgresSQL, krb5 and so on) and install all the dependencies: + + ```bash + docker-compose -f scripts/ci/docker-compose.yml run airflow-testing bash + # From the container + export TOX_ENV=py27-backend_mysql-env_docker + /app/scripts/ci/run-ci.sh + ``` -# Go to the Airflow directory -cd /airflow/ + If you wish to run individual tests inside of Docker environment you can do as follows: -# Install Airflow with all the required dependencies, -# including the devel which will provide the development tools -pip install -e ".[hdfs,hive,druid,devel]" + ```bash + # From the container (with your desired environment) with druid hook + export TOX_ENV=py27-backend_mysql-env_docker + /app/scripts/ci/run-ci.sh -- tests/hooks/test_druid_hook.py + ``` -# Init the database -airflow initdb -nosetests -v tests/hooks/test_druid_hook.py +### Running unit tests - test_get_first_record (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok - test_get_records (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok - test_get_uri (tests.hooks.test_druid_hook.TestDruidDbApiHook) ... ok - test_get_conn_url (tests.hooks.test_druid_hook.TestDruidHook) ... ok - test_submit_gone_wrong (tests.hooks.test_druid_hook.TestDruidHook) ... ok - test_submit_ok (tests.hooks.test_druid_hook.TestDruidHook) ... ok - test_submit_timeout (tests.hooks.test_druid_hook.TestDruidHook) ... ok - test_submit_unknown_response (tests.hooks.test_druid_hook.TestDruidHook) ... ok +To run tests locally, once your unit test environment is setup (directly on your +system or through our Docker setup) you should be able to simply run +``./run_unit_tests.sh`` at will. + +For example, in order to just execute the "core" unit tests, run the following: + +``` +./run_unit_tests.sh tests.core:CoreTest -s --logging-level=DEBUG +``` - ---------------------------------------------------------------------- - Ran 8 tests in 3.036s +or a single test method: - OK +``` +./run_unit_tests.sh tests.core:CoreTest.test_check_operators -s --logging-level=DEBUG ``` -The Airflow code is mounted inside of the Docker container, so if you change something using your favorite IDE, you can directly test is in the container. +To run the whole test suite with Docker Compose, do: -### Set up a development env using Virtualenv +``` +# Install Docker Compose first, then this will run the tests +docker-compose -f scripts/ci/docker-compose.yml run airflow-testing /app/scripts/ci/run-ci.sh +``` -Please install python(2.7.x or 3.4.x), mysql, and libxml by using system-level package -managers like yum, apt-get for Linux, or homebrew for Mac OS at first. -It is usually best to work in a virtualenv and tox. Install development requirements: +Alternatively can also set up [Travis CI](https://travis-ci.org/) on your repo to automate this. +It is free for open source projects. - cd $AIRFLOW_HOME - virtualenv env - source env/bin/activate - pip install -e .[devel] - tox +Another great way of automating linting and testing is to use [Git Hooks](https://git-scm.com/book/uz/v2/Customizing-Git-Git-Hooks). For example you could create a `pre-commit` file based on the Travis CI Pipeline so that before each commit a local pipeline will be triggered and if this pipeline fails (returns an exit code other than `0`) the commit does not come through. +This "in theory" has the advantage that you can not commit any code that fails that again reduces the errors in the Travis CI Pipelines. + +Since there are a lot of tests the script would last very long so you propably only should test your new feature locally. + +The following example of a `pre-commit` file allows you.. +- to lint your code via flake8 +- to test your code via nosetests in a docker container based on python 2 +- to test your code via nosetests in a docker container based on python 3 + +``` +#!/bin/sh + +GREEN='\033[0;32m' +NO_COLOR='\033[0m' + +setup_python_env() { + local venv_path=${1} + + echo -e "${GREEN}Activating python virtual environment ${venv_path}..${NO_COLOR}" + source ${venv_path} +} +run_linting() { + local project_dir=$(git rev-parse --show-toplevel) + + echo -e "${GREEN}Running flake8 over directory ${project_dir}..${NO_COLOR}" + flake8 ${project_dir} +} +run_testing_in_docker() { + local feature_path=${1} + local airflow_py2_container=${2} + local airflow_py3_container=${3} + + echo -e "${GREEN}Running tests in ${feature_path} in airflow python 2 docker container..${NO_COLOR}" + docker exec -i -w /airflow/ ${airflow_py2_container} nosetests -v ${feature_path} + echo -e "${GREEN}Running tests in ${feature_path} in airflow python 3 docker container..${NO_COLOR}" + docker exec -i -w /airflow/ ${airflow_py3_container} nosetests -v ${feature_path} +} + +set -e +# NOTE: Before running this make sure you have set the function arguments correctly. +setup_python_env /Users/feluelle/venv/bin/activate +run_linting +run_testing_in_docker tests/contrib/hooks/test_imap_hook.py dazzling_chatterjee quirky_stallman + +``` + +For more information on how to run a subset of the tests, take a look at the +nosetests docs. + +See also the list of test classes and methods in `tests/core.py`. Feel free to customize based on the extras available in [setup.py](./setup.py) -### Pull Request Guidelines +## Pull Request Guidelines Before you submit a pull request from your forked repo, check that it meets these guidelines: 1. The pull request should include tests, either as doctests, unit tests, or -both. The airflow repo uses [Travis CI](https://travis-ci.org/apache/incubator-airflow) -to run the tests and [codecov](https://codecov.io/gh/apache/incubator-airflow) +both. The airflow repo uses [Travis CI](https://travis-ci.org/apache/airflow) +to run the tests and [codecov](https://codecov.io/gh/apache/airflow) to track coverage. You can set up both for free on your fork. It will help you making sure you do not break the build with your PR and that you help increase coverage. @@ -149,7 +273,7 @@ The JIRA link should also be contained in the PR description. 4. Preface your commit's subject & PR's title with **[AIRFLOW-XXX]** where *XXX* is the JIRA number. We compose release notes (i.e. for Airflow releases) from all commit titles in a release. By placing the JIRA number in the commit title and hence in the release notes, -Airflow users can look into JIRA and Github PRs for more details about a particular change. +Airflow users can look into JIRA and GitHub PRs for more details about a particular change. 5. Add an [Apache License](http://www.apache.org/legal/src-headers.html) header to all new files 6. If the pull request adds functionality, the docs should be updated as part @@ -161,7 +285,7 @@ writing code that works in both Python 2 and 3, see the documentation at the Airflow requirement and should be used where possible). 8. As Airflow grows as a project, we try to enforce a more consistent style and try to follow the Python community guidelines. We track this -using [landscape.io](https://landscape.io/github/apache/incubator-airflow/), +using [landscape.io](https://landscape.io/github/apache/airflow/), which you can setup on your fork as well to check before you submit your PR. We currently enforce most [PEP8](https://www.python.org/dev/peps/pep-0008/) and a few other linting rules. It is usually a good idea to lint locally @@ -171,74 +295,93 @@ using `flake8 airflow tests`. `git diff upstream/master -u -- "*.py" | flake8 -- commit messages and adhere to them. It makes the lives of those who come after you a lot easier. -### Testing locally - -#### TL;DR -Tests can then be run with (see also the [Running unit tests](#running-unit-tests) section below): - - ./run_unit_tests.sh +### Changing the Metadata Database -Individual test files can be run with: +When developing features the need may arise to persist information to the the +metadata database. Airflow has [Alembic](https://bitbucket.org/zzzeek/alembic) +built-in to handle all schema changes. Alembic must be installed on your +development machine before continuing. - nosetests [path to file] +``` +# starting at the root of the project +$ pwd +~/airflow +# change to the airflow directory +$ cd airflow +$ alembic revision -m "add new field to db" + Generating +~/airflow/airflow/migrations/versions/12341123_add_new_field_to_db.py +``` -#### Running unit tests +## Setting up the node / npm javascript environment (ONLY FOR www_rbac) -We *highly* recommend setting up [Travis CI](https://travis-ci.org/) on -your repo to automate this. It is free for open source projects. If for -some reason you cannot, you can use the steps below to run tests. +`airflow/www_rbac/` contains all npm-managed, front end assets. +Flask-Appbuilder itself comes bundled with jQuery and bootstrap. +While these may be phased out over time, these packages are currently not +managed with npm. -Here are loose guidelines on how to get your environment to run the unit tests. -We do understand that no one out there can run the full test suite since -Airflow is meant to connect to virtually any external system and that you most -likely have only a subset of these in your environment. You should run the -CoreTests and tests related to things you touched in your PR. +### Node/npm versions +Make sure you are using recent versions of node and npm. No problems have been found with node>=8.11.3 and npm>=6.1.3 -To set up a unit test environment, first take a look at `run_unit_tests.sh` and -understand that your ``AIRFLOW_CONFIG`` points to an alternate config file -while running the tests. You shouldn't have to alter this config file but -you may if need be. +### Using npm to generate bundled files -From that point, you can actually export these same environment variables in -your shell, start an Airflow webserver ``airflow webserver -d`` and go and -configure your connection. Default connections that are used in the tests -should already have been created, you just need to point them to the systems -where you want your tests to run. +#### npm +First, npm must be available in your environment. If it is not you can run the following commands +(taken from [this source](https://gist.github.com/DanHerbert/9520689)) +``` +brew install node --without-npm +echo prefix=~/.npm-packages >> ~/.npmrc +curl -L https://www.npmjs.com/install.sh | sh +``` -Once your unit test environment is setup, you should be able to simply run -``./run_unit_tests.sh`` at will. +The final step is to add `~/.npm-packages/bin` to your `PATH` so commands you install globally are usable. +Add something like this to your `.bashrc` file, then `source ~/.bashrc` to reflect the change. +``` +export PATH="$HOME/.npm-packages/bin:$PATH" +``` -For example, in order to just execute the "core" unit tests, run the following: +#### npm packages +To install third party libraries defined in `package.json`, run the +following within the `airflow/www_rbac/` directory which will install them in a +new `node_modules/` folder within `www_rbac/`. -``` -./run_unit_tests.sh tests.core:CoreTest -s --logging-level=DEBUG +```bash +# from the root of the repository, move to where our JS package.json lives +cd airflow/www_rbac/ +# run npm install to fetch all the dependencies +npm install ``` -or a single test method: +To parse and generate bundled files for airflow, run either of the +following commands. The `dev` flag will keep the npm script running and +re-run it upon any changes within the assets directory. ``` -./run_unit_tests.sh tests.core:CoreTest.test_check_operators -s --logging-level=DEBUG +# Compiles the production / optimized js & css +npm run prod + +# Start a web server that manages and updates your assets as you modify them +npm run dev ``` -For more information on how to run a subset of the tests, take a look at the -nosetests docs. +#### Upgrading npm packages -See also the list of test classes and methods in `tests/core.py`. +Should you add or upgrade a npm package, which involves changing `package.json`, you'll need to re-run `npm install` +and push the newly generated `package-lock.json` file so we get the reproducible build. -### Changing the Metadata Database +#### Javascript Style Guide -When developing features the need may arise to persist information to the the -metadata database. Airflow has [Alembic](https://bitbucket.org/zzzeek/alembic) -built-in to handle all schema changes. Alembic must be installed on your -development machine before continuing. +We try to enforce a more consistent style and try to follow the JS community guidelines. +Once you add or modify any javascript code in the project, please make sure it follows the guidelines +defined in [Airbnb JavaScript Style Guide](https://github.com/airbnb/javascript). +Apache Airflow uses [ESLint](https://eslint.org/) as a tool for identifying and reporting on patterns in JavaScript, +which can be used by running any of the following commands. +```bash +# Check JS code in .js and .html files, and report any errors/warnings +npm run lint + +# Check JS code in .js and .html files, report any errors/warnings and fix them if possible +npm run lint:fix ``` -# starting at the root of the project -$ pwd -~/airflow -# change to the airflow directory -$ cd airflow -$ alembic revision -m "add new field to db" - Generating -~/airflow/airflow/migrations/versions/12341123_add_new_field_to_db.py -``` + diff --git a/DISCLAIMER b/DISCLAIMER deleted file mode 100644 index 8fe69887c3638..0000000000000 --- a/DISCLAIMER +++ /dev/null @@ -1 +0,0 @@ -Apache Airflow is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF. diff --git a/INSTALL b/INSTALL index 5c8f03eb663e6..c3d15a23eaa08 100644 --- a/INSTALL +++ b/INSTALL @@ -1,13 +1,22 @@ -# INSTALL / BUILD instruction for Apache Airflow (incubating) -# fetch the tarball and untar the source +# INSTALL / BUILD instructions for Apache Airflow + +# [required] fetch the tarball and untar the source +# change into the directory that was untarred. # [optional] run Apache RAT (release audit tool) to validate license headers -# RAT docs here: https://creadur.apache.org/rat/ +# RAT docs here: https://creadur.apache.org/rat/. Requires Java and Apache Rat java -jar apache-rat.jar -E ./.rat-excludes -d . -# [optional] by default one of Apache Airflow's dependencies pulls in a GPL -# library. If this is a concern issue (also every upgrade): -# export SLUGIFY_USES_TEXT_UNIDECODE=yes +# [optional] Airflow pulls in quite a lot of dependencies in order +# to connect to other services. You might want to test or run Airflow +# from a virtual env to make sure those dependencies are separated +# from your system wide versions +python -m my_env +source my_env/bin/activate + +# [required] building and installing +# by pip (preferred) +pip install . -# install the release +# or directly python setup.py install diff --git a/LICENSE b/LICENSE index 405540c64fb87..e3335acb8014a 100644 --- a/LICENSE +++ b/LICENSE @@ -209,6 +209,19 @@ limitations under the License. licenses. +======================================================================== +Third party Apache 2.0 licenses +======================================================================== + +The following components are provided under the Apache 2.0 License. +See project link for details. The text of each license is also included +at licenses/LICENSE-[project].txt. + + (ALv2 License) hue v4.3.0 (https://github.com/cloudera/hue/) + (ALv2 License) jqclock v2.3.0 (https://github.com/JohnRDOrazio/jQuery-Clock-Plugin) + (ALv2 License) bootstrap3-typeahead v4.0.2 (https://github.com/bassjobsen/Bootstrap-3-Typeahead) + (ALv2 License) airflow.contrib.auth.backends.github_enterprise_auth + ======================================================================== MIT licenses ======================================================================== @@ -216,16 +229,19 @@ MIT licenses The following components are provided under the MIT License. See project link for details. The text of each license is also included at licenses/LICENSE-[project].txt. - (MIT License) jquery (https://jquery.org/license/) - (MIT License) dagre-d3 (https://github.com/cpettitt/dagre-d3) - (MIT License) bootstrap (https://github.com/twbs/bootstrap/) - (MIT License) d3-tip (https://github.com/Caged/d3-tip) - (MIT License) dataTables (https://datatables.net) - (MIT License) WebGL-2D (https://github.com/gameclosure/webgl-2d) - (MIT License) Underscorejs (http://underscorejs.org) - (MIT License) Bootstrap Toggle (http://www.bootstraptoggle.com) - (MIT License) normalize.css (http://necolas.github.io/normalize.css/) - (MIT License) ElasticMock (https://github.com/vrcmarcos/elasticmock) + (MIT License) jquery v2.1.4 (https://jquery.org/license/) + (MIT License) dagre-d3 v0.6.1 (https://github.com/cpettitt/dagre-d3) + (MIT License) bootstrap v3.2 (https://github.com/twbs/bootstrap/) + (MIT License) d3-tip v0.6.3 (https://github.com/Caged/d3-tip) + (MIT License) dataTables v1.10.10 (https://datatables.net) + (MIT License) WebGL-2D (git-commit 9a7ec26) (https://github.com/gameclosure/webgl-2d) + (MIT License) Underscorejs v1.5.0 (http://underscorejs.org) + (MIT License) Bootstrap Toggle v2.2.0 (http://www.bootstraptoggle.com) + (MIT License) normalize.css v3.0.2 (http://necolas.github.io/normalize.css/) + (MIT License) ElasticMock v1.3.2 (https://github.com/vrcmarcos/elasticmock) + (MIT License) MomentJS v2.22.2 (http://momentjs.com/) + (MIT License) python-slugify v2.0.1 (https://github.com/un33k/python-slugify) + (MIT License) python-nvd3 v0.15.0 (https://github.com/areski/python-nvd3) ======================================================================== BSD 2-Clause licenses @@ -234,15 +250,16 @@ The following components are provided under the BSD 2-Clause license. See file headers and project links for details. The text of each license is also included at licenses/LICENSE-[project].txt. - (BSD 2 License) flask-kerberos (https://github.com/mkomitee/flask-kerberos) + (BSD 2 License) flask-kerberos v1.0.4 (https://github.com/mkomitee/flask-kerberos) ======================================================================== BSD 3-Clause licenses ======================================================================== -The following components are provided under the BSD 2-Clause license. -See file headers and project links for details. +The following components are provided under the BSD 3-Clause license. See project links for details. The text of each license is also included at licenses/LICENSE-[project].txt. - (BSD 3 License) Ace (https://github.com/ajaxorg/ace) - (BSD 3 License) d3js (https://d3js.org) - (BSD 3 License) parallel-coordinates (http://syntagmatic.github.com/parallel-coordinates/) + (BSD 3 License) Ace v1.1.8 (https://github.com/ajaxorg/ace) + (BSD 3 License) d3js v3.5.17 (https://d3js.org) + (BSD 3 License) parallel-coordinates v0.7.0 (http://syntagmatic.github.com/parallel-coordinates/) + (BSD 3 License) scikit-learn v0.19.1 (https://github.com/scikit-learn/scikit-learn) + diff --git a/MANIFEST.in b/MANIFEST.in index c04b80617d18d..5f0921dabf979 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,15 +18,19 @@ include NOTICE include LICENSE -include DISCLAIMER include CHANGELOG.txt include README.md +graft licenses/ graft airflow/www/templates graft airflow/www/static +graft airflow/www_rbac graft airflow/www_rbac/static graft airflow/www_rbac/templates graft airflow/www_rbac/translations +graft airflow/_vendor/ include airflow/alembic.ini graft scripts/systemd graft scripts/upstart graft airflow/config_templates +recursive-exclude airflow/www_rbac/node_modules * +global-exclude __pycache__ *.pyc diff --git a/NOTICE b/NOTICE index b1e78ad9c3380..2e6202e30e809 100644 --- a/NOTICE +++ b/NOTICE @@ -1,6 +1,38 @@ Apache Airflow -Copyright 2016 and onwards The Apache Software Foundation - +Copyright 2016-2019 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). + +======================================================================= + +Apache Airflow contains subcomponents with separate copyright notices and +license terms. Your use of the source code for the these subcomponents +is subject to the terms and conditions of their respective licenses. + +See the LICENSE file for a list of subcomponents and dependencies and +their respective licenses. + +airflow.contrib.auth.backends.github_enterprise_auth: +----------------------------------------------------- + +* Copyright 2015 Matthew Pelland (matt@pelland.io) + +hue: +----- +This product contains a modified portion of 'Hue' developed by Cloudera, Inc. +(https://github.com/cloudera/hue/). + +* Copyright 2009-2017 Cloudera Inc. + +python-slugify: +--------------- + +* Copyright (c) Val Neekman @ Neekware Inc. http://neekware.com + +python-nvd3: +------------ + +* Copyright (c) 2013 Arezqui Belaid and other contributors + + diff --git a/README.md b/README.md index e68d26cc8cfa2..fceb2a7d7bffb 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,43 @@ -# Airflow + + +# Apache Airflow [![PyPI version](https://badge.fury.io/py/apache-airflow.svg)](https://badge.fury.io/py/apache-airflow) -[![Build Status](https://travis-ci.org/apache/incubator-airflow.svg?branch=master)](https://travis-ci.org/apache/incubator-airflow) -[![Coverage Status](https://img.shields.io/codecov/c/github/apache/incubator-airflow/master.svg)](https://codecov.io/github/apache/incubator-airflow?branch=master) +[![Build Status](https://travis-ci.org/apache/airflow.svg?branch=master)](https://travis-ci.org/apache/airflow) +[![Coverage Status](https://img.shields.io/codecov/c/github/apache/airflow/master.svg)](https://codecov.io/github/apache/airflow?branch=master) [![Documentation Status](https://readthedocs.org/projects/airflow/badge/?version=latest)](https://airflow.readthedocs.io/en/latest/?badge=latest) [![License](http://img.shields.io/:license-Apache%202-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.txt) -[![Join the chat at https://gitter.im/apache/incubator-airflow](https://badges.gitter.im/apache/incubator-airflow.svg)](https://gitter.im/apache/incubator-airflow?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/apache-airflow.svg)](https://pypi.org/project/apache-airflow/) +[![Twitter Follow](https://img.shields.io/twitter/follow/ApacheAirflow.svg?style=social&label=Follow)](https://twitter.com/ApacheAirflow) -_NOTE: The transition from 1.8.0 (or before) to 1.8.1 (or after) requires uninstalling Airflow before installing the new version. The package name was changed from `airflow` to `apache-airflow` as of version 1.8.1._ +_NOTE: The transition from 1.8.0 (or before) to 1.8.1 (or after) requires uninstalling Apache Airflow before installing the new version. The package name was changed from `airflow` to `apache-airflow` as of version 1.8.1._ -Airflow is a platform to programmatically author, schedule, and monitor -workflows. +Apache Airflow (or simply Airflow) is a platform to programmatically author, schedule, and monitor workflows. When workflows are defined as code, they become more maintainable, versionable, testable, and collaborative. -Use Airflow to author workflows as directed acyclic graphs (DAGs) of tasks. -The Airflow scheduler executes your tasks on an array of workers while -following the specified dependencies. Rich command line utilities make -performing complex surgeries on DAGs a snap. The rich user interface -makes it easy to visualize pipelines running in production, -monitor progress, and troubleshoot issues when needed. +Use Airflow to author workflows as directed acyclic graphs (DAGs) of tasks. The Airflow scheduler executes your tasks on an array of workers while following the specified dependencies. Rich command line utilities make performing complex surgeries on DAGs a snap. The rich user interface makes it easy to visualize pipelines running in production, monitor progress, and troubleshoot issues when needed. ## Getting started -Please visit the Airflow Platform documentation (latest **stable** release) for help with [installing Airflow](https://airflow.incubator.apache.org/installation.html), getting a [quick start](https://airflow.incubator.apache.org/start.html), or a more complete [tutorial](https://airflow.incubator.apache.org/tutorial.html). +Please visit the Airflow Platform documentation (latest **stable** release) for help with [installing Airflow](https://airflow.apache.org/installation.html), getting a [quick start](https://airflow.apache.org/start.html), or a more complete [tutorial](https://airflow.apache.org/tutorial.html). Documentation of GitHub master (latest development branch): [ReadTheDocs Documentation](https://airflow.readthedocs.io/en/latest/) @@ -49,7 +63,7 @@ unit of work and continuity. - **Dynamic**: Airflow pipelines are configuration as code (Python), allowing for dynamic pipeline generation. This allows for writing code that instantiates pipelines dynamically. - **Extensible**: Easily define your own operators, executors and extend the library so that it fits the level of abstraction that suits your environment. - **Elegant**: Airflow pipelines are lean and explicit. Parameterizing your scripts is built into the core of Airflow using the powerful **Jinja** templating engine. -- **Scalable**: Airflow has a modular architecture and uses a message queue to orchestrate an arbitrary number of workers. Airflow is ready to scale to infinity. +- **Scalable**: Airflow has a modular architecture and uses a message queue to orchestrate an arbitrary number of workers. ## User Interface @@ -71,9 +85,14 @@ unit of work and continuity. - **Code View**: Quick way to view source code of a DAG. ![](/docs/img/code.png) +## Contributing + +Want to help build Apache Airflow? Check out our [contributing documentation](https://github.com/apache/airflow/blob/master/CONTRIBUTING.md). + + ## Who uses Airflow? -As the Airflow community grows, we'd like to keep track of who is using +As the Apache Airflow community grows, we'd like to keep track of who is using the platform. Please send a PR with your company name and @githubhandle if you may. @@ -233,6 +252,7 @@ Currently **officially** using Airflow: 1. [Tile](https://tile.com/) [[@ranjanmanish](https://github.com/ranjanmanish)] 1. [Tokopedia](https://www.tokopedia.com/) [@topedmaria](https://github.com/topedmaria) 1. [Twine Labs](https://www.twinelabs.com/) [[@ivorpeles](https://github.com/ivorpeles)] +1. [Twitter](https://www.twitter.com/) [[@aoen](https://github.com/aoen)] 1. [T2 Systems](http://t2systems.com) [[@unclaimedpants](https://github.com/unclaimedpants)] 1. [Ubisoft](https://www.ubisoft.com/) [[@Walkoss](https://github.com/Walkoss)] 1. [United Airlines](https://www.united.com/) [[@ilopezfr](https://github.com/ilopezfr)] @@ -256,10 +276,17 @@ Currently **officially** using Airflow: 1. [Zymergen](https://www.zymergen.com/) 1. [99](https://99taxis.com) [[@fbenevides](https://github.com/fbenevides), [@gustavoamigo](https://github.com/gustavoamigo) & [@mmmaia](https://github.com/mmmaia)] +## Who Maintains Apache Airflow? + +Airflow is the work of the [community](https://github.com/apache/airflow/graphs/contributors), +but the [core committers/maintainers](https://people.apache.org/committers-by-project.html#airflow) +are responsible for reviewing and merging PRs as well as steering conversation around new feature requests. +If you would like to become a maintainer, please review the Apache Airflow +[committer requirements](https://cwiki.apache.org/confluence/display/AIRFLOW/Committers). + ## Links -* [Documentation](https://airflow.incubator.apache.org/) -* [Chat](https://gitter.im/apache/incubator-airflow) -* [Apache Airflow Incubation Status](http://incubator.apache.org/projects/airflow.html) -* [More](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Links) +- [Documentation](https://airflow.apache.org/) +- [Chat](https://apache-airflow-slack.herokuapp.com/) +- [More](https://cwiki.apache.org/confluence/display/AIRFLOW/Airflow+Links) diff --git a/TODO.md b/TODO.md index cf19035e1b995..1e4e6ed249e1a 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1,22 @@ + + #### Roadmap items * UI page answering "Why isn't this task instance running?" * Attempt removing DagBag caching for the web server diff --git a/UPDATING.md b/UPDATING.md index 3a66e735c3ac4..bf48ee1a708f3 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -1,11 +1,334 @@ + + # Updating Airflow This file documents any backwards-incompatible changes in Airflow and assists users migrating to a new version. -## Airflow Master +## Airflow 1.10.3 + +### RedisPy dependency updated to v3 series +If you are using the Redis Sensor or Hook you may have to update your code. See +[redis-py porting instructions] to check if your code might be affected (MSET, +MSETNX, ZADD, and ZINCRBY all were, but read the full doc). + +[redis-py porting instructions]: https://github.com/andymccurdy/redis-py/tree/3.2.0#upgrading-from-redis-py-2x-to-30 + +### SLUGIFY_USES_TEXT_UNIDECODE or AIRFLOW_GPL_UNIDECODE no longer required + +It is no longer required to set one of the environment variables to avoid +a GPL dependency. Airflow will now always use text-unidecode if unidecode +was not installed before. + +### new `sync_parallelism` config option in celery section + +The new `sync_parallelism` config option will control how many processes CeleryExecutor will use to +fetch celery task state in parallel. Default value is max(1, number of cores - 1) + +### Rename of BashTaskRunner to StandardTaskRunner + +BashTaskRunner has been renamed to StandardTaskRunner. It is the default task runner +so you might need to update your config. + +`task_runner = StandardTaskRunner` + +### Modification to config file discovery + +If the `AIRFLOW_CONFIG` environment variable was not set and the +`~/airflow/airflow.cfg` file existed, airflow previously used +`~/airflow/airflow.cfg` instead of `$AIRFLOW_HOME/airflow.cfg`. Now airflow +will discover its config file using the `$AIRFLOW_CONFIG` and `$AIRFLOW_HOME` +environment variables rather than checking for the presence of a file. + +### New `dag_discovery_safe_mode` config option + +If `dag_discovery_safe_mode` is enabled, only check files for DAGs if +they contain the strings "airflow" and "DAG". For backwards +compatibility, this option is enabled by default. + +### Changes in Google Cloud Platform related operators + +Most GCP-related operators have now optional `PROJECT_ID` parameter. In case you do not specify it, +the project id configured in +[GCP Connection](https://airflow.apache.org/howto/manage-connections.html#connection-type-gcp) is used. +There will be an `AirflowException` thrown in case `PROJECT_ID` parameter is not specified and the +connection used has no project id defined. This change should be backwards compatible as earlier version +of the operators had `PROJECT_ID` mandatory. + +Operators involved: + + * GCP Compute Operators + * GceInstanceStartOperator + * GceInstanceStopOperator + * GceSetMachineTypeOperator + * GCP Function Operators + * GcfFunctionDeployOperator + * GCP Cloud SQL Operators + * CloudSqlInstanceCreateOperator + * CloudSqlInstancePatchOperator + * CloudSqlInstanceDeleteOperator + * CloudSqlInstanceDatabaseCreateOperator + * CloudSqlInstanceDatabasePatchOperator + * CloudSqlInstanceDatabaseDeleteOperator + +Other GCP operators are unaffected. + +### Changes in Google Cloud Platform related hooks + +The change in GCP operators implies that GCP Hooks for those operators require now keyword parameters rather +than positional ones in all methods where `project_id` is used. The methods throw an explanatory exception +in case they are called using positional parameters. + +Hooks involved: + + * GceHook + * GcfHook + * CloudSqlHook + +Other GCP hooks are unaffected. + +### Changed behaviour of using default value when accessing variables +It's now possible to use `None` as a default value with the `default_var` parameter when getting a variable, e.g. + +```python +foo = Variable.get("foo", default_var=None) +if foo is None: + handle_missing_foo() +``` + +(Note: there is already `Variable.setdefault()` which me be helpful in some cases.) + +This changes the behaviour if you previously explicitly provided `None` as a default value. If your code expects a `KeyError` to be thrown, then don't pass the `default_var` argument. + +### Removal of `airflow_home` config setting + +There were previously two ways of specifying the Airflow "home" directory +(`~/airflow` by default): the `AIRFLOW_HOME` environment variable, and the +`airflow_home` config setting in the `[core]` section. + +If they had two different values different parts of the code base would end up +with different values. The config setting has been deprecated, and you should +remove the value from the config file and set `AIRFLOW_HOME` environment +variable if you need to use a non default value for this. + +(Since this setting is used to calculate what config file to load, it is not +possible to keep just the config option) + +### Change of two methods signatures in `GCPTransferServiceHook` + +The signature of the `create_transfer_job` method in `GCPTransferServiceHook` +class has changed. The change does not change the behavior of the method. + +Old signature: +```python +def create_transfer_job(self, description, schedule, transfer_spec, project_id=None): +``` +New signature: +```python +def create_transfer_job(self, body): +``` + +It is necessary to rewrite calls to method. The new call looks like this: +```python +body = { + 'status': 'ENABLED', + 'projectId': project_id, + 'description': description, + 'transferSpec': transfer_spec, + 'schedule': schedule, +} +gct_hook.create_transfer_job(body) +``` +The change results from the unification of all hooks and adjust to +[the official recommendations](https://lists.apache.org/thread.html/e8534d82be611ae7bcb21ba371546a4278aad117d5e50361fd8f14fe@%3Cdev.airflow.apache.org%3E) +for the Google Cloud Platform. + +The signature of `wait_for_transfer_job` method in `GCPTransferServiceHook` has changed. + +Old signature: +```python +def wait_for_transfer_job(self, job): +``` +New signature: +```python +def wait_for_transfer_job(self, job, expected_statuses=(GcpTransferOperationStatus.SUCCESS, )): +``` + +The behavior of `wait_for_transfer_job` has changed: + +Old behavior: + +`wait_for_transfer_job` would wait for the SUCCESS status in specified jobs operations. + +New behavior: + +You can now specify an array of expected statuses. `wait_for_transfer_job` now waits for any of them. + +The default value of `expected_statuses` is SUCCESS so that change is backwards compatible. + +### Moved two classes to different modules + +The class `GoogleCloudStorageToGoogleCloudStorageTransferOperator` has been moved from +`airflow.contrib.operators.gcs_to_gcs_transfer_operator` to `airflow.contrib.operators.gcp_transfer_operator` + +the class `S3ToGoogleCloudStorageTransferOperator` has been moved from +`airflow.contrib.operators.s3_to_gcs_transfer_operator` to `airflow.contrib.operators.gcp_transfer_operator` + +The change was made to keep all the operators related to GCS Transfer Services in one file. + +The previous imports will continue to work until Airflow 2.0 + +### Fixed typo in --driver-class-path in SparkSubmitHook + +The `driver_classapth` argument to SparkSubmit Hook and Operator was +generating `--driver-classpath` on the spark command line, but this isn't a +valid option to spark. + +The argument has been renamed to `driver_class_path` and the option it +generates has been fixed. + +## Airflow 1.10.2 + +### DAG level Access Control for new RBAC UI + +Extend and enhance new Airflow RBAC UI to support DAG level ACL. Each dag now has two permissions(one for write, one for read) associated('can_dag_edit', 'can_dag_read'). +The admin will create new role, associate the dag permission with the target dag and assign that role to users. That user can only access / view the certain dags on the UI +that he has permissions on. If a new role wants to access all the dags, the admin could associate dag permissions on an artificial view(``all_dags``) with that role. + +We also provide a new cli command(``sync_perm``) to allow admin to auto sync permissions. + +### Modification to `ts_nodash` macro +`ts_nodash` previously contained TimeZone information along with execution date. For Example: `20150101T000000+0000`. This is not user-friendly for file or folder names which was a popular use case for `ts_nodash`. Hence this behavior has been changed and using `ts_nodash` will no longer contain TimeZone information, restoring the pre-1.10 behavior of this macro. And a new macro `ts_nodash_with_tz` has been added which can be used to get a string with execution date and timezone info without dashes. + +Examples: + * `ts_nodash`: `20150101T000000` + * `ts_nodash_with_tz`: `20150101T000000+0000` + +### Semantics of next_ds/prev_ds changed for manually triggered runs + +next_ds/prev_ds now map to execution_date instead of the next/previous schedule-aligned execution date for DAGs triggered in the UI. + +### User model changes +This patch changes the `User.superuser` field from a hardcoded boolean to a `Boolean()` database column. `User.superuser` will default to `False`, which means that this privilege will have to be granted manually to any users that may require it. + +For example, open a Python shell and +```python +from airflow import models, settings + +session = settings.Session() +users = session.query(models.User).all() # [admin, regular_user] + +users[1].superuser # False + +admin = users[0] +admin.superuser = True +session.add(admin) +session.commit() +``` + +### Custom auth backends interface change + +We have updated the version of flask-login we depend upon, and as a result any +custom auth backends might need a small change: `is_active`, +`is_authenticated`, and `is_anonymous` should now be properties. What this means is if +previously you had this in your user class + + def is_active(self): + return self.active + +then you need to change it like this + + @property + def is_active(self): + return self.active + +## Airflow 1.10.1 + +### New `dag_processor_manager_log_location` config option + +The DAG parsing manager log now by default will be log into a file, where its location is +controlled by the new `dag_processor_manager_log_location` config option in core section. + +### StatsD Metrics + +The `scheduler_heartbeat` metric has been changed from a gauge to a counter. Each loop of the scheduler will increment the counter by 1. This provides a higher degree of visibility and allows for better integration with Prometheus using the [StatsD Exporter](https://github.com/prometheus/statsd_exporter). Scheduler upness can be determined by graphing and alerting using a rate. If the scheduler goes down, the rate will drop to 0. + +### EMRHook now passes all of connection's extra to CreateJobFlow API + +EMRHook.create_job_flow has been changed to pass all keys to the create_job_flow API, rather than +just specific known keys for greater flexibility. + +However prior to this release the "emr_default" sample connection that was created had invalid +configuration, so creating EMR clusters might fail until your connection is updated. (Ec2KeyName, +Ec2SubnetId, TerminationProtection and KeepJobFlowAliveWhenNoSteps were all top-level keys when they +should be inside the "Instances" dict) + +### LDAP Auth Backend now requires TLS + +Connecting to an LDAP serever over plain text is not supported anymore. The +certificate presented by the LDAP server must be signed by a trusted +certificiate, or you must provide the `cacert` option under `[ldap]` in the +config file. + +If you want to use LDAP auth backend without TLS then you will habe to create a +custom-auth backend based on +https://github.com/apache/airflow/blob/1.10.0/airflow/contrib/auth/backends/ldap_auth.py + +### Custom auth backends interface change + +We have updated the version of flask-login we depend upon, and as a result any +custom auth backends might need a small change: `is_active`, +`is_authenticated`, and `is_anonymous` should now be properties. What this means is if +previously you had this in your user class + + def is_active(self): + return self.active + +then you need to change it like this + + @property + def is_active(self): + return self.active + +## Airflow 1.10 + +Installation and upgrading requires setting `SLUGIFY_USES_TEXT_UNIDECODE=yes` in your environment or +`AIRFLOW_GPL_UNIDECODE=yes`. In case of the latter a GPL runtime dependency will be installed due to a +dependency (python-nvd3 -> python-slugify -> unidecode). + +### Replace DataProcHook.await calls to DataProcHook.wait + +The method name was changed to be compatible with the Python 3.7 async/await keywords + +### DAG level Access Control for new RBAC UI + +Extend and enhance new Airflow RBAC UI to support DAG level ACL. Each dag now has two permissions(one for write, one for read) associated('can_dag_edit', 'can_dag_read'). +The admin will create new role, associate the dag permission with the target dag and assign that role to users. That user can only access / view the certain dags on the UI +that he has permissions on. If a new role wants to access all the dags, the admin could associate dag permissions on an artificial view(``all_dags``) with that role. + +We also provide a new cli command(``sync_perm``) to allow admin to auto sync permissions. + +### Setting UTF-8 as default mime_charset in email utils ### Add a configuration variable(default_dag_run_display_number) to control numbers of dag run for display + Add a configuration variable(default_dag_run_display_number) under webserver section to control num of dag run to show in UI. ### Default executor for SubDagOperator is changed to SequentialExecutor @@ -35,11 +358,13 @@ Run `airflow webserver` to start the new UI. This will bring up a log in page, e There are five roles created for Airflow by default: Admin, User, Op, Viewer, and Public. To configure roles/permissions, go to the `Security` tab and click `List Roles` in the new UI. #### Breaking changes + - AWS Batch Operator renamed property queue to job_queue to prevent conflict with the internal queue from CeleryExecutor - AIRFLOW-2542 - Users created and stored in the old users table will not be migrated automatically. FAB's built-in authentication support must be reconfigured. - Airflow dag home page is now `/home` (instead of `/admin`). - All ModelViews in Flask-AppBuilder follow a different pattern from Flask-Admin. The `/admin` part of the url path will no longer exist. For example: `/admin/connection` becomes `/connection/list`, `/admin/connection/new` becomes `/connection/add`, `/admin/connection/edit` becomes `/connection/edit`, etc. - Due to security concerns, the new webserver will no longer support the features in the `Data Profiling` menu of old UI, including `Ad Hoc Query`, `Charts`, and `Known Events`. +- HiveServer2Hook.get_results() always returns a list of tuples, even when a single column is queried, as per Python API 2. ### airflow.contrib.sensors.hdfs_sensors renamed to airflow.contrib.sensors.hdfs_sensor @@ -53,33 +378,43 @@ to have specified `explicit_defaults_for_timestamp=1` in your my.cnf under `[mys ### Celery config To make the config of Airflow compatible with Celery, some properties have been renamed: + ``` celeryd_concurrency -> worker_concurrency celery_result_backend -> result_backend +celery_ssl_active -> ssl_active +celery_ssl_cert -> ssl_cert +celery_ssl_key -> ssl_key ``` + Resulting in the same config parameters as Celery 4, with more transparency. ### GCP Dataflow Operators + Dataflow job labeling is now supported in Dataflow{Java,Python}Operator with a default "airflow-version" label, please upgrade your google-cloud-dataflow or apache-beam version to 2.2.0 or greater. ### BigQuery Hooks and Operator + The `bql` parameter passed to `BigQueryOperator` and `BigQueryBaseCursor.run_query` has been deprecated and renamed to `sql` for consistency purposes. Using `bql` will still work (and raise a `DeprecationWarning`), but is no longer supported and will be removed entirely in Airflow 2.0 ### Redshift to S3 Operator + With Airflow 1.9 or lower, Unload operation always included header row. In order to include header row, we need to turn off parallel unload. It is preferred to perform unload operation using all nodes so that it is faster for larger tables. So, parameter called `include_header` is added and default is set to False. -Header row will be added only if this parameter is set True and also in that case parallel will be automatically turned off (`PARALLEL OFF`) +Header row will be added only if this parameter is set True and also in that case parallel will be automatically turned off (`PARALLEL OFF`) ### Google cloud connection string With Airflow 1.9 or lower, there were two connection strings for the Google Cloud operators, both `google_cloud_storage_default` and `google_cloud_default`. This can be confusing and therefore the `google_cloud_storage_default` connection id has been replaced with `google_cloud_default` to make the connection id consistent across Airflow. ### Logging Configuration + With Airflow 1.9 or lower, `FILENAME_TEMPLATE`, `PROCESSOR_FILENAME_TEMPLATE`, `LOG_ID_TEMPLATE`, `END_OF_LOG_MARK` were configured in `airflow_local_settings.py`. These have been moved into the configuration file, and hence if you were using a custom configuration file the following defaults need to be added. + ``` [core] fab_logging_level = WARN @@ -91,23 +426,36 @@ elasticsearch_log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_ elasticsearch_end_of_log_mark = end_of_log ``` +The previous setting of `log_task_reader` is not needed in many cases now when using the default logging config with remote storages. (Previously it needed to be set to `s3.task` or similar. This is not needed with the default config anymore) + +#### Change of per-task log path + +With the change to Airflow core to be timezone aware the default log path for task instances will now include timezone information. This will by default mean all previous task logs won't be found. You can get the old behaviour back by setting the following config options: + +``` +[core] +log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ execution_date.strftime("%%Y-%%m-%%dT%%H:%%M:%%S") }}/{{ try_number }}.log +``` + ## Airflow 1.9 ### SSH Hook updates, along with new SSH Operator & SFTP Operator SSH Hook now uses the Paramiko library to create an ssh client connection, instead of the sub-process based ssh command execution previously (<1.9.0), so this is backward incompatible. - - update SSHHook constructor - - use SSHOperator class in place of SSHExecuteOperator which is removed now. Refer to test_ssh_operator.py for usage info. - - SFTPOperator is added to perform secure file transfer from serverA to serverB. Refer to test_sftp_operator.py.py for usage info. - - No updates are required if you are using ftpHook, it will continue to work as is. + +- update SSHHook constructor +- use SSHOperator class in place of SSHExecuteOperator which is removed now. Refer to test_ssh_operator.py for usage info. +- SFTPOperator is added to perform secure file transfer from serverA to serverB. Refer to test_sftp_operator.py.py for usage info. +- No updates are required if you are using ftpHook, it will continue to work as is. ### S3Hook switched to use Boto3 The airflow.hooks.S3_hook.S3Hook has been switched to use boto3 instead of the older boto (a.k.a. boto2). This results in a few backwards incompatible changes to the following classes: S3Hook: - - the constructors no longer accepts `s3_conn_id`. It is now called `aws_conn_id`. - - the default connection is now "aws_default" instead of "s3_default" - - the return type of objects returned by `get_bucket` is now boto3.s3.Bucket - - the return type of `get_key`, and `get_wildcard_key` is now an boto3.S3.Object. + +- the constructors no longer accepts `s3_conn_id`. It is now called `aws_conn_id`. +- the default connection is now "aws_default" instead of "s3_default" +- the return type of objects returned by `get_bucket` is now boto3.s3.Bucket +- the return type of `get_key`, and `get_wildcard_key` is now an boto3.S3.Object. If you are using any of these in your DAGs and specify a connection ID you will need to update the parameter name for the connection to "aws_conn_id": S3ToHiveTransfer, S3PrefixSensor, S3KeySensor, RedshiftToS3Transfer. @@ -145,17 +493,22 @@ The config can be taken from `airflow/config_templates/airflow_local_settings.py ``` # -*- coding: utf-8 -*- # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import os @@ -280,13 +633,14 @@ The `file_task_handler` logger has been made more flexible. The default format c #### I'm using S3Log or GCSLogs, what do I do!? -If you are logging to Google cloud storage, please see the [Google cloud platform documentation](https://airflow.incubator.apache.org/integration.html#gcp-google-cloud-platform) for logging instructions. +If you are logging to Google cloud storage, please see the [Google cloud platform documentation](https://airflow.apache.org/integration.html#gcp-google-cloud-platform) for logging instructions. If you are using S3, the instructions should be largely the same as the Google cloud platform instructions above. You will need a custom logging config. The `REMOTE_BASE_LOG_FOLDER` configuration key in your airflow config has been removed, therefore you will need to take the following steps: - - Copy the logging configuration from [`airflow/config_templates/airflow_logging_settings.py`](https://github.com/apache/incubator-airflow/blob/master/airflow/config_templates/airflow_local_settings.py). - - Place it in a directory inside the Python import path `PYTHONPATH`. If you are using Python 2.7, ensuring that any `__init__.py` files exist so that it is importable. - - Update the config by setting the path of `REMOTE_BASE_LOG_FOLDER` explicitly in the config. The `REMOTE_BASE_LOG_FOLDER` key is not used anymore. - - Set the `logging_config_class` to the filename and dict. For example, if you place `custom_logging_config.py` on the base of your pythonpath, you will need to set `logging_config_class = custom_logging_config.LOGGING_CONFIG` in your config as Airflow 1.8. + +- Copy the logging configuration from [`airflow/config_templates/airflow_logging_settings.py`](https://github.com/apache/airflow/blob/master/airflow/config_templates/airflow_local_settings.py). +- Place it in a directory inside the Python import path `PYTHONPATH`. If you are using Python 2.7, ensuring that any `__init__.py` files exist so that it is importable. +- Update the config by setting the path of `REMOTE_BASE_LOG_FOLDER` explicitly in the config. The `REMOTE_BASE_LOG_FOLDER` key is not used anymore. +- Set the `logging_config_class` to the filename and dict. For example, if you place `custom_logging_config.py` on the base of your pythonpath, you will need to set `logging_config_class = custom_logging_config.LOGGING_CONFIG` in your config as Airflow 1.8. ### New Features @@ -295,8 +649,10 @@ If you are using S3, the instructions should be largely the same as the Google c A new DaskExecutor allows Airflow tasks to be run in Dask Distributed clusters. ### Deprecated Features + These features are marked for deprecation. They may still work (and raise a `DeprecationWarning`), but are no longer supported and will be removed entirely in Airflow 2.0 + - If you're using the `google_cloud_conn_id` or `dataproc_cluster` argument names explicitly in `contrib.operators.Dataproc{*}Operator`(s), be sure to rename them to `gcp_conn_id` or `cluster_name`, respectively. We've renamed these arguments for consistency. (AIRFLOW-1323) - `post_execute()` hooks now take two arguments, `context` and `result` @@ -320,30 +676,36 @@ a previously installed version of Airflow before installing 1.8.1. ## Airflow 1.8 ### Database + The database schema needs to be upgraded. Make sure to shutdown Airflow and make a backup of your database. To upgrade the schema issue `airflow upgradedb`. ### Upgrade systemd unit files + Systemd unit files have been updated. If you use systemd please make sure to update these. > Please note that the webserver does not detach properly, this will be fixed in a future version. ### Tasks not starting although dependencies are met due to stricter pool checking + Airflow 1.7.1 has issues with being able to over subscribe to a pool, ie. more slots could be used than were available. This is fixed in Airflow 1.8.0, but due to past issue jobs may fail to start although their dependencies are met after an upgrade. To workaround either temporarily increase the amount of slots above the amount of queued tasks or use a new pool. ### Less forgiving scheduler on dynamic start_date + Using a dynamic start_date (e.g. `start_date = datetime.now()`) is not considered a best practice. The 1.8.0 scheduler is less forgiving in this area. If you encounter DAGs not being scheduled you can try using a fixed start_date and renaming your DAG. The last step is required to make sure you start with a clean slate, otherwise the old schedule can interfere. ### New and updated scheduler options + Please read through the new scheduler options, defaults have changed since 1.7.1. #### child_process_log_directory + In order to increase the robustness of the scheduler, DAGS are now processed in their own process. Therefore each DAG has its own log file for the scheduler. These log files are placed in `child_process_log_directory` which defaults to `/scheduler/latest`. You will need to make sure these log files are removed. @@ -351,24 +713,30 @@ DAG has its own log file for the scheduler. These log files are placed in `child > DAG logs or processor logs ignore and command line settings for log file locations. #### run_duration + Previously the command line option `num_runs` was used to let the scheduler terminate after a certain amount of loops. This is now time bound and defaults to `-1`, which means run continuously. See also num_runs. #### num_runs + Previously `num_runs` was used to let the scheduler terminate after a certain amount of loops. Now num_runs specifies the number of times to try to schedule each DAG file within `run_duration` time. Defaults to `-1`, which means try indefinitely. This is only available on the command line. #### min_file_process_interval + After how much time should an updated DAG be picked up from the filesystem. #### min_file_parsing_loop_time +CURRENTLY DISABLED DUE TO A BUG How many seconds to wait between file-parsing loops to prevent the logs from being spammed. #### dag_dir_list_interval + The frequency with which the scheduler should relist the contents of the DAG directory. If while developing +dags, they are not being picked up, have a look at this number and decrease it when necessary. #### catchup_by_default + By default the scheduler will fill any missing interval DAG Runs between the last execution date and the current date. This setting changes that behavior to only execute the latest interval. This can also be specified per DAG as `catchup = False / True`. Command line backfills will still work. @@ -399,6 +767,7 @@ required to whitelist these variables by adding the following to your configurat airflow\.ctx\..* ``` + ### Google Cloud Operator and Hook alignment All Google Cloud Operators and Hooks are aligned and use the same client library. Now you have a single connection @@ -410,6 +779,7 @@ Also the old P12 key file type is not supported anymore and only the new JSON ke account. ### Deprecated Features + These features are marked for deprecation. They may still work (and raise a `DeprecationWarning`), but are no longer supported and will be removed entirely in Airflow 2.0 @@ -421,11 +791,12 @@ supported and will be removed entirely in Airflow 2.0 - Operators no longer accept arbitrary arguments Previously, `Operator.__init__()` accepted any arguments (either positional `*args` or keyword `**kwargs`) without - complaint. Now, invalid arguments will be rejected. (https://github.com/apache/incubator-airflow/pull/1285) + complaint. Now, invalid arguments will be rejected. (https://github.com/apache/airflow/pull/1285) - The config value secure_mode will default to True which will disable some insecure endpoints/features ### Known Issues + There is a report that the default of "-1" for num_runs creates an issue where errors are reported while parsing tasks. It was not confirmed, but a workaround was found by changing the default back to `None`. @@ -452,7 +823,9 @@ To continue using the default smtp email backend, change the email_backend line [email] email_backend = airflow.utils.send_email_smtp ``` + to: + ``` [email] email_backend = airflow.utils.email.send_email_smtp @@ -465,7 +838,9 @@ To continue using S3 logging, update your config file so: ``` s3_log_folder = s3://my-airflow-log-bucket/logs ``` + becomes: + ``` remote_base_log_folder = s3://my-airflow-log-bucket/logs remote_log_conn_id = diff --git a/airflow/__init__.py b/airflow/__init__.py index f40b08aab5e77..4434dc4413e1f 100644 --- a/airflow/__init__.py +++ b/airflow/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,6 +25,8 @@ `airflow.www.login` """ from builtins import object +from typing import Any + from airflow import version from airflow.utils.log.logging_mixin import LoggingMixin @@ -32,17 +34,16 @@ import sys -from airflow import configuration as conf -from airflow import settings +# flake8: noqa: F401 +from airflow import settings, configuration as conf from airflow.models import DAG from flask_admin import BaseView from importlib import import_module from airflow.exceptions import AirflowException -if settings.DAGS_FOLDER not in sys.path: - sys.path.append(settings.DAGS_FOLDER) +settings.initialize() -login = None +login = None # type: Any def load_login(): @@ -80,11 +81,12 @@ class AirflowMacroPlugin(object): def __init__(self, namespace): self.namespace = namespace -from airflow import operators + +from airflow import operators # noqa: E402 from airflow import sensors # noqa: E402 -from airflow import hooks -from airflow import executors -from airflow import macros +from airflow import hooks # noqa: E402 +from airflow import executors # noqa: E402 +from airflow import macros # noqa: E402 operators._integrate_plugins() sensors._integrate_plugins() # noqa: E402 diff --git a/airflow/_vendor/README b/airflow/_vendor/README new file mode 100644 index 0000000000000..a79ea89eae536 --- /dev/null +++ b/airflow/_vendor/README @@ -0,0 +1,13 @@ +Original files in this directory were created with the following commands:: + + mkdir -p slugify/ + curl -fsSL -O https://files.pythonhosted.org/packages/1f/9c/8b07d625e9c9df567986d887f0375075abb1923e49d074a7803cd1527dae/python-slugify-2.0.1.tar.gz + tar -xzf python-slugify-*.tar.gz --strip-components=2 -C slugify/ '*/slugify/*' + tar -xzf python-slugify-*.tar.gz --strip-components=1 -C slugify/ '*/LICENSE' + rm *.tar.gz + + mkdir -p nvd3/ + curl -fsSL -O https://files.pythonhosted.org/packages/0b/aa/97165daa6e319409c5c2582e62736a7353bda3c90d90fdcb0b11e116dd2d/python-nvd3-0.15.0.tar.gz + tar -xzf python-nvd3-*.tar.gz --strip-components=2 -C nvd3/ '*/nvd3/*' + tar -xzf python-nvd3-*.tar.gz --strip-components=1 -C nvd3/ '*/LICENSE' + rm *.tar.gz diff --git a/.codecov.yml b/airflow/_vendor/__init__.py similarity index 96% rename from .codecov.yml rename to airflow/_vendor/__init__.py index 4094f35dccf0d..114d189da14ab 100644 --- a/.codecov.yml +++ b/airflow/_vendor/__init__.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -6,14 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -# keep default diff --git a/airflow/_vendor/nvd3/LICENSE b/airflow/_vendor/nvd3/LICENSE new file mode 100644 index 0000000000000..1add6249e57b4 --- /dev/null +++ b/airflow/_vendor/nvd3/LICENSE @@ -0,0 +1,24 @@ +The MIT License (MIT) + +Python-nvd3 + +Copyright (c) 2013 Arezqui Belaid and other contributors + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/airflow/_vendor/nvd3/NVD3Chart.py b/airflow/_vendor/nvd3/NVD3Chart.py new file mode 100644 index 0000000000000..faefe5d3a0fcf --- /dev/null +++ b/airflow/_vendor/nvd3/NVD3Chart.py @@ -0,0 +1,506 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from __future__ import unicode_literals +from optparse import OptionParser +from jinja2 import Environment, PackageLoader +from airflow._vendor.slugify import slugify + +try: + import simplejson as json +except ImportError: + import json + +CONTENT_FILENAME = "./content.html" +PAGE_FILENAME = "./page.html" + + +pl = PackageLoader('airflow._vendor.nvd3', 'templates') +jinja2_env = Environment(lstrip_blocks=True, trim_blocks=True, loader=pl) + +template_content = jinja2_env.get_template(CONTENT_FILENAME) +template_page = jinja2_env.get_template(PAGE_FILENAME) + + +def stab(tab=1): + """ + create space tabulation + """ + return ' ' * 4 * tab + + +class NVD3Chart(object): + """ + NVD3Chart Base class. + """ + #: chart count + count = 0 + #: directory holding the assets (bower_components) + assets_directory = './bower_components/' + + # this attribute is overriden by children of this + # class + CHART_FILENAME = None + template_environment = Environment(lstrip_blocks=True, trim_blocks=True, + loader=pl) + + def __init__(self, **kwargs): + """ + This is the base class for all the charts. The following keywords are + accepted: + + :keyword: **display_container** - default: ``True`` + :keyword: **jquery_on_ready** - default: ``False`` + :keyword: **charttooltip_dateformat** - default: ``'%d %b %Y'`` + :keyword: **name** - default: the class name + ``model`` - set the model (e.g. ``pieChart``, ` + ``LineWithFocusChart``, ``MultiBarChart``). + :keyword: **color_category** - default - ``None`` + :keyword: **color_list** - default - ``None`` + used by pieChart (e.g. ``['red', 'blue', 'orange']``) + :keyword: **margin_bottom** - default - ``20`` + :keyword: **margin_left** - default - ``60`` + :keyword: **margin_right** - default - ``60`` + :keyword: **margin_top** - default - ``30`` + :keyword: **height** - default - ``''`` + :keyword: **width** - default - ``''`` + :keyword: **stacked** - default - ``False`` + :keyword: **focus_enable** - default - ``False`` + :keyword: **resize** - define - ``False`` + :keyword: **show_legend** - default - ``True`` + :keyword: **show_labels** - default - ``True`` + :keyword: **tag_script_js** - default - ``True`` + :keyword: **use_interactive_guideline** - default - ``False`` + :keyword: **chart_attr** - default - ``None`` + :keyword: **extras** - default - ``None`` + + Extra chart modifiers. Use this to modify different attributes of + the chart. + :keyword: **x_axis_date** - default - False + Signal that x axis is a date axis + :keyword: **date_format** - default - ``%x`` + see https://github.com/mbostock/d3/wiki/Time-Formatting + :keyword: **x_axis_format** - default - ``''``. + :keyword: **y_axis_format** - default - ``''``. + :keyword: **style** - default - ``''`` + Style modifiers for the DIV container. + :keyword: **color_category** - default - ``category10`` + + Acceptable values are nvd3 categories such as + ``category10``, ``category20``, ``category20c``. + """ + # set the model + self.model = self.__class__.__name__ #: The chart model, + + #: an Instance of Jinja2 template + self.template_page_nvd3 = template_page + self.template_content_nvd3 = template_content + self.series = [] + self.axislist = {} + # accepted keywords + self.display_container = kwargs.get('display_container', True) + self.charttooltip_dateformat = kwargs.get('charttooltip_dateformat', + '%d %b %Y') + self._slugify_name(kwargs.get('name', self.model)) + self.jquery_on_ready = kwargs.get('jquery_on_ready', False) + self.color_category = kwargs.get('color_category', None) + self.color_list = kwargs.get('color_list', None) + self.margin_bottom = kwargs.get('margin_bottom', 20) + self.margin_left = kwargs.get('margin_left', 60) + self.margin_right = kwargs.get('margin_right', 60) + self.margin_top = kwargs.get('margin_top', 30) + self.height = kwargs.get('height', '') + self.width = kwargs.get('width', '') + self.stacked = kwargs.get('stacked', False) + self.focus_enable = kwargs.get('focus_enable', False) + self.resize = kwargs.get('resize', False) + self.show_legend = kwargs.get('show_legend', True) + self.show_labels = kwargs.get('show_labels', True) + self.tag_script_js = kwargs.get('tag_script_js', True) + self.use_interactive_guideline = kwargs.get("use_interactive_guideline", + False) + self.chart_attr = kwargs.get("chart_attr", {}) + self.extras = kwargs.get('extras', None) + self.style = kwargs.get('style', '') + self.date_format = kwargs.get('date_format', '%x') + self.x_axis_date = kwargs.get('x_axis_date', False) + #: x-axis contain date format or not + # possible duplicate of x_axis_date + self.date_flag = kwargs.get('date_flag', False) + self.x_axis_format = kwargs.get('x_axis_format', '') + # Load remote JS assets or use the local bower assets? + self.remote_js_assets = kwargs.get('remote_js_assets', True) + + # None keywords attribute that should be modified by methods + # We should change all these to _attr + + self.htmlcontent = '' #: written by buildhtml + self.htmlheader = '' + #: Place holder for the graph (the HTML div) + #: Written by ``buildcontainer`` + self.container = u'' + #: Header for javascript code + self.containerheader = u'' + # CDN http://cdnjs.com/libraries/nvd3/ needs to make sure it's up to + # date + self.header_css = [ + '' % h for h in + ( + 'https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css' if self.remote_js_assets else self.assets_directory + 'nvd3/src/nv.d3.css', + ) + ] + + self.header_js = [ + '' % h for h in + ( + 'https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js' if self.remote_js_assets else self.assets_directory + 'd3/d3.min.js', + 'https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.js' if self.remote_js_assets else self.assets_directory + 'nvd3/nv.d3.min.js' + ) + ] + + #: Javascript code as string + self.jschart = None + self.custom_tooltip_flag = False + self.tooltip_condition_string = '' + self.charttooltip = '' + self.serie_no = 1 + + def _slugify_name(self, name): + """Slufigy name with underscore""" + self.name = slugify(name, separator='_') + + def add_serie(self, y, x, name=None, extra=None, **kwargs): + """ + add serie - Series are list of data that will be plotted + y {1, 2, 3, 4, 5} / x {1, 2, 3, 4, 5} + + **Attributes**: + + * ``name`` - set Serie name + * ``x`` - x-axis data + * ``y`` - y-axis data + + kwargs: + + * ``shape`` - for scatterChart, you can set different shapes + (circle, triangle etc...) + * ``size`` - for scatterChart, you can set size of different shapes + * ``type`` - for multiChart, type should be bar + * ``bar`` - to display bars in Chart + * ``color_list`` - define list of colors which will be + used by pieChart + * ``color`` - set axis color + * ``disabled`` - + + extra: + + * ``tooltip`` - set tooltip flag + * ``date_format`` - set date_format for tooltip if x-axis is in + date format + + """ + if not name: + name = "Serie %d" % (self.serie_no) + + # For scatterChart shape & size fields are added in serie + if 'shape' in kwargs or 'size' in kwargs: + csize = kwargs.get('size', 1) + cshape = kwargs.get('shape', 'circle') + + serie = [{ + 'x': x[i], + 'y': j, + 'shape': cshape, + 'size': csize[i] if isinstance(csize, list) else csize + } for i, j in enumerate(y)] + else: + if self.model == 'pieChart': + serie = [{'label': x[i], 'value': y} for i, y in enumerate(y)] + else: + serie = [{'x': x[i], 'y': y} for i, y in enumerate(y)] + + data_keyvalue = {'values': serie, 'key': name} + + # multiChart + # Histogram type='bar' for the series + if 'type' in kwargs and kwargs['type']: + data_keyvalue['type'] = kwargs['type'] + + # Define on which Y axis the serie is related + # a chart can have 2 Y axis, left and right, by default only one Y Axis is used + if 'yaxis' in kwargs and kwargs['yaxis']: + data_keyvalue['yAxis'] = kwargs['yaxis'] + else: + if self.model != 'pieChart': + data_keyvalue['yAxis'] = '1' + + if 'bar' in kwargs and kwargs['bar']: + data_keyvalue['bar'] = 'true' + + if 'disabled' in kwargs and kwargs['disabled']: + data_keyvalue['disabled'] = 'true' + + if 'color' in kwargs and kwargs['color']: + data_keyvalue['color'] = kwargs['color'] + + if extra: + if self.model == 'pieChart': + if 'color_list' in extra and extra['color_list']: + self.color_list = extra['color_list'] + + if extra.get('date_format'): + self.charttooltip_dateformat = extra['date_format'] + + if extra.get('tooltip'): + self.custom_tooltip_flag = True + + if self.model != 'pieChart': + _start = extra['tooltip']['y_start'] + _end = extra['tooltip']['y_end'] + _start = ("'" + str(_start) + "' + ") if _start else '' + _end = (" + '" + str(_end) + "'") if _end else '' + + if self.model == 'linePlusBarChart': + if self.tooltip_condition_string: + self.tooltip_condition_string += stab(5) + self.tooltip_condition_string += stab(0) + "if(key.indexOf('" + name + "') > -1 ){\n" +\ + stab(6) + "var y = " + _start + " String(graph.point.y) " + _end + ";\n" +\ + stab(5) + "}\n" + elif self.model == 'cumulativeLineChart': + self.tooltip_condition_string += stab(0) + "if(key == '" + name + "'){\n" +\ + stab(6) + "var y = " + _start + " String(e) " + _end + ";\n" +\ + stab(5) + "}\n" + else: + self.tooltip_condition_string += stab(5) + "if(key == '" + name + "'){\n" +\ + stab(6) + "var y = " + _start + " String(graph.point.y) " + _end + ";\n" +\ + stab(5) + "}\n" + + if self.model == 'pieChart': + _start = extra['tooltip']['y_start'] + _end = extra['tooltip']['y_end'] + _start = ("'" + str(_start) + "' + ") if _start else '' + _end = (" + '" + str(_end) + "'") if _end else '' + self.tooltip_condition_string += "var y = " + _start + " String(y) " + _end + ";\n" + + # Increment series counter & append + self.serie_no += 1 + self.series.append(data_keyvalue) + + def add_chart_extras(self, extras): + """ + Use this method to add extra d3 properties to your chart. + For example, you want to change the text color of the graph:: + + chart = pieChart(name='pieChart', color_category='category20c', height=400, width=400) + + xdata = ["Orange", "Banana", "Pear", "Kiwi", "Apple", "Strawberry", "Pineapple"] + ydata = [3, 4, 0, 1, 5, 7, 3] + + extra_serie = {"tooltip": {"y_start": "", "y_end": " cal"}} + chart.add_serie(y=ydata, x=xdata, extra=extra_serie) + + The above code will create graph with a black text, the following will change it:: + + text_white="d3.selectAll('#pieChart text').style('fill', 'white');" + chart.add_chart_extras(text_white) + + The above extras will be appended to the java script generated. + + Alternatively, you can use the following initialization:: + + chart = pieChart(name='pieChart', + color_category='category20c', + height=400, width=400, + extras=text_white) + """ + self.extras = extras + + def set_graph_height(self, height): + """Set Graph height""" + self.height = str(height) + + def set_graph_width(self, width): + """Set Graph width""" + self.width = str(width) + + def set_containerheader(self, containerheader): + """Set containerheader""" + self.containerheader = containerheader + + def set_date_flag(self, date_flag=False): + """Set date flag""" + self.date_flag = date_flag + + def set_custom_tooltip_flag(self, custom_tooltip_flag): + """Set custom_tooltip_flag & date_flag""" + self.custom_tooltip_flag = custom_tooltip_flag + + def __str__(self): + """return htmlcontent""" + self.buildhtml() + return self.htmlcontent + + def buildcontent(self): + """Build HTML content only, no header or body tags. To be useful this + will usually require the attribute `juqery_on_ready` to be set which + will wrap the js in $(function(){};) + """ + self.buildcontainer() + # if the subclass has a method buildjs this method will be + # called instead of the method defined here + # when this subclass method is entered it does call + # the method buildjschart defined here + self.buildjschart() + self.htmlcontent = self.template_content_nvd3.render(chart=self) + + def buildhtml(self): + """Build the HTML page + Create the htmlheader with css / js + Create html page + Add Js code for nvd3 + """ + self.buildcontent() + self.content = self.htmlcontent + self.htmlcontent = self.template_page_nvd3.render(chart=self) + + # this is used by django-nvd3 + def buildhtmlheader(self): + """generate HTML header content""" + self.htmlheader = '' + # If the JavaScript assets have already been injected, don't bother re-sourcing them. + global _js_initialized + if '_js_initialized' not in globals() or not _js_initialized: + for css in self.header_css: + self.htmlheader += css + for js in self.header_js: + self.htmlheader += js + + def buildcontainer(self): + """generate HTML div""" + if self.container: + return + + # Create SVG div with style + if self.width: + if self.width[-1] != '%': + self.style += 'width:%spx;' % self.width + else: + self.style += 'width:%s;' % self.width + if self.height: + if self.height[-1] != '%': + self.style += 'height:%spx;' % self.height + else: + self.style += 'height:%s;' % self.height + if self.style: + self.style = 'style="%s"' % self.style + + self.container = self.containerheader + \ + '
\n' % (self.name, self.style) + + def buildjschart(self): + """generate javascript code for the chart""" + self.jschart = '' + + # add custom tooltip string in jschart + # default condition (if build_custom_tooltip is not called explicitly with date_flag=True) + if self.tooltip_condition_string == '': + self.tooltip_condition_string = 'var y = String(graph.point.y);\n' + + # Include data + self.series_js = json.dumps(self.series) + + def create_x_axis(self, name, label=None, format=None, date=False, custom_format=False): + """Create X-axis""" + axis = {} + if custom_format and format: + axis['tickFormat'] = format + elif format: + if format == 'AM_PM': + axis['tickFormat'] = "function(d) { return get_am_pm(parseInt(d)); }" + else: + axis['tickFormat'] = "d3.format(',%s')" % format + + if label: + axis['axisLabel'] = "'" + label + "'" + + # date format : see https://github.com/mbostock/d3/wiki/Time-Formatting + if date: + self.dateformat = format + axis['tickFormat'] = ("function(d) { return d3.time.format('%s')" + "(new Date(parseInt(d))) }\n" + "" % self.dateformat) + # flag is the x Axis is a date + if name[0] == 'x': + self.x_axis_date = True + + # Add new axis to list of axis + self.axislist[name] = axis + + # Create x2Axis if focus_enable + if name == "xAxis" and self.focus_enable: + self.axislist['x2Axis'] = axis + + def create_y_axis(self, name, label=None, format=None, custom_format=False): + """ + Create Y-axis + """ + axis = {} + + if custom_format and format: + axis['tickFormat'] = format + elif format: + axis['tickFormat'] = "d3.format(',%s')" % format + + if label: + axis['axisLabel'] = "'" + label + "'" + + # Add new axis to list of axis + self.axislist[name] = axis + + +class TemplateMixin(object): + """ + A mixin that override buildcontent. Instead of building the complex + content template we exploit Jinja2 inheritance. Thus each chart class + renders it's own chart template which inherits from content.html + """ + def buildcontent(self): + """Build HTML content only, no header or body tags. To be useful this + will usually require the attribute `juqery_on_ready` to be set which + will wrap the js in $(function(){};) + """ + self.buildcontainer() + # if the subclass has a method buildjs this method will be + # called instead of the method defined here + # when this subclass method is entered it does call + # the method buildjschart defined here + self.buildjschart() + self.htmlcontent = self.template_chart_nvd3.render(chart=self) + + +def _main(): + """ + Parse options and process commands + """ + # Parse arguments + usage = "usage: nvd3.py [options]" + parser = OptionParser(usage=usage, + version=("python-nvd3 - Charts generator with " + "nvd3.js and d3.js")) + parser.add_option("-q", "--quiet", + action="store_false", dest="verbose", default=True, + help="don't print messages to stdout") + + (options, args) = parser.parse_args() + + +if __name__ == '__main__': + _main() diff --git a/airflow/_vendor/nvd3/__init__.py b/airflow/_vendor/nvd3/__init__.py new file mode 100755 index 0000000000000..5b737b45361ad --- /dev/null +++ b/airflow/_vendor/nvd3/__init__.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +__version__ = '0.15.0' +__all__ = ['lineChart', 'pieChart', 'lineWithFocusChart', + 'stackedAreaChart', 'multiBarHorizontalChart', + 'linePlusBarChart', 'cumulativeLineChart', + 'scatterChart', 'discreteBarChart', 'multiBarChart'] + + +from .lineChart import lineChart +from .pieChart import pieChart +from .lineWithFocusChart import lineWithFocusChart +from .stackedAreaChart import stackedAreaChart +from .multiBarHorizontalChart import multiBarHorizontalChart +from .linePlusBarChart import linePlusBarChart +from .cumulativeLineChart import cumulativeLineChart +from .scatterChart import scatterChart +from .discreteBarChart import discreteBarChart +from .multiBarChart import multiBarChart +from . import ipynb diff --git a/airflow/_vendor/nvd3/cumulativeLineChart.py b/airflow/_vendor/nvd3/cumulativeLineChart.py new file mode 100644 index 0000000000000..d98d0867e4d99 --- /dev/null +++ b/airflow/_vendor/nvd3/cumulativeLineChart.py @@ -0,0 +1,104 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class cumulativeLineChart(TemplateMixin, NVD3Chart): + """ + A cumulative line chart is used when you have one important grouping representing + an ordered set of data and one value to show, summed over time. + + Python example:: + + from nvd3 import cumulativeLineChart + chart = cumulativeLineChart(name='cumulativeLineChart', x_is_date=True) + xdata = [1365026400000000, 1365026500000000, 1365026600000000] + ydata = [6, 5, 1] + y2data = [36, 55, 11] + + extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}} + chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) + + extra_serie = {"tooltip": {"y_start": "", "y_end": " mins"}} + chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./cumulativelinechart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(cumulativeLineChart, self).__init__(**kwargs) + self.model = 'cumulativeLineChart' + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', '%d %b %Y'), + date=True) + self.set_custom_tooltip_flag(True) + else: + self.create_x_axis('xAxis', format=kwargs.get( + 'x_axis_format', '.2f')) + + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.1%')) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/discreteBarChart.py b/airflow/_vendor/nvd3/discreteBarChart.py new file mode 100644 index 0000000000000..cf6c8a4a8ff4b --- /dev/null +++ b/airflow/_vendor/nvd3/discreteBarChart.py @@ -0,0 +1,91 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class discreteBarChart(TemplateMixin, NVD3Chart): + """ + A discrete bar chart or bar graph is a chart with rectangular bars with + lengths proportional to the values that they represent. + + Python example:: + + from nvd3 import discreteBarChart + chart = discreteBarChart(name='discreteBarChart', height=400, width=400) + + xdata = ["A", "B", "C", "D", "E", "F"] + ydata = [3, 4, 0, -3, 5, 7] + + chart.add_serie(y=ydata, x=xdata) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + + """ + CHART_FILENAME = "./discretebarchart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(discreteBarChart, self).__init__(**kwargs) + self.model = 'discreteBarChart' + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', + "%d %b %Y %H %S"), + date=True) + else: + self.create_x_axis('xAxis', format=None) + + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', ".0f")) + + self.set_custom_tooltip_flag(True) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/ipynb.py b/airflow/_vendor/nvd3/ipynb.py new file mode 100644 index 0000000000000..f421afc0a8a50 --- /dev/null +++ b/airflow/_vendor/nvd3/ipynb.py @@ -0,0 +1,91 @@ +''' +ipython compatability module for nvd3-python +This adds simple ipython compatibility to the nvd3-python package, without making any +major modifications to how the main package is structured. It utilizes the IPython +display-formatter functionality, as described at: +http://nbviewer.ipython.org/github/ipython/ipython/blob/master/examples/notebooks/Custom%20Display%20Logic.ipynb +For additional examples, see: +https://github.com/sympy/sympy/blob/master/sympy/interactive/printing.py +''' + +try: + _ip = get_ipython() +except: + _ip = None +if _ip and _ip.__module__.lower().startswith('ipy'): + global _js_initialized + _js_initialized = False + + def _print_html(chart): + '''Function to return the HTML code for the div container plus the javascript + to generate the chart. This function is bound to the ipython formatter so that + charts are displayed inline.''' + global _js_initialized + if not _js_initialized: + print('js not initialized - pausing to allow time for it to load...') + initialize_javascript() + import time + time.sleep(5) + chart.buildhtml() + return chart.htmlcontent + + def _setup_ipython_formatter(ip): + ''' Set up the ipython formatter to display HTML formatted output inline''' + from IPython import __version__ as IPython_version + from nvd3 import __all__ as nvd3_all + + if IPython_version >= '0.11': + html_formatter = ip.display_formatter.formatters['text/html'] + for chart_type in nvd3_all: + html_formatter.for_type_by_name('nvd3.' + chart_type, chart_type, _print_html) + + def initialize_javascript(d3_js_url='https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js', + nvd3_js_url='https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.js', + nvd3_css_url='https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css', + use_remote=False): + '''Initialize the ipython notebook to be able to display nvd3 results. + by instructing IPython to load the nvd3 JS and css files, and the d3 JS file. + + by default, it looks for the files in your IPython Notebook working directory. + + Takes the following options: + + use_remote: use remote hosts for d3.js, nvd3.js, and nv.d3.css (default False) + * Note: the following options are ignored if use_remote is False: + nvd3_css_url: location of nvd3 css file (default https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css) + nvd3_js_url: location of nvd3 javascript file (default https://cdnjs.cloudflare.com/ajax/libs/nvd3/1.7.0/nv.d3.min.css) + d3_js_url: location of d3 javascript file (default https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js) + ''' + from IPython.display import display, Javascript, HTML + + if not use_remote: + # these file locations are for IPython 1.x, and will probably change when 2.x is released + d3_js_url = 'files/d3.v3.js' + nvd3_js_url = 'files/nv.d3.js' + nvd3_css_url = 'files/nv.d3.css' + + # load the required javascript files + + #display(Javascript('''$.getScript("%s")''' %(d3_js_url))) + display(HTML('''''' % (nvd3_css_url))) + # The following two methods for loading the script file are redundant. + # This is intentional. + # Ipython's loading of javscript in version 1.x is a bit squirrely, especially + # when creating demos to view in nbviewer. + # by trying twice, in two different ways (one using jquery and one using plain old + # HTML), we maximize our chances of successfully loading the script. + display(Javascript('''$.getScript("%s")''' % (nvd3_js_url))) + display(Javascript('''$.getScript("%s", function() { + $.getScript("%s", function() {})});''' % (d3_js_url, nvd3_js_url))) + display(HTML('' % (d3_js_url))) + display(HTML('' % (nvd3_js_url))) + + global _js_initialized + _js_initialized = True + + print('loaded nvd3 IPython extension\n' + 'run nvd3.ipynb.initialize_javascript() to set up the notebook\n' + 'help(nvd3.ipynb.initialize_javascript) for options') + + _setup_ipython_formatter(_ip) diff --git a/airflow/_vendor/nvd3/lineChart.py b/airflow/_vendor/nvd3/lineChart.py new file mode 100644 index 0000000000000..c237d069802ad --- /dev/null +++ b/airflow/_vendor/nvd3/lineChart.py @@ -0,0 +1,120 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class lineChart(TemplateMixin, NVD3Chart): + + """ + A line chart or line graph is a type of chart which displays information + as a series of data points connected by straight line segments. + + Python example:: + + from nvd3 import lineChart + chart = lineChart(name="lineChart", x_is_date=False, x_axis_format="AM_PM") + + xdata = range(24) + ydata = [0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 4, 3, 3, 5, 7, 5, 3, 16, 6, 9, 15, 4, 12] + ydata2 = [9, 8, 11, 8, 3, 7, 10, 8, 6, 6, 9, 6, 5, 4, 3, 10, 0, 6, 3, 1, 0, 0, 0, 1] + + extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}} + chart.add_serie(y=ydata, x=xdata, name='sine', extra=extra_serie, **kwargs1) + extra_serie = {"tooltip": {"y_start": "", "y_end": " min"}} + chart.add_serie(y=ydata2, x=xdata, name='cose', extra=extra_serie, **kwargs2) + chart.buildhtml() + + Javascript renderd to: + + .. raw:: html + +
+ + + See the source code of this page, to see the underlying javascript. + """ + CHART_FILENAME = "./linechart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(lineChart, self).__init__(**kwargs) + self.model = 'lineChart' + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', '%d %b %Y'), + date=True) + self.set_custom_tooltip_flag(True) + else: + if kwargs.get('x_axis_format') == 'AM_PM': + self.x_axis_format = format = 'AM_PM' + else: + format = kwargs.get('x_axis_format', 'r') + self.create_x_axis('xAxis', format=format, + custom_format=kwargs.get('x_custom_format', + False)) + self.create_y_axis( + 'yAxis', + format=kwargs.get('y_axis_format', '.02f'), + custom_format=kwargs.get('y_custom_format', False)) + + # must have a specified height, otherwise it superimposes both chars + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/linePlusBarChart.py b/airflow/_vendor/nvd3/linePlusBarChart.py new file mode 100644 index 0000000000000..4eaa5fc6ffdbf --- /dev/null +++ b/airflow/_vendor/nvd3/linePlusBarChart.py @@ -0,0 +1,131 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class linePlusBarChart(TemplateMixin, NVD3Chart): + + """ + A linePlusBarChart Chart is a type of chart which displays information + as a series of data points connected by straight line segments + and with some series with rectangular bars with lengths proportional + to the values that they represent. + + Python example:: + + from nvd3 import linePlusBarChart + chart = linePlusBarChart(name="linePlusBarChart", + width=500, height=400, x_axis_format="%d %b %Y", + x_is_date=True, focus_enable=True, + yaxis2_format="function(d) { return d3.format(',0.3f')(d) }") + + xdata = [1338501600000, 1345501600000, 1353501600000] + ydata = [6, 5, 1] + y2data = [0.002, 0.003, 0.004] + + extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " calls"}, + "date_format": "%d %b %Y %H:%S" } + chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie, + bar=True) + + extra_serie = {"tooltip": {"y_start": "There are ", "y_end": " min"}} + chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) + chart.buildcontent() + + Note that in case you have two data serie with extreme different numbers, + that you would like to format in different ways, + you can pass a keyword *yaxis1_format* or *yaxis2_format* when + creating the graph. + + In the example above the graph created presents the values of the second + data series with three digits right of the decimal point. + + Javascript generated: + + .. raw:: html + +
+ + + """ + CHART_FILENAME = "./lineplusbarchart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(linePlusBarChart, self).__init__(**kwargs) + self.model = 'linePlusBarChart' + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + self.yaxis1_format = kwargs.get('yaxis1_format', + "function(d) { return d3.format(',f')(d) }") + self.yaxis2_format = kwargs.get('yaxis2_format', + "function(d) { return d3.format(',f')(d) }") + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', + '%d %b %Y %H %S'), + date=True) + self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', + '%d %b %Y %H %S'), + date=True) + self.set_custom_tooltip_flag(True) + else: + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', + '.2f')) + self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', + '.2f')) + + self.create_y_axis('y1Axis', format=self.yaxis1_format, + custom_format=True) + self.create_y_axis('y2Axis', format=self.yaxis2_format, + custom_format=True) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/lineWithFocusChart.py b/airflow/_vendor/nvd3/lineWithFocusChart.py new file mode 100644 index 0000000000000..cd26cd4716652 --- /dev/null +++ b/airflow/_vendor/nvd3/lineWithFocusChart.py @@ -0,0 +1,105 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class lineWithFocusChart(TemplateMixin, NVD3Chart): + """ + A lineWithFocusChart or line graph is a type of chart which displays information + as a series of data points connected by straight line segments. + The lineWithFocusChart provide a smaller chart that act as a selector, + this is very useful if you want to zoom on a specific time period. + + Python example:: + + from nvd3 import lineWithFocusChart + chart = lineWithFocusChart(name='lineWithFocusChart', x_is_date=True, x_axis_format="%d %b %Y") + xdata = [1365026400000000, 1365026500000000, 1365026600000000, 1365026700000000, 1365026800000000, 1365026900000000, 1365027000000000] + ydata = [-6, 5, -1, 2, 4, 8, 10] + + extra_serie = {"tooltip": {"y_start": "", "y_end": " ext"}, + "date_format": "%d %b %Y"} + chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./linewfocuschart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(lineWithFocusChart, self).__init__(**kwargs) + self.model = 'lineWithFocusChart' + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', + '%d %b %Y %H %S'), + date=True) + self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', + '%d %b %Y %H %S'), + date=True) + self.set_custom_tooltip_flag(True) + else: + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', + '.2f')) + self.create_x_axis('x2Axis', format=kwargs.get('x_axis_format', + '.2f')) + + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) + self.create_y_axis('y2Axis', format=kwargs.get('y_axis_format', '.2f')) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/multiBarChart.py b/airflow/_vendor/nvd3/multiBarChart.py new file mode 100644 index 0000000000000..cf335919a84c7 --- /dev/null +++ b/airflow/_vendor/nvd3/multiBarChart.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class multiBarChart(TemplateMixin, NVD3Chart): + """ + A multiple bar graph contains comparisons of two or more categories or bars. + One axis represents a quantity and the other axis identifies a specific feature + about the categories. Reading a multiple bar graph includes looking at extremes + (tallest/longest vs. shortest) in each grouping. + + Python example:: + + from nvd3 import multiBarChart + chart = multiBarChart(width=500, height=400, x_axis_format=None) + xdata = ['one', 'two', 'three', 'four'] + ydata1 = [6, 12, 9, 16] + ydata2 = [8, 14, 7, 11] + + chart.add_serie(name="Serie 1", y=ydata1, x=xdata) + chart.add_serie(name="Serie 2", y=ydata2, x=xdata) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./multibarchart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(multiBarChart, self).__init__(**kwargs) + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', '%d %b %Y'), + date=True) + self.set_custom_tooltip_flag(True) + else: + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.2f')) + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/multiBarHorizontalChart.py b/airflow/_vendor/nvd3/multiBarHorizontalChart.py new file mode 100644 index 0000000000000..ac969c31b548c --- /dev/null +++ b/airflow/_vendor/nvd3/multiBarHorizontalChart.py @@ -0,0 +1,100 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class multiBarHorizontalChart(TemplateMixin, NVD3Chart): + """ + A multiple horizontal bar graph contains comparisons of two or more categories or bars. + + Python example:: + + from nvd3 import multiBarHorizontalChart + chart = multiBarHorizontalChart(name='multiBarHorizontalChart', height=400, width=400) + xdata = [-14, -7, 7, 14] + ydata = [-6, 5, -1, 9] + y2data = [-23, -6, -32, 9] + + extra_serie = {"tooltip": {"y_start": "", "y_end": " balls"}} + chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) + + extra_serie = {"tooltip": {"y_start": "", "y_end": " calls"}} + chart.add_serie(name="Serie 2", y=y2data, x=xdata, extra=extra_serie) + chart.buildcontent() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./multibarcharthorizontal.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(multiBarHorizontalChart, self).__init__(**kwargs) + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.2f')) + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/pieChart.py b/airflow/_vendor/nvd3/pieChart.py new file mode 100644 index 0000000000000..1db76bdb3424c --- /dev/null +++ b/airflow/_vendor/nvd3/pieChart.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class pieChart(TemplateMixin, NVD3Chart): + + """ + A pie chart (or a circle graph) is a circular chart divided into sectors, + illustrating numerical proportion. In chart, the arc length of each sector + is proportional to the quantity it represents. + + Python example:: + + from nvd3 import pieChart + chart = pieChart(name='pieChart', color_category='category20c', + height=400, width=400) + + xdata = ["Orange", "Banana", "Pear", "Kiwi", "Apple", "Strawbery", + "Pineapple"] + ydata = [3, 4, 0, 1, 5, 7, 3] + + extra_serie = {"tooltip": {"y_start": "", "y_end": " cal"}} + chart.add_serie(y=ydata, x=xdata, extra=extra_serie) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + CHART_FILENAME = "./piechart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(pieChart, self).__init__(**kwargs) + + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + self.donut = kwargs.get('donut', False) + self.donutRatio = kwargs.get('donutRatio', 0.35) + self.color_list = [] + self.create_x_axis('xAxis', format=None) + self.create_y_axis('yAxis', format=None) + # must have a specified height, otherwise it superimposes both chars + if height: + self.set_graph_height(height) + if width: + self.set_graph_width(width) + self.donut = kwargs.get('donut', False) + self.donutRatio = kwargs.get('donutRatio', 0.35) diff --git a/airflow/_vendor/nvd3/scatterChart.py b/airflow/_vendor/nvd3/scatterChart.py new file mode 100644 index 0000000000000..c3a87d2908bde --- /dev/null +++ b/airflow/_vendor/nvd3/scatterChart.py @@ -0,0 +1,121 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class scatterChart(TemplateMixin, NVD3Chart): + + """ + A scatter plot or scattergraph is a type of mathematical diagram using Cartesian + coordinates to display values for two variables for a set of data. + The data is displayed as a collection of points, each having the value of one variable + determining the position on the horizontal axis and the value of the other variable + determining the position on the vertical axis. + + Python example:: + + from nvd3 import scatterChart + chart = scatterChart(name='scatterChart', height=400, width=400) + xdata = [3, 4, 0, -3, 5, 7] + ydata = [-1, 2, 3, 3, 15, 2] + ydata2 = [1, -2, 4, 7, -5, 3] + + kwargs1 = {'shape': 'circle', 'size': '1'} + kwargs2 = {'shape': 'cross', 'size': '10'} + + extra_serie = {"tooltip": {"y_start": "", "y_end": " call"}} + chart.add_serie(name="series 1", y=ydata, x=xdata, extra=extra_serie, **kwargs1) + + extra_serie = {"tooltip": {"y_start": "", "y_end": " min"}} + chart.add_serie(name="series 2", y=ydata2, x=xdata, extra=extra_serie, **kwargs2) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./scatterchart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(scatterChart, self).__init__(**kwargs) + self.model = 'scatterChart' + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', '.02f'), + label=kwargs.get('x_axis_label', None)) + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.02f'), + label=kwargs.get('y_axis_label', None)) + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/stackedAreaChart.py b/airflow/_vendor/nvd3/stackedAreaChart.py new file mode 100644 index 0000000000000..8346cd2c53879 --- /dev/null +++ b/airflow/_vendor/nvd3/stackedAreaChart.py @@ -0,0 +1,99 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Python-nvd3 is a Python wrapper for NVD3 graph library. +NVD3 is an attempt to build re-usable charts and chart components +for d3.js without taking away the power that d3.js gives you. + +Project location : https://github.com/areski/python-nvd3 +""" + +from .NVD3Chart import NVD3Chart, TemplateMixin + + +class stackedAreaChart(TemplateMixin, NVD3Chart): + """ + The stacked area chart is identical to the area chart, except the areas are stacked + on top of each other, rather than overlapping. This can make the chart much easier to read. + + Python example:: + + from nvd3 import stackedAreaChart + chart = stackedAreaChart(name='stackedAreaChart', height=400, width=400) + + xdata = [100, 101, 102, 103, 104, 105, 106,] + ydata = [6, 11, 12, 7, 11, 10, 11] + ydata2 = [8, 20, 16, 12, 20, 28, 28] + + extra_serie = {"tooltip": {"y_start": "There is ", "y_end": " min"}} + chart.add_serie(name="Serie 1", y=ydata, x=xdata, extra=extra_serie) + chart.add_serie(name="Serie 2", y=ydata2, x=xdata, extra=extra_serie) + chart.buildhtml() + + Javascript generated: + + .. raw:: html + +
+ + + """ + + CHART_FILENAME = "./stackedareachart.html" + template_chart_nvd3 = NVD3Chart.template_environment.get_template(CHART_FILENAME) + + def __init__(self, **kwargs): + super(stackedAreaChart, self).__init__(**kwargs) + height = kwargs.get('height', 450) + width = kwargs.get('width', None) + self.model = 'stackedAreaChart' + + if kwargs.get('x_is_date', False): + self.set_date_flag(True) + self.create_x_axis('xAxis', + format=kwargs.get('x_axis_format', '%d %b %Y'), + date=True) + self.set_custom_tooltip_flag(True) + else: + self.create_x_axis('xAxis', format=kwargs.get('x_axis_format', + '.2f')) + self.create_y_axis('yAxis', format=kwargs.get('y_axis_format', '.2f')) + + self.set_graph_height(height) + if width: + self.set_graph_width(width) diff --git a/airflow/_vendor/nvd3/templates/base.html b/airflow/_vendor/nvd3/templates/base.html new file mode 100644 index 0000000000000..e2d39dd7642cb --- /dev/null +++ b/airflow/_vendor/nvd3/templates/base.html @@ -0,0 +1,35 @@ +{% block container %} +{% endblock %} + +{% block start_script %} + {% if chart.tag_script_js %} + + {% endif %} +{% endblock endscript %} diff --git a/airflow/_vendor/nvd3/templates/content.html b/airflow/_vendor/nvd3/templates/content.html new file mode 100644 index 0000000000000..787f39b555a4a --- /dev/null +++ b/airflow/_vendor/nvd3/templates/content.html @@ -0,0 +1,123 @@ +{% extends "base.html" %} +{% block container %} +{% if chart.display_container %} + {{ chart.container }} +{% endif %} +{% endblock container %} + +{% block body %} + {% block data %} + data_{{ chart.name }}={{ chart.series_js }}; + {% endblock data %} + + {% block init %} + nv.addGraph(function() { + var chart = nv.models.{{ chart.model }}(){% if chart.use_interactive_guideline %}.useInteractiveGuideline(true){% endif %}; + + chart.margin({top: {{ chart.margin_top }}, right: {{ chart.margin_right }}, bottom: {{ chart.margin_bottom }}, left: {{ chart.margin_left }}}); + + var datum = data_{{ chart.name }}; + + {% if not chart.color_list and chart.color_category %} + chart.color(d3.scale.{{ chart.color_category }}().range()); + {% endif %} + {% endblock init %} + + {% if chart.stacked %} + chart.stacked(true); + {% endif %} + + {% block focus %} + {% endblock focus %} + + + {% block axes %} + {% for axis, a in chart.axislist.items() %} + {% if a.items() %} + chart.{{ axis }} + {% for attr, value in a.items() %} + .{{ attr}}({{ value}}){% if loop.last %}; + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} + {% endblock axes %} + + {# generate custom tooltip for the chart #} + {% block tooltip %} + {% if chart.custom_tooltip_flag %} + {% if not chart.date_flag %} + {% if chart.model == 'pieChart' %} + {% block pietooltip %} + {% endblock pietooltip %} + {% else %} + chart.tooltipContent(function(key, y, e, graph) { + var x = String(graph.point.x); + var y = String(graph.point.y); + {{ chart.tooltip_condition_string }} + tooltip_str = '
'+key+'
' + y + ' at ' + x; + return tooltip_str; + }); + {% endif %} + {% else %} + chart.tooltipContent(function(key, y, e, graph) { + var x = d3.time.format("{{ chart.charttooltip_dateformat }}")(new Date(parseInt(graph.point.x))); + var y = String(graph.point.y); + {{ chart.tooltip_condition_string }} + tooltip_str = '
'+key+'
' + y + ' on ' + x; + return tooltip_str; + }); + {% endif %} + {% endif %} + {% endblock tooltip %} + + {# the shape attribute in kwargs is not applied when #} + {# not allowing other shapes to be rendered #} + {% block legend %} + chart.showLegend({{chart.show_legend|lower}}); + {% endblock legend %} + + {% block custoattr %} + {# add custom chart attributes #} + {% for attr, value in chart.chart_attr.items() %} + {% if value is string and value.startswith(".") %}: + chart.{{ attr }}{{ value }}; + {% else %} + chart.{{ attr }}({{ value }}); + {% endif %} + {% endfor %} + + {% if chart.resize %} + nv.utils.windowResize(chart.update); + {% endif %} + + {# include specific subchart #} + {{ chart.jschart }} + + {% endblock custoattr %} + + {% block inject %} + {# Inject data to D3 #} + d3.select('#{{ chart.name }} svg') + .datum(datum) + .transition().duration(500) + {% if chart.width %} + .attr('width', {{ chart.width}}) + {% endif %} + {% if chart.height %} + .attr('height', {{ chart.height}}) + {% endif %} + .call(chart); + {% endblock inject %} + + {# extra chart attributes #} + {% if chart.extras %} + {{ chart.extras }} + {% endif %} + + {# closing nv.addGraph #} + {% block close %} + }); + {% endblock close %} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/cumulativelinechart.html b/airflow/_vendor/nvd3/templates/cumulativelinechart.html new file mode 100644 index 0000000000000..546a3e8e55171 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/cumulativelinechart.html @@ -0,0 +1,11 @@ +{# This template adds attributes unique + to cumulativeLineChart #} + +{% extends "content.html" %} +{% block body %} + +{# calling super guarantees everying in content is also found here ...#} +{{super()}} + +{% endblock body %} + diff --git a/airflow/_vendor/nvd3/templates/discretebarchart.html b/airflow/_vendor/nvd3/templates/discretebarchart.html new file mode 100644 index 0000000000000..2e31ae4874be0 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/discretebarchart.html @@ -0,0 +1,31 @@ +{# This is a dummy template, we can use that template to add attributes unique + to discreteBarChart #} + +{% extends "content.html" %} +{% block body %} + + {% block data %} + {{super()}} + {% endblock data %} + + {% block init %} + {{super()}} + {% endblock init %} + + {% block axes %} + {{super()}} + {% endblock axes %} + + {% block custoattr %} + {{super()}} + {% endblock custoattr %} + + {% block inject %} + {{ super() }} + {% endblock inject %} + + {% block close %} + {{ super() }} + {% endblock close %} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/linebarwfocuschart.html b/airflow/_vendor/nvd3/templates/linebarwfocuschart.html new file mode 100644 index 0000000000000..ad4866c8153f9 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/linebarwfocuschart.html @@ -0,0 +1,60 @@ +{# This template adds attributes unique + to lineChart #} + +{% extends "content.html" %} +{% block body %} + {% block data %} + data_{{ chart.name }}={{ chart.series_js }}; + {% endblock data %} + + + {% block init %} + {{super()}} + {% endblock init %} + {% block axes %} + {{super()}} + {% endblock axes %} + {% block tooltip %} + {{super()}} + {% endblock tooltip %} + + chart.showLegend({{chart.show_legend|lower}}); + + {# add custom chart attributes #} + {% for attr, value in chart.chart_attr.items() %} + {% if value is string and value.startswith(".") %}: + chart.{{ attr }}{{ value }}; + {% else %} + chart.{{ attr }}({{ value }}); + {% endif %} + {% endfor %} + + {% if chart.x_axis_format == 'AM_PM' %} + function get_am_pm(d){ + if (d > 12) { + d = d - 12; return (String(d) + 'PM'); + } + else { + return (String(d) + 'AM'); + } + }; + {% else %} + chart.x(function(d,i) { return i }); + {% endif %} + + {% if chart.resize %} + nv.utils.windowResize(chart.update); + {% endif %} + {% block inject %} + {{super()}} + {% endblock inject %} + + {% if chart.extras %} + {{ chart.extras }} + {% endif %} + + {% block close %} + }); + {% endblock close %} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/linechart.html b/airflow/_vendor/nvd3/templates/linechart.html new file mode 100644 index 0000000000000..cf15d33041558 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/linechart.html @@ -0,0 +1,47 @@ +{# This template adds attributes unique + to lineChart #} + +{% extends "content.html" %} +{% block body %} + + {% block data %} + {{super()}} + {% endblock data %} + + {% block init %} + {{super()}} + {% endblock init %} + + {% block axes %} + {{super()}} + {% endblock axes %} + + {% if chart.x_axis_format == 'AM_PM' %} + function get_am_pm(d){ + if (d > 12) { + d = d - 12; return (String(d) + 'PM'); + } + else { + return (String(d) + 'AM'); + } + }; + {% endif %} + + {% block legend %} + {{super()}} + {% endblock legend %} + + {% block custoattr %} + {{super()}} + {% endblock custoattr %} + + {% block inject %} + {{ super() }} + {% endblock inject %} + + {% block close %} + {{ super() }} + {% endblock close %} + +{% endblock body %} + diff --git a/airflow/_vendor/nvd3/templates/lineplusbarchart.html b/airflow/_vendor/nvd3/templates/lineplusbarchart.html new file mode 100644 index 0000000000000..73aeceacd2419 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/lineplusbarchart.html @@ -0,0 +1,44 @@ +{# This template adds attributes unique + to linePlusBarChart #} + +{% extends "content.html" %} +{% block body %} + + {% block data %} + {{super()}} + {% endblock data %} + + {% block init %} + {{super()}} + {% endblock init %} + + {% block focus %} + {% if chart.focus_enable %} + chart.focusEnable(true); + {% else %} + chart.focusEnable(false); + {% endif %} + {% endblock focus %} + + {% block axes %} + {{super()}} + {% endblock axes %} + + {% block legend %} + {{super()}} + {% endblock legend %} + + {% block custoattr %} + {{super()}} + {% endblock custoattr %} + + {% block inject %} + {{ super() }} + {% endblock inject %} + + {% block close %} + {{ super() }} + {% endblock close %} + +{% endblock body %} + diff --git a/airflow/_vendor/nvd3/templates/linewfocuschart.html b/airflow/_vendor/nvd3/templates/linewfocuschart.html new file mode 100644 index 0000000000000..5abe983d7efc3 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/linewfocuschart.html @@ -0,0 +1,10 @@ +{# This template adds attributes unique + to lineWithFocusChart #} + +{% extends "content.html" %} +{% block body %} + +{# calling super guarantees everying in content is also found here ...#} +{{super()}} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/multibarchart.html b/airflow/_vendor/nvd3/templates/multibarchart.html new file mode 100644 index 0000000000000..17eae7a634fef --- /dev/null +++ b/airflow/_vendor/nvd3/templates/multibarchart.html @@ -0,0 +1,10 @@ +{# This template adds attributes unique + to multiBarChart #} + +{% extends "content.html" %} +{% block body %} + +{# calling super guarantees everying in content is also found here ...#} +{{super()}} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html b/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html new file mode 100644 index 0000000000000..17eae7a634fef --- /dev/null +++ b/airflow/_vendor/nvd3/templates/multibarcharthorizontal.html @@ -0,0 +1,10 @@ +{# This template adds attributes unique + to multiBarChart #} + +{% extends "content.html" %} +{% block body %} + +{# calling super guarantees everying in content is also found here ...#} +{{super()}} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/page.html b/airflow/_vendor/nvd3/templates/page.html new file mode 100644 index 0000000000000..2dd0f5d16f829 --- /dev/null +++ b/airflow/_vendor/nvd3/templates/page.html @@ -0,0 +1,12 @@ + + + + + {% for header_element in chart.header_css+chart.header_js %} + {{ header_element }} + {% endfor %} + + + {{ chart.content }} + + diff --git a/airflow/_vendor/nvd3/templates/piechart.html b/airflow/_vendor/nvd3/templates/piechart.html new file mode 100644 index 0000000000000..a200e6d4a21bb --- /dev/null +++ b/airflow/_vendor/nvd3/templates/piechart.html @@ -0,0 +1,80 @@ +{# This template adds attributes unique + to pieChart #} + +{% extends "content.html" %} +{% block body %} + + data_{{ chart.name }}={{ chart.series_js }}; + + nv.addGraph(function() { + var chart = nv.models.{{ chart.model }}(){% if chart.use_interactive_guideline %}.useInteractiveGuideline(true){% endif %}; + chart.margin({top: {{ chart.margin_top }}, right: {{ chart.margin_right }}, bottom: {{ chart.margin_bottom }}, left: {{ chart.margin_left }}}); + var datum = data_{{ chart.name }}[0].values; + + {% if not chart.color_list and chart.color_category %} + chart.color(d3.scale.{{ chart.color_category }}().range()); + {% endif %} + + chart.tooltipContent(function(key, y, e, graph) { + var x = String(key); + {{ chart.tooltip_condition_string }} + tooltip_str = '
'+x+'
' + y; + return tooltip_str; + }); + {# showLabels only supported in pieChart #} + chart.showLabels({{chart.show_labels|lower}}); + + {% if chart.donut %} + chart.donut(true); + chart.donutRatio({{ chart.donutRatio }}); + {% else %} + chart.donut(false); + {% endif %} + + chart.showLegend({{chart.show_legend|lower}}); + + {# add custom chart attributes #} + {% for attr, value in chart.chart_attr.items() %} + {% if value is string and value.startswith(".") %}: + chart.{{ attr }}{{ value }}; + {% else %} + chart.{{ attr }}({{ value }}); + {% endif %} + {% endfor %} + + {% if chart.resize %} + nv.utils.windowResize(chart.update); + {% endif %} + + {% if chart.color_list %} + var mycolor = new Array(); + {% for color in chart.color_list %} + mycolor[{{ loop.index - 1}}] = "{{ color }}"; + {% endfor %} + {% endif %} + + chart + .x(function(d) { return d.label }) + .y(function(d) { return d.value }); + + {% if chart.width %} + chart.width({{ chart.width }}); + {% endif %} + + {% if chart.height %} + chart.height({{ chart.height }}); + {% endif %} + + {% if chart.color_list %} + chart.color(mycolor); + {% endif %} + + {% block inject %} + {{super()}} + {% endblock inject %} + + {% block close %} + {{ super() }} + {% endblock close %} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/scatterchart.html b/airflow/_vendor/nvd3/templates/scatterchart.html new file mode 100644 index 0000000000000..8c2adaae34cee --- /dev/null +++ b/airflow/_vendor/nvd3/templates/scatterchart.html @@ -0,0 +1,52 @@ +{# This template adds attributes unique + to scatterChart #} + +{% extends "content.html" %} +{% block body %} + + {% block data %} + {{super()}} + {% endblock data %} + + {% block init %} + {{super()}} + {% endblock init %} + + {% block axes %} + {{super()}} + {% endblock axes %} + + {% if chart.x_axis_format == 'AM_PM' %} + function get_am_pm(d){ + if (d > 12) { + d = d - 12; return (String(d) + 'PM'); + } + else { + return (String(d) + 'AM'); + } + }; + {% endif %} + + {% block legend %} + {{super()}} + {% endblock legend %} + + {% block custoattr %} + {{super()}} + {% endblock custoattr %} + + {% block inject %} + + chart + .showDistX(true) + .showDistY(true) + .color(d3.scale.category10().range()); + + {{ super() }} + {% endblock inject %} + + {% block close %} + {{ super() }} + {% endblock close %} + +{% endblock body %} diff --git a/airflow/_vendor/nvd3/templates/stackedareachart.html b/airflow/_vendor/nvd3/templates/stackedareachart.html new file mode 100644 index 0000000000000..b70833d2b385d --- /dev/null +++ b/airflow/_vendor/nvd3/templates/stackedareachart.html @@ -0,0 +1,7 @@ +{# This is a dummy template, we can use that template to add attributes unique + to stackedareachart #} + +{% extends "content.html" %} +{% block body %} + {{ super() }} +{% endblock body %} diff --git a/airflow/_vendor/nvd3/translator.py b/airflow/_vendor/nvd3/translator.py new file mode 100644 index 0000000000000..ffde2c2a1cec9 --- /dev/null +++ b/airflow/_vendor/nvd3/translator.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: UTF-8 -*- + + +class Tag(object): + """Tag class""" + + def __init__(self, content=None): + self.content = content + self.attrs = ' '.join(['%s="%s"' % (attr, value) + for attr, value in self.attrs]) + + def __str__(self): + return '<%s%s>\n %s\n' % (self.name, + ' ' + self.attrs if self.attrs else '', + self.content, + self.name) + + +class ScriptTag(Tag): + name = 'script' + attrs = (('type', 'text/javascript'),) + + +class AnonymousFunction(object): + def __init__(self, arguments, content): + self.arguments = arguments + self.content = content + + def __str__(self): + return 'function(%s) { %s }' % (self.arguments, self.content) + + +class Function(object): + + def __init__(self, name): + self.name = name + self._calls = [] + + def __str__(self): + operations = [self.name] + operations.extend(str(call) for call in self._calls) + return '%s' % ('.'.join(operations),) + + def __getattr__(self, attr): + self._calls.append(attr) + return self + + def __call__(self, *args): + if not args: + self._calls[-1] = self._calls[-1] + '()' + else: + arguments = ','.join([str(arg) for arg in args]) + self._calls[-1] = self._calls[-1] + '(%s)' % (arguments,) + return self + + +class Assignment(object): + + def __init__(self, key, value, scoped=True): + self.key = key + self.value = value + self.scoped = scoped + + def __str__(self): + return '%s%s = %s;' % ('var ' if self.scoped else '', self.key, self.value) + + +def indent(func): + # TODO: Add indents to function str + return str(func) diff --git a/airflow/_vendor/slugify/LICENSE b/airflow/_vendor/slugify/LICENSE new file mode 100644 index 0000000000000..82af695f594e8 --- /dev/null +++ b/airflow/_vendor/slugify/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) Val Neekman @ Neekware Inc. http://neekware.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/airflow/_vendor/slugify/__init__.py b/airflow/_vendor/slugify/__init__.py new file mode 100644 index 0000000000000..7358b998cd543 --- /dev/null +++ b/airflow/_vendor/slugify/__init__.py @@ -0,0 +1,6 @@ +from .slugify import * + + +__author__ = 'Val Neekman @ Neekware Inc. [@vneekman]' +__description__ = 'A Python slugify application that also handles Unicode' +__version__ = '2.0.1' diff --git a/airflow/_vendor/slugify/slugify.py b/airflow/_vendor/slugify/slugify.py new file mode 100644 index 0000000000000..0e9886d827138 --- /dev/null +++ b/airflow/_vendor/slugify/slugify.py @@ -0,0 +1,185 @@ +import re +import unicodedata +import types +import sys + +try: + from htmlentitydefs import name2codepoint + _unicode = unicode + _unicode_type = types.UnicodeType +except ImportError: + from html.entities import name2codepoint + _unicode = str + _unicode_type = str + unichr = chr + +import text_unidecode as unidecode + +__all__ = ['slugify', 'smart_truncate'] + + +CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint)) +DECIMAL_PATTERN = re.compile(r'&#(\d+);') +HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);') +QUOTE_PATTERN = re.compile(r'[\']+') +ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+') +ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+') +DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}') +NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)') +DEFAULT_SEPARATOR = '-' + + +def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False): + """ + Truncate a string. + :param string (str): string for modification + :param max_length (int): output string length + :param word_boundary (bool): + :param save_order (bool): if True then word order of output string is like input string + :param separator (str): separator between words + :return: + """ + + string = string.strip(separator) + + if not max_length: + return string + + if len(string) < max_length: + return string + + if not word_boundary: + return string[:max_length].strip(separator) + + if separator not in string: + return string[:max_length] + + truncated = '' + for word in string.split(separator): + if word: + next_len = len(truncated) + len(word) + if next_len < max_length: + truncated += '{0}{1}'.format(word, separator) + elif next_len == max_length: + truncated += '{0}'.format(word) + break + else: + if save_order: + break + if not truncated: # pragma: no cover + truncated = string[:max_length] + return truncated.strip(separator) + + +def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, + separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, + replacements=()): + """ + Make a slug from the given text. + :param text (str): initial text + :param entities (bool): + :param decimal (bool): + :param hexadecimal (bool): + :param max_length (int): output string length + :param word_boundary (bool): + :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order + :param separator (str): separator between words + :param stopwords (iterable): words to discount + :param regex_pattern (str): regex pattern for allowed characters + :param lowercase (bool): activate case sensitivity by setting it to False + :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] + :return (str): + """ + + # user-specific replacements + if replacements: + for old, new in replacements: + text = text.replace(old, new) + + # ensure text is unicode + if not isinstance(text, _unicode_type): + text = _unicode(text, 'utf-8', 'ignore') + + # replace quotes with dashes - pre-process + text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) + + # decode unicode + text = unidecode.unidecode(text) + + # ensure text is still in unicode + if not isinstance(text, _unicode_type): + text = _unicode(text, 'utf-8', 'ignore') + + # character entity reference + if entities: + text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) + + # decimal character reference + if decimal: + try: + text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) + except Exception: + pass + + # hexadecimal character reference + if hexadecimal: + try: + text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) + except Exception: + pass + + # translate + text = unicodedata.normalize('NFKD', text) + if sys.version_info < (3,): + text = text.encode('ascii', 'ignore') + + # make the text lowercase (optional) + if lowercase: + text = text.lower() + + # remove generated quotes -- post-process + text = QUOTE_PATTERN.sub('', text) + + # cleanup numbers + text = NUMBERS_PATTERN.sub('', text) + + # replace all other unwanted characters + if lowercase: + pattern = regex_pattern or ALLOWED_CHARS_PATTERN + else: + pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE + text = re.sub(pattern, DEFAULT_SEPARATOR, text) + + # remove redundant + text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) + + # remove stopwords + if stopwords: + if lowercase: + stopwords_lower = [s.lower() for s in stopwords] + words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] + else: + words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] + text = DEFAULT_SEPARATOR.join(words) + + # finalize user-specific replacements + if replacements: + for old, new in replacements: + text = text.replace(old, new) + + # smart truncate if requested + if max_length > 0: + text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order) + + if separator != DEFAULT_SEPARATOR: + text = text.replace(DEFAULT_SEPARATOR, separator) + + return text + + +def main(): # pragma: no cover + if len(sys.argv) < 2: + print("Usage %s TEXT TO SLUGIFY" % sys.argv[0]) + else: + text = ' '.join(sys.argv[1:]) + print(slugify(text)) diff --git a/airflow/api/__init__.py b/airflow/api/__init__.py index b4a2f8f5bc36f..67cb51d199e63 100644 --- a/airflow/api/__init__.py +++ b/airflow/api/__init__.py @@ -19,13 +19,15 @@ from __future__ import print_function +from typing import Any + from airflow.exceptions import AirflowException from airflow import configuration as conf from importlib import import_module from airflow.utils.log.logging_mixin import LoggingMixin -api_auth = None +api_auth = None # type: Any log = LoggingMixin().log diff --git a/airflow/api/auth/backend/kerberos_auth.py b/airflow/api/auth/backend/kerberos_auth.py index 7e560fb296dad..1b80e2efeb0cd 100644 --- a/airflow/api/auth/backend/kerberos_auth.py +++ b/airflow/api/auth/backend/kerberos_auth.py @@ -27,7 +27,7 @@ from airflow import configuration as conf from flask import Response -from flask import _request_ctx_stack as stack +from flask import _request_ctx_stack as stack # type: ignore from flask import make_response from flask import request from flask import g diff --git a/airflow/api/common/experimental/delete_dag.py b/airflow/api/common/experimental/delete_dag.py index b9ce736b48dfc..365997638b64c 100644 --- a/airflow/api/common/experimental/delete_dag.py +++ b/airflow/api/common/experimental/delete_dag.py @@ -19,13 +19,22 @@ from sqlalchemy import or_ -from airflow import models, settings +from airflow import models +from airflow.models.taskfail import TaskFail +from airflow.utils.db import provide_session from airflow.exceptions import DagNotFound, DagFileExists -def delete_dag(dag_id): - session = settings.Session() - +@provide_session +def delete_dag(dag_id, keep_records_in_log=True, session=None): + """ + :param dag_id: the dag_id of the DAG to delete + :type dag_id: str + :param keep_records_in_log: whether keep records of the given dag_id + in the Log table in the backend database (for reasons like auditing). + The default value is True. + :type keep_records_in_log: bool + """ DM = models.DagModel dag = session.query(DM).filter(DM.dag_id == dag_id).first() if dag is None: @@ -39,16 +48,16 @@ def delete_dag(dag_id): count = 0 # noinspection PyUnresolvedReferences,PyProtectedMember - for m in models.Base._decl_class_registry.values(): + for m in models.base.Base._decl_class_registry.values(): if hasattr(m, "dag_id"): + if keep_records_in_log and m.__name__ == 'Log': + continue cond = or_(m.dag_id == dag_id, m.dag_id.like(dag_id + ".%")) count += session.query(m).filter(cond).delete(synchronize_session='fetch') if dag.is_subdag: p, c = dag_id.rsplit(".", 1) - for m in models.DagRun, models.TaskFail, models.TaskInstance: + for m in models.DagRun, TaskFail, models.TaskInstance: count += session.query(m).filter(m.dag_id == p, m.task_id == c).delete() - session.commit() - return count diff --git a/airflow/api/common/experimental/get_code.py b/airflow/api/common/experimental/get_code.py new file mode 100644 index 0000000000000..f082cb03837da --- /dev/null +++ b/airflow/api/common/experimental/get_code.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.exceptions import AirflowException, DagNotFound +from airflow import models, settings +from airflow.www import utils as wwwutils + + +def get_code(dag_id): + """Return python code of a given dag_id.""" + session = settings.Session() + DM = models.DagModel + dag = session.query(DM).filter(DM.dag_id == dag_id).first() + session.close() + # Check DAG exists. + if dag is None: + error_message = "Dag id {} not found".format(dag_id) + raise DagNotFound(error_message) + + try: + with wwwutils.open_maybe_zipped(dag.fileloc, 'r') as f: + code = f.read() + return code + except IOError as e: + error_message = "Error {} while reading Dag id {} Code".format(str(e), dag_id) + raise AirflowException(error_message) diff --git a/airflow/api/common/experimental/get_dag_runs.py b/airflow/api/common/experimental/get_dag_runs.py index 63b1f993d392d..739905d5df903 100644 --- a/airflow/api/common/experimental/get_dag_runs.py +++ b/airflow/api/common/experimental/get_dag_runs.py @@ -22,7 +22,7 @@ from airflow.models import DagBag, DagRun -def get_dag_runs(dag_id, state=None): +def get_dag_runs(dag_id, state=None, run_url_route='Airflow.graph'): """ Returns a list of Dag Runs for a specific DAG ID. :param dag_id: String identifier of a DAG @@ -48,7 +48,7 @@ def get_dag_runs(dag_id, state=None): 'execution_date': run.execution_date.isoformat(), 'start_date': ((run.start_date or '') and run.start_date.isoformat()), - 'dag_run_url': url_for('Airflow.graph', dag_id=run.dag_id, + 'dag_run_url': url_for(run_url_route, dag_id=run.dag_id, execution_date=run.execution_date) }) diff --git a/airflow/api/common/experimental/mark_tasks.py b/airflow/api/common/experimental/mark_tasks.py index e9e4fec2235a5..b761d8dc5b594 100644 --- a/airflow/api/common/experimental/mark_tasks.py +++ b/airflow/api/common/experimental/mark_tasks.py @@ -17,15 +17,15 @@ # specific language governing permissions and limitations # under the License. +from sqlalchemy import or_ + from airflow.jobs import BackfillJob from airflow.models import DagRun, TaskInstance from airflow.operators.subdag_operator import SubDagOperator -from airflow.settings import Session from airflow.utils import timezone +from airflow.utils.db import provide_session from airflow.utils.state import State -from sqlalchemy import or_ - def _create_dagruns(dag, execution_dates, state, run_id_template): """ @@ -53,8 +53,9 @@ def _create_dagruns(dag, execution_dates, state, run_id_template): return drs +@provide_session def set_state(task, execution_date, upstream=False, downstream=False, - future=False, past=False, state=State.SUCCESS, commit=False): + future=False, past=False, state=State.SUCCESS, commit=False, session=None): """ Set the state of a task instance and if needed its relatives. Can set state for future tasks (calculated from execution_date) and retroactively @@ -70,14 +71,11 @@ def set_state(task, execution_date, upstream=False, downstream=False, :param past: Retroactively mark all tasks starting from start_date of the DAG :param state: State to which the tasks need to be set :param commit: Commit tasks to be altered to the database + :param session: database session :return: list of tasks that have been created and updated """ assert timezone.is_localized(execution_date) - # microseconds are supported by the database, but is not handled - # correctly by airflow on e.g. the filesystem and in other places - execution_date = execution_date.replace(microsecond=0) - assert task.dag is not None dag = task.dag @@ -123,7 +121,6 @@ def set_state(task, execution_date, upstream=False, downstream=False, # go through subdagoperators and create dag runs. We will only work # within the scope of the subdag. We wont propagate to the parent dag, # but we will propagate from parent to subdag. - session = Session() dags = [dag] sub_dag_ids = [] while len(dags) > 0: @@ -179,27 +176,80 @@ def set_state(task, execution_date, upstream=False, downstream=False, tis_altered += qry_sub_dag.with_for_update().all() for ti in tis_altered: ti.state = state - session.commit() else: tis_altered = qry_dag.all() if len(sub_dag_ids) > 0: tis_altered += qry_sub_dag.all() - session.expunge_all() - session.close() - return tis_altered -def set_dag_run_state(dag, execution_date, state=State.SUCCESS, commit=False): +@provide_session +def _set_dag_run_state(dag_id, execution_date, state, session=None): """ - Set the state of a dag run and all task instances associated with the dag - run for a specific execution date. + Helper method that set dag run state in the DB. + :param dag_id: dag_id of target dag run + :param execution_date: the execution date from which to start looking + :param state: target state + :param session: database session + """ + DR = DagRun + dr = session.query(DR).filter( + DR.dag_id == dag_id, + DR.execution_date == execution_date + ).one() + dr.state = state + if state == State.RUNNING: + dr.start_date = timezone.utcnow() + dr.end_date = None + else: + dr.end_date = timezone.utcnow() + session.merge(dr) + + +@provide_session +def set_dag_run_state_to_success(dag, execution_date, commit=False, session=None): + """ + Set the dag run for a specific execution date and its task instances + to success. :param dag: the DAG of which to alter state :param execution_date: the execution date from which to start looking - :param state: the state to which the DAG need to be set :param commit: commit DAG and tasks to be altered to the database - :return: list of tasks that have been created and updated + :param session: database session + :return: If commit is true, list of tasks that have been updated, + otherwise list of tasks that will be updated + :raises: AssertionError if dag or execution_date is invalid + """ + res = [] + + if not dag or not execution_date: + return res + + # Mark the dag run to success. + if commit: + _set_dag_run_state(dag.dag_id, execution_date, State.SUCCESS, session) + + # Mark all task instances of the dag run to success. + for task in dag.tasks: + task.dag = dag + new_state = set_state(task=task, execution_date=execution_date, + state=State.SUCCESS, commit=commit) + res.extend(new_state) + + return res + + +@provide_session +def set_dag_run_state_to_failed(dag, execution_date, commit=False, session=None): + """ + Set the dag run for a specific execution date and its running task instances + to failed. + :param dag: the DAG of which to alter state + :param execution_date: the execution date from which to start looking + :param commit: commit DAG and tasks to be altered to the database + :param session: database session + :return: If commit is true, list of tasks that have been updated, + otherwise list of tasks that will be updated :raises: AssertionError if dag or execution_date is invalid """ res = [] @@ -207,18 +257,47 @@ def set_dag_run_state(dag, execution_date, state=State.SUCCESS, commit=False): if not dag or not execution_date: return res - # Mark all task instances in the dag run + # Mark the dag run to failed. + if commit: + _set_dag_run_state(dag.dag_id, execution_date, State.FAILED, session) + + # Mark only RUNNING task instances. + TI = TaskInstance + task_ids = [task.task_id for task in dag.tasks] + tis = session.query(TI).filter( + TI.dag_id == dag.dag_id, + TI.execution_date == execution_date, + TI.task_id.in_(task_ids)).filter(TI.state == State.RUNNING) + task_ids_of_running_tis = [ti.task_id for ti in tis] for task in dag.tasks: + if task.task_id not in task_ids_of_running_tis: + continue task.dag = dag new_state = set_state(task=task, execution_date=execution_date, - state=state, commit=commit) + state=State.FAILED, commit=commit) res.extend(new_state) - # Mark the dag run + return res + + +@provide_session +def set_dag_run_state_to_running(dag, execution_date, commit=False, session=None): + """ + Set the dag run for a specific execution date to running. + :param dag: the DAG of which to alter state + :param execution_date: the execution date from which to start looking + :param commit: commit DAG and tasks to be altered to the database + :param session: database session + :return: If commit is true, list of tasks that have been updated, + otherwise list of tasks that will be updated + """ + res = [] + if not dag or not execution_date: + return res + + # Mark the dag run to running. if commit: - drs = DagRun.find(dag.dag_id, execution_date=execution_date) - for dr in drs: - dr.dag = dag - dr.update_state() + _set_dag_run_state(dag.dag_id, execution_date, State.RUNNING, session) + # To keep the return type consistent with the other similar functions. return res diff --git a/airflow/api/common/experimental/trigger_dag.py b/airflow/api/common/experimental/trigger_dag.py index 86be6aa544089..3989de4ebd8f0 100644 --- a/airflow/api/common/experimental/trigger_dag.py +++ b/airflow/api/common/experimental/trigger_dag.py @@ -59,7 +59,10 @@ def _trigger_dag( run_conf = None if conf: - run_conf = json.loads(conf) + if type(conf) is dict: + run_conf = conf + else: + run_conf = json.loads(conf) triggers = list() dags_to_trigger = list() diff --git a/airflow/bin/cli.py b/airflow/bin/cli.py index b56e3253276bc..422942f14944d 100644 --- a/airflow/bin/cli.py +++ b/airflow/bin/cli.py @@ -19,7 +19,7 @@ # under the License. from __future__ import print_function -from backports.configparser import NoSectionError +import importlib import logging import os @@ -33,7 +33,7 @@ import reprlib import argparse from builtins import input -from collections import namedtuple + from airflow.utils.timezone import parse as parsedate import json from tabulate import tabulate @@ -48,6 +48,7 @@ import psutil import re from urllib.parse import urlunparse +from typing import Any import airflow from airflow import api @@ -55,13 +56,11 @@ from airflow import configuration as conf from airflow.exceptions import AirflowException, AirflowWebServerTimeout from airflow.executors import GetDefaultExecutor -from airflow.models import (DagModel, DagBag, TaskInstance, - DagPickle, DagRun, Variable, DagStat, - Connection, DAG) - +from airflow.models import DagModel, DagBag, TaskInstance, DagRun, Variable, DAG +from airflow.models.connection import Connection +from airflow.models.dagpickle import DagPickle from airflow.ti_deps.dep_context import (DepContext, SCHEDULER_DEPS) -from airflow.utils import cli as cli_utils -from airflow.utils import db as db_utils +from airflow.utils import cli as cli_utils, db from airflow.utils.net import get_hostname from airflow.utils.log.logging_mixin import (LoggingMixin, redirect_stderr, redirect_stdout) @@ -70,16 +69,20 @@ from airflow.www_rbac.app import create_app as create_app_rbac from airflow.www_rbac.app import cached_appbuilder -from sqlalchemy import func from sqlalchemy.orm import exc api.load_auth() -api_module = import_module(conf.get('cli', 'api_client')) +api_module = import_module(conf.get('cli', 'api_client')) # type: Any api_client = api_module.Client(api_base_url=conf.get('cli', 'endpoint_url'), auth=api.api_auth.client_auth) log = LoggingMixin().log +DAGS_FOLDER = settings.DAGS_FOLDER + +if "BUILDING_AIRFLOW_DOCS" in os.environ: + DAGS_FOLDER = '[AIRFLOW_HOME]/dags' + def sigint_handler(sig, frame): sys.exit(0) @@ -116,24 +119,20 @@ def setup_logging(filename): def setup_locations(process, pid=None, stdout=None, stderr=None, log=None): if not stderr: - stderr = os.path.join(os.path.expanduser(settings.AIRFLOW_HOME), - 'airflow-{}.err'.format(process)) + stderr = os.path.join(settings.AIRFLOW_HOME, 'airflow-{}.err'.format(process)) if not stdout: - stdout = os.path.join(os.path.expanduser(settings.AIRFLOW_HOME), - 'airflow-{}.out'.format(process)) + stdout = os.path.join(settings.AIRFLOW_HOME, 'airflow-{}.out'.format(process)) if not log: - log = os.path.join(os.path.expanduser(settings.AIRFLOW_HOME), - 'airflow-{}.log'.format(process)) + log = os.path.join(settings.AIRFLOW_HOME, 'airflow-{}.log'.format(process)) if not pid: - pid = os.path.join(os.path.expanduser(settings.AIRFLOW_HOME), - 'airflow-{}.pid'.format(process)) + pid = os.path.join(settings.AIRFLOW_HOME, 'airflow-{}.pid'.format(process)) return pid, stdout, stderr, log def process_subdir(subdir): if subdir: - subdir = subdir.replace('DAGS_FOLDER', settings.DAGS_FOLDER) + subdir = subdir.replace('DAGS_FOLDER', DAGS_FOLDER) subdir = os.path.abspath(os.path.expanduser(subdir)) return subdir @@ -215,6 +214,7 @@ def backfill(args, dag=None): verbose=args.verbose, conf=run_conf, rerun_failed_tasks=args.rerun_failed_tasks, + run_backwards=args.run_backwards ) @@ -267,6 +267,7 @@ def _tabulate(pools): tablefmt="fancy_grid") try: + imp = getattr(args, 'import') if args.get is not None: pools = [api_client.get_pool(name=args.get)] elif args.set: @@ -275,6 +276,14 @@ def _tabulate(pools): description=args.set[2])] elif args.delete: pools = [api_client.delete_pool(name=args.delete)] + elif imp: + if os.path.exists(imp): + pools = pool_import_helper(imp) + else: + print("Missing pools file.") + pools = api_client.get_pools() + elif args.export: + pools = pool_export_helper(args.export) else: pools = api_client.get_pools() except (AirflowException, IOError) as err: @@ -283,6 +292,43 @@ def _tabulate(pools): log.info(_tabulate(pools=pools)) +def pool_import_helper(filepath): + with open(filepath, 'r') as poolfile: + pl = poolfile.read() + try: + d = json.loads(pl) + except Exception as e: + print("Please check the validity of the json file: " + str(e)) + else: + try: + pools = [] + n = 0 + for k, v in d.items(): + if isinstance(v, dict) and len(v) == 2: + pools.append(api_client.create_pool(name=k, + slots=v["slots"], + description=v["description"])) + n += 1 + else: + pass + except Exception: + pass + finally: + print("{} of {} pool(s) successfully updated.".format(n, len(d))) + return pools + + +def pool_export_helper(filepath): + pool_dict = {} + pools = api_client.get_pools() + for pool in pools: + pool_dict[pool[0]] = {"slots": pool[1], "description": pool[2]} + with open(filepath, 'w') as poolfile: + poolfile.write(json.dumps(pool_dict, sort_keys=True, indent=4)) + print("{} pools successfully exported to {}".format(len(pool_dict), filepath)) + return pools + + @cli_utils.action_logging def variables(args): if args.get: @@ -294,10 +340,8 @@ def variables(args): except ValueError as e: print(e) if args.delete: - session = settings.Session() - session.query(Variable).filter_by(key=args.delete).delete() - session.commit() - session.close() + with db.create_session() as session: + session.query(Variable).filter_by(key=args.delete).delete() if args.set: Variable.set(args.set[0], args.set[1]) # Work around 'import' as a reserved keyword @@ -311,10 +355,10 @@ def variables(args): export_helper(args.export) if not (args.set or args.get or imp or args.export or args.delete): # list all variables - session = settings.Session() - vars = session.query(Variable) - msg = "\n".join(var.key for var in vars) - print(msg) + with db.create_session() as session: + vars = session.query(Variable) + msg = "\n".join(var.key for var in vars) + print(msg) def import_helper(filepath): @@ -341,19 +385,17 @@ def import_helper(filepath): def export_helper(filepath): - session = settings.Session() - qry = session.query(Variable).all() - session.close() - var_dict = {} - d = json.JSONDecoder() - for var in qry: - val = None - try: - val = d.decode(var.val) - except Exception: - val = var.val - var_dict[var.key] = val + with db.create_session() as session: + qry = session.query(Variable).all() + + d = json.JSONDecoder() + for var in qry: + try: + val = d.decode(var.val) + except Exception: + val = var.val + var_dict[var.key] = val with open(filepath, 'w') as varfile: varfile.write(json.dumps(var_dict, sort_keys=True, indent=4)) @@ -373,14 +415,12 @@ def unpause(args, dag=None): def set_is_paused(is_paused, args, dag=None): dag = dag or get_dag(args) - session = settings.Session() - dm = session.query(DagModel).filter( - DagModel.dag_id == dag.dag_id).first() - dm.is_paused = is_paused - session.commit() + with db.create_session() as session: + dm = session.query(DagModel).filter(DagModel.dag_id == dag.dag_id).first() + dm.is_paused = is_paused + session.commit() - msg = "Dag: {}, paused: {}".format(dag, str(dag.is_paused)) - print(msg) + print("Dag: {}, paused: {}".format(dag, str(dag.is_paused))) def _run(args, dag, ti): @@ -406,14 +446,12 @@ def _run(args, dag, ti): if args.ship_dag: try: # Running remotely, so pickling the DAG - session = settings.Session() - pickle = DagPickle(dag) - session.add(pickle) - session.commit() - pickle_id = pickle.id - # TODO: This should be written to a log - print('Pickled dag {dag} as pickle_id:{pickle_id}' - .format(**locals())) + with db.create_session() as session: + pickle = DagPickle(dag) + session.add(pickle) + pickle_id = pickle.id + # TODO: This should be written to a log + print('Pickled dag {dag} as pickle_id:{pickle_id}'.format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) @@ -450,19 +488,7 @@ def run(args, dag=None): if os.path.exists(args.cfg_path): os.remove(args.cfg_path) - # Do not log these properties since some may contain passwords. - # This may also set default values for database properties like - # core.sql_alchemy_pool_size - # core.sql_alchemy_pool_recycle - for section, config in conf_dict.items(): - for option, value in config.items(): - try: - conf.set(section, option, value) - except NoSectionError: - log.error('Section {section} Option {option} ' - 'does not exist in the config!'.format(section=section, - option=option)) - + conf.conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave @@ -474,13 +500,12 @@ def run(args, dag=None): if not args.pickle and not dag: dag = get_dag(args) elif not dag: - session = settings.Session() - log.info('Loading pickle id {args.pickle}'.format(args=args)) - dag_pickle = session.query( - DagPickle).filter(DagPickle.id == args.pickle).first() - if not dag_pickle: - raise AirflowException("Who hid the pickle!? [missing pickle]") - dag = dag_pickle.pickle + with db.create_session() as session: + log.info('Loading pickle id %s', args.pickle) + dag_pickle = session.query(DagPickle).filter(DagPickle.id == args.pickle).first() + if not dag_pickle: + raise AirflowException("Who hid the pickle!? [missing pickle]") + dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) @@ -551,6 +576,42 @@ def dag_state(args): print(dr[0].state if len(dr) > 0 else None) +@cli_utils.action_logging +def next_execution(args): + """ + Returns the next execution datetime of a DAG at the command line. + >>> airflow next_execution tutorial + 2018-08-31 10:38:00 + """ + dag = get_dag(args) + + if dag.is_paused: + print("[INFO] Please be reminded this DAG is PAUSED now.") + + if dag.latest_execution_date: + next_execution_dttm = dag.following_schedule(dag.latest_execution_date) + + if next_execution_dttm is None: + print("[WARN] No following schedule can be found. " + + "This DAG may have schedule interval '@once' or `None`.") + + print(next_execution_dttm) + else: + print("[WARN] Only applicable when there is execution record found for the DAG.") + print(None) + + +@cli_utils.action_logging +def rotate_fernet_key(args): + session = settings.Session() + for conn in session.query(Connection).filter( + Connection.is_encrypted | Connection.is_extra_encrypted): + conn.rotate_fernet_key() + for var in session.query(Variable).filter(Variable.is_encrypted): + var.rotate_fernet_key() + session.commit() + + @cli_utils.action_logging def list_dags(args): dagbag = DagBag(process_subdir(args.subdir)) @@ -578,6 +639,11 @@ def list_tasks(args, dag=None): @cli_utils.action_logging def test(args, dag=None): + # We want log outout from operators etc to show up here. Normally + # airflow.task would redirect to a file, but here we want it to propagate + # up to the normal airflow handler. + logging.getLogger('airflow.task').propagate = True + dag = dag or get_dag(args) task = dag.get_task(task_id=args.task_id) @@ -587,10 +653,20 @@ def test(args, dag=None): task.params.update(passed_in_params) ti = TaskInstance(task, args.execution_date) - if args.dry_run: - ti.dry_run() - else: - ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) + try: + if args.dry_run: + ti.dry_run() + else: + ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True) + except Exception: + if args.post_mortem: + try: + debugger = importlib.import_module("ipdb") + except ImportError: + debugger = importlib.import_module("pdb") + debugger.post_mortem() + else: + raise @cli_utils.action_logging @@ -629,7 +705,9 @@ def clear(args): only_failed=args.only_failed, only_running=args.only_running, confirm_prompt=not args.no_confirm, - include_subdags=not args.exclude_subdags) + include_subdags=not args.exclude_subdags, + include_parentdag=not args.exclude_parentdag, + ) def get_num_ready_workers_running(gunicorn_master_proc): @@ -783,13 +861,19 @@ def webserver(args): print( "Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) - app = create_app_rbac(conf) if settings.RBAC else create_app(conf) - app.run(debug=True, port=args.port, host=args.hostname, + if settings.RBAC: + app, _ = create_app_rbac(None, testing=conf.get('core', 'unit_test_mode')) + else: + app = create_app(None, testing=conf.get('core', 'unit_test_mode')) + app.run(debug=True, use_reloader=False if app.config['TESTING'] else True, + port=args.port, host=args.hostname, ssl_context=(ssl_cert, ssl_key) if ssl_cert and ssl_key else None) else: - app = cached_app_rbac(conf) if settings.RBAC else cached_app(conf) + os.environ['SKIP_DAGS_PARSING'] = 'True' + app = cached_app_rbac(None) if settings.RBAC else cached_app(None) pid, stdout, stderr, log_file = setup_locations( "webserver", args.pid, args.stdout, args.stderr, args.log_file) + os.environ.pop('SKIP_DAGS_PARSING') if args.daemon: handle = setup_logging(log_file) stdout = open(stdout, 'w+') @@ -951,17 +1035,27 @@ def worker(args): env = os.environ.copy() env['AIRFLOW_HOME'] = settings.AIRFLOW_HOME + if not settings.validate_session(): + log = LoggingMixin().log + log.error("Worker exiting... database connection precheck failed! ") + sys.exit(1) + # Celery worker from airflow.executors.celery_executor import app as celery_app from celery.bin import worker + autoscale = args.autoscale + if autoscale is None and conf.has_option("celery", "worker_autoscale"): + autoscale = conf.get("celery", "worker_autoscale") worker = worker.worker(app=celery_app) options = { 'optimization': 'fair', 'O': 'fair', 'queues': args.queues, 'concurrency': args.concurrency, + 'autoscale': autoscale, 'hostname': args.celery_hostname, + 'loglevel': conf.get('core', 'LOGGING_LEVEL'), } if args.daemon: @@ -999,17 +1093,16 @@ def worker(args): def initdb(args): # noqa print("DB: " + repr(settings.engine.url)) - db_utils.initdb(settings.RBAC) + db.initdb(settings.RBAC) print("Done.") -@cli_utils.action_logging def resetdb(args): print("DB: " + repr(settings.engine.url)) if args.yes or input("This will drop existing tables " "if they exist. Proceed? " "(y/n)").upper() == "Y": - db_utils.resetdb(settings.RBAC) + db.resetdb(settings.RBAC) else: print("Bail.") @@ -1017,19 +1110,7 @@ def resetdb(args): @cli_utils.action_logging def upgradedb(args): # noqa print("DB: " + repr(settings.engine.url)) - db_utils.upgradedb() - - # Populate DagStats table - session = settings.Session() - ds_rows = session.query(DagStat).count() - if not ds_rows: - qry = ( - session.query(DagRun.dag_id, DagRun.state, func.count('*')) - .group_by(DagRun.dag_id, DagRun.state) - ) - for dag_id, state, count in qry: - session.add(DagStat(dag_id=dag_id, state=state, count=count)) - session.commit() + db.upgradedb() @cli_utils.action_logging @@ -1056,20 +1137,20 @@ def connections(args): print(msg) return - session = settings.Session() - conns = session.query(Connection.conn_id, Connection.conn_type, - Connection.host, Connection.port, - Connection.is_encrypted, - Connection.is_extra_encrypted, - Connection.extra).all() - conns = [map(reprlib.repr, conn) for conn in conns] - msg = tabulate(conns, ['Conn Id', 'Conn Type', 'Host', 'Port', - 'Is Encrypted', 'Is Extra Encrypted', 'Extra'], - tablefmt="fancy_grid") - if sys.version_info[0] < 3: - msg = msg.encode('utf-8') - print(msg) - return + with db.create_session() as session: + conns = session.query(Connection.conn_id, Connection.conn_type, + Connection.host, Connection.port, + Connection.is_encrypted, + Connection.is_extra_encrypted, + Connection.extra).all() + conns = [map(reprlib.repr, conn) for conn in conns] + msg = tabulate(conns, ['Conn Id', 'Conn Type', 'Host', 'Port', + 'Is Encrypted', 'Is Extra Encrypted', 'Extra'], + tablefmt="fancy_grid") + if sys.version_info[0] < 3: + msg = msg.encode('utf-8') + print(msg) + return if args.delete: # Check that only the `conn_id` arg was passed to the command @@ -1089,31 +1170,30 @@ def connections(args): 'the --conn_id flag.\n') return - session = settings.Session() - try: - to_delete = (session - .query(Connection) - .filter(Connection.conn_id == args.conn_id) - .one()) - except exc.NoResultFound: - msg = '\n\tDid not find a connection with `conn_id`={conn_id}\n' - msg = msg.format(conn_id=args.conn_id) - print(msg) - return - except exc.MultipleResultsFound: - msg = ('\n\tFound more than one connection with ' + - '`conn_id`={conn_id}\n') - msg = msg.format(conn_id=args.conn_id) - print(msg) + with db.create_session() as session: + try: + to_delete = (session + .query(Connection) + .filter(Connection.conn_id == args.conn_id) + .one()) + except exc.NoResultFound: + msg = '\n\tDid not find a connection with `conn_id`={conn_id}\n' + msg = msg.format(conn_id=args.conn_id) + print(msg) + return + except exc.MultipleResultsFound: + msg = ('\n\tFound more than one connection with ' + + '`conn_id`={conn_id}\n') + msg = msg.format(conn_id=args.conn_id) + print(msg) + return + else: + deleted_conn_id = to_delete.conn_id + session.delete(to_delete) + msg = '\n\tSuccessfully deleted `conn_id`={conn_id}\n' + msg = msg.format(conn_id=deleted_conn_id) + print(msg) return - else: - deleted_conn_id = to_delete.conn_id - session.delete(to_delete) - session.commit() - msg = '\n\tSuccessfully deleted `conn_id`={conn_id}\n' - msg = msg.format(conn_id=deleted_conn_id) - print(msg) - return if args.add: # Check that the conn_id and conn_uri args were passed to the command: @@ -1152,26 +1232,25 @@ def connections(args): if args.conn_extra is not None: new_conn.set_extra(args.conn_extra) - session = settings.Session() - if not (session.query(Connection) - .filter(Connection.conn_id == new_conn.conn_id).first()): - session.add(new_conn) - session.commit() - msg = '\n\tSuccessfully added `conn_id`={conn_id} : {uri}\n' - msg = msg.format(conn_id=new_conn.conn_id, - uri=args.conn_uri or - urlunparse((args.conn_type, - '{login}:{password}@{host}:{port}' - .format(login=args.conn_login or '', - password=args.conn_password or '', - host=args.conn_host or '', - port=args.conn_port or ''), - args.conn_schema or '', '', '', ''))) - print(msg) - else: - msg = '\n\tA connection with `conn_id`={conn_id} already exists\n' - msg = msg.format(conn_id=new_conn.conn_id) - print(msg) + with db.create_session() as session: + if not (session.query(Connection) + .filter(Connection.conn_id == new_conn.conn_id).first()): + session.add(new_conn) + msg = '\n\tSuccessfully added `conn_id`={conn_id} : {uri}\n' + msg = msg.format(conn_id=new_conn.conn_id, + uri=args.conn_uri or + urlunparse((args.conn_type, + '{login}:{password}@{host}:{port}' + .format(login=args.conn_login or '', + password=args.conn_password or '', + host=args.conn_host or '', + port=args.conn_port or ''), + args.conn_schema or '', '', '', ''))) + print(msg) + else: + msg = '\n\tA connection with `conn_id`={conn_id} already exists\n' + msg = msg.format(conn_id=new_conn.conn_id) + print(msg) return @@ -1189,6 +1268,10 @@ def flower(args): if args.url_prefix: url_prefix = '--url-prefix=' + args.url_prefix + basic_auth = '' + if args.basic_auth: + basic_auth = '--basic_auth=' + args.basic_auth + flower_conf = '' if args.flower_conf: flower_conf = '--conf=' + args.flower_conf @@ -1210,7 +1293,7 @@ def flower(args): with ctx: os.execvp("flower", ['flower', '-b', - broka, address, port, api, flower_conf, url_prefix]) + broka, address, port, api, flower_conf, url_prefix, basic_auth]) stdout.close() stderr.close() @@ -1219,7 +1302,7 @@ def flower(args): signal.signal(signal.SIGTERM, sigint_handler) os.execvp("flower", ['flower', '-b', - broka, address, port, api, flower_conf, url_prefix]) + broka, address, port, api, flower_conf, url_prefix, basic_auth]) @cli_utils.action_logging @@ -1243,12 +1326,12 @@ def kerberos(args): # noqa ) with ctx: - airflow.security.kerberos.run() + airflow.security.kerberos.run(principal=args.principal, keytab=args.keytab) stdout.close() stderr.close() else: - airflow.security.kerberos.run() + airflow.security.kerberos.run(principal=args.principal, keytab=args.keytab) @cli_utils.action_logging @@ -1280,6 +1363,9 @@ def create_user(args): if password != password_confirmation: raise SystemExit('Passwords did not match!') + if appbuilder.sm.find_user(args.username): + print('{} already exist in the db'.format(args.username)) + return user = appbuilder.sm.add_user(args.username, args.firstname, args.lastname, args.email, role, password) if user: @@ -1288,9 +1374,110 @@ def create_user(args): raise SystemExit('Failed to create user.') -Arg = namedtuple( - 'Arg', ['flags', 'help', 'action', 'default', 'nargs', 'type', 'choices', 'metavar']) -Arg.__new__.__defaults__ = (None, None, None, None, None, None, None) +@cli_utils.action_logging +def delete_user(args): + if not args.username: + raise SystemExit('Required arguments are missing: username') + + appbuilder = cached_appbuilder() + + try: + u = next(u for u in appbuilder.sm.get_all_users() if u.username == args.username) + except StopIteration: + raise SystemExit('{} is not a valid user.'.format(args.username)) + + if appbuilder.sm.del_register_user(u): + print('User {} deleted.'.format(args.username)) + else: + raise SystemExit('Failed to delete user.') + + +@cli_utils.action_logging +def list_users(args): + appbuilder = cached_appbuilder() + users = appbuilder.sm.get_all_users() + fields = ['id', 'username', 'email', 'first_name', 'last_name', 'roles'] + users = [[user.__getattribute__(field) for field in fields] for user in users] + msg = tabulate(users, [field.capitalize().replace('_', ' ') for field in fields], + tablefmt="fancy_grid") + if sys.version_info[0] < 3: + msg = msg.encode('utf-8') + print(msg) + + +@cli_utils.action_logging +def list_dag_runs(args, dag=None): + if dag: + args.dag_id = dag.dag_id + + dagbag = DagBag() + + if args.dag_id not in dagbag.dags: + error_message = "Dag id {} not found".format(args.dag_id) + raise AirflowException(error_message) + + dag_runs = list() + state = args.state.lower() if args.state else None + for run in DagRun.find(dag_id=args.dag_id, + state=state, + no_backfills=args.no_backfill): + dag_runs.append({ + 'id': run.id, + 'run_id': run.run_id, + 'state': run.state, + 'dag_id': run.dag_id, + 'execution_date': run.execution_date.isoformat(), + 'start_date': ((run.start_date or '') and + run.start_date.isoformat()), + }) + if not dag_runs: + print('No dag runs for {dag_id}'.format(dag_id=args.dag_id)) + + s = textwrap.dedent("""\n + {line} + DAG RUNS + {line} + {dag_run_header} + """) + + dag_runs.sort(key=lambda x: x['execution_date'], reverse=True) + dag_run_header = '%-3s | %-20s | %-10s | %-20s | %-20s |' % ('id', + 'run_id', + 'state', + 'execution_date', + 'state_date') + print(s.format(dag_run_header=dag_run_header, + line='-' * 120)) + for dag_run in dag_runs: + record = '%-3s | %-20s | %-10s | %-20s | %-20s |' % (dag_run['id'], + dag_run['run_id'], + dag_run['state'], + dag_run['execution_date'], + dag_run['start_date']) + print(record) + + +@cli_utils.action_logging +def sync_perm(args): # noqa + if settings.RBAC: + appbuilder = cached_appbuilder() + print('Update permission, view-menu for all existing roles') + appbuilder.sm.sync_roles() + else: + print('The sync_perm command only works for rbac UI.') + + +class Arg(object): + def __init__(self, flags=None, help=None, action=None, default=None, nargs=None, + type=None, choices=None, metavar=None): + self.flags = flags + self.help = help + self.action = action + self.default = default + self.nargs = nargs + self.type = type + self.choices = choices + self.metavar = metavar class CLIFactory(object): @@ -1306,8 +1493,10 @@ class CLIFactory(object): "The regex to filter specific task_ids to backfill (optional)"), 'subdir': Arg( ("-sd", "--subdir"), - "File location or directory from which to look for the dag", - default=settings.DAGS_FOLDER), + "File location or directory from which to look for the dag. " + "Defaults to '[AIRFLOW_HOME]/dags' where [AIRFLOW_HOME] is the " + "value you set for 'AIRFLOW_HOME' config you set in 'airflow.cfg' ", + default=DAGS_FOLDER), 'start_date': Arg( ("-s", "--start_date"), "Override start_date YYYY-MM-DD", type=parsedate), @@ -1334,6 +1523,19 @@ class CLIFactory(object): "Do not prompt to confirm reset. Use with care!", "store_true", default=False), + 'username': Arg( + ('-u', '--username',), + help='Username of the user', + type=str), + + # list_dag_runs + 'no_backfill': Arg( + ("--no_backfill",), + "filter all the backfill dagruns given the dag id", "store_true"), + 'state': Arg( + ("--state",), + "Only list the dag runs corresponding to the state" + ), # backfill 'mark_success': Arg( @@ -1388,6 +1590,13 @@ class CLIFactory(object): "all the failed tasks for the backfill date range " "instead of throwing exceptions"), "store_true"), + 'run_backwards': Arg( + ("-B", "--run_backwards",), + ( + "if set, the backfill will run tasks from the most " + "recent day first. if there are tasks that depend_on_past " + "this option will throw an exception"), + "store_true"), # list_tasks 'tree': Arg(("-t", "--tree"), "Tree view", "store_true"), @@ -1409,6 +1618,10 @@ class CLIFactory(object): 'exclude_subdags': Arg( ("-x", "--exclude_subdags"), "Exclude subdags", "store_true"), + 'exclude_parentdag': Arg( + ("-xp", "--exclude_parentdag"), + "Exclude ParentDAGS if the task cleared is a part of a SubDAG", + "store_true"), 'dag_regex': Arg( ("-dx", "--dag_regex"), "Search dag_id as regex instead of exact string", "store_true"), @@ -1434,6 +1647,14 @@ class CLIFactory(object): ("-x", "--delete"), metavar="NAME", help="Delete a pool"), + 'pool_import': Arg( + ("-i", "--import"), + metavar="FILEPATH", + help="Import pool from JSON file"), + 'pool_export': Arg( + ("-e", "--export"), + metavar="FILEPATH", + help="Export pool to JSON file"), # variables 'set': Arg( ("-s", "--set"), @@ -1467,8 +1688,7 @@ class CLIFactory(object): help="Delete a variable"), # kerberos 'principal': Arg( - ("principal",), "kerberos principal", - nargs='?', default=conf.get('kerberos', 'principal')), + ("principal",), "kerberos principal", nargs='?'), 'keytab': Arg( ("-kt", "--keytab"), "keytab", nargs='?', default=conf.get('kerberos', 'keytab')), @@ -1616,9 +1836,20 @@ class CLIFactory(object): ("-u", "--url_prefix"), default=conf.get('celery', 'FLOWER_URL_PREFIX'), help="URL prefix for Flower"), + 'flower_basic_auth': Arg( + ("-ba", "--basic_auth"), + default=conf.get('celery', 'FLOWER_BASIC_AUTH'), + help=("Securing Flower with Basic Authentication. " + "Accepts user:password pairs separated by a comma. " + "Example: flower_basic_auth = user1:password1,user2:password2")), 'task_params': Arg( ("-tp", "--task_params"), help="Sends a JSON params dict to the task"), + 'post_mortem': Arg( + ("-pm", "--post_mortem"), + action="store_true", + help="Open debugger on uncaught exception", + ), # connections 'list_connections': Arg( ('-l', '--list'), @@ -1686,10 +1917,6 @@ class CLIFactory(object): ('-e', '--email',), help='Email of the user', type=str), - 'username': Arg( - ('-u', '--username',), - help='Username of the user', - type=str), 'password': Arg( ('-p', '--password',), help='Password of the user', @@ -1699,6 +1926,9 @@ class CLIFactory(object): help='Do not prompt for password. Use random string instead', default=False, action='store_true'), + 'autoscale': Arg( + ('-a', '--autoscale'), + help="Minimum and Maximum number of worker to autoscale"), } subparsers = ( { @@ -1707,7 +1937,7 @@ class CLIFactory(object): "If reset_dag_run option is used," " backfill will first prompt users whether airflow " "should clear all the previous dag_run and task_instances " - "within the backfill date range." + "within the backfill date range. " "If rerun_failed_tasks is used, backfill " "will auto re-run the previous failed task instances" " within the backfill date range.", @@ -1716,7 +1946,16 @@ class CLIFactory(object): 'mark_success', 'local', 'donot_pickle', 'bf_ignore_dependencies', 'bf_ignore_first_depends_on_past', 'subdir', 'pool', 'delay_on_limit', 'dry_run', 'verbose', 'conf', - 'reset_dag_run', 'rerun_failed_tasks', + 'reset_dag_run', 'rerun_failed_tasks', 'run_backwards' + ) + }, { + 'func': list_dag_runs, + 'help': "List dag runs given a DAG id. If state option is given, it will only" + "search for all the dagruns with the given state. " + "If no_backfill option is given, it will filter out" + "all backfill dagruns for given dag id.", + 'args': ( + 'dag_id', 'no_backfill', 'state' ) }, { 'func': list_tasks, @@ -1728,7 +1967,7 @@ class CLIFactory(object): 'args': ( 'dag_id', 'task_regex', 'start_date', 'end_date', 'subdir', 'upstream', 'downstream', 'no_confirm', 'only_failed', - 'only_running', 'exclude_subdags', 'dag_regex'), + 'only_running', 'exclude_subdags', 'exclude_parentdag', 'dag_regex'), }, { 'func': pause, 'help': "Pause a DAG", @@ -1748,7 +1987,7 @@ class CLIFactory(object): }, { 'func': pool, 'help': "CRUD operations on pools", - "args": ('pool_set', 'pool_get', 'pool_delete'), + "args": ('pool_set', 'pool_get', 'pool_delete', 'pool_import', 'pool_export'), }, { 'func': variables, 'help': "CRUD operations on variables", @@ -1806,7 +2045,7 @@ class CLIFactory(object): "dependencies or recording its state in the database."), 'args': ( 'dag_id', 'task_id', 'execution_date', 'subdir', 'dry_run', - 'task_params'), + 'task_params', 'post_mortem'), }, { 'func': webserver, 'help': "Start a Airflow webserver instance", @@ -1831,12 +2070,12 @@ class CLIFactory(object): 'func': worker, 'help': "Start a Celery worker node", 'args': ('do_pickle', 'queues', 'concurrency', 'celery_hostname', - 'pid', 'daemon', 'stdout', 'stderr', 'log_file'), + 'pid', 'daemon', 'stdout', 'stderr', 'log_file', 'autoscale'), }, { 'func': flower, 'help': "Start a Celery Flower", 'args': ('flower_hostname', 'flower_port', 'flower_conf', 'flower_url_prefix', - 'broker_api', 'pid', 'daemon', 'stdout', 'stderr', 'log_file'), + 'flower_basic_auth', 'broker_api', 'pid', 'daemon', 'stdout', 'stderr', 'log_file'), }, { 'func': version, 'help': "Show the version", @@ -1848,14 +2087,39 @@ class CLIFactory(object): 'conn_id', 'conn_uri', 'conn_extra') + tuple(alternative_conn_specs), }, { 'func': create_user, - 'help': "Create an admin account", + 'help': "Create an account for the Web UI (FAB-based)", 'args': ('role', 'username', 'email', 'firstname', 'lastname', 'password', 'use_random_password'), + }, { + 'func': delete_user, + 'help': "Delete an account for the Web UI", + 'args': ('username',), + }, { + 'func': list_users, + 'help': "List accounts for the Web UI", + 'args': tuple(), + }, + { + 'func': sync_perm, + 'help': "Update existing role's permissions.", + 'args': tuple(), + }, + { + 'func': next_execution, + 'help': "Get the next execution datetime of a DAG.", + 'args': ('dag_id', 'subdir') + }, + { + 'func': rotate_fernet_key, + 'help': 'Rotate all encrypted connection credentials and variables; see ' + 'https://airflow.readthedocs.io/en/stable/howto/secure-connections.html' + '#rotating-encryption-keys.', + 'args': (), }, ) subparsers_dict = {sp['func'].__name__: sp for sp in subparsers} dag_subparsers = ( - 'list_tasks', 'backfill', 'test', 'run', 'pause', 'unpause') + 'list_tasks', 'backfill', 'test', 'run', 'pause', 'unpause', 'list_dag_runs') @classmethod def get_parser(cls, dag_parser=False): @@ -1873,8 +2137,8 @@ def get_parser(cls, dag_parser=False): continue arg = cls.args[arg] kwargs = { - f: getattr(arg, f) - for f in arg._fields if f != 'flags' and getattr(arg, f)} + f: v + for f, v in vars(arg).items() if f != 'flags' and v} sp.add_argument(*arg.flags, **kwargs) sp.set_defaults(func=sub['func']) return parser diff --git a/airflow/config_templates/airflow_local_settings.py b/airflow/config_templates/airflow_local_settings.py index 95150ab3bbbef..a25f56d076647 100644 --- a/airflow/config_templates/airflow_local_settings.py +++ b/airflow/config_templates/airflow_local_settings.py @@ -18,8 +18,10 @@ # under the License. import os +from typing import Dict, Any from airflow import configuration as conf +from airflow.utils.file import mkdirs # TODO: Logging format and level should be configured # in this file instead of from airflow.cfg. Currently @@ -38,7 +40,11 @@ PROCESSOR_LOG_FOLDER = conf.get('scheduler', 'CHILD_PROCESS_LOG_DIRECTORY') +DAG_PROCESSOR_MANAGER_LOG_LOCATION = \ + conf.get('core', 'DAG_PROCESSOR_MANAGER_LOG_LOCATION') + FILENAME_TEMPLATE = conf.get('core', 'LOG_FILENAME_TEMPLATE') + PROCESSOR_FILENAME_TEMPLATE = conf.get('core', 'LOG_PROCESSOR_FILENAME_TEMPLATE') # Storage bucket url for remote logging @@ -79,7 +85,7 @@ 'formatter': 'airflow', 'base_log_folder': os.path.expanduser(PROCESSOR_LOG_FOLDER), 'filename_template': PROCESSOR_FILENAME_TEMPLATE, - }, + } }, 'loggers': { 'airflow.processor': { @@ -102,6 +108,26 @@ 'handlers': ['console'], 'level': LOG_LEVEL, } +} # type: Dict[str, Any] + +DEFAULT_DAG_PARSING_LOGGING_CONFIG = { + 'handlers': { + 'processor_manager': { + 'class': 'logging.handlers.RotatingFileHandler', + 'formatter': 'airflow', + 'filename': DAG_PROCESSOR_MANAGER_LOG_LOCATION, + 'mode': 'a', + 'maxBytes': 104857600, # 100MB + 'backupCount': 5 + } + }, + 'loggers': { + 'airflow.processor_manager': { + 'handlers': ['processor_manager'], + 'level': LOG_LEVEL, + 'propagate': False, + } + } } REMOTE_HANDLERS = { @@ -172,11 +198,27 @@ REMOTE_LOGGING = conf.get('core', 'remote_logging') +# Only update the handlers and loggers when CONFIG_PROCESSOR_MANAGER_LOGGER is set. +# This is to avoid exceptions when initializing RotatingFileHandler multiple times +# in multiple processes. +if os.environ.get('CONFIG_PROCESSOR_MANAGER_LOGGER') == 'True': + DEFAULT_LOGGING_CONFIG['handlers'] \ + .update(DEFAULT_DAG_PARSING_LOGGING_CONFIG['handlers']) + DEFAULT_LOGGING_CONFIG['loggers'] \ + .update(DEFAULT_DAG_PARSING_LOGGING_CONFIG['loggers']) + + # Manually create log directory for processor_manager handler as RotatingFileHandler + # will only create file but not the directory. + processor_manager_handler_config = DEFAULT_DAG_PARSING_LOGGING_CONFIG['handlers'][ + 'processor_manager'] + directory = os.path.dirname(processor_manager_handler_config['filename']) + mkdirs(directory, 0o755) + if REMOTE_LOGGING and REMOTE_BASE_LOG_FOLDER.startswith('s3://'): - DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['s3']) + DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['s3']) elif REMOTE_LOGGING and REMOTE_BASE_LOG_FOLDER.startswith('gs://'): - DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['gcs']) + DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['gcs']) elif REMOTE_LOGGING and REMOTE_BASE_LOG_FOLDER.startswith('wasb'): - DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['wasb']) + DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['wasb']) elif REMOTE_LOGGING and ELASTICSEARCH_HOST: - DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['elasticsearch']) + DEFAULT_LOGGING_CONFIG['handlers'].update(REMOTE_HANDLERS['elasticsearch']) diff --git a/airflow/config_templates/default_airflow.cfg b/airflow/config_templates/default_airflow.cfg index 2f6130c8975f7..2f990131367b5 100644 --- a/airflow/config_templates/default_airflow.cfg +++ b/airflow/config_templates/default_airflow.cfg @@ -30,9 +30,6 @@ # ----------------------- TEMPLATE BEGINS HERE ----------------------- [core] -# The home folder for airflow, default is ~/airflow -airflow_home = {AIRFLOW_HOME} - # The folder where your airflow pipelines live, most likely a # subfolder in a code repository # This path must be absolute @@ -62,14 +59,13 @@ fab_logging_level = WARN logging_config_class = # Log format -# we need to escape the curly braces by adding an additional curly brace log_format = [%%(asctime)s] {{%%(filename)s:%%(lineno)d}} %%(levelname)s - %%(message)s simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s # Log filename format -# we need to escape the curly braces by adding an additional curly brace log_filename_template = {{{{ ti.dag_id }}}}/{{{{ ti.task_id }}}}/{{{{ ts }}}}/{{{{ try_number }}}}.log log_processor_filename_template = {{{{ filename }}}}.log +dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log # Hostname by providing a path to a callable, which will resolve the hostname hostname_callable = socket:getfqdn @@ -79,7 +75,7 @@ hostname_callable = socket:getfqdn default_timezone = utc # The executor class that airflow should use. Choices include -# SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor +# SequentialExecutor, LocalExecutor, CeleryExecutor, DaskExecutor, KubernetesExecutor executor = SequentialExecutor # The SqlAlchemy connection string to the metadata database. @@ -87,6 +83,9 @@ executor = SequentialExecutor # their website sql_alchemy_conn = sqlite:///{AIRFLOW_HOME}/airflow.db +# The encoding for the databases +sql_engine_encoding = utf-8 + # If SqlAlchemy should pool database connections. sql_alchemy_pool_enabled = True @@ -104,6 +103,10 @@ sql_alchemy_pool_recycle = 1800 # disconnects. Setting this to 0 disables retries. sql_alchemy_reconnect_timeout = 300 +# The schema to use for the metadata database +# SqlAlchemy supports databases with the concept of multiple schemas. +sql_alchemy_schema = + # The amount of parallelism as a setting to the executor. This defines # the max number of task instances that should run simultaneously # on this airflow installation @@ -140,7 +143,7 @@ donot_pickle = False dagbag_import_timeout = 30 # The class to use for running task instances in a subprocess -task_runner = BashTaskRunner +task_runner = StandardTaskRunner # If set, tasks without a `run_as_user` argument will be run with this user # Can be used to de-elevate a sudo user running Airflow when executing tasks @@ -173,6 +176,13 @@ killed_task_cleanup_time = 60 # `airflow trigger_dag -c`, the key-value pairs will override the existing ones in params. dag_run_conf_overrides_params = False +# Worker initialisation check to validate Metadata Database connection +worker_precheck = False + +# When discovering DAGs, ignore any files that don't contain the strings `DAG` and `airflow`. +dag_discovery_safe_mode = True + + [cli] # In what way should the cli access the API. The LocalClient will use the # database directly, while the json_client will use the api running on the @@ -258,10 +268,13 @@ access_logfile = - error_logfile = - # Expose the configuration file in the web server +# This is only applicable for the flask-admin based web UI (non FAB-based). +# In the FAB-based web UI with RBAC feature, +# access to configuration is controlled by role permissions. expose_config = False # Set to true to turn on authentication: -# https://airflow.incubator.apache.org/security.html#web-authentication +# https://airflow.apache.org/security.html#web-authentication authenticate = False # Filter the list of dags by owner name (requires authentication to be enabled) @@ -306,6 +319,15 @@ navbar_color = #007A87 # Default dagrun to show in UI default_dag_run_display_number = 25 +# Enable werkzeug `ProxyFix` middleware +enable_proxy_fix = False + +# Set secure flag on session cookie +cookie_secure = False + +# Set samesite policy on session cookie +cookie_samesite = + [email] email_backend = airflow.utils.email.send_email_smtp @@ -338,6 +360,14 @@ celery_app_name = airflow.executors.celery_executor # your worker box and the nature of your tasks worker_concurrency = 16 +# The maximum and minimum concurrency that will be used when starting workers with the +# "airflow worker" command (always keep minimum processes, but grow to maximum if necessary). +# Note the value should be "max_concurrency,min_concurrency" +# Pick these numbers based on resources on worker box and the nature of the task. +# If autoscale option is available, worker_concurrency will be ignored. +# http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale +# worker_autoscale = 16,12 + # When you start an airflow worker, airflow starts a tiny web server # subprocess to serve the workers local log files to the airflow main # web server, who then builds pages and sends them to users. This defines @@ -370,9 +400,18 @@ flower_url_prefix = # This defines the port that Celery Flower runs on flower_port = 5555 +# Securing Flower with Basic Authentication +# Accepts user:password pairs separated by a comma +# Example: flower_basic_auth = user1:password1,user2:password2 +flower_basic_auth = + # Default queue that tasks get assigned to and that worker listen on. default_queue = default +# How many processes CeleryExecutor uses to sync task state. +# 0 means to use max(1, number of cores - 1) processes. +sync_parallelism = 0 + # Import path for celery configuration options celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG @@ -425,17 +464,20 @@ scheduler_heartbeat_sec = 5 # -1 indicates to run continuously (see also num_runs) run_duration = -1 -# after how much time a new DAGs should be picked up from the filesystem +# after how much time (seconds) a new DAGs should be picked up from the filesystem min_file_process_interval = 0 -# How many seconds to wait between file-parsing loops to prevent the logs from being spammed. -min_file_parsing_loop_time = 1 - +# How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. dag_dir_list_interval = 300 # How often should stats be printed to the logs print_stats_interval = 30 +# If the last scheduler heartbeat happened more than scheduler_health_check_threshold ago (in seconds), +# scheduler is considered unhealthy. +# This is used by the health check in the "/health" endpoint +scheduler_health_check_threshold = 30 + child_process_log_directory = {AIRFLOW_HOME}/logs/scheduler # Local task jobs periodically heartbeat to the DB. If the job has @@ -475,6 +517,10 @@ max_threads = 2 authenticate = False +# Turn off scheduler use of cron intervals by setting this to False. +# DAGs submitted manually in the web UI or with trigger_dag will still run. +use_job_schedule = True + [ldap] # set this to ldaps://: uri = @@ -489,6 +535,10 @@ basedn = dc=example,dc=com cacert = /etc/ca/ldap_ca.crt search_scope = LEVEL +# This setting allows the use of LDAP servers that either return a +# broken schema, or do not return a schema. +ignore_malformed_schema = False + [mesos] # Mesos master address which MesosExecutor will connect to. master = localhost:5050 @@ -549,28 +599,35 @@ hide_sensitive_variable_fields = True [elasticsearch] elasticsearch_host = -# we need to escape the curly braces by adding an additional curly brace elasticsearch_log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}} elasticsearch_end_of_log_mark = end_of_log [kubernetes] -# The repository and tag of the Kubernetes Image for the Worker to Run +# The repository, tag and imagePullPolicy of the Kubernetes Image for the Worker to Run worker_container_repository = worker_container_tag = +worker_container_image_pull_policy = IfNotPresent # If True (default), worker pods will be deleted upon termination delete_worker_pods = True +# Number of Kubernetes Worker Pod creation calls per scheduler loop +worker_pods_creation_batch_size = 1 + # The Kubernetes namespace where airflow workers should be created. Defaults to `default` namespace = default # The name of the Kubernetes ConfigMap Containing the Airflow Configuration (this file) airflow_configmap = +# For docker image already contains DAGs, this is set to `True`, and the worker will search for dags in dags_folder, +# otherwise use git sync or dags volume claim to mount DAGs +dags_in_image = False + # For either git sync or volume mounted DAGs, the worker will look in this subpath for DAGs dags_volume_subpath = -# For DAGs mounted via a volume claim (mutually exclusive with volume claim) +# For DAGs mounted via a volume claim (mutually exclusive with git-sync and host path) dags_volume_claim = # For volume mounted logs, the worker will look in this subpath for logs @@ -579,16 +636,67 @@ logs_volume_subpath = # A shared volume claim for the logs logs_volume_claim = +# For DAGs mounted via a hostPath volume (mutually exclusive with volume claim and git-sync) +# Useful in local environment, discouraged in production +dags_volume_host = + +# A hostPath volume for the logs +# Useful in local environment, discouraged in production +logs_volume_host = + +# A list of configMapsRefs to envFrom. If more than one configMap is +# specified, provide a comma separated list: configmap_a,configmap_b +env_from_configmap_ref = + +# A list of secretRefs to envFrom. If more than one secret is +# specified, provide a comma separated list: secret_a,secret_b +env_from_secret_ref = + # Git credentials and repository for DAGs mounted via Git (mutually exclusive with volume claim) git_repo = git_branch = +git_subpath = +# Use git_user and git_password for user authentication or git_ssh_key_secret_name and git_ssh_key_secret_key +# for SSH authentication git_user = git_password = -git_subpath = +git_sync_root = /git +git_sync_dest = repo +# Mount point of the volume if git-sync is being used. +# i.e. {AIRFLOW_HOME}/dags +git_dags_folder_mount_point = + +# To get Git-sync SSH authentication set up follow this format +# +# airflow-secrets.yaml: +# --- +# apiVersion: v1 +# kind: Secret +# metadata: +# name: airflow-secrets +# data: +# # key needs to be gitSshKey +# gitSshKey: +# --- +# airflow-configmap.yaml: +# apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: airflow-configmap +# data: +# known_hosts: | +# github.com ssh-rsa <...> +# airflow.cfg: | +# ... +# +# git_ssh_key_secret_name = airflow-secrets +# git_ssh_known_hosts_configmap_name = airflow-configmap +git_ssh_key_secret_name = +git_ssh_known_hosts_configmap_name = # For cloning DAGs from git repositories into volumes: https://github.com/kubernetes/git-sync -git_sync_container_repository = gcr.io/google-containers/git-sync-amd64 -git_sync_container_tag = v2.0.5 +git_sync_container_repository = k8s.gcr.io/git-sync +git_sync_container_tag = v3.1.1 git_sync_init_container_name = git-sync-clone # The name of the Kubernetes service account to be associated with airflow workers, if any. @@ -606,21 +714,71 @@ image_pull_secrets = gcp_service_account_keys = # Use the service account kubernetes gives to pods to connect to kubernetes cluster. -# It’s intended for clients that expect to be running inside a pod running on kubernetes. +# It's intended for clients that expect to be running inside a pod running on kubernetes. # It will raise an exception if called from a process not running in a kubernetes environment. in_cluster = True +# When running with in_cluster=False change the default cluster_context or config_file +# options to Kubernetes client. Leave blank these to use default behaviour like `kubectl` has. +# cluster_context = +# config_file = + + +# Affinity configuration as a single line formatted JSON object. +# See the affinity model for top-level key names (e.g. `nodeAffinity`, etc.): +# https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#affinity-v1-core +affinity = + +# A list of toleration objects as a single line formatted JSON array +# See: +# https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.12/#toleration-v1-core +tolerations = + +# Worker pods security context options +# See: +# https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ + +# Specifies the uid to run the first process of the worker pods containers as +run_as_user = + +# Specifies a gid to associate with all containers in the worker pods +# if using a git_ssh_key_secret_name use an fs_group +# that allows for the key to be read, e.g. 65533 +fs_group = + +[kubernetes_node_selectors] +# The Key-value pairs to be given to worker pods. +# The worker pods will be scheduled to the nodes of the specified key-value pairs. +# Should be supplied in the format: key = value + +[kubernetes_annotations] +# The Key-value annotations pairs to be given to worker pods. +# Should be supplied in the format: key = value + +[kubernetes_environment_variables] +# The scheduler sets the following environment variables into your workers. You may define as +# many environment variables as needed and the kubernetes launcher will set them in the launched workers. +# Environment variables in this section are defined as follows +# = +# +# For example if you wanted to set an environment variable with value `prod` and key +# `ENVIRONMENT` you would follow the following format: +# ENVIRONMENT = prod +# +# Additionally you may override worker airflow settings with the AIRFLOW__
__ +# formatting as supported by airflow normally. + [kubernetes_secrets] # The scheduler mounts the following secrets into your workers as they are launched by the # scheduler. You may define as many secrets as needed and the kubernetes launcher will parse the # defined secrets and mount them as secret environment variables in the launched workers. # Secrets in this section are defined as follows -# = : +# = = # # For example if you wanted to mount a kubernetes secret key named `postgres_password` from the # kubernetes secret object `airflow-secret` as the environment variable `POSTGRES_PASSWORD` into # your workers you would follow the following format: -# POSTGRES_PASSWORD = airflow-secret:postgres_credentials +# POSTGRES_PASSWORD = airflow-secret=postgres_credentials # # Additionally you may override worker airflow settings with the AIRFLOW__
__ # formatting as supported by airflow normally. diff --git a/airflow/config_templates/default_celery.py b/airflow/config_templates/default_celery.py index d44f2b3448aff..7a9fd25064b8a 100644 --- a/airflow/config_templates/default_celery.py +++ b/airflow/config_templates/default_celery.py @@ -37,7 +37,7 @@ def _broker_supports_visibility_timeout(url): ) if 'visibility_timeout' not in broker_transport_options: if _broker_supports_visibility_timeout(broker_url): - broker_transport_options = {'visibility_timeout': 21600} + broker_transport_options['visibility_timeout'] = 21600 DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], @@ -55,17 +55,27 @@ def _broker_supports_visibility_timeout(url): celery_ssl_active = False try: celery_ssl_active = configuration.conf.getboolean('celery', 'SSL_ACTIVE') -except AirflowConfigException as e: +except AirflowConfigException: log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: - broker_use_ssl = {'keyfile': configuration.conf.get('celery', 'SSL_KEY'), - 'certfile': configuration.conf.get('celery', 'SSL_CERT'), - 'ca_certs': configuration.conf.get('celery', 'SSL_CACERT'), - 'cert_reqs': ssl.CERT_REQUIRED} + if 'amqp://' in broker_url: + broker_use_ssl = {'keyfile': configuration.conf.get('celery', 'SSL_KEY'), + 'certfile': configuration.conf.get('celery', 'SSL_CERT'), + 'ca_certs': configuration.conf.get('celery', 'SSL_CACERT'), + 'cert_reqs': ssl.CERT_REQUIRED} + elif 'redis://' in broker_url: + broker_use_ssl = {'ssl_keyfile': configuration.conf.get('celery', 'SSL_KEY'), + 'ssl_certfile': configuration.conf.get('celery', 'SSL_CERT'), + 'ssl_ca_certs': configuration.conf.get('celery', 'SSL_CACERT'), + 'ssl_cert_reqs': ssl.CERT_REQUIRED} + else: + raise AirflowException('The broker you configured does not support SSL_ACTIVE to be True. ' + 'Please use RabbitMQ or Redis if you would like to use SSL for broker.') + DEFAULT_CELERY_CONFIG['broker_use_ssl'] = broker_use_ssl -except AirflowConfigException as e: +except AirflowConfigException: raise AirflowException('AirflowConfigException: SSL_ACTIVE is True, ' 'please ensure SSL_KEY, ' 'SSL_CERT and SSL_CACERT are set') @@ -75,6 +85,6 @@ def _broker_supports_visibility_timeout(url): 'SSL and/or have all necessary certs and key ({}).'.format(e)) result_backend = DEFAULT_CELERY_CONFIG['result_backend'] -if 'amqp' in result_backend or 'redis' in result_backend or 'rpc' in result_backend: +if 'amqp://' in result_backend or 'redis://' in result_backend or 'rpc://' in result_backend: log.warning("You have configured a result_backend of %s, it is highly recommended " "to use an alternative result_backend (i.e. a database).", result_backend) diff --git a/airflow/config_templates/default_test.cfg b/airflow/config_templates/default_test.cfg index cd4bd32e68fe9..10281f6106e0d 100644 --- a/airflow/config_templates/default_test.cfg +++ b/airflow/config_templates/default_test.cfg @@ -31,7 +31,6 @@ [core] unit_test_mode = True -airflow_home = {AIRFLOW_HOME} dags_folder = {TEST_DAGS_FOLDER} plugins_folder = {TEST_PLUGINS_FOLDER} base_log_folder = {AIRFLOW_HOME}/logs @@ -39,6 +38,7 @@ logging_level = INFO fab_logging_level = WARN log_filename_template = {{{{ ti.dag_id }}}}/{{{{ ti.task_id }}}}/{{{{ ts }}}}/{{{{ try_number }}}}.log log_processor_filename_template = {{{{ filename }}}}.log +dag_processor_manager_log_location = {AIRFLOW_HOME}/logs/dag_processor_manager/dag_processor_manager.log executor = SequentialExecutor sql_alchemy_conn = sqlite:///{AIRFLOW_HOME}/unittests.db load_examples = True @@ -51,6 +51,7 @@ enable_xcom_pickling = False killed_task_cleanup_time = 5 secure_mode = False hostname_callable = socket:getfqdn +worker_precheck = False [cli] api_client = airflow.api.client.local_client @@ -95,6 +96,7 @@ result_backend = db+mysql://airflow:airflow@localhost:3306/airflow flower_host = 0.0.0.0 flower_port = 5555 default_queue = default +sync_parallelism = 0 [mesos] master = localhost:5050 @@ -108,6 +110,7 @@ docker_image_slave = test/docker-airflow [scheduler] job_heartbeat_sec = 1 scheduler_heartbeat_sec = 5 +scheduler_health_check_threshold = 30 authenticate = true max_threads = 2 catchup_by_default = True @@ -122,3 +125,6 @@ hide_sensitive_variable_fields = True elasticsearch_host = elasticsearch_log_id_template = {{dag_id}}-{{task_id}}-{{execution_date}}-{{try_number}} elasticsearch_end_of_log_mark = end_of_log + +[kubernetes] +dags_volume_claim = default diff --git a/airflow/config_templates/default_webserver_config.py b/airflow/config_templates/default_webserver_config.py index ba33d81dec23b..e61c7e1a45cb2 100644 --- a/airflow/config_templates/default_webserver_config.py +++ b/airflow/config_templates/default_webserver_config.py @@ -89,3 +89,13 @@ # { 'name': 'AOL', 'url': 'http://openid.aol.com/' }, # { 'name': 'Flickr', 'url': 'http://www.flickr.com/' }, # { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }] + +# ---------------------------------------------------- +# Theme CONFIG +# ---------------------------------------------------- +# Flask App Builder comes up with a number of predefined themes +# that you can use for Apache Airflow. +# http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes +# Please make sure to remove "navbar_color" configuration from airflow.cfg +# in order to fully utilize the theme. (or use that property in conjunction with theme) +# APP_THEME = "bootstrap-theme.css" # default bootstrap diff --git a/airflow/configuration.py b/airflow/configuration.py index 2ee453fd7d93f..ad03b310073bb 100644 --- a/airflow/configuration.py +++ b/airflow/configuration.py @@ -35,7 +35,7 @@ import sys import warnings -from backports.configparser import ConfigParser +from backports.configparser import ConfigParser, _UNSET, NoOptionError from zope.deprecation import deprecated as _deprecated from airflow.exceptions import AirflowConfigException @@ -56,12 +56,9 @@ def generate_fernet_key(): try: from cryptography.fernet import Fernet except ImportError: - pass - try: - key = Fernet.generate_key().decode() - except NameError: - key = "cryptography_not_found_storing_passwords_in_plain_text" - return key + return '' + else: + return Fernet.generate_key().decode() def expand_env_var(env_var): @@ -101,15 +98,20 @@ def run_command(command): return output -_templates_dir = os.path.join(os.path.dirname(__file__), 'config_templates') -with open(os.path.join(_templates_dir, 'default_airflow.cfg')) as f: - DEFAULT_CONFIG = f.read() - if six.PY2: - DEFAULT_CONFIG = DEFAULT_CONFIG.decode('utf-8') -with open(os.path.join(_templates_dir, 'default_test.cfg')) as f: - TEST_CONFIG = f.read() +def _read_default_config_file(file_name): + templates_dir = os.path.join(os.path.dirname(__file__), 'config_templates') + file_path = os.path.join(templates_dir, file_name) if six.PY2: - TEST_CONFIG = TEST_CONFIG.decode('utf-8') + with open(file_path) as f: + config = f.read() + return config.decode('utf-8') + else: + with open(file_path, encoding='utf-8') as f: + return f.read() + + +DEFAULT_CONFIG = _read_default_config_file('default_airflow.cfg') +TEST_CONFIG = _read_default_config_file('default_test.cfg') class AirflowConfigParser(ConfigParser): @@ -121,15 +123,53 @@ class AirflowConfigParser(ConfigParser): ('core', 'sql_alchemy_conn'), ('core', 'fernet_key'), ('celery', 'broker_url'), - ('celery', 'result_backend') + ('celery', 'result_backend'), + # Todo: remove this in Airflow 1.11 + ('celery', 'celery_result_backend'), + ('atlas', 'password'), + ('smtp', 'smtp_password'), + ('ldap', 'bind_password'), + ('kubernetes', 'git_password'), } + # A two-level mapping of (section -> new_name -> old_name). When reading + # new_name, the old_name will be checked to see if it exists. If it does a + # DeprecationWarning will be issued and the old name will be used instead + deprecated_options = { + 'celery': { + # Remove these keys in Airflow 1.11 + 'worker_concurrency': 'celeryd_concurrency', + 'result_backend': 'celery_result_backend', + 'broker_url': 'celery_broker_url', + 'ssl_active': 'celery_ssl_active', + 'ssl_cert': 'celery_ssl_cert', + 'ssl_key': 'celery_ssl_key', + } + } + deprecation_format_string = ( + 'The {old} option in [{section}] has been renamed to {new} - the old ' + 'setting has been used, but please update your config.' + ) + + # A mapping of old default values that we want to change and warn the user + # about. Mapping of section -> setting -> { old, replace, by_version } + deprecated_values = { + 'core': { + 'task_runner': ('BashTaskRunner', 'StandardTaskRunner', '2.0'), + }, + } + deprecation_value_format_string = ( + 'The {name} setting in [{section}] has the old default value of {old!r}. This ' + 'value has been changed to {new!r} in the running config, but please ' + 'update your config before Apache Airflow {version}.' + ) + def __init__(self, default_config=None, *args, **kwargs): super(AirflowConfigParser, self).__init__(*args, **kwargs) - self.defaults = ConfigParser(*args, **kwargs) + self.airflow_defaults = ConfigParser(*args, **kwargs) if default_config is not None: - self.defaults.read_string(default_config) + self.airflow_defaults.read_string(default_config) self.is_validated = False @@ -159,11 +199,30 @@ def _validate(self): "error: attempt at using ldapgroup " "filtering without using the Ldap backend") + for section, replacement in self.deprecated_values.items(): + for name, info in replacement.items(): + old, new, version = info + if self.get(section, name, fallback=None) == old: + # Make sure the env var option is removed, otherwise it + # would be read and used instead of the value we set + env_var = self._env_var_name(section, name) + os.environ.pop(env_var, None) + + self.set(section, name, new) + warnings.warn( + self.deprecation_value_format_string.format(**locals()), + FutureWarning, + ) + self.is_validated = True + @staticmethod + def _env_var_name(section, key): + return 'AIRFLOW__{S}__{K}'.format(S=section.upper(), K=key.upper()) + def _get_env_var_option(self, section, key): # must have format AIRFLOW__{SECTION}__{KEY} (note double underscore) - env_var = 'AIRFLOW__{S}__{K}'.format(S=section.upper(), K=key.upper()) + env_var = self._env_var_name(section, key) if env_var in os.environ: return expand_env_var(os.environ[env_var]) @@ -181,10 +240,17 @@ def get(self, section, key, **kwargs): section = str(section).lower() key = str(key).lower() + deprecated_name = self.deprecated_options.get(section, {}).get(key, None) + # first check environment variables option = self._get_env_var_option(section, key) if option is not None: return option + if deprecated_name: + option = self._get_env_var_option(section, deprecated_name) + if option is not None: + self._warn_deprecate(section, key, deprecated_name) + return option # ...then the config file if super(AirflowConfigParser, self).has_option(section, key): @@ -192,56 +258,74 @@ def get(self, section, key, **kwargs): # separate the config from default config. return expand_env_var( super(AirflowConfigParser, self).get(section, key, **kwargs)) + if deprecated_name: + if super(AirflowConfigParser, self).has_option(section, deprecated_name): + self._warn_deprecate(section, key, deprecated_name) + return expand_env_var(super(AirflowConfigParser, self).get( + section, + deprecated_name, + **kwargs + )) # ...then commands option = self._get_cmd_option(section, key) if option: return option + if deprecated_name: + option = self._get_cmd_option(section, deprecated_name) + if option: + self._warn_deprecate(section, key, deprecated_name) + return option # ...then the default config - if self.defaults.has_option(section, key): + if self.airflow_defaults.has_option(section, key) or 'fallback' in kwargs: return expand_env_var( - self.defaults.get(section, key, **kwargs)) + self.airflow_defaults.get(section, key, **kwargs)) else: log.warning( - "section/key [{section}/{key}] not found in config".format(**locals()) + "section/key [%s/%s] not found in config", section, key ) raise AirflowConfigException( "section/key [{section}/{key}] not found " - "in config".format(**locals())) + "in config".format(section=section, key=key)) - def getboolean(self, section, key): - val = str(self.get(section, key)).lower().strip() + def getboolean(self, section, key, **kwargs): + val = str(self.get(section, key, **kwargs)).lower().strip() if '#' in val: val = val.split('#')[0].strip() - if val.lower() in ('t', 'true', '1'): + if val in ('t', 'true', '1'): return True - elif val.lower() in ('f', 'false', '0'): + elif val in ('f', 'false', '0'): return False else: raise AirflowConfigException( 'The value for configuration option "{}:{}" is not a ' 'boolean (received "{}").'.format(section, key, val)) - def getint(self, section, key): - return int(self.get(section, key)) + def getint(self, section, key, **kwargs): + return int(self.get(section, key, **kwargs)) + + def getfloat(self, section, key, **kwargs): + return float(self.get(section, key, **kwargs)) - def getfloat(self, section, key): - return float(self.get(section, key)) + def read(self, filenames, **kwargs): + super(AirflowConfigParser, self).read(filenames, **kwargs) + self._validate() - def read(self, filenames): - super(AirflowConfigParser, self).read(filenames) + def read_dict(self, *args, **kwargs): + super(AirflowConfigParser, self).read_dict(*args, **kwargs) self._validate() def has_option(self, section, option): try: # Using self.get() to avoid reimplementing the priority order # of config variables (env, config, cmd, defaults) - self.get(section, option) + # UNSET to avoid logging a warning about missing values + self.get(section, option, fallback=_UNSET) return True - except AirflowConfigException: + except NoOptionError: return False def remove_option(self, section, option, remove_default=True): @@ -253,24 +337,32 @@ def remove_option(self, section, option, remove_default=True): if super(AirflowConfigParser, self).has_option(section, option): super(AirflowConfigParser, self).remove_option(section, option) - if self.defaults.has_option(section, option) and remove_default: - self.defaults.remove_option(section, option) + if self.airflow_defaults.has_option(section, option) and remove_default: + self.airflow_defaults.remove_option(section, option) def getsection(self, section): """ Returns the section as a dict. Values are converted to int, float, bool as required. + :param section: section from the config - :return: dict + :rtype: dict """ - if section not in self._sections and section not in self.defaults._sections: + if (section not in self._sections and + section not in self.airflow_defaults._sections): return None - _section = copy.deepcopy(self.defaults._sections[section]) + _section = copy.deepcopy(self.airflow_defaults._sections[section]) if section in self._sections: _section.update(copy.deepcopy(self._sections[section])) + section_prefix = 'AIRFLOW__{S}__'.format(S=section.upper()) + for env_var in sorted(os.environ.keys()): + if env_var.startswith(section_prefix): + key = env_var.replace(section_prefix, '').lower() + _section[key] = self._get_env_var_option(section, key) + for key, val in iteritems(_section): try: val = int(val) @@ -285,30 +377,35 @@ def getsection(self, section): _section[key] = val return _section - def as_dict(self, display_source=False, display_sensitive=False): + def as_dict( + self, display_source=False, display_sensitive=False, raw=False): """ Returns the current configuration as an OrderedDict of OrderedDicts. :param display_source: If False, the option value is returned. If True, a tuple of (option_value, source) is returned. Source is either - 'airflow.cfg' or 'default'. + 'airflow.cfg', 'default', 'env var', or 'cmd'. :type display_source: bool :param display_sensitive: If True, the values of options set by env vars and bash commands will be displayed. If False, those options are shown as '< hidden >' :type display_sensitive: bool + :param raw: Should the values be output as interpolated values, or the + "raw" form that can be fed back in to ConfigParser + :type raw: bool """ - cfg = copy.deepcopy(self.defaults._sections) - cfg.update(copy.deepcopy(self._sections)) - - # remove __name__ (affects Python 2 only) - for options in cfg.values(): - options.pop('__name__', None) - - # add source - if display_source: - for section in cfg: - for k, v in cfg[section].items(): - cfg[section][k] = (v, 'airflow config') + cfg = {} + configs = [ + ('default', self.airflow_defaults), + ('airflow.cfg', self), + ] + + for (source_name, config) in configs: + for section in config.sections(): + sect = cfg.setdefault(section, OrderedDict()) + for (k, val) in config.items(section=section, raw=raw): + if display_source: + val = (val, source_name) + sect[k] = val # add env vars and overwrite because they have priority for ev in [ev for ev in os.environ if ev.startswith('AIRFLOW__')]: @@ -316,16 +413,15 @@ def as_dict(self, display_source=False, display_sensitive=False): _, section, key = ev.split('__') opt = self._get_env_var_option(section, key) except ValueError: - opt = None - if opt: - if ( - not display_sensitive and - ev != 'AIRFLOW__CORE__UNIT_TEST_MODE'): - opt = '< hidden >' - if display_source: - opt = (opt, 'env var') - cfg.setdefault(section.lower(), OrderedDict()).update( - {key.lower(): opt}) + continue + if (not display_sensitive and ev != 'AIRFLOW__CORE__UNIT_TEST_MODE'): + opt = '< hidden >' + elif raw: + opt = opt.replace('%', '%%') + if display_source: + opt = (opt, 'env var') + cfg.setdefault(section.lower(), OrderedDict()).update( + {key.lower(): opt}) # add bash commands for (section, key) in self.as_command_stdout: @@ -334,8 +430,11 @@ def as_dict(self, display_source=False, display_sensitive=False): if not display_sensitive: opt = '< hidden >' if display_source: - opt = (opt, 'bash cmd') + opt = (opt, 'cmd') + elif raw: + opt = opt.replace('%', '%%') cfg.setdefault(section, OrderedDict()).update({key: opt}) + del cfg[section][key + '_cmd'] return cfg @@ -352,6 +451,17 @@ def load_test_config(self): # then read any "custom" test settings self.read(TEST_CONFIG_FILE) + def _warn_deprecate(self, section, key, deprecated_name): + warnings.warn( + self.deprecation_format_string.format( + old=deprecated_name, + new=key, + section=section, + ), + DeprecationWarning, + stacklevel=3, + ) + def mkdir_p(path): try: @@ -364,23 +474,23 @@ def mkdir_p(path): 'Error creating {}: {}'.format(path, exc.strerror)) -# Setting AIRFLOW_HOME and AIRFLOW_CONFIG from environment variables, using -# "~/airflow" and "~/airflow/airflow.cfg" respectively as defaults. +def get_airflow_home(): + return expand_env_var(os.environ.get('AIRFLOW_HOME', '~/airflow')) -if 'AIRFLOW_HOME' not in os.environ: - AIRFLOW_HOME = expand_env_var('~/airflow') -else: - AIRFLOW_HOME = expand_env_var(os.environ['AIRFLOW_HOME']) +def get_airflow_config(airflow_home): + if 'AIRFLOW_CONFIG' not in os.environ: + return os.path.join(airflow_home, 'airflow.cfg') + return expand_env_var(os.environ['AIRFLOW_CONFIG']) + + +# Setting AIRFLOW_HOME and AIRFLOW_CONFIG from environment variables, using +# "~/airflow" and "$AIRFLOW_HOME/airflow.cfg" respectively as defaults. + +AIRFLOW_HOME = get_airflow_home() +AIRFLOW_CONFIG = get_airflow_config(AIRFLOW_HOME) mkdir_p(AIRFLOW_HOME) -if 'AIRFLOW_CONFIG' not in os.environ: - if os.path.isfile(expand_env_var('~/airflow.cfg')): - AIRFLOW_CONFIG = expand_env_var('~/airflow.cfg') - else: - AIRFLOW_CONFIG = AIRFLOW_HOME + '/airflow.cfg' -else: - AIRFLOW_CONFIG = expand_env_var(os.environ['AIRFLOW_CONFIG']) # Set up dags folder for unit tests # this directory won't exist if users install via pip @@ -449,12 +559,42 @@ def parameterized_config(template): conf.read(AIRFLOW_CONFIG) +if conf.has_option('core', 'AIRFLOW_HOME'): + msg = ( + 'Specifying both AIRFLOW_HOME environment variable and airflow_home ' + 'in the config file is deprecated. Please use only the AIRFLOW_HOME ' + 'environment variable and remove the config file entry.' + ) + if 'AIRFLOW_HOME' in os.environ: + warnings.warn(msg, category=DeprecationWarning) + elif conf.get('core', 'airflow_home') == AIRFLOW_HOME: + warnings.warn( + 'Specifying airflow_home in the config file is deprecated. As you ' + 'have left it at the default value you should remove the setting ' + 'from your airflow.cfg and suffer no change in behaviour.', + category=DeprecationWarning, + ) + else: + AIRFLOW_HOME = conf.get('core', 'airflow_home') + warnings.warn(msg, category=DeprecationWarning) + +# Warn about old config file. We used to read ~/airflow/airflow.cfg even if +# that AIRFLOW_HOME was set to something else +_old_config_file = os.path.expanduser("~/airflow/airflow.cfg") +if _old_config_file != AIRFLOW_CONFIG and os.path.isfile(_old_config_file): + warnings.warn( + 'You have two airflow.cfg files: {old} and {new}. Airflow used to look ' + 'at ~/airflow/airflow.cfg, even when AIRFLOW_HOME was set to a different ' + 'value. Airflow will now only read {new}, and you should remove the ' + 'other file'.format(old=_old_config_file, new=AIRFLOW_CONFIG), + category=DeprecationWarning, + ) + -if conf.getboolean('webserver', 'rbac'): - with open(os.path.join(_templates_dir, 'default_webserver_config.py')) as f: - DEFAULT_WEBSERVER_CONFIG = f.read() +WEBSERVER_CONFIG = AIRFLOW_HOME + '/webserver_config.py' - WEBSERVER_CONFIG = AIRFLOW_HOME + '/webserver_config.py' +if conf.getboolean('webserver', 'rbac'): + DEFAULT_WEBSERVER_CONFIG = _read_default_config_file('default_webserver_config.py') if not os.path.isfile(WEBSERVER_CONFIG): log.info('Creating new FAB webserver config file in: %s', WEBSERVER_CONFIG) diff --git a/airflow/contrib/auth/backends/github_enterprise_auth.py b/airflow/contrib/auth/backends/github_enterprise_auth.py index 5196725156393..99c01ed2794ce 100644 --- a/airflow/contrib/auth/backends/github_enterprise_auth.py +++ b/airflow/contrib/auth/backends/github_enterprise_auth.py @@ -1,4 +1,4 @@ -# Copyright 2015 Matthew Pelland (matt@pelland.io) +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -19,18 +19,14 @@ import flask_login # Need to expose these downstream -# pylint: disable=unused-import -from flask_login import (current_user, - logout_user, - login_required, - login_user) -# pylint: enable=unused-import +# flake8: noqa: F401 +from flask_login import current_user, logout_user, login_required, login_user from flask import url_for, redirect, request from flask_oauthlib.client import OAuth -from airflow import models, configuration, settings +from airflow import models, configuration from airflow.configuration import AirflowConfigException from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin @@ -47,14 +43,17 @@ class GHEUser(models.User): def __init__(self, user): self.user = user + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False diff --git a/airflow/contrib/auth/backends/google_auth.py b/airflow/contrib/auth/backends/google_auth.py index d1a35791db888..ddbcb1222f2c0 100644 --- a/airflow/contrib/auth/backends/google_auth.py +++ b/airflow/contrib/auth/backends/google_auth.py @@ -1,4 +1,4 @@ -# Copyright 2016 Ananya Mishra (am747@cornell.edu) +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -19,18 +19,14 @@ import flask_login # Need to expose these downstream -# pylint: disable=unused-import -from flask_login import (current_user, - logout_user, - login_required, - login_user) -# pylint: enable=unused-import +# flake8: noqa: F401 +from flask_login import current_user, logout_user, login_required, login_user from flask import url_for, redirect, request from flask_oauthlib.client import OAuth -from airflow import models, configuration, settings +from airflow import models, configuration from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin @@ -46,14 +42,17 @@ class GoogleUser(models.User): def __init__(self, user): self.user = user + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False @@ -113,8 +112,7 @@ def login(self, request): log.debug('Redirecting user to Google login') return self.google_oauth.authorize(callback=url_for( 'google_oauth_callback', - _external=True, - _scheme='https'), + _external=True), state=request.args.get('next') or request.referrer or None) def get_google_user_profile_info(self, google_token): diff --git a/airflow/contrib/auth/backends/kerberos_auth.py b/airflow/contrib/auth/backends/kerberos_auth.py index 08be299a197dd..17b8b387c14ef 100644 --- a/airflow/contrib/auth/backends/kerberos_auth.py +++ b/airflow/contrib/auth/backends/kerberos_auth.py @@ -19,10 +19,10 @@ import logging import flask_login +from airflow.exceptions import AirflowConfigException from flask_login import current_user from flask import flash -from wtforms import ( - Form, PasswordField, StringField) +from wtforms import Form, PasswordField, StringField from wtforms.validators import InputRequired # pykerberos should be used as it verifies the KDC, the "kerberos" module does not do so @@ -32,7 +32,6 @@ from flask import url_for, redirect -from airflow import settings from airflow import models from airflow import configuration from airflow.utils.db import provide_session @@ -58,7 +57,13 @@ def authenticate(username, password): utils.get_fqdn() ) realm = configuration.conf.get("kerberos", "default_realm") - user_principal = utils.principal_from_username(username) + + try: + user_realm = configuration.conf.get("security", "default_realm") + except AirflowConfigException: + user_realm = realm + + user_principal = utils.principal_from_username(username, user_realm) try: # this is pykerberos specific, verify = True is needed to prevent KDC spoofing @@ -68,19 +73,23 @@ def authenticate(username, password): raise AuthenticationError() except kerberos.KrbError as e: logging.error( - 'Password validation for principal %s failed %s', user_principal, e) + 'Password validation for user ' + '%s in realm %s failed %s', user_principal, realm, e) raise AuthenticationError(e) return + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False @@ -110,7 +119,7 @@ def load_user(userid, session=None): @provide_session def login(self, request, session=None): - if current_user.is_authenticated(): + if current_user.is_authenticated: flash("You are already logged in") return redirect(url_for('index')) diff --git a/airflow/contrib/auth/backends/ldap_auth.py b/airflow/contrib/auth/backends/ldap_auth.py index eefaa1263b250..1639f43aec307 100644 --- a/airflow/contrib/auth/backends/ldap_auth.py +++ b/airflow/contrib/auth/backends/ldap_auth.py @@ -19,13 +19,12 @@ from future.utils import native import flask_login -from flask_login import login_required, current_user, logout_user +from flask_login import login_required, current_user, logout_user # noqa: F401 from flask import flash -from wtforms import ( - Form, PasswordField, StringField) +from wtforms import Form, PasswordField, StringField from wtforms.validators import InputRequired -from ldap3 import Server, Connection, Tls, LEVEL, SUBTREE, BASE +from ldap3 import Server, Connection, Tls, set_config_parameter, LEVEL, SUBTREE import ssl from flask import url_for, redirect @@ -56,16 +55,26 @@ class LdapException(Exception): def get_ldap_connection(dn=None, password=None): - tls_configuration = None - use_ssl = False try: cacert = configuration.conf.get("ldap", "cacert") - tls_configuration = Tls(validate=ssl.CERT_REQUIRED, ca_certs_file=cacert) - use_ssl = True - except: + except AirflowConfigException: pass - server = Server(configuration.conf.get("ldap", "uri"), use_ssl, tls_configuration) + try: + ignore_malformed_schema = configuration.conf.get("ldap", "ignore_malformed_schema") + except AirflowConfigException: + pass + + if ignore_malformed_schema: + set_config_parameter('IGNORE_MALFORMED_SCHEMA', ignore_malformed_schema) + + tls_configuration = Tls(validate=ssl.CERT_REQUIRED, + ca_certs_file=cacert) + + server = Server(configuration.conf.get("ldap", "uri"), + use_ssl=True, + tls=tls_configuration) + conn = Connection(server, native(dn), native(password)) if not conn.bind(): @@ -94,7 +103,7 @@ def groups_user(conn, search_base, user_filter, user_name_att, username): search_filter = "(&({0})({1}={2}))".format(user_filter, user_name_att, username) try: memberof_attr = configuration.conf.get("ldap", "group_member_attr") - except: + except Exception: memberof_attr = "memberOf" res = conn.search(native(search_base), native(search_filter), attributes=[native(memberof_attr)]) @@ -226,7 +235,7 @@ def try_login(username, password): Unable to parse LDAP structure. If you're using Active Directory and not specifying an OU, you must set search_scope=SUBTREE in airflow.cfg. %s - """ % traceback.format_exc()) + """, traceback.format_exc()) raise LdapException( "Could not parse LDAP structure. " "Try setting search_scope in airflow.cfg, or check logs" @@ -236,14 +245,17 @@ def try_login(username, password): log.info("Password incorrect for user %s", username) raise AuthenticationError("Invalid username or password") + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False @@ -274,7 +286,7 @@ def load_user(userid, session=None): @provide_session def login(self, request, session=None): - if current_user.is_authenticated(): + if current_user.is_authenticated: flash("You are already logged in") return redirect(url_for('admin.index')) diff --git a/airflow/contrib/auth/backends/password_auth.py b/airflow/contrib/auth/backends/password_auth.py index 879aaa142a4db..9d6a3ccbe6327 100644 --- a/airflow/contrib/auth/backends/password_auth.py +++ b/airflow/contrib/auth/backends/password_auth.py @@ -35,9 +35,8 @@ from sqlalchemy import Column, String from sqlalchemy.ext.hybrid import hybrid_property -from airflow import settings from airflow import models -from airflow.utils.db import provide_session +from airflow.utils.db import provide_session, create_session from airflow.utils.log.logging_mixin import LoggingMixin login_manager = flask_login.LoginManager() @@ -48,6 +47,9 @@ PY3 = version_info[0] == 3 +client_auth = None + + class AuthenticationError(Exception): pass @@ -71,14 +73,17 @@ def password(self, plaintext): def authenticate(self, plaintext): return check_password_hash(self._password, plaintext) + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False @@ -92,8 +97,7 @@ def data_profiling(self): return True def is_superuser(self): - """Access all the things""" - return True + return hasattr(self, 'user') and self.user.is_superuser() @login_manager.user_loader @@ -137,7 +141,7 @@ def authenticate(session, username, password): @provide_session def login(self, request, session=None): - if current_user.is_authenticated(): + if current_user.is_authenticated: flash("You are already logged in") return redirect(url_for('admin.index')) @@ -160,9 +164,6 @@ def login(self, request, session=None): return self.render('airflow/login.html', title="Airflow - Login", form=form) - finally: - session.commit() - session.close() class LoginForm(Form): @@ -196,19 +197,16 @@ def decorated(*args, **kwargs): userpass = ''.join(header.split()[1:]) username, password = base64.b64decode(userpass).decode("utf-8").split(":", 1) - session = settings.Session() - try: - authenticate(session, username, password) + with create_session() as session: + try: + authenticate(session, username, password) - response = function(*args, **kwargs) - response = make_response(response) - return response + response = function(*args, **kwargs) + response = make_response(response) + return response - except AuthenticationError: - return _forbidden() + except AuthenticationError: + return _forbidden() - finally: - session.commit() - session.close() return _unauthorized() return decorated diff --git a/scripts/ci/kadm5.acl b/airflow/contrib/example_dags/__init__.py similarity index 96% rename from scripts/ci/kadm5.acl rename to airflow/contrib/example_dags/__init__.py index 691dce6c2bbdf..114d189da14ab 100644 --- a/scripts/ci/kadm5.acl +++ b/airflow/contrib/example_dags/__init__.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -6,13 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -*/admin@TEST.LOCAL * diff --git a/airflow/contrib/example_dags/example_azure_container_instances_operator.py b/airflow/contrib/example_dags/example_azure_container_instances_operator.py new file mode 100644 index 0000000000000..181a30b50e62f --- /dev/null +++ b/airflow/contrib/example_dags/example_azure_container_instances_operator.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow import DAG +from airflow.contrib.operators.azure_container_instances_operator import AzureContainerInstancesOperator +from datetime import datetime, timedelta + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': datetime(2018, 11, 1), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=5), +} + +dag = DAG( + 'aci_example', + default_args=default_args, + schedule_interval=timedelta(1) +) + +t1 = AzureContainerInstancesOperator( + ci_conn_id='azure_container_instances_default', + registry_conn_id=None, + resource_group='resource-group', + name='aci-test-{{ ds }}', + image='hello-world', + region='WestUS2', + environment_variables={}, + volumes=[], + memory_in_gb=4.0, + cpu=1.0, + task_id='start_container', + dag=dag +) diff --git a/airflow/contrib/example_dags/example_azure_cosmosdb_sensor.py b/airflow/contrib/example_dags/example_azure_cosmosdb_sensor.py new file mode 100644 index 0000000000000..dd0b83e811d1f --- /dev/null +++ b/airflow/contrib/example_dags/example_azure_cosmosdb_sensor.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +This is only an example DAG to highlight usage of AzureCosmosDocumentSensor to detect +if a document now exists. + +You can trigger this manually with `airflow trigger_dag example_cosmosdb_sensor`. + +*Note: Make sure that connection `azure_cosmos_default` is properly set before running +this example.* +""" + +from airflow import DAG +from airflow.contrib.sensors.azure_cosmos_sensor import AzureCosmosDocumentSensor +from airflow.contrib.operators.azure_cosmos_operator import AzureCosmosInsertDocumentOperator +from airflow.utils import dates + +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': dates.days_ago(2), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False +} + +dag = DAG('example_azure_cosmosdb_sensor', default_args=default_args) + +dag.doc_md = __doc__ + +t1 = AzureCosmosDocumentSensor( + task_id='check_cosmos_file', + database_name='airflow_example_db', + collection_name='airflow_example_coll', + document_id='airflow_checkid', + azure_cosmos_conn_id='azure_cosmos_default', + dag=dag) + +t2 = AzureCosmosInsertDocumentOperator( + task_id='insert_cosmos_file', + dag=dag, + database_name='airflow_example_db', + collection_name='new-collection', + document={"id": "someuniqueid", "param1": "value1", "param2": "value2"}, + azure_cosmos_conn_id='azure_cosmos_default') + +t1 >> t2 diff --git a/airflow/contrib/example_dags/example_databricks_operator.py b/airflow/contrib/example_dags/example_databricks_operator.py index bc827d465bd3a..79f947ba1c123 100644 --- a/airflow/contrib/example_dags/example_databricks_operator.py +++ b/airflow/contrib/example_dags/example_databricks_operator.py @@ -32,7 +32,7 @@ # the spark jar task will NOT run until the notebook task completes # successfully. # -# The definition of a succesful run is if the run has a result_state of "SUCCESS". +# The definition of a successful run is if the run has a result_state of "SUCCESS". # For more information about the state of a run refer to # https://docs.databricks.com/api/latest/jobs.html#runstate diff --git a/airflow/contrib/example_dags/example_dingding_operator.py b/airflow/contrib/example_dags/example_dingding_operator.py new file mode 100644 index 0000000000000..c6d8cca6fdaad --- /dev/null +++ b/airflow/contrib/example_dags/example_dingding_operator.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from datetime import timedelta + +import airflow +from airflow.contrib.operators.dingding_operator import DingdingOperator +from airflow.models import DAG + +args = { + 'owner': 'airflow', + 'retries': 3, + 'start_date': airflow.utils.dates.days_ago(2), +} + + +# [START howto_operator_dingding_failure_callback] +def failure_callback(context): + message = 'AIRFLOW TASK FAILURE TIPS:\n' \ + 'DAG: {}\n' \ + 'TASKS: {}\n' \ + 'Reason: {}\n' \ + .format(context['task_instance'].dag_id, + context['task_instance'].task_id, + context['exception']) + return DingdingOperator( + task_id='dingding_success_callback', + dingding_conn_id='dingding_default', + message_type='text', + message=message, + at_all=True, + ).execute(context) + + +args['on_failure_callback'] = failure_callback +# [END howto_operator_dingding_failure_callback] + +dag = DAG( + dag_id='example_dingding_operator', + default_args=args, + schedule_interval='@once', + dagrun_timeout=timedelta(minutes=60), +) + +# [START howto_operator_dingding] +text_msg_remind_none = DingdingOperator( + task_id='text_msg_remind_none', + dingding_conn_id='dingding_default', + message_type='text', + message='Airflow dingding text message remind none', + at_mobiles=None, + at_all=False, + dag=dag, +) +# [END howto_operator_dingding] + +text_msg_remind_specific = DingdingOperator( + task_id='text_msg_remind_specific', + dingding_conn_id='dingding_default', + message_type='text', + message='Airflow dingding text message remind specific users', + at_mobiles=['156XXXXXXXX', '130XXXXXXXX'], + at_all=False, + dag=dag, +) + +text_msg_remind_include_invalid = DingdingOperator( + task_id='text_msg_remind_include_invalid', + dingding_conn_id='dingding_default', + message_type='text', + message='Airflow dingding text message remind users including invalid', + # 123 is invalid user or user not in the group + at_mobiles=['156XXXXXXXX', '123'], + at_all=False, + dag=dag, +) + +# [START howto_operator_dingding_remind_users] +text_msg_remind_all = DingdingOperator( + task_id='text_msg_remind_all', + dingding_conn_id='dingding_default', + message_type='text', + message='Airflow dingding text message remind all users in group', + # list of user phone/email here in the group + # when at_all is specific will cover at_mobiles + at_mobiles=['156XXXXXXXX', '130XXXXXXXX'], + at_all=True, + dag=dag, +) +# [END howto_operator_dingding_remind_users] + +link_msg = DingdingOperator( + task_id='link_msg', + dingding_conn_id='dingding_default', + message_type='link', + message={ + 'title': 'Airflow dingding link message', + 'text': 'Airflow official documentation link', + 'messageUrl': 'http://airflow.apache.org', + 'picURL': 'http://airflow.apache.org/_images/pin_large.png' + }, + dag=dag, +) + +# [START howto_operator_dingding_rich_text] +markdown_msg = DingdingOperator( + task_id='markdown_msg', + dingding_conn_id='dingding_default', + message_type='markdown', + message={ + 'title': 'Airflow dingding markdown message', + 'text': '# Markdown message title\n' + 'content content .. \n' + '### sub-title\n' + '![logo](http://airflow.apache.org/_images/pin_large.png)' + }, + at_mobiles=['156XXXXXXXX'], + at_all=False, + dag=dag, +) +# [END howto_operator_dingding_rich_text] + +single_action_card_msg = DingdingOperator( + task_id='single_action_card_msg', + dingding_conn_id='dingding_default', + message_type='actionCard', + message={ + 'title': 'Airflow dingding single actionCard message', + 'text': 'Airflow dingding single actionCard message\n' + '![logo](http://airflow.apache.org/_images/pin_large.png)\n' + 'This is a official logo in Airflow website.', + 'hideAvatar': '0', + 'btnOrientation': '0', + 'singleTitle': 'read more', + 'singleURL': 'http://airflow.apache.org' + }, + dag=dag, +) + +multi_action_card_msg = DingdingOperator( + task_id='multi_action_card_msg', + dingding_conn_id='dingding_default', + message_type='actionCard', + message={ + 'title': 'Airflow dingding multi actionCard message', + 'text': 'Airflow dingding multi actionCard message\n' + '![logo](http://airflow.apache.org/_images/pin_large.png)\n' + 'Airflow documentation and github', + 'hideAvatar': '0', + 'btnOrientation': '0', + 'btns': [ + { + 'title': 'Airflow Documentation', + 'actionURL': 'http://airflow.apache.org' + }, + { + 'title': 'Airflow Github', + 'actionURL': 'https://github.com/apache/airflow' + } + ] + }, + dag=dag, +) + +feed_card_msg = DingdingOperator( + task_id='feed_card_msg', + dingding_conn_id='dingding_default', + message_type='feedCard', + message={ + "links": [ + { + "title": "Airflow DAG feed card", + "messageURL": "https://airflow.readthedocs.io/en/latest/ui.html", + "picURL": "http://airflow.apache.org/_images/dags.png" + }, + { + "title": "Airflow tree feed card", + "messageURL": "https://airflow.readthedocs.io/en/latest/ui.html", + "picURL": "http://airflow.apache.org/_images/tree.png" + }, + { + "title": "Airflow graph feed card", + "messageURL": "https://airflow.readthedocs.io/en/latest/ui.html", + "picURL": "http://airflow.apache.org/_images/graph.png" + } + ] + }, + dag=dag, +) + +msg_failure_callback = DingdingOperator( + task_id='msg_failure_callback', + dingding_conn_id='dingding_default', + message_type='not_support_msg_type', + message="", + dag=dag, +) + +[ + text_msg_remind_none, + text_msg_remind_specific, + text_msg_remind_include_invalid, + text_msg_remind_all +] >> link_msg >> markdown_msg >> [ + single_action_card_msg, + multi_action_card_msg +] >> feed_card_msg >> msg_failure_callback diff --git a/airflow/contrib/example_dags/example_emr_job_flow_manual_steps.py b/airflow/contrib/example_dags/example_emr_job_flow_manual_steps.py index 48a178a2a954c..ba5b923007511 100644 --- a/airflow/contrib/example_dags/example_emr_job_flow_manual_steps.py +++ b/airflow/contrib/example_dags/example_emr_job_flow_manual_steps.py @@ -54,8 +54,7 @@ ] JOB_FLOW_OVERRIDES = { - 'Name': 'PiCalc', - 'KeepJobFlowAliveWhenNoSteps': True + 'Name': 'PiCalc' } dag = DAG( diff --git a/airflow/contrib/example_dags/example_gcp_bigtable_operators.py b/airflow/contrib/example_dags/example_gcp_bigtable_operators.py new file mode 100644 index 0000000000000..aca820a0ecb41 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_bigtable_operators.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# 'License'); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# noinspection LongLine +""" +Example Airflow DAG that creates and performs following operations on Cloud Bigtable: +- creates an Instance +- creates a Table +- updates Cluster +- waits for Table replication completeness +- deletes the Table +- deletes the Instance + +This DAG relies on the following environment variables: + +* GCP_PROJECT_ID - Google Cloud Platform project +* CBT_INSTANCE_ID - desired ID of a Cloud Bigtable instance +* CBT_INSTANCE_DISPLAY_NAME - desired human-readable display name of the Instance +* CBT_INSTANCE_TYPE - type of the Instance, e.g. 1 for DEVELOPMENT + See https://googleapis.github.io/google-cloud-python/latest/bigtable/instance.html#google.cloud.bigtable.instance.Instance # noqa E501 +* CBT_INSTANCE_LABELS - labels to add for the Instance +* CBT_CLUSTER_ID - desired ID of the main Cluster created for the Instance +* CBT_CLUSTER_ZONE - zone in which main Cluster will be created. e.g. europe-west1-b + See available zones: https://cloud.google.com/bigtable/docs/locations +* CBT_CLUSTER_NODES - initial amount of nodes of the Cluster +* CBT_CLUSTER_NODES_UPDATED - amount of nodes for BigtableClusterUpdateOperator +* CBT_CLUSTER_STORAGE_TYPE - storage for the Cluster, e.g. 1 for SSD + See https://googleapis.github.io/google-cloud-python/latest/bigtable/instance.html#google.cloud.bigtable.instance.Instance.cluster # noqa E501 +* CBT_TABLE_ID - desired ID of the Table +* CBT_POKE_INTERVAL - number of seconds between every attempt of Sensor check + +""" +import json + +from os import getenv + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_bigtable_operator import \ + BigtableInstanceCreateOperator, \ + BigtableInstanceDeleteOperator, \ + BigtableClusterUpdateOperator, \ + BigtableTableCreateOperator, \ + BigtableTableWaitForReplicationSensor, \ + BigtableTableDeleteOperator + +# [START howto_operator_gcp_bigtable_args] +GCP_PROJECT_ID = getenv('GCP_PROJECT_ID', 'example-project') +CBT_INSTANCE_ID = getenv('CBT_INSTANCE_ID', 'some-instance-id') +CBT_INSTANCE_DISPLAY_NAME = getenv('CBT_INSTANCE_DISPLAY_NAME', 'Human-readable name') +CBT_INSTANCE_TYPE = getenv('CBT_INSTANCE_TYPE', '2') +CBT_INSTANCE_LABELS = getenv('CBT_INSTANCE_LABELS', '{}') +CBT_CLUSTER_ID = getenv('CBT_CLUSTER_ID', 'some-cluster-id') +CBT_CLUSTER_ZONE = getenv('CBT_CLUSTER_ZONE', 'europe-west1-b') +CBT_CLUSTER_NODES = getenv('CBT_CLUSTER_NODES', '3') +CBT_CLUSTER_NODES_UPDATED = getenv('CBT_CLUSTER_NODES_UPDATED', '5') +CBT_CLUSTER_STORAGE_TYPE = getenv('CBT_CLUSTER_STORAGE_TYPE', '2') +CBT_TABLE_ID = getenv('CBT_TABLE_ID', 'some-table-id') +CBT_POKE_INTERVAL = getenv('CBT_POKE_INTERVAL', '60') +# [END howto_operator_gcp_bigtable_args] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + +with models.DAG( + 'example_gcp_bigtable_operators', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + # [START howto_operator_gcp_bigtable_instance_create] + create_instance_task = BigtableInstanceCreateOperator( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + main_cluster_id=CBT_CLUSTER_ID, + main_cluster_zone=CBT_CLUSTER_ZONE, + instance_display_name=CBT_INSTANCE_DISPLAY_NAME, + instance_type=int(CBT_INSTANCE_TYPE), + instance_labels=json.loads(CBT_INSTANCE_LABELS), + cluster_nodes=int(CBT_CLUSTER_NODES), + cluster_storage_type=int(CBT_CLUSTER_STORAGE_TYPE), + task_id='create_instance_task', + ) + create_instance_task2 = BigtableInstanceCreateOperator( + instance_id=CBT_INSTANCE_ID, + main_cluster_id=CBT_CLUSTER_ID, + main_cluster_zone=CBT_CLUSTER_ZONE, + instance_display_name=CBT_INSTANCE_DISPLAY_NAME, + instance_type=int(CBT_INSTANCE_TYPE), + instance_labels=json.loads(CBT_INSTANCE_LABELS), + cluster_nodes=int(CBT_CLUSTER_NODES), + cluster_storage_type=int(CBT_CLUSTER_STORAGE_TYPE), + task_id='create_instance_task2', + ) + create_instance_task >> create_instance_task2 + # [END howto_operator_gcp_bigtable_instance_create] + + # [START howto_operator_gcp_bigtable_cluster_update] + cluster_update_task = BigtableClusterUpdateOperator( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + cluster_id=CBT_CLUSTER_ID, + nodes=int(CBT_CLUSTER_NODES_UPDATED), + task_id='update_cluster_task', + ) + cluster_update_task2 = BigtableClusterUpdateOperator( + instance_id=CBT_INSTANCE_ID, + cluster_id=CBT_CLUSTER_ID, + nodes=int(CBT_CLUSTER_NODES_UPDATED), + task_id='update_cluster_task2', + ) + cluster_update_task >> cluster_update_task2 + # [END howto_operator_gcp_bigtable_cluster_update] + + # [START howto_operator_gcp_bigtable_instance_delete] + delete_instance_task = BigtableInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + task_id='delete_instance_task', + ) + delete_instance_task2 = BigtableInstanceDeleteOperator( + instance_id=CBT_INSTANCE_ID, + task_id='delete_instance_task2', + ) + # [END howto_operator_gcp_bigtable_instance_delete] + + # [START howto_operator_gcp_bigtable_table_create] + create_table_task = BigtableTableCreateOperator( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + task_id='create_table', + ) + create_table_task2 = BigtableTableCreateOperator( + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + task_id='create_table_task2', + ) + create_table_task >> create_table_task2 + # [END howto_operator_gcp_bigtable_table_create] + + # [START howto_operator_gcp_bigtable_table_wait_for_replication] + wait_for_table_replication_task = BigtableTableWaitForReplicationSensor( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + poke_interval=int(CBT_POKE_INTERVAL), + timeout=180, + task_id='wait_for_table_replication_task', + ) + wait_for_table_replication_task2 = BigtableTableWaitForReplicationSensor( + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + poke_interval=int(CBT_POKE_INTERVAL), + timeout=180, + task_id='wait_for_table_replication_task2', + ) + # [END howto_operator_gcp_bigtable_table_wait_for_replication] + + # [START howto_operator_gcp_bigtable_table_delete] + delete_table_task = BigtableTableDeleteOperator( + project_id=GCP_PROJECT_ID, + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + task_id='delete_table_task', + ) + delete_table_task2 = BigtableTableDeleteOperator( + instance_id=CBT_INSTANCE_ID, + table_id=CBT_TABLE_ID, + task_id='delete_table_task2', + ) + # [END howto_operator_gcp_bigtable_table_delete] + + wait_for_table_replication_task >> delete_table_task + wait_for_table_replication_task2 >> delete_table_task + wait_for_table_replication_task >> delete_table_task2 + wait_for_table_replication_task2 >> delete_table_task2 + create_instance_task \ + >> create_table_task \ + >> cluster_update_task \ + >> delete_table_task + create_instance_task2 \ + >> create_table_task2 \ + >> cluster_update_task2 \ + >> delete_table_task2 + + # Only delete instances after all tables are deleted + [delete_table_task, delete_table_task2] >> \ + delete_instance_task >> delete_instance_task2 diff --git a/airflow/contrib/example_dags/example_gcp_compute.py b/airflow/contrib/example_dags/example_gcp_compute.py new file mode 100644 index 0000000000000..dde0f5975ae6f --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_compute.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that starts, stops and sets the machine type of a Google Compute +Engine instance. + +This DAG relies on the following OS environment variables + +* GCP_PROJECT_ID - Google Cloud Platform project where the Compute Engine instance exists. +* GCE_ZONE - Google Cloud Platform zone where the instance exists. +* GCE_INSTANCE - Name of the Compute Engine instance. +* GCE_SHORT_MACHINE_TYPE_NAME - Machine type resource name to set, e.g. 'n1-standard-1'. + See https://cloud.google.com/compute/docs/machine-types +""" +import os + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_compute_operator import GceInstanceStartOperator, \ + GceInstanceStopOperator, GceSetMachineTypeOperator + +# [START howto_operator_gce_args_common] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCE_ZONE = os.environ.get('GCE_ZONE', 'europe-west1-b') +GCE_INSTANCE = os.environ.get('GCE_INSTANCE', 'testinstance') +# [END howto_operator_gce_args_common] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1), +} + +# [START howto_operator_gce_args_set_machine_type] +GCE_SHORT_MACHINE_TYPE_NAME = os.environ.get('GCE_SHORT_MACHINE_TYPE_NAME', 'n1-standard-1') +SET_MACHINE_TYPE_BODY = { + 'machineType': 'zones/{}/machineTypes/{}'.format(GCE_ZONE, GCE_SHORT_MACHINE_TYPE_NAME) +} +# [END howto_operator_gce_args_set_machine_type] + + +with models.DAG( + 'example_gcp_compute', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + # [START howto_operator_gce_start] + gce_instance_start = GceInstanceStartOperator( + project_id=GCP_PROJECT_ID, + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + task_id='gcp_compute_start_task' + ) + # [END howto_operator_gce_start] + # Duplicate start for idempotence testing + # [START howto_operator_gce_start_no_project_id] + gce_instance_start2 = GceInstanceStartOperator( + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + task_id='gcp_compute_start_task2' + ) + # [END howto_operator_gce_start_no_project_id] + # [START howto_operator_gce_stop] + gce_instance_stop = GceInstanceStopOperator( + project_id=GCP_PROJECT_ID, + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + task_id='gcp_compute_stop_task' + ) + # [END howto_operator_gce_stop] + # Duplicate stop for idempotence testing + # [START howto_operator_gce_stop_no_project_id] + gce_instance_stop2 = GceInstanceStopOperator( + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + task_id='gcp_compute_stop_task2' + ) + # [END howto_operator_gce_stop_no_project_id] + # [START howto_operator_gce_set_machine_type] + gce_set_machine_type = GceSetMachineTypeOperator( + project_id=GCP_PROJECT_ID, + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + body=SET_MACHINE_TYPE_BODY, + task_id='gcp_compute_set_machine_type' + ) + # [END howto_operator_gce_set_machine_type] + # Duplicate set machine type for idempotence testing + # [START howto_operator_gce_set_machine_type_no_project_id] + gce_set_machine_type2 = GceSetMachineTypeOperator( + zone=GCE_ZONE, + resource_id=GCE_INSTANCE, + body=SET_MACHINE_TYPE_BODY, + task_id='gcp_compute_set_machine_type2' + ) + # [END howto_operator_gce_set_machine_type_no_project_id] + + gce_instance_start >> gce_instance_start2 >> gce_instance_stop >> \ + gce_instance_stop2 >> gce_set_machine_type >> gce_set_machine_type2 diff --git a/airflow/contrib/example_dags/example_gcp_compute_igm.py b/airflow/contrib/example_dags/example_gcp_compute_igm.py new file mode 100644 index 0000000000000..103cbfd590c15 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_compute_igm.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that uses IGM-type compute operations: +* copy of Instance Template +* update template in Instance Group Manager + +This DAG relies on the following OS environment variables + +* GCP_PROJECT_ID - the Google Cloud Platform project where the Compute Engine instance exists +* GCE_ZONE - the zone where the Compute Engine instance exists + +Variables for copy template operator: +* GCE_TEMPLATE_NAME - name of the template to copy +* GCE_NEW_TEMPLATE_NAME - name of the new template +* GCE_NEW_DESCRIPTION - description added to the template + +Variables for update template in Group Manager: + +* GCE_INSTANCE_GROUP_MANAGER_NAME - name of the Instance Group Manager +* SOURCE_TEMPLATE_URL - url of the template to replace in the Instance Group Manager +* DESTINATION_TEMPLATE_URL - url of the new template to set in the Instance Group Manager +""" + +import os + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_compute_operator import \ + GceInstanceTemplateCopyOperator, GceInstanceGroupManagerUpdateTemplateOperator + +# [START howto_operator_compute_igm_common_args] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCE_ZONE = os.environ.get('GCE_ZONE', 'europe-west1-b') +# [END howto_operator_compute_igm_common_args] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + +# [START howto_operator_compute_template_copy_args] +GCE_TEMPLATE_NAME = os.environ.get('GCE_TEMPLATE_NAME', 'instance-template-test') +GCE_NEW_TEMPLATE_NAME = os.environ.get('GCE_NEW_TEMPLATE_NAME', + 'instance-template-test-new') +GCE_NEW_DESCRIPTION = os.environ.get('GCE_NEW_DESCRIPTION', 'Test new description') +GCE_INSTANCE_TEMPLATE_BODY_UPDATE = { + "name": GCE_NEW_TEMPLATE_NAME, + "description": GCE_NEW_DESCRIPTION, + "properties": { + "machineType": "n1-standard-2" + } +} +# [END howto_operator_compute_template_copy_args] + +# [START howto_operator_compute_igm_update_template_args] +GCE_INSTANCE_GROUP_MANAGER_NAME = os.environ.get('GCE_INSTANCE_GROUP_MANAGER_NAME', + 'instance-group-test') + +SOURCE_TEMPLATE_URL = os.environ.get( + 'SOURCE_TEMPLATE_URL', + "https://www.googleapis.com/compute/beta/projects/" + GCP_PROJECT_ID + + "/global/instanceTemplates/instance-template-test") + +DESTINATION_TEMPLATE_URL = os.environ.get( + 'DESTINATION_TEMPLATE_URL', + "https://www.googleapis.com/compute/beta/projects/" + GCP_PROJECT_ID + + "/global/instanceTemplates/" + GCE_NEW_TEMPLATE_NAME) + +UPDATE_POLICY = { + "type": "OPPORTUNISTIC", + "minimalAction": "RESTART", + "maxSurge": { + "fixed": 1 + }, + "minReadySec": 1800 +} + +# [END howto_operator_compute_igm_update_template_args] + + +with models.DAG( + 'example_gcp_compute_igm', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + # [START howto_operator_gce_igm_copy_template] + gce_instance_template_copy = GceInstanceTemplateCopyOperator( + project_id=GCP_PROJECT_ID, + resource_id=GCE_TEMPLATE_NAME, + body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE, + task_id='gcp_compute_igm_copy_template_task' + ) + # [END howto_operator_gce_igm_copy_template] + # Added to check for idempotence + # [START howto_operator_gce_igm_copy_template_no_project_id] + gce_instance_template_copy2 = GceInstanceTemplateCopyOperator( + resource_id=GCE_TEMPLATE_NAME, + body_patch=GCE_INSTANCE_TEMPLATE_BODY_UPDATE, + task_id='gcp_compute_igm_copy_template_task_2' + ) + # [END howto_operator_gce_igm_copy_template_no_project_id] + # [START howto_operator_gce_igm_update_template] + gce_instance_group_manager_update_template = \ + GceInstanceGroupManagerUpdateTemplateOperator( + project_id=GCP_PROJECT_ID, + resource_id=GCE_INSTANCE_GROUP_MANAGER_NAME, + zone=GCE_ZONE, + source_template=SOURCE_TEMPLATE_URL, + destination_template=DESTINATION_TEMPLATE_URL, + update_policy=UPDATE_POLICY, + task_id='gcp_compute_igm_group_manager_update_template' + ) + # [END howto_operator_gce_igm_update_template] + # Added to check for idempotence (and without UPDATE_POLICY) + # [START howto_operator_gce_igm_update_template_no_project_id] + gce_instance_group_manager_update_template2 = \ + GceInstanceGroupManagerUpdateTemplateOperator( + resource_id=GCE_INSTANCE_GROUP_MANAGER_NAME, + zone=GCE_ZONE, + source_template=SOURCE_TEMPLATE_URL, + destination_template=DESTINATION_TEMPLATE_URL, + task_id='gcp_compute_igm_group_manager_update_template_2' + ) + # [END howto_operator_gce_igm_update_template_no_project_id] + gce_instance_template_copy >> gce_instance_template_copy2 >> \ + gce_instance_group_manager_update_template >> \ + gce_instance_group_manager_update_template2 diff --git a/airflow/contrib/example_dags/example_gcp_function.py b/airflow/contrib/example_dags/example_gcp_function.py new file mode 100644 index 0000000000000..75dcf2ece3816 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_function.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that displays interactions with Google Cloud Functions. +It creates a function and then deletes it. + +This DAG relies on the following OS environment variables +https://airflow.apache.org/concepts.html#variables + +* GCP_PROJECT_ID - Google Cloud Project to use for the Cloud Function. +* GCP_LOCATION - Google Cloud Functions region where the function should be + created. +* GCF_ENTRYPOINT - Name of the executable function in the source code. +* and one of the below: + + * GCF_SOURCE_ARCHIVE_URL - Path to the zipped source in Google Cloud Storage + + * GCF_SOURCE_UPLOAD_URL - Generated upload URL for the zipped source and GCF_ZIP_PATH - Local path to + the zipped source archive + + * GCF_SOURCE_REPOSITORY - The URL pointing to the hosted repository where the function + is defined in a supported Cloud Source Repository URL format + https://cloud.google.com/functions/docs/reference/rest/v1/projects.locations.functions#SourceRepository + +""" + +import os + +from airflow import models +from airflow.contrib.operators.gcp_function_operator \ + import GcfFunctionDeployOperator, GcfFunctionDeleteOperator +from airflow.utils import dates + +# [START howto_operator_gcf_common_variables] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCP_LOCATION = os.environ.get('GCP_LOCATION', 'europe-west1') +GCF_SHORT_FUNCTION_NAME = os.environ.get('GCF_SHORT_FUNCTION_NAME', 'hello').\ + replace("-", "_") # make sure there are no dashes in function name (!) +FUNCTION_NAME = 'projects/{}/locations/{}/functions/{}'.format(GCP_PROJECT_ID, + GCP_LOCATION, + GCF_SHORT_FUNCTION_NAME) +# [END howto_operator_gcf_common_variables] +# [START howto_operator_gcf_deploy_variables] +GCF_SOURCE_ARCHIVE_URL = os.environ.get('GCF_SOURCE_ARCHIVE_URL', '') +GCF_SOURCE_UPLOAD_URL = os.environ.get('GCF_SOURCE_UPLOAD_URL', '') +GCF_SOURCE_REPOSITORY = os.environ.get( + 'GCF_SOURCE_REPOSITORY', + 'https://source.developers.google.com/' + 'projects/{}/repos/hello-world/moveable-aliases/master'.format(GCP_PROJECT_ID)) +GCF_ZIP_PATH = os.environ.get('GCF_ZIP_PATH', '') +GCF_ENTRYPOINT = os.environ.get('GCF_ENTRYPOINT', 'helloWorld') +GCF_RUNTIME = 'nodejs6' +GCP_VALIDATE_BODY = os.environ.get('GCP_VALIDATE_BODY', True) +# [END howto_operator_gcf_deploy_variables] + +# [START howto_operator_gcf_deploy_body] +body = { + "name": FUNCTION_NAME, + "entryPoint": GCF_ENTRYPOINT, + "runtime": GCF_RUNTIME, + "httpsTrigger": {} +} +# [END howto_operator_gcf_deploy_body] + +# [START howto_operator_gcf_default_args] +default_args = { + 'start_date': dates.days_ago(1) +} +# [END howto_operator_gcf_default_args] + +# [START howto_operator_gcf_deploy_variants] +if GCF_SOURCE_ARCHIVE_URL: + body['sourceArchiveUrl'] = GCF_SOURCE_ARCHIVE_URL +elif GCF_SOURCE_REPOSITORY: + body['sourceRepository'] = { + 'url': GCF_SOURCE_REPOSITORY + } +elif GCF_ZIP_PATH: + body['sourceUploadUrl'] = '' + default_args['zip_path'] = GCF_ZIP_PATH +elif GCF_SOURCE_UPLOAD_URL: + body['sourceUploadUrl'] = GCF_SOURCE_UPLOAD_URL +else: + raise Exception("Please provide one of the source_code parameters") +# [END howto_operator_gcf_deploy_variants] + + +with models.DAG( + 'example_gcp_function', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + # [START howto_operator_gcf_deploy] + deploy_task = GcfFunctionDeployOperator( + task_id="gcf_deploy_task", + project_id=GCP_PROJECT_ID, + location=GCP_LOCATION, + body=body, + validate_body=GCP_VALIDATE_BODY + ) + # [END howto_operator_gcf_deploy] + # [START howto_operator_gcf_deploy_no_project_id] + deploy2_task = GcfFunctionDeployOperator( + task_id="gcf_deploy2_task", + location=GCP_LOCATION, + body=body, + validate_body=GCP_VALIDATE_BODY + ) + # [END howto_operator_gcf_deploy_no_project_id] + # [START howto_operator_gcf_delete] + delete_task = GcfFunctionDeleteOperator( + task_id="gcf_delete_task", + name=FUNCTION_NAME + ) + # [END howto_operator_gcf_delete] + deploy_task >> deploy2_task >> delete_task diff --git a/airflow/contrib/example_dags/example_gcp_natural_language.py b/airflow/contrib/example_dags/example_gcp_natural_language.py new file mode 100644 index 0000000000000..48ace4030bf17 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_natural_language.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG for Google Cloud Natural Language service +""" + + +from google.cloud.language_v1.proto.language_service_pb2 import Document + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_natural_language_operator import ( + CloudLanguageAnalyzeEntitiesOperator, + CloudLanguageAnalyzeEntitySentimentOperator, + CloudLanguageAnalyzeSentimentOperator, + CloudLanguageClassifyTextOperator, +) +from airflow.operators.bash_operator import BashOperator + +# [START howto_operator_gcp_natural_language_document_text] +TEXT = """ +Airflow is a platform to programmatically author, schedule and monitor workflows. + +Use Airflow to author workflows as Directed Acyclic Graphs (DAGs) of tasks. The Airflow scheduler executes + your tasks on an array of workers while following the specified dependencies. Rich command line utilities + make performing complex surgeries on DAGs a snap. The rich user interface makes it easy to visualize + pipelines running in production, monitor progress, and troubleshoot issues when needed. +""" +document = Document(content=TEXT, type="PLAIN_TEXT") +# [END howto_operator_gcp_natural_language_document_text] + +# [START howto_operator_gcp_natural_language_document_gcs] +GCS_CONTENT_URI = "gs://my-text-bucket/sentiment-me.txt" +document_gcs = Document(gcs_content_uri=GCS_CONTENT_URI, type="PLAIN_TEXT") +# [END howto_operator_gcp_natural_language_document_gcs] + + +default_args = {"start_date": airflow.utils.dates.days_ago(1)} + +with models.DAG( + "example_gcp_natural_language", + default_args=default_args, + schedule_interval=None, # Override to match your needs +) as dag: + + # [START howto_operator_gcp_natural_language_analyze_entities] + analyze_entities = CloudLanguageAnalyzeEntitiesOperator(document=document, task_id="analyze_entities") + # [END howto_operator_gcp_natural_language_analyze_entities] + + # [START howto_operator_gcp_natural_language_analyze_entities_result] + analyze_entities_result = BashOperator( + bash_command="echo \"{{ task_instance.xcom_pull('analyze_entities') }}\"", + task_id="analyze_entities_result", + ) + # [END howto_operator_gcp_natural_language_analyze_entities_result] + + # [START howto_operator_gcp_natural_language_analyze_entity_sentiment] + analyze_entity_sentiment = CloudLanguageAnalyzeEntitySentimentOperator( + document=document, task_id="analyze_entity_sentiment" + ) + # [END howto_operator_gcp_natural_language_analyze_entity_sentiment] + + # [START howto_operator_gcp_natural_language_analyze_entity_sentiment_result] + analyze_entity_sentiment_result = BashOperator( + bash_command="echo \"{{ task_instance.xcom_pull('analyze_entity_sentiment') }}\"", + task_id="analyze_entity_sentiment_result", + ) + # [END howto_operator_gcp_natural_language_analyze_entity_sentiment_result] + + # [START howto_operator_gcp_natural_language_analyze_sentiment] + analyze_sentiment = CloudLanguageAnalyzeSentimentOperator(document=document, task_id="analyze_sentiment") + # [END howto_operator_gcp_natural_language_analyze_sentiment] + + # [START howto_operator_gcp_natural_language_analyze_sentiment_result] + analyze_sentiment_result = BashOperator( + bash_command="echo \"{{ task_instance.xcom_pull('analyze_sentiment') }}\"", + task_id="analyze_sentiment_result", + ) + # [END howto_operator_gcp_natural_language_analyze_sentiment_result] + + # [START howto_operator_gcp_natural_language_analyze_classify_text] + analyze_classify_text = CloudLanguageClassifyTextOperator( + document=document, task_id="analyze_classify_text" + ) + # [END howto_operator_gcp_natural_language_analyze_classify_text] + + # [START howto_operator_gcp_natural_language_analyze_classify_text_result] + analyze_classify_text_result = BashOperator( + bash_command="echo \"{{ task_instance.xcom_pull('analyze_classify_text') }}\"", + task_id="analyze_classify_text_result", + ) + # [END howto_operator_gcp_natural_language_analyze_classify_text_result] + + analyze_entities >> analyze_entities_result + analyze_entity_sentiment >> analyze_entity_sentiment_result + analyze_sentiment >> analyze_sentiment_result + analyze_classify_text >> analyze_classify_text_result diff --git a/airflow/contrib/example_dags/example_gcp_spanner.py b/airflow/contrib/example_dags/example_gcp_spanner.py new file mode 100644 index 0000000000000..4d77b3abf096b --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_spanner.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that creates, updates, queries and deletes a Cloud Spanner instance. + +This DAG relies on the following environment variables +* GCP_PROJECT_ID - Google Cloud Platform project for the Cloud Spanner instance. +* GCP_SPANNER_INSTANCE_ID - Cloud Spanner instance ID. +* GCP_SPANNER_DATABASE_ID - Cloud Spanner database ID. +* GCP_SPANNER_CONFIG_NAME - The name of the instance's configuration. Values are of the + form ``projects//instanceConfigs/``. See also: + https://cloud.google.com/spanner/docs/reference/rest/v1/projects.instanceConfigs#InstanceConfig + https://cloud.google.com/spanner/docs/reference/rest/v1/projects.instanceConfigs/list#google.spanner.admin.instance.v1.InstanceAdmin.ListInstanceConfigs +* GCP_SPANNER_NODE_COUNT - Number of nodes allocated to the instance. +* GCP_SPANNER_DISPLAY_NAME - The descriptive name for this instance as it appears in UIs. + Must be unique per project and between 4 and 30 characters in length. +""" + +import os + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_spanner_operator import \ + CloudSpannerInstanceDeployOperator, \ + CloudSpannerInstanceDatabaseQueryOperator, \ + CloudSpannerInstanceDeleteOperator, \ + CloudSpannerInstanceDatabaseDeployOperator, \ + CloudSpannerInstanceDatabaseUpdateOperator, \ + CloudSpannerInstanceDatabaseDeleteOperator + +# [START howto_operator_spanner_arguments] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCP_SPANNER_INSTANCE_ID = os.environ.get('GCP_SPANNER_INSTANCE_ID', 'testinstance') +GCP_SPANNER_DATABASE_ID = os.environ.get('GCP_SPANNER_DATABASE_ID', 'testdatabase') +GCP_SPANNER_CONFIG_NAME = os.environ.get('GCP_SPANNER_CONFIG_NAME', + 'projects/example-project/instanceConfigs/eur3') +GCP_SPANNER_NODE_COUNT = os.environ.get('GCP_SPANNER_NODE_COUNT', '1') +GCP_SPANNER_DISPLAY_NAME = os.environ.get('GCP_SPANNER_DISPLAY_NAME', 'Test Instance') +# OPERATION_ID should be unique per operation +OPERATION_ID = 'unique_operation_id' +# [END howto_operator_spanner_arguments] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + +with models.DAG( + 'example_gcp_spanner', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + # Create + # [START howto_operator_spanner_deploy] + spanner_instance_create_task = CloudSpannerInstanceDeployOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + configuration_name=GCP_SPANNER_CONFIG_NAME, + node_count=int(GCP_SPANNER_NODE_COUNT), + display_name=GCP_SPANNER_DISPLAY_NAME, + task_id='spanner_instance_create_task' + ) + spanner_instance_update_task = CloudSpannerInstanceDeployOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + configuration_name=GCP_SPANNER_CONFIG_NAME, + node_count=int(GCP_SPANNER_NODE_COUNT) + 1, + display_name=GCP_SPANNER_DISPLAY_NAME + '_updated', + task_id='spanner_instance_update_task' + ) + # [END howto_operator_spanner_deploy] + + # [START howto_operator_spanner_database_deploy] + spanner_database_deploy_task = CloudSpannerInstanceDatabaseDeployOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + ddl_statements=[ + "CREATE TABLE my_table1 (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + "CREATE TABLE my_table2 (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + ], + task_id='spanner_database_deploy_task' + ) + spanner_database_deploy_task2 = CloudSpannerInstanceDatabaseDeployOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + ddl_statements=[ + "CREATE TABLE my_table1 (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + "CREATE TABLE my_table2 (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + ], + task_id='spanner_database_deploy_task2' + ) + # [END howto_operator_spanner_database_deploy] + + # [START howto_operator_spanner_database_update] + spanner_database_update_task = CloudSpannerInstanceDatabaseUpdateOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + ddl_statements=[ + "CREATE TABLE my_table3 (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + ], + task_id='spanner_database_update_task' + ) + # [END howto_operator_spanner_database_update] + + # [START howto_operator_spanner_database_update_idempotent] + spanner_database_update_idempotent1_task = CloudSpannerInstanceDatabaseUpdateOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + operation_id=OPERATION_ID, + ddl_statements=[ + "CREATE TABLE my_table_unique (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + ], + task_id='spanner_database_update_idempotent1_task' + ) + spanner_database_update_idempotent2_task = CloudSpannerInstanceDatabaseUpdateOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + operation_id=OPERATION_ID, + ddl_statements=[ + "CREATE TABLE my_table_unique (id INT64, name STRING(MAX)) PRIMARY KEY (id)", + ], + task_id='spanner_database_update_idempotent2_task' + ) + # [END howto_operator_spanner_database_update_idempotent] + + # [START howto_operator_spanner_query] + spanner_instance_query_task = CloudSpannerInstanceDatabaseQueryOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + query=["DELETE FROM my_table2 WHERE true"], + task_id='spanner_instance_query_task' + ) + spanner_instance_query_task2 = CloudSpannerInstanceDatabaseQueryOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + query=["DELETE FROM my_table2 WHERE true"], + task_id='spanner_instance_query_task2' + ) + # [END howto_operator_spanner_query] + + # [START howto_operator_spanner_database_delete] + spanner_database_delete_task = CloudSpannerInstanceDatabaseDeleteOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + task_id='spanner_database_delete_task' + ) + spanner_database_delete_task2 = CloudSpannerInstanceDatabaseDeleteOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + database_id=GCP_SPANNER_DATABASE_ID, + task_id='spanner_database_delete_task2' + ) + # [END howto_operator_spanner_database_delete] + + # [START howto_operator_spanner_delete] + spanner_instance_delete_task = CloudSpannerInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance_id=GCP_SPANNER_INSTANCE_ID, + task_id='spanner_instance_delete_task' + ) + spanner_instance_delete_task2 = CloudSpannerInstanceDeleteOperator( + instance_id=GCP_SPANNER_INSTANCE_ID, + task_id='spanner_instance_delete_task2' + ) + # [END howto_operator_spanner_delete] + + spanner_instance_create_task \ + >> spanner_instance_update_task \ + >> spanner_database_deploy_task \ + >> spanner_database_deploy_task2 \ + >> spanner_database_update_task \ + >> spanner_database_update_idempotent1_task \ + >> spanner_database_update_idempotent2_task \ + >> spanner_instance_query_task \ + >> spanner_instance_query_task2 \ + >> spanner_database_delete_task \ + >> spanner_database_delete_task2 \ + >> spanner_instance_delete_task \ + >> spanner_instance_delete_task2 diff --git a/airflow/contrib/example_dags/example_gcp_spanner.sql b/airflow/contrib/example_dags/example_gcp_spanner.sql new file mode 100644 index 0000000000000..5d5f238022a52 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_spanner.sql @@ -0,0 +1,3 @@ +INSERT my_table2 (id, name) VALUES (7, 'Seven'); +INSERT my_table2 (id, name) + VALUES (8, 'Eight'); diff --git a/airflow/contrib/example_dags/example_gcp_sql.py b/airflow/contrib/example_dags/example_gcp_sql.py new file mode 100644 index 0000000000000..6c357f4b07e2f --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_sql.py @@ -0,0 +1,423 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that creates, patches and deletes a Cloud SQL instance, and also +creates, patches and deletes a database inside the instance, in Google Cloud Platform. + +This DAG relies on the following OS environment variables +https://airflow.apache.org/concepts.html#variables +* GCP_PROJECT_ID - Google Cloud Platform project for the Cloud SQL instance. +* INSTANCE_NAME - Name of the Cloud SQL instance. +* DB_NAME - Name of the database inside a Cloud SQL instance. +""" + +import os + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_sql_operator import CloudSqlInstanceCreateOperator, \ + CloudSqlInstancePatchOperator, CloudSqlInstanceDeleteOperator, \ + CloudSqlInstanceDatabaseCreateOperator, CloudSqlInstanceDatabasePatchOperator, \ + CloudSqlInstanceDatabaseDeleteOperator, CloudSqlInstanceExportOperator, \ + CloudSqlInstanceImportOperator +from airflow.contrib.operators.gcs_acl_operator import \ + GoogleCloudStorageBucketCreateAclEntryOperator, \ + GoogleCloudStorageObjectCreateAclEntryOperator + +from six.moves.urllib.parse import urlsplit + +# [START howto_operator_cloudsql_arguments] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +INSTANCE_NAME = os.environ.get('GCSQL_MYSQL_INSTANCE_NAME', 'test-mysql') +INSTANCE_NAME2 = os.environ.get('GCSQL_MYSQL_INSTANCE_NAME2', 'test-mysql2') +DB_NAME = os.environ.get('GCSQL_MYSQL_DATABASE_NAME', 'testdb') +# [END howto_operator_cloudsql_arguments] + +# [START howto_operator_cloudsql_export_import_arguments] +EXPORT_URI = os.environ.get('GCSQL_MYSQL_EXPORT_URI', 'gs://bucketName/fileName') +IMPORT_URI = os.environ.get('GCSQL_MYSQL_IMPORT_URI', 'gs://bucketName/fileName') +# [END howto_operator_cloudsql_export_import_arguments] + +# Bodies below represent Cloud SQL instance resources: +# https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances + +# [START howto_operator_cloudsql_create_arguments] +FAILOVER_REPLICA_NAME = INSTANCE_NAME + "-failover-replica" +READ_REPLICA_NAME = INSTANCE_NAME + "-read-replica" +# [END howto_operator_cloudsql_create_arguments] + +# [START howto_operator_cloudsql_create_body] +body = { + "name": INSTANCE_NAME, + "settings": { + "tier": "db-n1-standard-1", + "backupConfiguration": { + "binaryLogEnabled": True, + "enabled": True, + "startTime": "05:00" + }, + "activationPolicy": "ALWAYS", + "dataDiskSizeGb": 30, + "dataDiskType": "PD_SSD", + "databaseFlags": [], + "ipConfiguration": { + "ipv4Enabled": True, + "requireSsl": True, + }, + "locationPreference": { + "zone": "europe-west4-a" + }, + "maintenanceWindow": { + "hour": 5, + "day": 7, + "updateTrack": "canary" + }, + "pricingPlan": "PER_USE", + "replicationType": "ASYNCHRONOUS", + "storageAutoResize": True, + "storageAutoResizeLimit": 0, + "userLabels": { + "my-key": "my-value" + } + }, + "failoverReplica": { + "name": FAILOVER_REPLICA_NAME + }, + "databaseVersion": "MYSQL_5_7", + "region": "europe-west4", +} +# [END howto_operator_cloudsql_create_body] + +body2 = { + "name": INSTANCE_NAME2, + "settings": { + "tier": "db-n1-standard-1", + }, + "databaseVersion": "MYSQL_5_7", + "region": "europe-west4", +} + +# [START howto_operator_cloudsql_create_replica] +read_replica_body = { + "name": READ_REPLICA_NAME, + "settings": { + "tier": "db-n1-standard-1", + }, + "databaseVersion": "MYSQL_5_7", + "region": "europe-west4", + "masterInstanceName": INSTANCE_NAME, +} +# [END howto_operator_cloudsql_create_replica] + + +# [START howto_operator_cloudsql_patch_body] +patch_body = { + "name": INSTANCE_NAME, + "settings": { + "dataDiskSizeGb": 35, + "maintenanceWindow": { + "hour": 3, + "day": 6, + "updateTrack": "canary" + }, + "userLabels": { + "my-key-patch": "my-value-patch" + } + } +} +# [END howto_operator_cloudsql_patch_body] +# [START howto_operator_cloudsql_export_body] +export_body = { + "exportContext": { + "fileType": "sql", + "uri": EXPORT_URI, + "sqlExportOptions": { + "schemaOnly": False + } + } +} +# [END howto_operator_cloudsql_export_body] +# [START howto_operator_cloudsql_import_body] +import_body = { + "importContext": { + "fileType": "sql", + "uri": IMPORT_URI + } +} +# [END howto_operator_cloudsql_import_body] +# [START howto_operator_cloudsql_db_create_body] +db_create_body = { + "instance": INSTANCE_NAME, + "name": DB_NAME, + "project": GCP_PROJECT_ID +} +# [END howto_operator_cloudsql_db_create_body] +# [START howto_operator_cloudsql_db_patch_body] +db_patch_body = { + "charset": "utf16", + "collation": "utf16_general_ci" +} +# [END howto_operator_cloudsql_db_patch_body] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + +with models.DAG( + 'example_gcp_sql', + default_args=default_args, + schedule_interval=None # Override to match your needs +) as dag: + + def next_dep(task, prev): + prev >> task + return task + + # ############################################## # + # ### INSTANCES SET UP ######################### # + # ############################################## # + + # [START howto_operator_cloudsql_create] + sql_instance_create_task = CloudSqlInstanceCreateOperator( + project_id=GCP_PROJECT_ID, + body=body, + instance=INSTANCE_NAME, + task_id='sql_instance_create_task' + ) + # [END howto_operator_cloudsql_create] + prev_task = sql_instance_create_task + + sql_instance_create_2_task = CloudSqlInstanceCreateOperator( + project_id=GCP_PROJECT_ID, + body=body2, + instance=INSTANCE_NAME2, + task_id='sql_instance_create_task2' + ) + # [END howto_operator_cloudsql_create] + + prev_task = sql_instance_create_task + prev_task = next_dep(sql_instance_create_2_task, prev_task) + + sql_instance_read_replica_create = CloudSqlInstanceCreateOperator( + project_id=GCP_PROJECT_ID, + body=read_replica_body, + instance=INSTANCE_NAME2, + task_id='sql_instance_read_replica_create' + ) + prev_task = next_dep(sql_instance_read_replica_create, prev_task) + + # ############################################## # + # ### MODIFYING INSTANCE AND ITS DATABASE ###### # + # ############################################## # + + # [START howto_operator_cloudsql_patch] + sql_instance_patch_task = CloudSqlInstancePatchOperator( + project_id=GCP_PROJECT_ID, + body=patch_body, + instance=INSTANCE_NAME, + task_id='sql_instance_patch_task' + ) + + sql_instance_patch_task2 = CloudSqlInstancePatchOperator( + body=patch_body, + instance=INSTANCE_NAME, + task_id='sql_instance_patch_task2' + ) + # [END howto_operator_cloudsql_patch] + prev_task = next_dep(sql_instance_patch_task, prev_task) + prev_task = next_dep(sql_instance_patch_task2, prev_task) + + # [START howto_operator_cloudsql_db_create] + sql_db_create_task = CloudSqlInstanceDatabaseCreateOperator( + project_id=GCP_PROJECT_ID, + body=db_create_body, + instance=INSTANCE_NAME, + task_id='sql_db_create_task' + ) + sql_db_create_task2 = CloudSqlInstanceDatabaseCreateOperator( + body=db_create_body, + instance=INSTANCE_NAME, + task_id='sql_db_create_task2' + ) + # [END howto_operator_cloudsql_db_create] + prev_task = next_dep(sql_db_create_task, prev_task) + prev_task = next_dep(sql_db_create_task2, prev_task) + + # [START howto_operator_cloudsql_db_patch] + sql_db_patch_task = CloudSqlInstanceDatabasePatchOperator( + project_id=GCP_PROJECT_ID, + body=db_patch_body, + instance=INSTANCE_NAME, + database=DB_NAME, + task_id='sql_db_patch_task' + ) + sql_db_patch_task2 = CloudSqlInstanceDatabasePatchOperator( + body=db_patch_body, + instance=INSTANCE_NAME, + database=DB_NAME, + task_id='sql_db_patch_task2' + ) + # [END howto_operator_cloudsql_db_patch] + prev_task = next_dep(sql_db_patch_task, prev_task) + prev_task = next_dep(sql_db_patch_task2, prev_task) + + # ############################################## # + # ### EXPORTING SQL FROM INSTANCE 1 ############ # + # ############################################## # + export_url_split = urlsplit(EXPORT_URI) + + # For export to work we need to add the Cloud SQL instance's Service Account + # write access to the destination GCS bucket. + # [START howto_operator_cloudsql_export_gcs_permissions] + sql_gcp_add_bucket_permission_task = GoogleCloudStorageBucketCreateAclEntryOperator( + entity="user-{{ task_instance.xcom_pull(" + "'sql_instance_create_task', key='service_account_email') " + "}}", + role="WRITER", + bucket=export_url_split[1], # netloc (bucket) + task_id='sql_gcp_add_bucket_permission_task' + ) + # [END howto_operator_cloudsql_export_gcs_permissions] + prev_task = next_dep(sql_gcp_add_bucket_permission_task, prev_task) + + # [START howto_operator_cloudsql_export] + sql_export_task = CloudSqlInstanceExportOperator( + project_id=GCP_PROJECT_ID, + body=export_body, + instance=INSTANCE_NAME, + task_id='sql_export_task' + ) + sql_export_task2 = CloudSqlInstanceExportOperator( + body=export_body, + instance=INSTANCE_NAME, + task_id='sql_export_task2' + ) + # [END howto_operator_cloudsql_export] + prev_task = next_dep(sql_export_task, prev_task) + prev_task = next_dep(sql_export_task2, prev_task) + + # ############################################## # + # ### IMPORTING SQL TO INSTANCE 2 ############## # + # ############################################## # + import_url_split = urlsplit(IMPORT_URI) + + # For import to work we need to add the Cloud SQL instance's Service Account + # read access to the target GCS object. + # [START howto_operator_cloudsql_import_gcs_permissions] + sql_gcp_add_object_permission_task = GoogleCloudStorageObjectCreateAclEntryOperator( + entity="user-{{ task_instance.xcom_pull(" + "'sql_instance_create_task2', key='service_account_email')" + " }}", + role="READER", + bucket=import_url_split[1], # netloc (bucket) + object_name=import_url_split[2][1:], # path (strip first '/') + task_id='sql_gcp_add_object_permission_task', + ) + prev_task = next_dep(sql_gcp_add_object_permission_task, prev_task) + + # For import to work we also need to add the Cloud SQL instance's Service Account + # write access to the whole bucket!. + sql_gcp_add_bucket_permission_2_task = GoogleCloudStorageBucketCreateAclEntryOperator( + entity="user-{{ task_instance.xcom_pull(" + "'sql_instance_create_task2', key='service_account_email') " + "}}", + role="WRITER", + bucket=import_url_split[1], # netloc + task_id='sql_gcp_add_bucket_permission_2_task', + ) + # [END howto_operator_cloudsql_import_gcs_permissions] + prev_task = next_dep(sql_gcp_add_bucket_permission_2_task, prev_task) + + # [START howto_operator_cloudsql_import] + sql_import_task = CloudSqlInstanceImportOperator( + project_id=GCP_PROJECT_ID, + body=import_body, + instance=INSTANCE_NAME2, + task_id='sql_import_task' + ) + sql_import_task2 = CloudSqlInstanceImportOperator( + body=import_body, + instance=INSTANCE_NAME2, + task_id='sql_import_task2' + ) + # [END howto_operator_cloudsql_import] + prev_task = next_dep(sql_import_task, prev_task) + prev_task = next_dep(sql_import_task2, prev_task) + + # ############################################## # + # ### DELETING A DATABASE FROM AN INSTANCE ##### # + # ############################################## # + + # [START howto_operator_cloudsql_db_delete] + sql_db_delete_task = CloudSqlInstanceDatabaseDeleteOperator( + project_id=GCP_PROJECT_ID, + instance=INSTANCE_NAME, + database=DB_NAME, + task_id='sql_db_delete_task' + ) + sql_db_delete_task2 = CloudSqlInstanceDatabaseDeleteOperator( + instance=INSTANCE_NAME, + database=DB_NAME, + task_id='sql_db_delete_task2' + ) + # [END howto_operator_cloudsql_db_delete] + prev_task = next_dep(sql_db_delete_task, prev_task) + prev_task = next_dep(sql_db_delete_task2, prev_task) + + # ############################################## # + # ### INSTANCES TEAR DOWN ###################### # + # ############################################## # + + # [START howto_operator_cloudsql_replicas_delete] + sql_instance_failover_replica_delete_task = CloudSqlInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance=FAILOVER_REPLICA_NAME, + task_id='sql_instance_failover_replica_delete_task' + ) + + sql_instance_read_replica_delete_task = CloudSqlInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance=READ_REPLICA_NAME, + task_id='sql_instance_read_replica_delete_task' + ) + # [END howto_operator_cloudsql_replicas_delete] + + prev_task = next_dep(sql_instance_failover_replica_delete_task, prev_task) + prev_task = next_dep(sql_instance_read_replica_delete_task, prev_task) + + # [START howto_operator_cloudsql_delete] + sql_instance_delete_task = CloudSqlInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance=INSTANCE_NAME, + task_id='sql_instance_delete_task' + ) + sql_instance_delete_task2 = CloudSqlInstanceDeleteOperator( + instance=INSTANCE_NAME2, + task_id='sql_instance_delete_task2' + ) + # [END howto_operator_cloudsql_delete] + prev_task = next_dep(sql_instance_delete_task, prev_task) + + sql_instance_delete_2_task = CloudSqlInstanceDeleteOperator( + project_id=GCP_PROJECT_ID, + instance=INSTANCE_NAME2, + task_id='sql_instance_delete_2_task' + ) + prev_task = next_dep(sql_instance_delete_2_task, prev_task) diff --git a/airflow/contrib/example_dags/example_gcp_sql_query.py b/airflow/contrib/example_dags/example_gcp_sql_query.py new file mode 100644 index 0000000000000..f4fc5f392650c --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_sql_query.py @@ -0,0 +1,294 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that performs query in a Cloud SQL instance. + +This DAG relies on the following OS environment variables + +* GCP_PROJECT_ID - Google Cloud Platform project for the Cloud SQL instance +* GCP_REGION - Google Cloud region where the database is created +* +* GCSQL_POSTGRES_INSTANCE_NAME - Name of the postgres Cloud SQL instance +* GCSQL_POSTGRES_USER - Name of the postgres database user +* GCSQL_POSTGRES_PASSWORD - Password of the postgres database user +* GCSQL_POSTGRES_PUBLIC_IP - Public IP of the Postgres database +* GCSQL_POSTGRES_PUBLIC_PORT - Port of the postgres database +* +* GCSQL_MYSQL_INSTANCE_NAME - Name of the postgres Cloud SQL instance +* GCSQL_MYSQL_USER - Name of the mysql database user +* GCSQL_MYSQL_PASSWORD - Password of the mysql database user +* GCSQL_MYSQL_PUBLIC_IP - Public IP of the mysql database +* GCSQL_MYSQL_PUBLIC_PORT - Port of the mysql database +""" + +import os +import subprocess +from os.path import expanduser + +from six.moves.urllib.parse import quote_plus + +import airflow +from airflow import models +from airflow.contrib.operators.gcp_sql_operator import CloudSqlQueryOperator + +# [START howto_operator_cloudsql_query_arguments] + +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCP_REGION = os.environ.get('GCP_REGION', 'europe-west-1b') + +GCSQL_POSTGRES_INSTANCE_NAME_QUERY = os.environ.get( + 'GCSQL_POSTGRES_INSTANCE_NAME_QUERY', + 'testpostgres') +GCSQL_POSTGRES_DATABASE_NAME = os.environ.get('GCSQL_POSTGRES_DATABASE_NAME', + 'postgresdb') +GCSQL_POSTGRES_USER = os.environ.get('GCSQL_POSTGRES_USER', 'postgres_user') +GCSQL_POSTGRES_PASSWORD = os.environ.get('GCSQL_POSTGRES_PASSWORD', 'password') +GCSQL_POSTGRES_PUBLIC_IP = os.environ.get('GCSQL_POSTGRES_PUBLIC_IP', '0.0.0.0') +GCSQL_POSTGRES_PUBLIC_PORT = os.environ.get('GCSQL_POSTGRES_PUBLIC_PORT', 5432) +GCSQL_POSTGRES_CLIENT_CERT_FILE = os.environ.get('GCSQL_POSTGRES_CLIENT_CERT_FILE', + ".key/postgres-client-cert.pem") +GCSQL_POSTGRES_CLIENT_KEY_FILE = os.environ.get('GCSQL_POSTGRES_CLIENT_KEY_FILE', + ".key/postgres-client-key.pem") +GCSQL_POSTGRES_SERVER_CA_FILE = os.environ.get('GCSQL_POSTGRES_SERVER_CA_FILE', + ".key/postgres-server-ca.pem") + +GCSQL_MYSQL_INSTANCE_NAME_QUERY = os.environ.get('GCSQL_MYSQL_INSTANCE_NAME_QUERY', + 'testmysql') +GCSQL_MYSQL_DATABASE_NAME = os.environ.get('GCSQL_MYSQL_DATABASE_NAME', 'mysqldb') +GCSQL_MYSQL_USER = os.environ.get('GCSQL_MYSQL_USER', 'mysql_user') +GCSQL_MYSQL_PASSWORD = os.environ.get('GCSQL_MYSQL_PASSWORD', 'password') +GCSQL_MYSQL_PUBLIC_IP = os.environ.get('GCSQL_MYSQL_PUBLIC_IP', '0.0.0.0') +GCSQL_MYSQL_PUBLIC_PORT = os.environ.get('GCSQL_MYSQL_PUBLIC_PORT', 3306) +GCSQL_MYSQL_CLIENT_CERT_FILE = os.environ.get('GCSQL_MYSQL_CLIENT_CERT_FILE', + ".key/mysql-client-cert.pem") +GCSQL_MYSQL_CLIENT_KEY_FILE = os.environ.get('GCSQL_MYSQL_CLIENT_KEY_FILE', + ".key/mysql-client-key.pem") +GCSQL_MYSQL_SERVER_CA_FILE = os.environ.get('GCSQL_MYSQL_SERVER_CA_FILE', + ".key/mysql-server-ca.pem") + +SQL = [ + 'CREATE TABLE IF NOT EXISTS TABLE_TEST (I INTEGER)', + 'CREATE TABLE IF NOT EXISTS TABLE_TEST (I INTEGER)', # shows warnings logged + 'INSERT INTO TABLE_TEST VALUES (0)', + 'CREATE TABLE IF NOT EXISTS TABLE_TEST2 (I INTEGER)', + 'DROP TABLE TABLE_TEST', + 'DROP TABLE TABLE_TEST2', +] + +# [END howto_operator_cloudsql_query_arguments] +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + + +# [START howto_operator_cloudsql_query_connections] + +HOME_DIR = expanduser("~") + + +def get_absolute_path(path): + if path.startswith("/"): + return path + else: + return os.path.join(HOME_DIR, path) + + +postgres_kwargs = dict( + user=quote_plus(GCSQL_POSTGRES_USER), + password=quote_plus(GCSQL_POSTGRES_PASSWORD), + public_port=GCSQL_POSTGRES_PUBLIC_PORT, + public_ip=quote_plus(GCSQL_POSTGRES_PUBLIC_IP), + project_id=quote_plus(GCP_PROJECT_ID), + location=quote_plus(GCP_REGION), + instance=quote_plus(GCSQL_POSTGRES_INSTANCE_NAME_QUERY), + database=quote_plus(GCSQL_POSTGRES_DATABASE_NAME), + client_cert_file=quote_plus(get_absolute_path(GCSQL_POSTGRES_CLIENT_CERT_FILE)), + client_key_file=quote_plus(get_absolute_path(GCSQL_POSTGRES_CLIENT_KEY_FILE)), + server_ca_file=quote_plus(get_absolute_path(GCSQL_POSTGRES_SERVER_CA_FILE)) +) + +# The connections below are created using one of the standard approaches - via environment +# variables named AIRFLOW_CONN_* . The connections can also be created in the database +# of AIRFLOW (using command line or UI). + +# Postgres: connect via proxy over TCP +os.environ['AIRFLOW_CONN_PROXY_POSTGRES_TCP'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=postgres&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=True&" \ + "sql_proxy_use_tcp=True".format(**postgres_kwargs) + +# Postgres: connect via proxy over UNIX socket (specific proxy version) +os.environ['AIRFLOW_CONN_PROXY_POSTGRES_SOCKET'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=postgres&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=True&" \ + "sql_proxy_version=v1.13&" \ + "sql_proxy_use_tcp=False".format(**postgres_kwargs) + +# Postgres: connect directly via TCP (non-SSL) +os.environ['AIRFLOW_CONN_PUBLIC_POSTGRES_TCP'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=postgres&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=False&" \ + "use_ssl=False".format(**postgres_kwargs) + +# Postgres: connect directly via TCP (SSL) +os.environ['AIRFLOW_CONN_PUBLIC_POSTGRES_TCP_SSL'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=postgres&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=False&" \ + "use_ssl=True&" \ + "sslcert={client_cert_file}&" \ + "sslkey={client_key_file}&" \ + "sslrootcert={server_ca_file}"\ + .format(**postgres_kwargs) + +mysql_kwargs = dict( + user=quote_plus(GCSQL_MYSQL_USER), + password=quote_plus(GCSQL_MYSQL_PASSWORD), + public_port=GCSQL_MYSQL_PUBLIC_PORT, + public_ip=quote_plus(GCSQL_MYSQL_PUBLIC_IP), + project_id=quote_plus(GCP_PROJECT_ID), + location=quote_plus(GCP_REGION), + instance=quote_plus(GCSQL_MYSQL_INSTANCE_NAME_QUERY), + database=quote_plus(GCSQL_MYSQL_DATABASE_NAME), + client_cert_file=quote_plus(get_absolute_path(GCSQL_MYSQL_CLIENT_CERT_FILE)), + client_key_file=quote_plus(get_absolute_path(GCSQL_MYSQL_CLIENT_KEY_FILE)), + server_ca_file=quote_plus(get_absolute_path(GCSQL_MYSQL_SERVER_CA_FILE)) +) + +# MySQL: connect via proxy over TCP (specific proxy version) +os.environ['AIRFLOW_CONN_PROXY_MYSQL_TCP'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=mysql&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=True&" \ + "sql_proxy_version=v1.13&" \ + "sql_proxy_use_tcp=True".format(**mysql_kwargs) + +# MySQL: connect via proxy over UNIX socket using pre-downloaded Cloud Sql Proxy binary +try: + sql_proxy_binary_path = subprocess.check_output( + ['which', 'cloud_sql_proxy']).decode('utf-8').rstrip() +except subprocess.CalledProcessError: + sql_proxy_binary_path = "/tmp/anyhow_download_cloud_sql_proxy" + +os.environ['AIRFLOW_CONN_PROXY_MYSQL_SOCKET'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=mysql&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=True&" \ + "sql_proxy_binary_path={sql_proxy_binary_path}&" \ + "sql_proxy_use_tcp=False".format( + sql_proxy_binary_path=quote_plus(sql_proxy_binary_path), **mysql_kwargs) + +# MySQL: connect directly via TCP (non-SSL) +os.environ['AIRFLOW_CONN_PUBLIC_MYSQL_TCP'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=mysql&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=False&" \ + "use_ssl=False".format(**mysql_kwargs) + +# MySQL: connect directly via TCP (SSL) and with fixed Cloud Sql Proxy binary path +os.environ['AIRFLOW_CONN_PUBLIC_MYSQL_TCP_SSL'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=mysql&" \ + "project_id={project_id}&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=False&" \ + "use_ssl=True&" \ + "sslcert={client_cert_file}&" \ + "sslkey={client_key_file}&" \ + "sslrootcert={server_ca_file}".format(**mysql_kwargs) + +# Special case: MySQL: connect directly via TCP (SSL) and with fixed Cloud Sql +# Proxy binary path AND with missing project_id + +os.environ['AIRFLOW_CONN_PUBLIC_MYSQL_TCP_SSL_NO_PROJECT_ID'] = \ + "gcpcloudsql://{user}:{password}@{public_ip}:{public_port}/{database}?" \ + "database_type=mysql&" \ + "location={location}&" \ + "instance={instance}&" \ + "use_proxy=False&" \ + "use_ssl=True&" \ + "sslcert={client_cert_file}&" \ + "sslkey={client_key_file}&" \ + "sslrootcert={server_ca_file}".format(**mysql_kwargs) + + +# [END howto_operator_cloudsql_query_connections] + +# [START howto_operator_cloudsql_query_operators] + +connection_names = [ + "proxy_postgres_tcp", + "proxy_postgres_socket", + "public_postgres_tcp", + "public_postgres_tcp_ssl", + "proxy_mysql_tcp", + "proxy_mysql_socket", + "public_mysql_tcp", + "public_mysql_tcp_ssl", + "public_mysql_tcp_ssl_no_project_id" +] + +tasks = [] + + +with models.DAG( + dag_id='example_gcp_sql_query', + default_args=default_args, + schedule_interval=None +) as dag: + prev_task = None + + for connection_name in connection_names: + task = CloudSqlQueryOperator( + gcp_cloudsql_conn_id=connection_name, + task_id="example_gcp_sql_task_" + connection_name, + sql=SQL + ) + tasks.append(task) + if prev_task: + prev_task >> task + prev_task = task + +# [END howto_operator_cloudsql_query_operators] diff --git a/airflow/contrib/example_dags/example_gcp_transfer.py b/airflow/contrib/example_dags/example_gcp_transfer.py new file mode 100644 index 0000000000000..a21bdbcc18a4d --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_transfer.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that demonstrates interactions with Google Cloud Transfer. + + +This DAG relies on the following OS environment variables + +* GCP_PROJECT_ID - Google Cloud Project to use for the Google Cloud Transfer Service. +* GCP_DESCRIPTION - Description of transfer job +* GCP_TRANSFER_SOURCE_AWS_BUCKET - Amazon Web Services Storage bucket from which files are copied. + .. warning:: + You need to provide a large enough set of data so that operations do not execute too quickly. + Otherwise, DAG will fail. +* GCP_TRANSFER_FIRST_TARGET_BUCKET - Google Cloud Storage bucket to which files are copied from AWS. + It is also a source bucket in next step +* GCP_TRANSFER_SECOND_TARGET_BUCKET - Google Cloud Storage bucket bucket to which files are copied +* WAIT_FOR_OPERATION_POKE_INTERVAL - interval of what to check the status of the operation + A smaller value than the default value accelerates the system test and ensures its correct execution with + smaller quantities of files in the source bucket + Look at documentation of :class:`~airflow.operators.sensors.BaseSensorOperator` for more information + +""" +import os +from datetime import datetime, timedelta +from typing import Any, Dict + +from airflow import models +from airflow.contrib.hooks.gcp_transfer_hook import ( + GcpTransferOperationStatus, + GcpTransferJobsStatus, + TRANSFER_OPTIONS, + PROJECT_ID, + BUCKET_NAME, + GCS_DATA_SINK, + STATUS, + DESCRIPTION, + GCS_DATA_SOURCE, + START_TIME_OF_DAY, + SCHEDULE_END_DATE, + SCHEDULE_START_DATE, + SCHEDULE, + AWS_S3_DATA_SOURCE, + TRANSFER_SPEC, + FILTER_PROJECT_ID, + FILTER_JOB_NAMES, + TRANSFER_JOB, + TRANSFER_JOB_FIELD_MASK, + ALREADY_EXISTING_IN_SINK, +) +from airflow.contrib.operators.gcp_transfer_operator import ( + GcpTransferServiceJobCreateOperator, + GcpTransferServiceJobDeleteOperator, + GcpTransferServiceJobUpdateOperator, + GcpTransferServiceOperationsListOperator, + GcpTransferServiceOperationGetOperator, + GcpTransferServiceOperationPauseOperator, + GcpTransferServiceOperationResumeOperator, + GcpTransferServiceOperationCancelOperator, +) +from airflow.contrib.sensors.gcp_transfer_sensor import GCPTransferServiceWaitForJobStatusSensor +from airflow.utils.dates import days_ago + +# [START howto_operator_gcp_transfer_common_variables] +GCP_PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'example-project') +GCP_DESCRIPTION = os.environ.get('GCP_DESCRIPTION', 'description') +GCP_TRANSFER_TARGET_BUCKET = os.environ.get('GCP_TRANSFER_TARGET_BUCKET') +WAIT_FOR_OPERATION_POKE_INTERVAL = int(os.environ.get('WAIT_FOR_OPERATION_POKE_INTERVAL', 5)) + +GCP_TRANSFER_SOURCE_AWS_BUCKET = os.environ.get('GCP_TRANSFER_SOURCE_AWS_BUCKET') +GCP_TRANSFER_FIRST_TARGET_BUCKET = os.environ.get( + 'GCP_TRANSFER_FIRST_TARGET_BUCKET', 'gcp-transfer-first-target' +) +GCP_TRANSFER_SECOND_TARGET_BUCKET = os.environ.get( + 'GCP_TRANSFER_SECOND_TARGET_BUCKET', 'gcp-transfer-second-target' +) +# [END howto_operator_gcp_transfer_common_variables] + +# [START howto_operator_gcp_transfer_create_job_body_aws] +aws_to_gcs_transfer_body = { + DESCRIPTION: GCP_DESCRIPTION, + STATUS: GcpTransferJobsStatus.ENABLED, + PROJECT_ID: GCP_PROJECT_ID, + SCHEDULE: { + SCHEDULE_START_DATE: datetime(2015, 1, 1).date(), + SCHEDULE_END_DATE: datetime(2030, 1, 1).date(), + START_TIME_OF_DAY: (datetime.utcnow() + timedelta(minutes=2)).time(), + }, + TRANSFER_SPEC: { + AWS_S3_DATA_SOURCE: {BUCKET_NAME: GCP_TRANSFER_SOURCE_AWS_BUCKET}, + GCS_DATA_SINK: {BUCKET_NAME: GCP_TRANSFER_FIRST_TARGET_BUCKET}, + TRANSFER_OPTIONS: {ALREADY_EXISTING_IN_SINK: True}, + }, +} +# [END howto_operator_gcp_transfer_create_job_body_aws] + +# [START howto_operator_gcp_transfer_create_job_body_gcp] +gcs_to_gcs_transfer_body = { + DESCRIPTION: GCP_DESCRIPTION, + STATUS: GcpTransferJobsStatus.ENABLED, + PROJECT_ID: GCP_PROJECT_ID, + SCHEDULE: { + SCHEDULE_START_DATE: datetime(2015, 1, 1).date(), + SCHEDULE_END_DATE: datetime(2030, 1, 1).date(), + START_TIME_OF_DAY: (datetime.utcnow() + timedelta(minutes=2)).time(), + }, + TRANSFER_SPEC: { + GCS_DATA_SOURCE: {BUCKET_NAME: GCP_TRANSFER_FIRST_TARGET_BUCKET}, + GCS_DATA_SINK: {BUCKET_NAME: GCP_TRANSFER_SECOND_TARGET_BUCKET}, + TRANSFER_OPTIONS: {ALREADY_EXISTING_IN_SINK: True}, + }, +} # type: Dict[str, Any] +# [END howto_operator_gcp_transfer_create_job_body_gcp] + +# [START howto_operator_gcp_transfer_update_job_body] +update_body = { + PROJECT_ID: GCP_PROJECT_ID, + TRANSFER_JOB: {DESCRIPTION: "{}_updated".format(GCP_DESCRIPTION)}, + TRANSFER_JOB_FIELD_MASK: "description", +} +# [END howto_operator_gcp_transfer_update_job_body] + +list_filter_dict = {FILTER_PROJECT_ID: GCP_PROJECT_ID, FILTER_JOB_NAMES: []} + +# [START howto_operator_gcp_transfer_default_args] +default_args = {'start_date': days_ago(1)} +# [END howto_operator_gcp_transfer_default_args] + +with models.DAG( + 'example_gcp_transfer', default_args=default_args, schedule_interval=None # Override to match your needs +) as dag: + + # [START howto_operator_gcp_transfer_create_job] + create_transfer_job_from_aws = GcpTransferServiceJobCreateOperator( + task_id="create_transfer_job_from_aws", body=aws_to_gcs_transfer_body + ) + # [END howto_operator_gcp_transfer_create_job] + + wait_for_operation_to_start = GCPTransferServiceWaitForJobStatusSensor( + task_id="wait_for_operation_to_start", + job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}", + project_id=GCP_PROJECT_ID, + expected_statuses={GcpTransferOperationStatus.IN_PROGRESS}, + poke_interval=WAIT_FOR_OPERATION_POKE_INTERVAL, + ) + + # [START howto_operator_gcp_transfer_pause_operation] + pause_operation = GcpTransferServiceOperationPauseOperator( + task_id="pause_operation", + operation_name="{{task_instance.xcom_pull('wait_for_operation_to_start', " + "key='sensed_operations')[0]['name']}}", + ) + # [END howto_operator_gcp_transfer_pause_operation] + + # [START howto_operator_gcp_transfer_update_job] + update_job = GcpTransferServiceJobUpdateOperator( + task_id="update_job", + job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}", + body=update_body, + ) + # [END howto_operator_gcp_transfer_update_job] + + # [START howto_operator_gcp_transfer_list_operations] + list_operations = GcpTransferServiceOperationsListOperator( + task_id="list_operations", + filter={ + FILTER_PROJECT_ID: GCP_PROJECT_ID, + FILTER_JOB_NAMES: ["{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}"], + }, + ) + # [END howto_operator_gcp_transfer_list_operations] + + # [START howto_operator_gcp_transfer_get_operation] + get_operation = GcpTransferServiceOperationGetOperator( + task_id="get_operation", operation_name="{{task_instance.xcom_pull('list_operations')[0]['name']}}" + ) + # [END howto_operator_gcp_transfer_get_operation] + + # [START howto_operator_gcp_transfer_resume_operation] + resume_operation = GcpTransferServiceOperationResumeOperator( + task_id="resume_operation", operation_name="{{task_instance.xcom_pull('get_operation')['name']}}" + ) + # [END howto_operator_gcp_transfer_resume_operation] + + # [START howto_operator_gcp_transfer_wait_operation] + wait_for_operation_to_end = GCPTransferServiceWaitForJobStatusSensor( + task_id="wait_for_operation_to_end", + job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}", + project_id=GCP_PROJECT_ID, + expected_statuses={GcpTransferOperationStatus.SUCCESS}, + poke_interval=WAIT_FOR_OPERATION_POKE_INTERVAL, + ) + # [END howto_operator_gcp_transfer_wait_operation] + + job_time = datetime.utcnow() + timedelta(minutes=2) + + gcs_to_gcs_transfer_body['schedule']['startTimeOfDay'] = (datetime.utcnow() + timedelta(minutes=2)).time() + + create_transfer_job_from_gcp = GcpTransferServiceJobCreateOperator( + task_id="create_transfer_job_from_gcp", body=gcs_to_gcs_transfer_body + ) + + wait_for_second_operation_to_start = GCPTransferServiceWaitForJobStatusSensor( + task_id="wait_for_second_operation_to_start", + job_name="{{ task_instance.xcom_pull('create_transfer_job_from_gcp')['name'] }}", + project_id=GCP_PROJECT_ID, + expected_statuses={GcpTransferOperationStatus.IN_PROGRESS}, + poke_interval=WAIT_FOR_OPERATION_POKE_INTERVAL, + ) + + # [START howto_operator_gcp_transfer_cancel_operation] + cancel_operation = GcpTransferServiceOperationCancelOperator( + task_id="cancel_operation", + operation_name="{{task_instance.xcom_pull(" + "'wait_for_second_operation_to_start', key='sensed_operations')[0]['name']}}", + ) + # [END howto_operator_gcp_transfer_cancel_operation] + + # [START howto_operator_gcp_transfer_delete_job] + delete_transfer_from_aws_job = GcpTransferServiceJobDeleteOperator( + task_id="delete_transfer_from_aws_job", + job_name="{{task_instance.xcom_pull('create_transfer_job_from_aws')['name']}}", + project_id=GCP_PROJECT_ID, + ) + # [END howto_operator_gcp_transfer_delete_job] + + delete_transfer_from_gcp_job = GcpTransferServiceJobDeleteOperator( + task_id="delete_transfer_from_gcp_job", + job_name="{{task_instance.xcom_pull('create_transfer_job_from_gcp')['name']}}", + project_id=GCP_PROJECT_ID, + ) + + create_transfer_job_from_aws >> wait_for_operation_to_start >> pause_operation >> \ + list_operations >> get_operation >> resume_operation >> wait_for_operation_to_end >> \ + create_transfer_job_from_gcp >> wait_for_second_operation_to_start >> cancel_operation >> \ + delete_transfer_from_aws_job >> delete_transfer_from_gcp_job diff --git a/airflow/contrib/example_dags/example_gcp_translate.py b/airflow/contrib/example_dags/example_gcp_translate.py new file mode 100644 index 0000000000000..bc7030cf1a084 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_translate.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that translates text in Google Cloud Translate +service in the Google Cloud Platform. + +""" +import airflow +from airflow import models + +from airflow.contrib.operators.gcp_translate_operator import CloudTranslateTextOperator +from airflow.operators.bash_operator import BashOperator + +default_args = {'start_date': airflow.utils.dates.days_ago(1)} + +with models.DAG( + 'example_gcp_translate', default_args=default_args, schedule_interval=None # Override to match your needs +) as dag: + # [START howto_operator_translate_text] + product_set_create = CloudTranslateTextOperator( + task_id='translate', + values=['zażółć gęślą jaźń'], + target_language='en', + format_='text', + source_language=None, + model='base', + ) + # [END howto_operator_translate_text] + # [START howto_operator_translate_access] + translation_access = BashOperator( + task_id='access', + bash_command="echo '{{ task_instance.xcom_pull(\"translate\")[0] }}'" + ) + product_set_create >> translation_access + # [END howto_operator_translate_access] diff --git a/airflow/contrib/example_dags/example_gcp_vision.py b/airflow/contrib/example_dags/example_gcp_vision.py new file mode 100644 index 0000000000000..2a1facffcef78 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcp_vision.py @@ -0,0 +1,415 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that creates, gets, updates and deletes Products and Product Sets in the Google Cloud +Vision service in the Google Cloud Platform. + +This DAG relies on the following OS environment variables + +* GCP_VISION_LOCATION - Zone where the instance exists. +* GCP_VISION_PRODUCT_SET_ID - Product Set ID. +* GCP_VISION_PRODUCT_ID - Product ID. +* GCP_VISION_REFERENCE_IMAGE_ID - Reference Image ID. +* GCP_VISION_REFERENCE_IMAGE_URL - A link to the bucket that contains the reference image. +* GCP_VISION_ANNOTATE_IMAGE_URL - A link to the bucket that contains the file to be annotated. + +""" +import os + +# [START howto_operator_vision_retry_import] +from google.api_core.retry import Retry + +# [END howto_operator_vision_retry_import] +# [START howto_operator_vision_product_set_import] +from google.cloud.vision_v1.types import ProductSet +# [END howto_operator_vision_product_set_import] +# [START howto_operator_vision_product_import] +from google.cloud.vision_v1.types import Product +# [END howto_operator_vision_product_import] +# [START howto_operator_vision_reference_image_import] +from google.cloud.vision_v1.types import ReferenceImage +# [END howto_operator_vision_reference_image_import] +# [START howto_operator_vision_enums_import] +from google.cloud.vision import enums +# [END howto_operator_vision_enums_import] + +import airflow +from airflow import models +from airflow.operators.bash_operator import BashOperator + +from airflow.contrib.operators.gcp_vision_operator import ( + CloudVisionProductSetCreateOperator, + CloudVisionProductSetGetOperator, + CloudVisionProductSetUpdateOperator, + CloudVisionProductSetDeleteOperator, + CloudVisionProductCreateOperator, + CloudVisionProductGetOperator, + CloudVisionProductUpdateOperator, + CloudVisionProductDeleteOperator, + CloudVisionReferenceImageCreateOperator, + CloudVisionAddProductToProductSetOperator, + CloudVisionRemoveProductFromProductSetOperator, + CloudVisionAnnotateImageOperator, +) + +default_args = {'start_date': airflow.utils.dates.days_ago(1)} + +# [START howto_operator_vision_args_common] +GCP_VISION_LOCATION = os.environ.get('GCP_VISION_LOCATION', 'europe-west1') +# [END howto_operator_vision_args_common] + +# [START howto_operator_vision_product_set_explicit_id] +GCP_VISION_PRODUCT_SET_ID = os.environ.get('GCP_VISION_PRODUCT_SET_ID', 'product_set_explicit_id') +# [END howto_operator_vision_product_set_explicit_id] + +# [START howto_operator_vision_product_explicit_id] +GCP_VISION_PRODUCT_ID = os.environ.get('GCP_VISION_PRODUCT_ID', 'product_explicit_id') +# [END howto_operator_vision_product_explicit_id] + +# [START howto_operator_vision_reference_image_args] +GCP_VISION_REFERENCE_IMAGE_ID = os.environ.get('GCP_VISION_REFERENCE_IMAGE_ID', 'reference_image_explicit_id') +GCP_VISION_REFERENCE_IMAGE_URL = os.environ.get('GCP_VISION_REFERENCE_IMAGE_URL', 'gs://bucket/image1.jpg') +# [END howto_operator_vision_reference_image_args] + +# [START howto_operator_vision_annotate_image_url] +GCP_VISION_ANNOTATE_IMAGE_URL = os.environ.get('GCP_VISION_ANNOTATE_IMAGE_URL', 'gs://bucket/image2.jpg') +# [END howto_operator_vision_annotate_image_url] + +# [START howto_operator_vision_product_set] +product_set = ProductSet(display_name='My Product Set') +# [END howto_operator_vision_product_set] + +# [START howto_operator_vision_product] +product = Product(display_name='My Product 1', product_category='toys') +# [END howto_operator_vision_product] + +# [START howto_operator_vision_reference_image] +reference_image = ReferenceImage(uri=GCP_VISION_REFERENCE_IMAGE_URL) +# [END howto_operator_vision_reference_image] + +# [START howto_operator_vision_annotate_image_request] +annotate_image_request = { + 'image': {'source': {'image_uri': GCP_VISION_ANNOTATE_IMAGE_URL}}, + 'features': [{'type': enums.Feature.Type.LOGO_DETECTION}], +} +# [END howto_operator_vision_annotate_image_request] + +with models.DAG( + 'example_gcp_vision_autogenerated_id', default_args=default_args, schedule_interval=None +) as dag_autogenerated_id: + # ################################## # + # ### Autogenerated IDs examples ### # + # ################################## # + + # [START howto_operator_vision_product_set_create] + product_set_create = CloudVisionProductSetCreateOperator( + location=GCP_VISION_LOCATION, + product_set=product_set, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_set_create', + ) + # [END howto_operator_vision_product_set_create] + + # [START howto_operator_vision_product_set_get] + product_set_get = CloudVisionProductSetGetOperator( + location=GCP_VISION_LOCATION, + product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", + task_id='product_set_get', + ) + # [END howto_operator_vision_product_set_get] + + # [START howto_operator_vision_product_set_update] + product_set_update = CloudVisionProductSetUpdateOperator( + location=GCP_VISION_LOCATION, + product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", + product_set=ProductSet(display_name='My Product Set 2'), + task_id='product_set_update', + ) + # [END howto_operator_vision_product_set_update] + + # [START howto_operator_vision_product_set_delete] + product_set_delete = CloudVisionProductSetDeleteOperator( + location=GCP_VISION_LOCATION, + product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", + task_id='product_set_delete', + ) + # [END howto_operator_vision_product_set_delete] + + # [START howto_operator_vision_product_create] + product_create = CloudVisionProductCreateOperator( + location=GCP_VISION_LOCATION, + product=product, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_create', + ) + # [END howto_operator_vision_product_create] + + # [START howto_operator_vision_product_get] + product_get = CloudVisionProductGetOperator( + location=GCP_VISION_LOCATION, + product_id="{{ task_instance.xcom_pull('product_create') }}", + task_id='product_get', + ) + # [END howto_operator_vision_product_get] + + # [START howto_operator_vision_product_update] + product_update = CloudVisionProductUpdateOperator( + location=GCP_VISION_LOCATION, + product_id="{{ task_instance.xcom_pull('product_create') }}", + product=Product(display_name='My Product 2', description='My updated description'), + task_id='product_update', + ) + # [END howto_operator_vision_product_update] + + # [START howto_operator_vision_product_delete] + product_delete = CloudVisionProductDeleteOperator( + location=GCP_VISION_LOCATION, + product_id="{{ task_instance.xcom_pull('product_create') }}", + task_id='product_delete', + ) + # [END howto_operator_vision_product_delete] + + # [START howto_operator_vision_reference_image_create] + reference_image_create = CloudVisionReferenceImageCreateOperator( + location=GCP_VISION_LOCATION, + reference_image=reference_image, + product_id="{{ task_instance.xcom_pull('product_create') }}", + reference_image_id=GCP_VISION_REFERENCE_IMAGE_ID, + retry=Retry(maximum=10.0), + timeout=5, + task_id='reference_image_create', + ) + # [END howto_operator_vision_reference_image_create] + + # [START howto_operator_vision_add_product_to_product_set] + add_product_to_product_set = CloudVisionAddProductToProductSetOperator( + location=GCP_VISION_LOCATION, + product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", + product_id="{{ task_instance.xcom_pull('product_create') }}", + retry=Retry(maximum=10.0), + timeout=5, + task_id='add_product_to_product_set', + ) + # [END howto_operator_vision_add_product_to_product_set] + + # [START howto_operator_vision_remove_product_from_product_set] + remove_product_from_product_set = CloudVisionRemoveProductFromProductSetOperator( + location=GCP_VISION_LOCATION, + product_set_id="{{ task_instance.xcom_pull('product_set_create') }}", + product_id="{{ task_instance.xcom_pull('product_create') }}", + retry=Retry(maximum=10.0), + timeout=5, + task_id='remove_product_from_product_set', + ) + # [END howto_operator_vision_remove_product_from_product_set] + + # Product path + product_create >> product_get >> product_update >> product_delete + + # ProductSet path + product_set_create >> product_set_get >> product_set_update >> product_set_delete + + # ReferenceImage path + product_create >> reference_image_create >> product_delete + + # Product/ProductSet path + product_create >> add_product_to_product_set + product_set_create >> add_product_to_product_set + add_product_to_product_set >> remove_product_from_product_set + remove_product_from_product_set >> product_delete + remove_product_from_product_set >> product_set_delete + +with models.DAG( + 'example_gcp_vision_explicit_id', default_args=default_args, schedule_interval=None +) as dag_explicit_id: + # ############################# # + # ### Explicit IDs examples ### # + # ############################# # + + # [START howto_operator_vision_product_set_create_2] + product_set_create_2 = CloudVisionProductSetCreateOperator( + product_set_id=GCP_VISION_PRODUCT_SET_ID, + location=GCP_VISION_LOCATION, + product_set=product_set, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_set_create_2', + ) + # [END howto_operator_vision_product_set_create_2] + + # Second 'create' task with the same product_set_id to demonstrate idempotence + product_set_create_2_idempotence = CloudVisionProductSetCreateOperator( + product_set_id=GCP_VISION_PRODUCT_SET_ID, + location=GCP_VISION_LOCATION, + product_set=product_set, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_set_create_2_idempotence', + ) + + # [START howto_operator_vision_product_set_get_2] + product_set_get_2 = CloudVisionProductSetGetOperator( + location=GCP_VISION_LOCATION, product_set_id=GCP_VISION_PRODUCT_SET_ID, task_id='product_set_get_2' + ) + # [END howto_operator_vision_product_set_get_2] + + # [START howto_operator_vision_product_set_update_2] + product_set_update_2 = CloudVisionProductSetUpdateOperator( + location=GCP_VISION_LOCATION, + product_set_id=GCP_VISION_PRODUCT_SET_ID, + product_set=ProductSet(display_name='My Product Set 2'), + task_id='product_set_update_2', + ) + # [END howto_operator_vision_product_set_update_2] + + # [START howto_operator_vision_product_set_delete_2] + product_set_delete_2 = CloudVisionProductSetDeleteOperator( + location=GCP_VISION_LOCATION, product_set_id=GCP_VISION_PRODUCT_SET_ID, task_id='product_set_delete_2' + ) + # [END howto_operator_vision_product_set_delete_2] + + # [START howto_operator_vision_product_create_2] + product_create_2 = CloudVisionProductCreateOperator( + product_id=GCP_VISION_PRODUCT_ID, + location=GCP_VISION_LOCATION, + product=product, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_create_2', + ) + # [END howto_operator_vision_product_create_2] + + # Second 'create' task with the same product_id to demonstrate idempotence + product_create_2_idempotence = CloudVisionProductCreateOperator( + product_id=GCP_VISION_PRODUCT_ID, + location=GCP_VISION_LOCATION, + product=product, + retry=Retry(maximum=10.0), + timeout=5, + task_id='product_create_2_idempotence', + ) + + # [START howto_operator_vision_product_get_2] + product_get_2 = CloudVisionProductGetOperator( + location=GCP_VISION_LOCATION, product_id=GCP_VISION_PRODUCT_ID, task_id='product_get_2' + ) + # [END howto_operator_vision_product_get_2] + + # [START howto_operator_vision_product_update_2] + product_update_2 = CloudVisionProductUpdateOperator( + location=GCP_VISION_LOCATION, + product_id=GCP_VISION_PRODUCT_ID, + product=Product(display_name='My Product 2', description='My updated description'), + task_id='product_update_2', + ) + # [END howto_operator_vision_product_update_2] + + # [START howto_operator_vision_product_delete_2] + product_delete_2 = CloudVisionProductDeleteOperator( + location=GCP_VISION_LOCATION, product_id=GCP_VISION_PRODUCT_ID, task_id='product_delete_2' + ) + # [END howto_operator_vision_product_delete_2] + + # [START howto_operator_vision_reference_image_create_2] + reference_image_create_2 = CloudVisionReferenceImageCreateOperator( + location=GCP_VISION_LOCATION, + reference_image=reference_image, + product_id=GCP_VISION_PRODUCT_ID, + reference_image_id=GCP_VISION_REFERENCE_IMAGE_ID, + retry=Retry(maximum=10.0), + timeout=5, + task_id='reference_image_create_2', + ) + # [END howto_operator_vision_reference_image_create_2] + + # Second 'create' task with the same product_id to demonstrate idempotence + reference_image_create_2_idempotence = CloudVisionReferenceImageCreateOperator( + location=GCP_VISION_LOCATION, + reference_image=reference_image, + product_id=GCP_VISION_PRODUCT_ID, + reference_image_id=GCP_VISION_REFERENCE_IMAGE_ID, + retry=Retry(maximum=10.0), + timeout=5, + task_id='reference_image_create_2_idempotence', + ) + + # [START howto_operator_vision_add_product_to_product_set_2] + add_product_to_product_set_2 = CloudVisionAddProductToProductSetOperator( + location=GCP_VISION_LOCATION, + product_set_id=GCP_VISION_PRODUCT_SET_ID, + product_id=GCP_VISION_PRODUCT_ID, + retry=Retry(maximum=10.0), + timeout=5, + task_id='add_product_to_product_set_2', + ) + # [END howto_operator_vision_add_product_to_product_set_2] + + # [START howto_operator_vision_remove_product_from_product_set_2] + remove_product_from_product_set_2 = CloudVisionRemoveProductFromProductSetOperator( + location=GCP_VISION_LOCATION, + product_set_id=GCP_VISION_PRODUCT_SET_ID, + product_id=GCP_VISION_PRODUCT_ID, + retry=Retry(maximum=10.0), + timeout=5, + task_id='remove_product_from_product_set_2', + ) + # [END howto_operator_vision_remove_product_from_product_set_2] + + # Product path + product_create_2 >> product_create_2_idempotence >> product_get_2 >> product_update_2 >> product_delete_2 + + # ProductSet path + product_set_create_2 >> product_set_get_2 >> product_set_update_2 >> product_set_delete_2 + product_set_create_2 >> product_set_create_2_idempotence >> product_set_delete_2 + + # ReferenceImage path + product_create_2 >> reference_image_create_2 >> reference_image_create_2_idempotence >> product_delete_2 + + # Product/ProductSet path + add_product_to_product_set_2 >> remove_product_from_product_set_2 + product_set_create_2 >> add_product_to_product_set_2 + product_create_2 >> add_product_to_product_set_2 + remove_product_from_product_set_2 >> product_set_delete_2 + remove_product_from_product_set_2 >> product_delete_2 + +with models.DAG( + 'example_gcp_vision_annotate_image', default_args=default_args, schedule_interval=None +) as dag_annotate_image: + # ############################## # + # ### Annotate image example ### # + # ############################## # + + # [START howto_operator_vision_annotate_image] + annotate_image = CloudVisionAnnotateImageOperator( + request=annotate_image_request, retry=Retry(maximum=10.0), timeout=5, task_id='annotate_image' + ) + # [END howto_operator_vision_annotate_image] + + # [START howto_operator_vision_annotate_image_result] + annotate_image_result = BashOperator( + bash_command="echo {{ task_instance.xcom_pull('annotate_image')" + "['logoAnnotations'][0]['description'] }}", + task_id='annotate_image_result', + ) + # [END howto_operator_vision_annotate_image_result] + + annotate_image >> annotate_image_result diff --git a/airflow/contrib/example_dags/example_gcs_acl.py b/airflow/contrib/example_dags/example_gcs_acl.py new file mode 100644 index 0000000000000..7247199a4f656 --- /dev/null +++ b/airflow/contrib/example_dags/example_gcs_acl.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Example Airflow DAG that creates a new ACL entry on the specified bucket and object. + +This DAG relies on the following OS environment variables + +* GCS_ACL_BUCKET - Name of a bucket. +* GCS_ACL_OBJECT - Name of the object. For information about how to URL encode object + names to be path safe, see: + https://cloud.google.com/storage/docs/json_api/#encoding +* GCS_ACL_ENTITY - The entity holding the permission. +* GCS_ACL_BUCKET_ROLE - The access permission for the entity for the bucket. +* GCS_ACL_OBJECT_ROLE - The access permission for the entity for the object. +""" +import os + +import airflow +from airflow import models +from airflow.contrib.operators.gcs_acl_operator import \ + GoogleCloudStorageBucketCreateAclEntryOperator, \ + GoogleCloudStorageObjectCreateAclEntryOperator + +# [START howto_operator_gcs_acl_args_common] +GCS_ACL_BUCKET = os.environ.get('GCS_ACL_BUCKET', 'example-bucket') +GCS_ACL_OBJECT = os.environ.get('GCS_ACL_OBJECT', 'example-object') +GCS_ACL_ENTITY = os.environ.get('GCS_ACL_ENTITY', 'example-entity') +GCS_ACL_BUCKET_ROLE = os.environ.get('GCS_ACL_BUCKET_ROLE', 'example-bucket-role') +GCS_ACL_OBJECT_ROLE = os.environ.get('GCS_ACL_OBJECT_ROLE', 'example-object-role') +# [END howto_operator_gcs_acl_args_common] + +default_args = { + 'start_date': airflow.utils.dates.days_ago(1) +} + +with models.DAG( + 'example_gcs_acl', + default_args=default_args, + schedule_interval=None # Change to match your use case +) as dag: + # [START howto_operator_gcs_bucket_create_acl_entry_task] + gcs_bucket_create_acl_entry_task = GoogleCloudStorageBucketCreateAclEntryOperator( + bucket=GCS_ACL_BUCKET, + entity=GCS_ACL_ENTITY, + role=GCS_ACL_BUCKET_ROLE, + task_id="gcs_bucket_create_acl_entry_task" + ) + # [END howto_operator_gcs_bucket_create_acl_entry_task] + # [START howto_operator_gcs_object_create_acl_entry_task] + gcs_object_create_acl_entry_task = GoogleCloudStorageObjectCreateAclEntryOperator( + bucket=GCS_ACL_BUCKET, + object_name=GCS_ACL_OBJECT, + entity=GCS_ACL_ENTITY, + role=GCS_ACL_OBJECT_ROLE, + task_id="gcs_object_create_acl_entry_task" + ) + # [END howto_operator_gcs_object_create_acl_entry_task] + + gcs_bucket_create_acl_entry_task >> gcs_object_create_acl_entry_task diff --git a/airflow/contrib/example_dags/example_gcs_to_bq_operator.py b/airflow/contrib/example_dags/example_gcs_to_bq_operator.py new file mode 100644 index 0000000000000..b76d05eec138e --- /dev/null +++ b/airflow/contrib/example_dags/example_gcs_to_bq_operator.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any + +import airflow +from airflow import models +from airflow.operators import bash_operator + +gcs_to_bq = None # type: Any +try: + from airflow.contrib.operators import gcs_to_bq +except ImportError: + pass + + +if gcs_to_bq is not None: + args = { + 'owner': 'airflow', + 'start_date': airflow.utils.dates.days_ago(2) + } + + dag = models.DAG( + dag_id='example_gcs_to_bq_operator', default_args=args, + schedule_interval=None) + + create_test_dataset = bash_operator.BashOperator( + task_id='create_airflow_test_dataset', + bash_command='bq mk airflow_test', + dag=dag) + + # [START howto_operator_gcs_to_bq] + load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( + task_id='gcs_to_bq_example', + bucket='cloud-samples-data', + source_objects=['bigquery/us-states/us-states.csv'], + destination_project_dataset_table='airflow_test.gcs_to_bq_table', + schema_fields=[ + {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, + {'name': 'post_abbr', 'type': 'STRING', 'mode': 'NULLABLE'}, + ], + write_disposition='WRITE_TRUNCATE', + dag=dag) + # [END howto_operator_gcs_to_bq] + + delete_test_dataset = bash_operator.BashOperator( + task_id='delete_airflow_test_dataset', + bash_command='bq rm -rf airflow_test', + dag=dag) + + create_test_dataset >> load_csv >> delete_test_dataset diff --git a/airflow/contrib/example_dags/example_kubernetes_annotation.py b/airflow/contrib/example_dags/example_kubernetes_annotation.py new file mode 100644 index 0000000000000..058baf69901f7 --- /dev/null +++ b/airflow/contrib/example_dags/example_kubernetes_annotation.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import print_function +import airflow +from airflow.operators.python_operator import PythonOperator +from airflow.models import DAG + +args = { + 'owner': 'airflow', + 'start_date': airflow.utils.dates.days_ago(2) +} + +dag = DAG( + dag_id='example_kubernetes_annotation', default_args=args, + schedule_interval=None +) + + +def print_stuff(): + print("annotated!") + + +# You can use annotations on your kubernetes pods! +start_task = PythonOperator( + task_id="start_task", python_callable=print_stuff, dag=dag, + executor_config={ + "KubernetesExecutor": { + "annotations": {"test": "annotation"} + } + } +) diff --git a/airflow/example_dags/example_kubernetes_executor.py b/airflow/contrib/example_dags/example_kubernetes_executor.py similarity index 69% rename from airflow/example_dags/example_kubernetes_executor.py rename to airflow/contrib/example_dags/example_kubernetes_executor.py index 1d9bb7304318b..d03e255ab3287 100644 --- a/airflow/example_dags/example_kubernetes_executor.py +++ b/airflow/contrib/example_dags/example_kubernetes_executor.py @@ -32,6 +32,31 @@ schedule_interval=None ) +affinity = { + 'podAntiAffinity': { + 'requiredDuringSchedulingIgnoredDuringExecution': [ + { + 'topologyKey': 'kubernetes.io/hostname', + 'labelSelector': { + 'matchExpressions': [ + { + 'key': 'app', + 'operator': 'In', + 'values': ['airflow'] + } + ] + } + } + ] + } +} + +tolerations = [{ + 'key': 'dedicated', + 'operator': 'Equal', + 'value': 'airflow' +}] + def print_stuff(): print("stuff!") @@ -59,11 +84,14 @@ def use_zip_binary(): executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}} ) -# Limit resources on this operator/task +# Limit resources on this operator/task with node affinity & tolerations three_task = PythonOperator( task_id="three_task", python_callable=print_stuff, dag=dag, executor_config={ - "KubernetesExecutor": {"request_memory": "128Mi", "limit_memory": "128Mi"}} + "KubernetesExecutor": {"request_memory": "128Mi", + "limit_memory": "128Mi", + "tolerations": tolerations, + "affinity": affinity}} ) start_task.set_downstream([one_task, two_task, three_task]) diff --git a/airflow/contrib/example_dags/example_kubernetes_executor_config.py b/airflow/contrib/example_dags/example_kubernetes_executor_config.py new file mode 100644 index 0000000000000..1316b21803921 --- /dev/null +++ b/airflow/contrib/example_dags/example_kubernetes_executor_config.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from __future__ import print_function +import airflow +from airflow.operators.python_operator import PythonOperator +from libs.helper import print_stuff +from airflow.models import DAG +import os + +args = { + 'owner': 'airflow', + 'start_date': airflow.utils.dates.days_ago(2) +} + +dag = DAG( + dag_id='example_kubernetes_executor_config', default_args=args, + schedule_interval=None +) + + +def test_volume_mount(): + with open('/foo/volume_mount_test.txt', 'w') as foo: + foo.write('Hello') + + rc = os.system("cat /foo/volume_mount_test.txt") + assert rc == 0 + + +# You can use annotations on your kubernetes pods! +start_task = PythonOperator( + task_id="start_task", python_callable=print_stuff, dag=dag, + executor_config={ + "KubernetesExecutor": { + "annotations": {"test": "annotation"} + } + } +) + +# You can mount volume or secret to the worker pod +second_task = PythonOperator( + task_id="four_task", python_callable=test_volume_mount, dag=dag, + executor_config={ + "KubernetesExecutor": { + "volumes": [ + { + "name": "example-kubernetes-test-volume", + "hostPath": {"path": "/tmp/"}, + }, + ], + "volume_mounts": [ + { + "mountPath": "/foo/", + "name": "example-kubernetes-test-volume", + }, + ] + } + } +) + +# Test that we can run tasks as a normal user +third_task = PythonOperator( + task_id="non_root_task", python_callable=print_stuff, dag=dag, + executor_config={ + "KubernetesExecutor": { + "securityContext": { + "runAsUser": 1000 + } + } + } +) + +start_task.set_downstream(second_task) +second_task.set_downstream(third_task) diff --git a/airflow/example_dags/example_kubernetes_operator.py b/airflow/contrib/example_dags/example_kubernetes_operator.py similarity index 84% rename from airflow/example_dags/example_kubernetes_operator.py rename to airflow/contrib/example_dags/example_kubernetes_operator.py index 92d73c5d3369d..4977890bc68f3 100644 --- a/airflow/example_dags/example_kubernetes_operator.py +++ b/airflow/contrib/example_dags/example_kubernetes_operator.py @@ -25,7 +25,7 @@ try: # Kubernetes is optional, so not available in vanilla Airflow - # pip install airflow[kubernetes] + # pip install 'apache-airflow[kubernetes]' from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator args = { @@ -38,6 +38,14 @@ default_args=args, schedule_interval=None) + tolerations = [ + { + 'key': "key", + 'operator': 'Equal', + 'value': 'value' + } + ] + k = KubernetesPodOperator( namespace='default', image="ubuntu:16.04", @@ -48,9 +56,12 @@ in_cluster=False, task_id="task", get_logs=True, - dag=dag) + dag=dag, + is_delete_operator_pod=False, + tolerations=tolerations + ) except ImportError as e: log.warn("Could not import KubernetesPodOperator: " + str(e)) log.warn("Install kubernetes dependencies with: " - " pip install airflow['kubernetes']") + " pip install 'apache-airflow[kubernetes]'") diff --git a/airflow/contrib/example_dags/example_qubole_operator.py b/airflow/contrib/example_dags/example_qubole_operator.py index 826a50af99cd9..5f77d09ba1442 100644 --- a/airflow/contrib/example_dags/example_qubole_operator.py +++ b/airflow/contrib/example_dags/example_qubole_operator.py @@ -65,7 +65,7 @@ def compare_result(ds, **kwargs): fetch_logs=True, # If `fetch_logs`=true, will fetch qubole command logs and concatenate # them into corresponding airflow task logs - tags='aiflow_example_run', + tags='airflow_example_run', # To attach tags to qubole command, auto attach 3 tags - dag_id, task_id, run_id qubole_conn_id='qubole_default', # Connection id to submit commands inside QDS, if not set "qubole_default" is used @@ -220,7 +220,7 @@ def main(args: Array[String]) { program=prog, language='scala', arguments='--class SparkPi', - tags='aiflow_example_run', + tags='airflow_example_run', dag=dag) t11.set_upstream(branching) diff --git a/airflow/contrib/example_dags/example_twitter_README.md b/airflow/contrib/example_dags/example_twitter_README.md index 319eac39f6953..7563b816656b6 100644 --- a/airflow/contrib/example_dags/example_twitter_README.md +++ b/airflow/contrib/example_dags/example_twitter_README.md @@ -1,3 +1,22 @@ + + # Example Twitter DAG ***Introduction:*** This example dag depicts a typical ETL process and is a perfect use case automation scenario for Airflow. Please note that the main scripts associated with the tasks are returning None. The purpose of this DAG is to demonstrate how to write a functional DAG within Airflow. @@ -31,6 +50,6 @@ CREATE TABLE toTwitter_A(id BIGINT, id_str STRING ``` When you review the code for the DAG, you will notice that these tasks are generated using for loop. These two for loops could be combined into one loop. However, in most cases, you will be running different analysis on your incoming incoming and outgoing tweets, and hence they are kept separated in this example. Final step is a running the broker script, brokerapi.py, which will run queries in Hive and store the summarized data to MySQL in our case. To connect to Hive, pyhs2 library is extremely useful and easy to use. To insert data into MySQL from Python, sqlalchemy is also a good one to use. -I hope you find this tutorial useful. If you have question feel free to ask me on [Twitter](https://twitter.com/EkhtiarSyed) or via the live Airflow chatroom room in [Gitter](https://gitter.im/airbnb/airflow).

+I hope you find this tutorial useful. If you have question feel free to ask me on [Twitter](https://twitter.com/EkhtiarSyed) or via the live Airflow chatroom room in [Gitter](https://gitter.im/apache/airflow).

-Ekhtiar Syed Last Update: 8-April-2016 diff --git a/airflow/contrib/example_dags/example_winrm_operator.py b/airflow/contrib/example_dags/example_winrm_operator.py index 195bf5d98d03c..83e1844ebb042 100644 --- a/airflow/contrib/example_dags/example_winrm_operator.py +++ b/airflow/contrib/example_dags/example_winrm_operator.py @@ -31,7 +31,7 @@ from airflow.models import DAG from datetime import timedelta -from airflow.contrib.hooks import WinRMHook +from airflow.contrib.hooks.winrm_hook import WinRMHook from airflow.contrib.operators.winrm_operator import WinRMOperator diff --git a/init.sh b/airflow/contrib/example_dags/libs/__init__.py similarity index 92% rename from init.sh rename to airflow/contrib/example_dags/libs/__init__.py index 6f4adcad06752..114d189da14ab 100644 --- a/init.sh +++ b/airflow/contrib/example_dags/libs/__init__.py @@ -1,5 +1,4 @@ -#!/usr/bin/env bash - +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -8,14 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -source $AIRFLOW_HOME/env/bin/activate diff --git a/run_tox.sh b/airflow/contrib/example_dags/libs/helper.py old mode 100755 new mode 100644 similarity index 91% rename from run_tox.sh rename to airflow/contrib/example_dags/libs/helper.py index b4f204d649a32..d7b62e65c7c88 --- a/run_tox.sh +++ b/airflow/contrib/example_dags/libs/helper.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -6,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -16,6 +17,6 @@ # specific language governing permissions and limitations # under the License. -set -o verbose -python setup.py test --tox-args="-v -e $TOX_ENV" +def print_stuff(): + print("annotated!") diff --git a/airflow/contrib/executors/kubernetes_executor.py b/airflow/contrib/executors/kubernetes_executor.py index 65053bd56b2a2..ebd22076e078c 100644 --- a/airflow/contrib/executors/kubernetes_executor.py +++ b/airflow/contrib/executors/kubernetes_executor.py @@ -16,6 +16,9 @@ # under the License. import base64 +import hashlib +import re +import json import multiprocessing from queue import Queue from dateutil import parser @@ -29,8 +32,10 @@ from airflow.contrib.kubernetes.worker_configuration import WorkerConfiguration from airflow.executors.base_executor import BaseExecutor from airflow.executors import Executors -from airflow.models import TaskInstance, KubeResourceVersion, KubeWorkerIdentifier +from airflow.models import TaskInstance +from airflow.models.kubernetes import KubeResourceVersion, KubeWorkerIdentifier from airflow.utils.state import State +from airflow.utils.db import provide_session, create_session from airflow import configuration, settings from airflow.exceptions import AirflowConfigException, AirflowException from airflow.utils.log.logging_mixin import LoggingMixin @@ -39,7 +44,8 @@ class KubernetesExecutorConfig: def __init__(self, image=None, image_pull_policy=None, request_memory=None, request_cpu=None, limit_memory=None, limit_cpu=None, - gcp_service_account_key=None): + gcp_service_account_key=None, node_selectors=None, affinity=None, + annotations=None, volumes=None, volume_mounts=None, tolerations=None): self.image = image self.image_pull_policy = image_pull_policy self.request_memory = request_memory @@ -47,13 +53,23 @@ def __init__(self, image=None, image_pull_policy=None, request_memory=None, self.limit_memory = limit_memory self.limit_cpu = limit_cpu self.gcp_service_account_key = gcp_service_account_key + self.node_selectors = node_selectors + self.affinity = affinity + self.annotations = annotations + self.volumes = volumes + self.volume_mounts = volume_mounts + self.tolerations = tolerations def __repr__(self): return "{}(image={}, image_pull_policy={}, request_memory={}, request_cpu={}, " \ - "limit_memory={}, limit_cpu={}, gcp_service_account_key={})" \ + "limit_memory={}, limit_cpu={}, gcp_service_account_key={}, " \ + "node_selectors={}, affinity={}, annotations={}, volumes={}, " \ + "volume_mounts={}, tolerations={})" \ .format(KubernetesExecutorConfig.__name__, self.image, self.image_pull_policy, self.request_memory, self.request_cpu, self.limit_memory, - self.limit_cpu, self.gcp_service_account_key) + self.limit_cpu, self.gcp_service_account_key, self.node_selectors, + self.affinity, self.annotations, self.volumes, self.volume_mounts, + self.tolerations) @staticmethod def from_dict(obj): @@ -73,7 +89,13 @@ def from_dict(obj): request_cpu=namespaced.get('request_cpu', None), limit_memory=namespaced.get('limit_memory', None), limit_cpu=namespaced.get('limit_cpu', None), - gcp_service_account_key=namespaced.get('gcp_service_account_key', None) + gcp_service_account_key=namespaced.get('gcp_service_account_key', None), + node_selectors=namespaced.get('node_selectors', None), + affinity=namespaced.get('affinity', None), + annotations=namespaced.get('annotations', {}), + volumes=namespaced.get('volumes', []), + volume_mounts=namespaced.get('volume_mounts', []), + tolerations=namespaced.get('tolerations', None), ) def as_dict(self): @@ -84,7 +106,13 @@ def as_dict(self): 'request_cpu': self.request_cpu, 'limit_memory': self.limit_memory, 'limit_cpu': self.limit_cpu, - 'gcp_service_account_key': self.gcp_service_account_key + 'gcp_service_account_key': self.gcp_service_account_key, + 'node_selectors': self.node_selectors, + 'affinity': self.affinity, + 'annotations': self.annotations, + 'volumes': self.volumes, + 'volume_mounts': self.volume_mounts, + 'tolerations': self.tolerations, } @@ -96,7 +124,12 @@ def __init__(self): configuration_dict = configuration.as_dict(display_sensitive=True) self.core_configuration = configuration_dict['core'] self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) - self.airflow_home = configuration.get(self.core_section, 'airflow_home') + self.kube_env_vars = configuration_dict.get('kubernetes_environment_variables', {}) + self.env_from_configmap_ref = configuration.get(self.kubernetes_section, + 'env_from_configmap_ref') + self.env_from_secret_ref = configuration.get(self.kubernetes_section, + 'env_from_secret_ref') + self.airflow_home = settings.AIRFLOW_HOME self.dags_folder = configuration.get(self.core_section, 'dags_folder') self.parallelism = configuration.getint(self.core_section, 'PARALLELISM') self.worker_container_repository = configuration.get( @@ -108,13 +141,24 @@ def __init__(self): self.kube_image_pull_policy = configuration.get( self.kubernetes_section, "worker_container_image_pull_policy" ) + self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {}) + self.kube_annotations = configuration_dict.get('kubernetes_annotations', {}) self.delete_worker_pods = conf.getboolean( self.kubernetes_section, 'delete_worker_pods') - + self.worker_pods_creation_batch_size = conf.getint( + self.kubernetes_section, 'worker_pods_creation_batch_size') self.worker_service_account_name = conf.get( self.kubernetes_section, 'worker_service_account_name') self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') + # NOTE: user can build the dags into the docker image directly, + # this will set to True if so + self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') + + # Run as user for pod security context + self.worker_run_as_user = conf.get(self.kubernetes_section, 'run_as_user') + self.worker_fs_group = conf.get(self.kubernetes_section, 'fs_group') + # NOTE: `git_repo` and `git_branch` must be specified together as a pair # The http URL of the git repository to clone from self.git_repo = conf.get(self.kubernetes_section, 'git_repo') @@ -122,11 +166,22 @@ def __init__(self): self.git_branch = conf.get(self.kubernetes_section, 'git_branch') # Optionally, the directory in the git repository containing the dags self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') - - # Optionally a user may supply a `git_user` and `git_password` for private - # repositories + # Optionally, the root directory for git operations + self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') + # Optionally, the name at which to publish the checked-out files under --root + self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') + # Optionally, if git_dags_folder_mount_point is set the worker will use + # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder + self.git_dags_folder_mount_point = conf.get(self.kubernetes_section, + 'git_dags_folder_mount_point') + + # Optionally a user may supply a (`git_user` AND `git_password`) OR + # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories self.git_user = conf.get(self.kubernetes_section, 'git_user') self.git_password = conf.get(self.kubernetes_section, 'git_password') + self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name') + self.git_ssh_known_hosts_configmap_name = conf.get(self.kubernetes_section, + 'git_ssh_known_hosts_configmap_name') # NOTE: The user may optionally use a volume claim to mount a PV containing # DAGs directly @@ -145,6 +200,12 @@ def __init__(self): self.logs_volume_subpath = conf.get( self.kubernetes_section, 'logs_volume_subpath') + # Optionally, hostPath volume containing DAGs + self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') + + # Optionally, write logs to a hostPath Volume + self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') + # This prop may optionally be set for PV Claims and is used to write logs self.base_log_folder = configuration.get(self.core_section, 'base_log_folder') @@ -179,13 +240,41 @@ def __init__(self): # configmap self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') + affinity_json = conf.get(self.kubernetes_section, 'affinity') + if affinity_json: + self.kube_affinity = json.loads(affinity_json) + else: + self.kube_affinity = None + + tolerations_json = conf.get(self.kubernetes_section, 'tolerations') + if tolerations_json: + self.kube_tolerations = json.loads(tolerations_json) + else: + self.kube_tolerations = None + self._validate() def _validate(self): - if not self.dags_volume_claim and (not self.git_repo or not self.git_branch): + # TODO: use XOR for dags_volume_claim and git_dags_folder_mount_point + if not self.dags_volume_claim \ + and not self.dags_volume_host \ + and not self.dags_in_image \ + and (not self.git_repo or not self.git_branch or not self.git_dags_folder_mount_point): raise AirflowConfigException( 'In kubernetes mode the following must be set in the `kubernetes` ' - 'config section: `dags_volume_claim` or `git_repo and git_branch`') + 'config section: `dags_volume_claim` ' + 'or `dags_volume_host` ' + 'or `dags_in_image` ' + 'or `git_repo and git_branch and git_dags_folder_mount_point`') + if self.git_repo \ + and (self.git_user or self.git_password) \ + and self.git_ssh_key_secret_name: + raise AirflowConfigException( + 'In kubernetes mode, using `git_repo` to pull the DAGs: ' + 'for private repositories, either `git_user` and `git_password` ' + 'must be set for authentication through user credentials; ' + 'or `git_ssh_key_secret_name` must be set for authentication ' + 'through ssh key, but not both') class KubernetesJobWatcher(multiprocessing.Process, LoggingMixin, object): @@ -275,8 +364,7 @@ def process_status(self, pod_id, status, labels, resource_version): class AirflowKubernetesScheduler(LoggingMixin): - def __init__(self, kube_config, task_queue, result_queue, session, - kube_client, worker_uuid): + def __init__(self, kube_config, task_queue, result_queue, kube_client, worker_uuid): self.log.debug("Creating Kubernetes executor") self.kube_config = kube_config self.task_queue = task_queue @@ -287,12 +375,11 @@ def __init__(self, kube_config, task_queue, result_queue, session, self.launcher = PodLauncher(kube_client=self.kube_client) self.worker_configuration = WorkerConfiguration(kube_config=self.kube_config) self.watcher_queue = multiprocessing.Queue() - self._session = session self.worker_uuid = worker_uuid self.kube_watcher = self._make_kube_watcher() def _make_kube_watcher(self): - resource_version = KubeResourceVersion.get_current_resource_version(self._session) + resource_version = KubeResourceVersion.get_current_resource_version() watcher = KubernetesJobWatcher(self.namespace, self.watcher_queue, resource_version, self.worker_uuid) watcher.start() @@ -317,13 +404,15 @@ def run_next(self, next_job): """ self.log.info('Kubernetes job is %s', str(next_job)) key, command, kube_executor_config = next_job - dag_id, task_id, execution_date = key + dag_id, task_id, execution_date, try_number = key self.log.debug("Kubernetes running for command %s", command) self.log.debug("Kubernetes launching image %s", self.kube_config.kube_image) pod = self.worker_configuration.make_pod( namespace=self.namespace, worker_uuid=self.worker_uuid, pod_id=self._create_pod_id(dag_id, task_id), - dag_id=dag_id, task_id=task_id, + dag_id=self._make_safe_label_value(dag_id), + task_id=self._make_safe_label_value(task_id), + try_number=try_number, execution_date=self._datetime_to_label_safe_datestring(execution_date), airflow_command=command, kube_executor_config=kube_executor_config ) @@ -382,7 +471,7 @@ def _strip_unsafe_kubernetes_special_chars(string): @staticmethod def _make_safe_pod_id(safe_dag_id, safe_task_id, safe_uuid): - """ + r""" Kubernetes pod names must be <= 253 chars and must pass the following regex for validation "^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*$" @@ -400,6 +489,27 @@ def _make_safe_pod_id(safe_dag_id, safe_task_id, safe_uuid): return safe_pod_id + @staticmethod + def _make_safe_label_value(string): + """ + Valid label values must be 63 characters or less and must be empty or begin and + end with an alphanumeric character ([a-z0-9A-Z]) with dashes (-), underscores (_), + dots (.), and alphanumerics between. + + If the label value is then greater than 63 chars once made safe, or differs in any + way from the original value sent to this function, then we need to truncate to + 53chars, and append it with a unique hash. + """ + MAX_LABEL_LEN = 63 + + safe_label = re.sub(r'^[^a-z0-9A-Z]*|[^a-zA-Z0-9_\-\.]|[^a-z0-9A-Z]*$', '', string) + + if len(safe_label) > MAX_LABEL_LEN or string != safe_label: + safe_hash = hashlib.md5(string.encode()).hexdigest()[:9] + safe_label = safe_label[:MAX_LABEL_LEN - len(safe_hash) - 1] + "-" + safe_hash + + return safe_label + @staticmethod def _create_pod_id(dag_id, task_id): safe_dag_id = AirflowKubernetesScheduler._strip_unsafe_kubernetes_special_chars( @@ -418,7 +528,7 @@ def _label_safe_datestring_to_datetime(string): "_", let's replace ":" with "_" - :param string: string + :param string: str :return: datetime.datetime object """ return parser.parse(string.replace('_plus_', '+').replace("_", ":")) @@ -435,30 +545,65 @@ def _datetime_to_label_safe_datestring(datetime_obj): return datetime_obj.isoformat().replace(":", "_").replace('+', '_plus_') def _labels_to_key(self, labels): + try_num = 1 + try: + try_num = int(labels.get('try_number', '1')) + except ValueError: + self.log.warn("could not get try_number as an int: %s", labels.get('try_number', '1')) + try: - return ( - labels['dag_id'], labels['task_id'], - self._label_safe_datestring_to_datetime(labels['execution_date'])) + dag_id = labels['dag_id'] + task_id = labels['task_id'] + ex_time = self._label_safe_datestring_to_datetime(labels['execution_date']) except Exception as e: self.log.warn( - 'Error while converting labels to key; labels: %s; exception: %s', + 'Error while retrieving labels; labels: %s; exception: %s', labels, e ) return None + with create_session() as session: + tasks = ( + session + .query(TaskInstance) + .filter_by(execution_date=ex_time).all() + ) + self.log.info( + 'Checking %s task instances.', + len(tasks) + ) + for task in tasks: + if ( + self._make_safe_label_value(task.dag_id) == dag_id and + self._make_safe_label_value(task.task_id) == task_id and + task.execution_date == ex_time + ): + self.log.info( + 'Found matching task %s-%s (%s) with current state of %s', + task.dag_id, task.task_id, task.execution_date, task.state + ) + dag_id = task.dag_id + task_id = task.task_id + return (dag_id, task_id, ex_time, try_num) + self.log.warn( + 'Failed to find and match task details to a pod; labels: %s', + labels + ) + return None + class KubernetesExecutor(BaseExecutor, LoggingMixin): def __init__(self): self.kube_config = KubeConfig() self.task_queue = None - self._session = None self.result_queue = None self.kube_scheduler = None self.kube_client = None self.worker_uuid = None super(KubernetesExecutor, self).__init__(parallelism=self.kube_config.parallelism) - def clear_not_launched_queued_tasks(self): + @provide_session + def clear_not_launched_queued_tasks(self, session=None): """ If the airflow scheduler restarts with pending "Queued" tasks, the tasks may or may not @@ -474,18 +619,25 @@ def clear_not_launched_queued_tasks(self): proper support for State.LAUNCHED """ - queued_tasks = self._session.query( - TaskInstance).filter(TaskInstance.state == State.QUEUED).all() + queued_tasks = session\ + .query(TaskInstance)\ + .filter(TaskInstance.state == State.QUEUED).all() self.log.info( 'When executor started up, found %s queued task instances', len(queued_tasks) ) for task in queued_tasks: - dict_string = "dag_id={},task_id={},execution_date={},airflow-worker={}" \ - .format(task.dag_id, task.task_id, - AirflowKubernetesScheduler._datetime_to_label_safe_datestring( - task.execution_date), self.worker_uuid) + dict_string = ( + "dag_id={},task_id={},execution_date={},airflow-worker={}".format( + AirflowKubernetesScheduler._make_safe_label_value(task.dag_id), + AirflowKubernetesScheduler._make_safe_label_value(task.task_id), + AirflowKubernetesScheduler._datetime_to_label_safe_datestring( + task.execution_date + ), + self.worker_uuid + ) + ) kwargs = dict(label_selector=dict_string) pod_list = self.kube_client.list_namespaced_pod( self.kube_config.kube_namespace, **kwargs) @@ -494,14 +646,12 @@ def clear_not_launched_queued_tasks(self): 'TaskInstance: %s found in queued state but was not launched, ' 'rescheduling', task ) - self._session.query(TaskInstance).filter( + session.query(TaskInstance).filter( TaskInstance.dag_id == task.dag_id, TaskInstance.task_id == task.task_id, TaskInstance.execution_date == task.execution_date ).update({TaskInstance.state: State.NONE}) - self._session.commit() - def _inject_secrets(self): def _create_or_update_secret(secret_name, secret_path): try: @@ -538,20 +688,18 @@ def _create_or_update_secret(secret_name, secret_path): def start(self): self.log.info('Start Kubernetes executor') - self._session = settings.Session() - self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid( - self._session) + self.worker_uuid = KubeWorkerIdentifier.get_or_create_current_kube_worker_uuid() self.log.debug('Start with worker_uuid: %s', self.worker_uuid) # always need to reset resource version since we don't know # when we last started, note for behavior below # https://github.com/kubernetes-client/python/blob/master/kubernetes/docs # /CoreV1Api.md#list_namespaced_pod - KubeResourceVersion.reset_resource_version(self._session) + KubeResourceVersion.reset_resource_version() self.task_queue = Queue() self.result_queue = Queue() self.kube_client = get_kube_client() self.kube_scheduler = AirflowKubernetesScheduler( - self.kube_config, self.task_queue, self.result_queue, self._session, + self.kube_config, self.task_queue, self.result_queue, self.kube_client, self.worker_uuid ) self._inject_secrets() @@ -567,9 +715,9 @@ def execute_async(self, key, command, queue=None, executor_config=None): def sync(self): if self.running: - self.log.info('self.running: %s', self.running) + self.log.debug('self.running: %s', self.running) if self.queued_tasks: - self.log.info('self.queued: %s', self.queued_tasks) + self.log.debug('self.queued: %s', self.queued_tasks) self.kube_scheduler.sync() last_resource_version = None @@ -578,14 +726,24 @@ def sync(self): key, state, pod_id, resource_version = results last_resource_version = resource_version self.log.info('Changing state of %s to %s', results, state) - self._change_state(key, state, pod_id) + try: + self._change_state(key, state, pod_id) + except Exception as e: + self.log.exception('Exception: %s when attempting ' + + 'to change state of %s to %s, re-queueing.', e, results, state) + self.result_queue.put(results) - KubeResourceVersion.checkpoint_resource_version( - last_resource_version, session=self._session) + KubeResourceVersion.checkpoint_resource_version(last_resource_version) - if not self.task_queue.empty(): - key, command, kube_executor_config = self.task_queue.get() - self.kube_scheduler.run_next((key, command, kube_executor_config)) + for i in range(min((self.kube_config.worker_pods_creation_batch_size, self.task_queue.qsize()))): + task = self.task_queue.get() + + try: + self.kube_scheduler.run_next(task) + except ApiException: + self.log.exception('ApiException when attempting ' + + 'to run task, re-queueing.') + self.task_queue.put(task) def _change_state(self, key, state, pod_id): if state != State.RUNNING: @@ -597,16 +755,16 @@ def _change_state(self, key, state, pod_id): self.log.debug('Could not find key: %s', str(key)) pass self.event_buffer[key] = state - (dag_id, task_id, ex_time) = key - item = self._session.query(TaskInstance).filter_by( - dag_id=dag_id, - task_id=task_id, - execution_date=ex_time - ).one() - if state: - item.state = state - self._session.add(item) - self._session.commit() + (dag_id, task_id, ex_time, try_number) = key + with create_session() as session: + item = session.query(TaskInstance).filter_by( + dag_id=dag_id, + task_id=task_id, + execution_date=ex_time + ).one() + if state: + item.state = state + session.add(item) def end(self): self.log.info('Shutting down Kubernetes executor') diff --git a/airflow/contrib/executors/mesos_executor.py b/airflow/contrib/executors/mesos_executor.py index ff974ffc3c080..f101348acc1a1 100644 --- a/airflow/contrib/executors/mesos_executor.py +++ b/airflow/contrib/executors/mesos_executor.py @@ -19,10 +19,6 @@ from future import standard_library -from airflow.utils.log.logging_mixin import LoggingMixin -from airflow.www.utils import LoginMixin - - from builtins import str from queue import Queue @@ -49,7 +45,7 @@ def get_framework_name(): # AirflowMesosScheduler, implements Mesos Scheduler interface # To schedule airflow jobs on mesos -class AirflowMesosScheduler(mesos.interface.Scheduler, LoggingMixin): +class AirflowMesosScheduler(mesos.interface.Scheduler): """ Airflow Mesos scheduler implements mesos scheduler interface to schedule airflow tasks on mesos. @@ -80,7 +76,7 @@ def registered(self, driver, frameworkId, masterInfo): if configuration.conf.getboolean('mesos', 'CHECKPOINT') and \ configuration.conf.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error - from airflow.models import Connection + from airflow.models.connection import Connection # Update the Framework ID in the database. session = Session() @@ -162,7 +158,7 @@ def resourceOffers(self, driver, offers): command = mesos_pb2.CommandInfo() command.shell = True - command.value = cmd + command.value = " ".join(cmd) task.command.MergeFrom(command) # If docker image for airflow is specified in config then pull that @@ -213,7 +209,7 @@ def statusUpdate(self, driver, update): self.task_queue.task_done() -class MesosExecutor(BaseExecutor, LoginMixin): +class MesosExecutor(BaseExecutor): """ MesosExecutor allows distributing the execution of task instances to multiple mesos workers. @@ -253,7 +249,7 @@ def start(self): if configuration.conf.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error - from airflow.models import Connection + from airflow.models.connection import Connection # Query the database to get the ID of the Mesos Framework, if available. conn_id = FRAMEWORK_CONNID_PREFIX + framework.name diff --git a/airflow/contrib/hooks/aws_athena_hook.py b/airflow/contrib/hooks/aws_athena_hook.py new file mode 100644 index 0000000000000..f11ff23c515f4 --- /dev/null +++ b/airflow/contrib/hooks/aws_athena_hook.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from time import sleep +from airflow.contrib.hooks.aws_hook import AwsHook + + +class AWSAthenaHook(AwsHook): + """ + Interact with AWS Athena to run, poll queries and return query results + + :param aws_conn_id: aws connection to use. + :type aws_conn_id: str + :param sleep_time: Time to wait between two consecutive call to check query status on athena + :type sleep_time: int + """ + + INTERMEDIATE_STATES = ('QUEUED', 'RUNNING',) + FAILURE_STATES = ('FAILED', 'CANCELLED',) + SUCCESS_STATES = ('SUCCEEDED',) + + def __init__(self, aws_conn_id='aws_default', sleep_time=30, *args, **kwargs): + super(AWSAthenaHook, self).__init__(aws_conn_id, **kwargs) + self.sleep_time = sleep_time + self.conn = None + + def get_conn(self): + """ + check if aws conn exists already or create one and return it + + :return: boto3 session + """ + if not self.conn: + self.conn = self.get_client_type('athena') + return self.conn + + def run_query(self, query, query_context, result_configuration, client_request_token=None): + """ + Run Presto query on athena with provided config and return submitted query_execution_id + + :param query: Presto query to run + :type query: str + :param query_context: Context in which query need to be run + :type query_context: dict + :param result_configuration: Dict with path to store results in and config related to encryption + :type result_configuration: dict + :param client_request_token: Unique token created by user to avoid multiple executions of same query + :type client_request_token: str + :return: str + """ + response = self.conn.start_query_execution(QueryString=query, + ClientRequestToken=client_request_token, + QueryExecutionContext=query_context, + ResultConfiguration=result_configuration) + query_execution_id = response['QueryExecutionId'] + return query_execution_id + + def check_query_status(self, query_execution_id): + """ + Fetch the status of submitted athena query. Returns None or one of valid query states. + + :param query_execution_id: Id of submitted athena query + :type query_execution_id: str + :return: str + """ + response = self.conn.get_query_execution(QueryExecutionId=query_execution_id) + state = None + try: + state = response['QueryExecution']['Status']['State'] + except Exception as ex: + self.log.error('Exception while getting query state', ex) + finally: + return state + + def get_query_results(self, query_execution_id): + """ + Fetch submitted athena query results. returns none if query is in intermediate state or + failed/cancelled state else dict of query output + + :param query_execution_id: Id of submitted athena query + :type query_execution_id: str + :return: dict + """ + query_state = self.check_query_status(query_execution_id) + if query_state is None: + self.log.error('Invalid Query state') + return None + elif query_state in self.INTERMEDIATE_STATES or query_state in self.FAILURE_STATES: + self.log.error('Query is in {state} state. Cannot fetch results'.format(state=query_state)) + return None + return self.conn.get_query_results(QueryExecutionId=query_execution_id) + + def poll_query_status(self, query_execution_id, max_tries=None): + """ + Poll the status of submitted athena query until query state reaches final state. + Returns one of the final states + + :param query_execution_id: Id of submitted athena query + :type query_execution_id: str + :param max_tries: Number of times to poll for query state before function exits + :type max_tries: int + :return: str + """ + try_number = 1 + final_query_state = None # Query state when query reaches final state or max_tries reached + while True: + query_state = self.check_query_status(query_execution_id) + if query_state is None: + self.log.info('Trial {try_number}: Invalid query state. Retrying again'.format( + try_number=try_number)) + elif query_state in self.INTERMEDIATE_STATES: + self.log.info('Trial {try_number}: Query is still in an intermediate state - {state}' + .format(try_number=try_number, state=query_state)) + else: + self.log.info('Trial {try_number}: Query execution completed. Final state is {state}' + .format(try_number=try_number, state=query_state)) + final_query_state = query_state + break + if max_tries and try_number >= max_tries: # Break loop if max_tries reached + final_query_state = query_state + break + try_number += 1 + sleep(self.sleep_time) + return final_query_state + + def stop_query(self, query_execution_id): + """ + Cancel the submitted athena query + + :param query_execution_id: Id of submitted athena query + :type query_execution_id: str + :return: dict + """ + return self.conn.stop_query_execution(QueryExecutionId=query_execution_id) diff --git a/airflow/contrib/hooks/aws_glue_catalog_hook.py b/airflow/contrib/hooks/aws_glue_catalog_hook.py new file mode 100644 index 0000000000000..4b7fba533f920 --- /dev/null +++ b/airflow/contrib/hooks/aws_glue_catalog_hook.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from airflow.contrib.hooks.aws_hook import AwsHook + + +class AwsGlueCatalogHook(AwsHook): + """ + Interact with AWS Glue Catalog + + :param aws_conn_id: ID of the Airflow connection where + credentials and extra configuration are stored + :type aws_conn_id: str + :param region_name: aws region name (example: us-east-1) + :type region_name: str + """ + + def __init__(self, + aws_conn_id='aws_default', + region_name=None, + *args, + **kwargs): + self.region_name = region_name + super(AwsGlueCatalogHook, self).__init__(aws_conn_id=aws_conn_id, *args, **kwargs) + + def get_conn(self): + """ + Returns glue connection object. + """ + self.conn = self.get_client_type('glue', self.region_name) + return self.conn + + def get_partitions(self, + database_name, + table_name, + expression='', + page_size=None, + max_items=None): + """ + Retrieves the partition values for a table. + + :param database_name: The name of the catalog database where the partitions reside. + :type database_name: str + :param table_name: The name of the partitions' table. + :type table_name: str + :param expression: An expression filtering the partitions to be returned. + Please see official AWS documentation for further information. + https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-partitions.html#aws-glue-api-catalog-partitions-GetPartitions + :type expression: str + :param page_size: pagination size + :type page_size: int + :param max_items: maximum items to return + :type max_items: int + :return: set of partition values where each value is a tuple since + a partition may be composed of multiple columns. For example: + ``{('2018-01-01','1'), ('2018-01-01','2')}`` + """ + config = { + 'PageSize': page_size, + 'MaxItems': max_items, + } + + paginator = self.get_conn().get_paginator('get_partitions') + response = paginator.paginate( + DatabaseName=database_name, + TableName=table_name, + Expression=expression, + PaginationConfig=config + ) + + partitions = set() + for page in response: + for p in page['Partitions']: + partitions.add(tuple(p['Values'])) + + return partitions + + def check_for_partition(self, database_name, table_name, expression): + """ + Checks whether a partition exists + + :param database_name: Name of hive database (schema) @table belongs to + :type database_name: str + :param table_name: Name of hive table @partition belongs to + :type table_name: str + :expression: Expression that matches the partitions to check for + (eg `a = 'b' AND c = 'd'`) + :type expression: str + :rtype: bool + + >>> hook = AwsGlueCatalogHook() + >>> t = 'static_babynames_partitioned' + >>> hook.check_for_partition('airflow', t, "ds='2015-01-01'") + True + """ + partitions = self.get_partitions(database_name, table_name, expression, max_items=1) + + if partitions: + return True + else: + return False + + def get_table(self, database_name, table_name): + """ + Get the information of the table + + :param database_name: Name of hive database (schema) @table belongs to + :type database_name: str + :param table_name: Name of hive table + :type table_name: str + :rtype: dict + + >>> hook = AwsGlueCatalogHook() + >>> r = hook.get_table('db', 'table_foo') + >>> r['Name'] = 'table_foo' + """ + + result = self.get_conn().get_table(DatabaseName=database_name, Name=table_name) + + return result['Table'] + + def get_table_location(self, database_name, table_name): + """ + Get the physical location of the table + + :param database_name: Name of hive database (schema) @table belongs to + :type database_name: str + :param table_name: Name of hive table + :type table_name: str + :return: str + """ + + table = self.get_table(database_name, table_name) + + return table['StorageDescriptor']['Location'] diff --git a/airflow/contrib/hooks/aws_hook.py b/airflow/contrib/hooks/aws_hook.py index c712d2de7910b..9d4a73e1c0c6b 100644 --- a/airflow/contrib/hooks/aws_hook.py +++ b/airflow/contrib/hooks/aws_hook.py @@ -17,7 +17,6 @@ # specific language governing permissions and limitations # under the License. - import boto3 import configparser import logging @@ -72,7 +71,7 @@ def _parse_s3_config(config_file_name, config_format='boto', profile=None): try: access_key = config.get(cred_section, key_id_option) secret_key = config.get(cred_section, secret_key_option) - except: + except Exception: logging.warning("Option Error in parsing s3 config file") raise return access_key, secret_key @@ -84,8 +83,9 @@ class AwsHook(BaseHook): This class is a thin wrapper around the boto3 python library. """ - def __init__(self, aws_conn_id='aws_default'): + def __init__(self, aws_conn_id='aws_default', verify=None): self.aws_conn_id = aws_conn_id + self.verify = verify def _get_credentials(self, region_name): aws_access_key_id = None @@ -96,32 +96,36 @@ def _get_credentials(self, region_name): if self.aws_conn_id: try: connection_object = self.get_connection(self.aws_conn_id) + extra_config = connection_object.extra_dejson if connection_object.login: aws_access_key_id = connection_object.login aws_secret_access_key = connection_object.password - elif 'aws_secret_access_key' in connection_object.extra_dejson: - aws_access_key_id = connection_object.extra_dejson[ + elif 'aws_secret_access_key' in extra_config: + aws_access_key_id = extra_config[ 'aws_access_key_id'] - aws_secret_access_key = connection_object.extra_dejson[ + aws_secret_access_key = extra_config[ 'aws_secret_access_key'] - elif 's3_config_file' in connection_object.extra_dejson: + elif 's3_config_file' in extra_config: aws_access_key_id, aws_secret_access_key = \ _parse_s3_config( - connection_object.extra_dejson['s3_config_file'], - connection_object.extra_dejson.get('s3_config_format')) + extra_config['s3_config_file'], + extra_config.get('s3_config_format'), + extra_config.get('profile')) if region_name is None: - region_name = connection_object.extra_dejson.get('region_name') + region_name = extra_config.get('region_name') - role_arn = connection_object.extra_dejson.get('role_arn') - aws_account_id = connection_object.extra_dejson.get('aws_account_id') - aws_iam_role = connection_object.extra_dejson.get('aws_iam_role') + role_arn = extra_config.get('role_arn') + external_id = extra_config.get('external_id') + aws_account_id = extra_config.get('aws_account_id') + aws_iam_role = extra_config.get('aws_iam_role') if role_arn is None and aws_account_id is not None and \ aws_iam_role is not None: - role_arn = "arn:aws:iam::" + aws_account_id + ":role/" + aws_iam_role + role_arn = "arn:aws:iam::{}:role/{}" \ + .format(aws_account_id, aws_iam_role) if role_arn is not None: sts_session = boto3.session.Session( @@ -130,14 +134,23 @@ def _get_credentials(self, region_name): region_name=region_name) sts_client = sts_session.client('sts') - sts_response = sts_client.assume_role( - RoleArn=role_arn, - RoleSessionName='Airflow_' + self.aws_conn_id) - aws_access_key_id = sts_response['Credentials']['AccessKeyId'] - aws_secret_access_key = sts_response['Credentials']['SecretAccessKey'] - aws_session_token = sts_response['Credentials']['SessionToken'] - endpoint_url = connection_object.extra_dejson.get('host') + if external_id is None: + sts_response = sts_client.assume_role( + RoleArn=role_arn, + RoleSessionName='Airflow_' + self.aws_conn_id) + else: + sts_response = sts_client.assume_role( + RoleArn=role_arn, + RoleSessionName='Airflow_' + self.aws_conn_id, + ExternalId=external_id) + + credentials = sts_response['Credentials'] + aws_access_key_id = credentials['AccessKeyId'] + aws_secret_access_key = credentials['SecretAccessKey'] + aws_session_token = credentials['SessionToken'] + + endpoint_url = extra_config.get('host') except AirflowException: # No connection found: fallback on boto3 credential strategy @@ -150,15 +163,17 @@ def _get_credentials(self, region_name): aws_session_token=aws_session_token, region_name=region_name), endpoint_url - def get_client_type(self, client_type, region_name=None): + def get_client_type(self, client_type, region_name=None, config=None): session, endpoint_url = self._get_credentials(region_name) - return session.client(client_type, endpoint_url=endpoint_url) + return session.client(client_type, endpoint_url=endpoint_url, + config=config, verify=self.verify) - def get_resource_type(self, resource_type, region_name=None): + def get_resource_type(self, resource_type, region_name=None, config=None): session, endpoint_url = self._get_credentials(region_name) - return session.resource(resource_type, endpoint_url=endpoint_url) + return session.resource(resource_type, endpoint_url=endpoint_url, + config=config, verify=self.verify) def get_session(self, region_name=None): """Get the underlying boto3.session.""" @@ -168,10 +183,23 @@ def get_session(self, region_name=None): def get_credentials(self, region_name=None): """Get the underlying `botocore.Credentials` object. - This contains the attributes: access_key, secret_key and token. + This contains the following authentication attributes: access_key, secret_key and token. """ session, _ = self._get_credentials(region_name) - # Credentials are refreshable, so accessing your access key / secret key - # separately can lead to a race condition. + # Credentials are refreshable, so accessing your access key and + # secret key separately can lead to a race condition. # See https://stackoverflow.com/a/36291428/8283373 return session.get_credentials().get_frozen_credentials() + + def expand_role(self, role): + """ + If the IAM role is a role name, get the Amazon Resource Name (ARN) for the role. + If IAM role is already an IAM role ARN, no change is made. + + :param role: IAM role name or ARN + :return: IAM role ARN + """ + if '/' in role: + return role + else: + return self.get_client_type('iam').get_role(RoleName=role)['Role']['Arn'] diff --git a/airflow/contrib/hooks/aws_sns_hook.py b/airflow/contrib/hooks/aws_sns_hook.py new file mode 100644 index 0000000000000..4308b493ce3e3 --- /dev/null +++ b/airflow/contrib/hooks/aws_sns_hook.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json + +from airflow.contrib.hooks.aws_hook import AwsHook + + +class AwsSnsHook(AwsHook): + """ + Interact with Amazon Simple Notification Service. + """ + + def __init__(self, *args, **kwargs): + super(AwsSnsHook, self).__init__(*args, **kwargs) + + def get_conn(self): + """ + Get an SNS connection + """ + self.conn = self.get_client_type('sns') + return self.conn + + def publish_to_target(self, target_arn, message): + """ + Publish a message to a topic or an endpoint. + + :param target_arn: either a TopicArn or an EndpointArn + :type target_arn: str + :param message: the default message you want to send + :param message: str + """ + + conn = self.get_conn() + + messages = { + 'default': message + } + + return conn.publish( + TargetArn=target_arn, + Message=json.dumps(messages), + MessageStructure='json' + ) diff --git a/airflow/contrib/hooks/azure_container_instance_hook.py b/airflow/contrib/hooks/azure_container_instance_hook.py new file mode 100644 index 0000000000000..3f29c6de4af5a --- /dev/null +++ b/airflow/contrib/hooks/azure_container_instance_hook.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from airflow.hooks.base_hook import BaseHook +from airflow.exceptions import AirflowException + +from azure.common.client_factory import get_client_from_auth_file +from azure.common.credentials import ServicePrincipalCredentials + +from azure.mgmt.containerinstance import ContainerInstanceManagementClient + + +class AzureContainerInstanceHook(BaseHook): + """ + A hook to communicate with Azure Container Instances. + + This hook requires a service principal in order to work. + After creating this service principal + (Azure Active Directory/App Registrations), you need to fill in the + client_id (Application ID) as login, the generated password as password, + and tenantId and subscriptionId in the extra's field as a json. + + :param conn_id: connection id of a service principal which will be used + to start the container instance + :type conn_id: str + """ + + def __init__(self, conn_id='azure_default'): + self.conn_id = conn_id + self.connection = self.get_conn() + + def get_conn(self): + conn = self.get_connection(self.conn_id) + key_path = conn.extra_dejson.get('key_path', False) + if key_path: + if key_path.endswith('.json'): + self.log.info('Getting connection using a JSON key file.') + return get_client_from_auth_file(ContainerInstanceManagementClient, + key_path) + else: + raise AirflowException('Unrecognised extension for key file.') + + if os.environ.get('AZURE_AUTH_LOCATION'): + key_path = os.environ.get('AZURE_AUTH_LOCATION') + if key_path.endswith('.json'): + self.log.info('Getting connection using a JSON key file.') + return get_client_from_auth_file(ContainerInstanceManagementClient, + key_path) + else: + raise AirflowException('Unrecognised extension for key file.') + + credentials = ServicePrincipalCredentials( + client_id=conn.login, + secret=conn.password, + tenant=conn.extra_dejson['tenantId'] + ) + + subscription_id = conn.extra_dejson['subscriptionId'] + return ContainerInstanceManagementClient(credentials, str(subscription_id)) + + def create_or_update(self, resource_group, name, container_group): + """ + Create a new container group + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + :param container_group: the properties of the container group + :type container_group: azure.mgmt.containerinstance.models.ContainerGroup + """ + self.connection.container_groups.create_or_update(resource_group, + name, + container_group) + + def get_state_exitcode_details(self, resource_group, name): + """ + Get the state and exitcode of a container group + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + :return: A tuple with the state, exitcode, and details. + If the exitcode is unknown 0 is returned. + :rtype: tuple(state,exitcode,details) + """ + current_state = self._get_instance_view(resource_group, name).current_state + return (current_state.state, + current_state.exit_code, + current_state.detail_status) + + def _get_instance_view(self, resource_group, name): + response = self.connection.container_groups.get(resource_group, + name, + raw=False) + return response.containers[0].instance_view + + def get_messages(self, resource_group, name): + """ + Get the messages of a container group + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + :return: A list of the event messages + :rtype: list[str] + """ + instance_view = self._get_instance_view(resource_group, name) + + return [event.message for event in instance_view.events] + + def get_logs(self, resource_group, name, tail=1000): + """ + Get the tail from logs of a container group + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + :param tail: the size of the tail + :type tail: int + :return: A list of log messages + :rtype: list[str] + """ + logs = self.connection.container.list_logs(resource_group, name, name, tail=tail) + return logs.content.splitlines(True) + + def delete(self, resource_group, name): + """ + Delete a container group + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + """ + self.connection.container_groups.delete(resource_group, name) + + def exists(self, resource_group, name): + """ + Test if a container group exists + + :param resource_group: the name of the resource group + :type resource_group: str + :param name: the name of the container group + :type name: str + """ + for container in self.connection.container_groups.list_by_resource_group(resource_group): + if container.name == name: + return True + return False diff --git a/airflow/contrib/hooks/azure_container_registry_hook.py b/airflow/contrib/hooks/azure_container_registry_hook.py new file mode 100644 index 0000000000000..af38c1a94380e --- /dev/null +++ b/airflow/contrib/hooks/azure_container_registry_hook.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.base_hook import BaseHook +from azure.mgmt.containerinstance.models import ImageRegistryCredential + + +class AzureContainerRegistryHook(BaseHook): + """ + A hook to communicate with a Azure Container Registry. + + :param conn_id: connection id of a service principal which will be used + to start the container instance + :type conn_id: str + """ + + def __init__(self, conn_id='azure_registry'): + self.conn_id = conn_id + self.connection = self.get_conn() + + def get_conn(self): + conn = self.get_connection(self.conn_id) + return ImageRegistryCredential(server=conn.host, username=conn.login, password=conn.password) diff --git a/airflow/contrib/hooks/azure_container_volume_hook.py b/airflow/contrib/hooks/azure_container_volume_hook.py new file mode 100644 index 0000000000000..c81c5190776c6 --- /dev/null +++ b/airflow/contrib/hooks/azure_container_volume_hook.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.base_hook import BaseHook +from azure.mgmt.containerinstance.models import (Volume, + AzureFileVolume) + + +class AzureContainerVolumeHook(BaseHook): + """ + A hook which wraps an Azure Volume. + + :param wasb_conn_id: connection id of a Azure storage account of + which file shares should be mounted + :type wasb_conn_id: str + """ + + def __init__(self, wasb_conn_id='wasb_default'): + self.conn_id = wasb_conn_id + + def get_storagekey(self): + conn = self.get_connection(self.conn_id) + service_options = conn.extra_dejson + + if 'connection_string' in service_options: + for keyvalue in service_options['connection_string'].split(";"): + key, value = keyvalue.split("=", 1) + if key == "AccountKey": + return value + return conn.password + + def get_file_volume(self, mount_name, share_name, + storage_account_name, read_only=False): + return Volume(name=mount_name, + azure_file=AzureFileVolume(share_name=share_name, + storage_account_name=storage_account_name, + read_only=read_only, + storage_account_key=self.get_storagekey())) diff --git a/airflow/contrib/hooks/azure_cosmos_hook.py b/airflow/contrib/hooks/azure_cosmos_hook.py new file mode 100644 index 0000000000000..01b4007b0308f --- /dev/null +++ b/airflow/contrib/hooks/azure_cosmos_hook.py @@ -0,0 +1,287 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import azure.cosmos.cosmos_client as cosmos_client +from azure.cosmos.errors import HTTPFailure +import uuid + +from airflow.exceptions import AirflowBadRequest +from airflow.hooks.base_hook import BaseHook + + +class AzureCosmosDBHook(BaseHook): + """ + Interacts with Azure CosmosDB. + + login should be the endpoint uri, password should be the master key + optionally, you can use the following extras to default these values + {"database_name": "", "collection_name": "COLLECTION_NAME"}. + + :param azure_cosmos_conn_id: Reference to the Azure CosmosDB connection. + :type azure_cosmos_conn_id: str + """ + + def __init__(self, azure_cosmos_conn_id='azure_cosmos_default'): + self.conn_id = azure_cosmos_conn_id + self.connection = self.get_connection(self.conn_id) + self.extras = self.connection.extra_dejson + + self.endpoint_uri = self.connection.login + self.master_key = self.connection.password + self.default_database_name = self.extras.get('database_name') + self.default_collection_name = self.extras.get('collection_name') + self.cosmos_client = None + + def get_conn(self): + """ + Return a cosmos db client. + """ + if self.cosmos_client is not None: + return self.cosmos_client + + # Initialize the Python Azure Cosmos DB client + self.cosmos_client = cosmos_client.CosmosClient(self.endpoint_uri, {'masterKey': self.master_key}) + + return self.cosmos_client + + def __get_database_name(self, database_name=None): + db_name = database_name + if db_name is None: + db_name = self.default_database_name + + if db_name is None: + raise AirflowBadRequest("Database name must be specified") + + return db_name + + def __get_collection_name(self, collection_name=None): + coll_name = collection_name + if coll_name is None: + coll_name = self.default_collection_name + + if coll_name is None: + raise AirflowBadRequest("Collection name must be specified") + + return coll_name + + def does_collection_exist(self, collection_name, database_name=None): + """ + Checks if a collection exists in CosmosDB. + """ + if collection_name is None: + raise AirflowBadRequest("Collection name cannot be None.") + + existing_container = list(self.get_conn().QueryContainers( + get_database_link(self.__get_database_name(database_name)), { + "query": "SELECT * FROM r WHERE r.id=@id", + "parameters": [ + {"name": "@id", "value": collection_name} + ] + })) + if len(existing_container) == 0: + return False + + return True + + def create_collection(self, collection_name, database_name=None): + """ + Creates a new collection in the CosmosDB database. + """ + if collection_name is None: + raise AirflowBadRequest("Collection name cannot be None.") + + # We need to check to see if this container already exists so we don't try + # to create it twice + existing_container = list(self.get_conn().QueryContainers( + get_database_link(self.__get_database_name(database_name)), { + "query": "SELECT * FROM r WHERE r.id=@id", + "parameters": [ + {"name": "@id", "value": collection_name} + ] + })) + + # Only create if we did not find it already existing + if len(existing_container) == 0: + self.get_conn().CreateContainer( + get_database_link(self.__get_database_name(database_name)), + {"id": collection_name}) + + def does_database_exist(self, database_name): + """ + Checks if a database exists in CosmosDB. + """ + if database_name is None: + raise AirflowBadRequest("Database name cannot be None.") + + existing_database = list(self.get_conn().QueryDatabases({ + "query": "SELECT * FROM r WHERE r.id=@id", + "parameters": [ + {"name": "@id", "value": database_name} + ] + })) + if len(existing_database) == 0: + return False + + return True + + def create_database(self, database_name): + """ + Creates a new database in CosmosDB. + """ + if database_name is None: + raise AirflowBadRequest("Database name cannot be None.") + + # We need to check to see if this database already exists so we don't try + # to create it twice + existing_database = list(self.get_conn().QueryDatabases({ + "query": "SELECT * FROM r WHERE r.id=@id", + "parameters": [ + {"name": "@id", "value": database_name} + ] + })) + + # Only create if we did not find it already existing + if len(existing_database) == 0: + self.get_conn().CreateDatabase({"id": database_name}) + + def delete_database(self, database_name): + """ + Deletes an existing database in CosmosDB. + """ + if database_name is None: + raise AirflowBadRequest("Database name cannot be None.") + + self.get_conn().DeleteDatabase(get_database_link(database_name)) + + def delete_collection(self, collection_name, database_name=None): + """ + Deletes an existing collection in the CosmosDB database. + """ + if collection_name is None: + raise AirflowBadRequest("Collection name cannot be None.") + + self.get_conn().DeleteContainer( + get_collection_link(self.__get_database_name(database_name), collection_name)) + + def upsert_document(self, document, database_name=None, collection_name=None, document_id=None): + """ + Inserts a new document (or updates an existing one) into an existing + collection in the CosmosDB database. + """ + # Assign unique ID if one isn't provided + if document_id is None: + document_id = str(uuid.uuid4()) + + if document is None: + raise AirflowBadRequest("You cannot insert a None document") + + # Add document id if isn't found + if 'id' in document: + if document['id'] is None: + document['id'] = document_id + else: + document['id'] = document_id + + created_document = self.get_conn().CreateItem( + get_collection_link( + self.__get_database_name(database_name), + self.__get_collection_name(collection_name)), + document) + + return created_document + + def insert_documents(self, documents, database_name=None, collection_name=None): + """ + Insert a list of new documents into an existing collection in the CosmosDB database. + """ + if documents is None: + raise AirflowBadRequest("You cannot insert empty documents") + + created_documents = [] + for single_document in documents: + created_documents.append( + self.get_conn().CreateItem( + get_collection_link( + self.__get_database_name(database_name), + self.__get_collection_name(collection_name)), + single_document)) + + return created_documents + + def delete_document(self, document_id, database_name=None, collection_name=None): + """ + Delete an existing document out of a collection in the CosmosDB database. + """ + if document_id is None: + raise AirflowBadRequest("Cannot delete a document without an id") + + self.get_conn().DeleteItem( + get_document_link( + self.__get_database_name(database_name), + self.__get_collection_name(collection_name), + document_id)) + + def get_document(self, document_id, database_name=None, collection_name=None): + """ + Get a document from an existing collection in the CosmosDB database. + """ + if document_id is None: + raise AirflowBadRequest("Cannot get a document without an id") + + try: + return self.get_conn().ReadItem( + get_document_link( + self.__get_database_name(database_name), + self.__get_collection_name(collection_name), + document_id)) + except HTTPFailure: + return None + + def get_documents(self, sql_string, database_name=None, collection_name=None, partition_key=None): + """ + Get a list of documents from an existing collection in the CosmosDB database via SQL query. + """ + if sql_string is None: + raise AirflowBadRequest("SQL query string cannot be None") + + # Query them in SQL + query = {'query': sql_string} + + try: + result_iterable = self.get_conn().QueryItems( + get_collection_link( + self.__get_database_name(database_name), + self.__get_collection_name(collection_name)), + query, + partition_key) + + return list(result_iterable) + except HTTPFailure: + return None + + +def get_database_link(database_id): + return "dbs/" + database_id + + +def get_collection_link(database_id, collection_id): + return get_database_link(database_id) + "/colls/" + collection_id + + +def get_document_link(database_id, collection_id, document_id): + return get_collection_link(database_id, collection_id) + "/docs/" + document_id diff --git a/airflow/contrib/hooks/azure_data_lake_hook.py b/airflow/contrib/hooks/azure_data_lake_hook.py index 1a02d78f27b72..049fadbb78274 100644 --- a/airflow/contrib/hooks/azure_data_lake_hook.py +++ b/airflow/contrib/hooks/azure_data_lake_hook.py @@ -59,7 +59,7 @@ def check_for_file(self, file_path): :param file_path: Path and name of the file. :type file_path: str :return: True if the file exists, False otherwise. - :rtype bool + :rtype: bool """ try: files = self.connection.glob(file_path, details=False, invalidate_cache=True) @@ -77,7 +77,7 @@ def upload_file(self, local_path, remote_path, nthreads=64, overwrite=True, are not supported. :type local_path: str :param remote_path: Remote path to upload to; if multiple files, this is the - dircetory root to write within. + directory root to write within. :type remote_path: str :param nthreads: Number of threads to use. If None, uses the number of cores. :type nthreads: int diff --git a/airflow/contrib/hooks/azure_fileshare_hook.py b/airflow/contrib/hooks/azure_fileshare_hook.py index edabc17293606..8afa1540d78b4 100644 --- a/airflow/contrib/hooks/azure_fileshare_hook.py +++ b/airflow/contrib/hooks/azure_fileshare_hook.py @@ -56,7 +56,7 @@ def check_for_directory(self, share_name, directory_name, **kwargs): `FileService.exists()` takes. :type kwargs: object :return: True if the file exists, False otherwise. - :rtype bool + :rtype: bool """ return self.connection.exists(share_name, directory_name, **kwargs) @@ -75,7 +75,7 @@ def check_for_file(self, share_name, directory_name, file_name, **kwargs): `FileService.exists()` takes. :type kwargs: object :return: True if the file exists, False otherwise. - :rtype bool + :rtype: bool """ return self.connection.exists(share_name, directory_name, file_name, **kwargs) @@ -92,7 +92,7 @@ def list_directories_and_files(self, share_name, directory_name=None, **kwargs): `FileService.list_directories_and_files()` takes. :type kwargs: object :return: A list of files and directories - :rtype list + :rtype: list """ return self.connection.list_directories_and_files(share_name, directory_name, @@ -100,7 +100,7 @@ def list_directories_and_files(self, share_name, directory_name=None, **kwargs): def create_directory(self, share_name, directory_name, **kwargs): """ - Create a new direcotry on a Azure File Share. + Create a new directory on a Azure File Share. :param share_name: Name of the share. :type share_name: str @@ -110,7 +110,7 @@ def create_directory(self, share_name, directory_name, **kwargs): `FileService.create_directory()` takes. :type kwargs: object :return: A list of files and directories - :rtype list + :rtype: list """ return self.connection.create_directory(share_name, directory_name, **kwargs) diff --git a/airflow/contrib/hooks/bigquery_hook.py b/airflow/contrib/hooks/bigquery_hook.py index b452f51c22c69..5744e3b1756a4 100644 --- a/airflow/contrib/hooks/bigquery_hook.py +++ b/airflow/contrib/hooks/bigquery_hook.py @@ -23,7 +23,10 @@ """ import time +import six from builtins import range +from copy import deepcopy +from six import iteritems from past.builtins import basestring @@ -31,8 +34,8 @@ from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.hooks.dbapi_hook import DbApiHook from airflow.utils.log.logging_mixin import LoggingMixin -from apiclient.discovery import HttpError, build -from googleapiclient import errors +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError from pandas_gbq.gbq import \ _check_google_client_version as gbq_check_google_client_version from pandas_gbq import read_gbq @@ -41,7 +44,7 @@ from pandas_gbq.gbq import GbqConnector -class BigQueryHook(GoogleCloudBaseHook, DbApiHook, LoggingMixin): +class BigQueryHook(GoogleCloudBaseHook, DbApiHook): """ Interact with BigQuery. This hook uses the Google Cloud Platform connection. @@ -51,10 +54,12 @@ class BigQueryHook(GoogleCloudBaseHook, DbApiHook, LoggingMixin): def __init__(self, bigquery_conn_id='bigquery_default', delegate_to=None, - use_legacy_sql=True): + use_legacy_sql=True, + location=None): super(BigQueryHook, self).__init__( gcp_conn_id=bigquery_conn_id, delegate_to=delegate_to) self.use_legacy_sql = use_legacy_sql + self.location = location def get_conn(self): """ @@ -65,7 +70,9 @@ def get_conn(self): return BigQueryConnection( service=service, project_id=project, - use_legacy_sql=self.use_legacy_sql) + use_legacy_sql=self.use_legacy_sql, + location=self.location, + ) def get_service(self): """ @@ -93,13 +100,13 @@ def get_pandas_df(self, sql, parameters=None, dialect=None): https://github.com/pydata/pandas/issues/6900 :param sql: The BigQuery SQL to execute. - :type sql: string + :type sql: str :param parameters: The parameters to render the SQL query with (not used, leave to override superclass method) :type parameters: mapping or iterable :param dialect: Dialect of BigQuery SQL – legacy SQL or standard SQL defaults to use `self.use_legacy_sql` if not specified - :type dialect: string in {'legacy', 'standard'} + :type dialect: str in {'legacy', 'standard'} """ if dialect is None: dialect = 'legacy' if self.use_legacy_sql else 'standard' @@ -116,12 +123,12 @@ def table_exists(self, project_id, dataset_id, table_id): :param project_id: The Google cloud project in which to look for the table. The connection supplied to the hook must provide access to the specified project. - :type project_id: string + :type project_id: str :param dataset_id: The name of the dataset in which to look for the table. - :type dataset_id: string + :type dataset_id: str :param table_id: The name of the table to check the existence of. - :type table_id: string + :type table_id: str """ service = self.get_service() try: @@ -129,7 +136,7 @@ def table_exists(self, project_id, dataset_id, table_id): projectId=project_id, datasetId=dataset_id, tableId=table_id).execute() return True - except errors.HttpError as e: + except HttpError as e: if e.resp['status'] == '404': return False raise @@ -195,21 +202,34 @@ class BigQueryBaseCursor(LoggingMixin): PEP 249 cursor isn't needed. """ - def __init__(self, service, project_id, use_legacy_sql=True): + def __init__(self, + service, + project_id, + use_legacy_sql=True, + api_resource_configs=None, + location=None): + self.service = service self.project_id = project_id self.use_legacy_sql = use_legacy_sql + if api_resource_configs: + _validate_value("api_resource_configs", api_resource_configs, dict) + self.api_resource_configs = api_resource_configs \ + if api_resource_configs else {} self.running_job_id = None + self.location = location def create_empty_table(self, project_id, dataset_id, table_id, schema_fields=None, - time_partitioning={} - ): + time_partitioning=None, + labels=None, + view=None): """ Creates a new, empty table in the dataset. + To create a view, which is defined by a SQL query, parse a dictionary to 'view' kwarg :param project_id: The project to create the table into. :type project_id: str @@ -218,23 +238,37 @@ def create_empty_table(self, :param table_id: The Name of the table to be created. :type table_id: str :param schema_fields: If set, the schema field list as defined here: - https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema + :type schema_fields: list + :param labels: a dictionary containing labels for the table, passed to BigQuery + :type labels: dict **Example**: :: schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}] - :type schema_fields: list :param time_partitioning: configure optional time partitioning fields i.e. partition by field, type and expiration as per API specifications. .. seealso:: - https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning :type time_partitioning: dict + :param view: [Optional] A dictionary containing definition for the view. + If set, it will create a view instead of a table: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#view + :type view: dict - :return: + **Example**: :: + + view = { + "query": "SELECT * FROM `test-project-id.test_dataset_id.test_table_prefix*` LIMIT 1000", + "useLegacySql": False + } + + :return: None """ + project_id = project_id if project_id is not None else self.project_id table_resource = { @@ -249,6 +283,12 @@ def create_empty_table(self, if time_partitioning: table_resource['timePartitioning'] = time_partitioning + if labels: + table_resource['labels'] = labels + + if view: + table_resource['view'] = view + self.log.info('Creating Table %s:%s.%s', project_id, dataset_id, table_id) @@ -280,7 +320,8 @@ def create_external_table(self, quote_character=None, allow_quoted_newlines=False, allow_jagged_rows=False, - src_fmt_configs={} + src_fmt_configs=None, + labels=None ): """ Creates a new external table in the dataset with the data in Google @@ -291,11 +332,11 @@ def create_external_table(self, for more details about these parameters. :param external_project_dataset_table: - The dotted (.|:).($) BigQuery + The dotted ``(.|:).
($)`` BigQuery table name to create external table. - If is not included, project will be the + If ```` is not included, project will be the project defined in the connection json. - :type external_project_dataset_table: string + :type external_project_dataset_table: str :param schema_fields: The schema field list as defined here: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#resource :type schema_fields: list @@ -304,7 +345,7 @@ def create_external_table(self, per-object name can be used. :type source_uris: list :param source_format: File format to export. - :type source_format: string + :type source_format: str :param autodetect: Try to detect schema and format options automatically. Any option specified explicitly will be honored. :type autodetect: bool @@ -312,8 +353,8 @@ def create_external_table(self, Possible values include GZIP and NONE. The default value is NONE. This setting is ignored for Google Cloud Bigtable, - Google Cloud Datastore backups and Avro formats. - :type compression: string + Google Cloud Datastore backups and Avro formats. + :type compression: str :param ignore_unknown_values: [Optional] Indicates if BigQuery should allow extra values that are not represented in the table schema. If true, the extra values are ignored. If false, records with extra columns @@ -326,13 +367,13 @@ def create_external_table(self, :param skip_leading_rows: Number of rows to skip when loading from a CSV. :type skip_leading_rows: int :param field_delimiter: The delimiter to use when loading from a CSV. - :type field_delimiter: string + :type field_delimiter: str :param quote_character: The value that is used to quote data sections in a CSV file. - :type quote_character: string + :type quote_character: str :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false). - :type allow_quoted_newlines: boolean + :type allow_quoted_newlines: bool :param allow_jagged_rows: Accept rows that are missing trailing optional columns. The missing values are treated as nulls. If false, records with missing trailing columns are treated as bad records, and if there are too many bad @@ -341,8 +382,12 @@ def create_external_table(self, :type allow_jagged_rows: bool :param src_fmt_configs: configure optional fields specific to the source format :type src_fmt_configs: dict + :param labels: a dictionary containing labels for the table, passed to BigQuery + :type labels: dict """ + if src_fmt_configs is None: + src_fmt_configs = {} project_id, dataset_id, external_table_id = \ _split_tablename(table_input=external_project_dataset_table, default_project_id=self.project_id, @@ -439,6 +484,9 @@ def create_external_table(self, table_resource['externalDataConfiguration'][src_fmt_to_param_mapping[ source_format]] = src_fmt_configs + if labels: + table_resource['labels'] = labels + try: self.service.tables().insert( projectId=project_id, @@ -454,22 +502,134 @@ def create_external_table(self, 'BigQuery job failed. Error was: {}'.format(err.content) ) + def patch_table(self, + dataset_id, + table_id, + project_id=None, + description=None, + expiration_time=None, + external_data_configuration=None, + friendly_name=None, + labels=None, + schema=None, + time_partitioning=None, + view=None, + require_partition_filter=None): + """ + Patch information in an existing table. + It only updates fileds that are provided in the request object. + + Reference: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables/patch + + :param dataset_id: The dataset containing the table to be patched. + :type dataset_id: str + :param table_id: The Name of the table to be patched. + :type table_id: str + :param project_id: The project containing the table to be patched. + :type project_id: str + :param description: [Optional] A user-friendly description of this table. + :type description: str + :param expiration_time: [Optional] The time when this table expires, + in milliseconds since the epoch. + :type expiration_time: int + :param external_data_configuration: [Optional] A dictionary containing + properties of a table stored outside of BigQuery. + :type external_data_configuration: dict + :param friendly_name: [Optional] A descriptive name for this table. + :type friendly_name: str + :param labels: [Optional] A dictionary containing labels associated with this table. + :type labels: dict + :param schema: [Optional] If set, the schema field list as defined here: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema + The supported schema modifications and unsupported schema modification are listed here: + https://cloud.google.com/bigquery/docs/managing-table-schemas + **Example**: :: + + schema=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}] + + :type schema: list + :param time_partitioning: [Optional] A dictionary containing time-based partitioning + definition for the table. + :type time_partitioning: dict + :param view: [Optional] A dictionary containing definition for the view. + If set, it will patch a view instead of a table: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#view + **Example**: :: + + view = { + "query": "SELECT * FROM `test-project-id.test_dataset_id.test_table_prefix*` LIMIT 500", + "useLegacySql": False + } + + :type view: dict + :param require_partition_filter: [Optional] If true, queries over the this table require a + partition filter. If false, queries over the table + :type require_partition_filter: bool + + """ + + project_id = project_id if project_id is not None else self.project_id + + table_resource = {} + + if description is not None: + table_resource['description'] = description + if expiration_time is not None: + table_resource['expirationTime'] = expiration_time + if external_data_configuration: + table_resource['externalDataConfiguration'] = external_data_configuration + if friendly_name is not None: + table_resource['friendlyName'] = friendly_name + if labels: + table_resource['labels'] = labels + if schema: + table_resource['schema'] = {'fields': schema} + if time_partitioning: + table_resource['timePartitioning'] = time_partitioning + if view: + table_resource['view'] = view + if require_partition_filter is not None: + table_resource['requirePartitionFilter'] = require_partition_filter + + self.log.info('Patching Table %s:%s.%s', + project_id, dataset_id, table_id) + + try: + self.service.tables().patch( + projectId=project_id, + datasetId=dataset_id, + tableId=table_id, + body=table_resource).execute() + + self.log.info('Table patched successfully: %s:%s.%s', + project_id, dataset_id, table_id) + + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content) + ) + def run_query(self, bql=None, sql=None, - destination_dataset_table=False, + destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, - flatten_results=False, - udf_config=False, + flatten_results=None, + udf_config=None, use_legacy_sql=None, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', query_params=None, + labels=None, schema_update_options=(), priority='INTERACTIVE', - time_partitioning={}): + time_partitioning=None, + api_resource_configs=None, + cluster_fields=None, + location=None): """ Executes a BigQuery SQL query. Optionally persists results in a BigQuery table. See here: @@ -480,31 +640,38 @@ def run_query(self, :param bql: (Deprecated. Use `sql` parameter instead) The BigQuery SQL to execute. - :type bql: string + :type bql: str :param sql: The BigQuery SQL to execute. - :type sql: string - :param destination_dataset_table: The dotted .
+ :type sql: str + :param destination_dataset_table: The dotted ``.
`` BigQuery table to save the query results. - :type destination_dataset_table: string + :type destination_dataset_table: str :param write_disposition: What to do if the table already exists in BigQuery. - :type write_disposition: string + :type write_disposition: str :param allow_large_results: Whether to allow large results. - :type allow_large_results: boolean + :type allow_large_results: bool :param flatten_results: If true and query uses legacy SQL dialect, flattens all nested and repeated fields in the query results. ``allowLargeResults`` must be true if this is set to false. For standard SQL queries, this flag is ignored and results are never flattened. - :type flatten_results: boolean + :type flatten_results: bool :param udf_config: The User Defined Function configuration for the query. See https://cloud.google.com/bigquery/user-defined-functions for details. + :type udf_config: list :param use_legacy_sql: Whether to use legacy SQL (true) or standard SQL (false). If `None`, defaults to `self.use_legacy_sql`. - :type use_legacy_sql: boolean - :type udf_config: list + :type use_legacy_sql: bool + :param api_resource_configs: a dictionary that contain params + 'configuration' applied for Google BigQuery Jobs API: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs + for example, {'query': {'useQueryCache': False}}. You could use it + if you need to provide some params that are not supported by the + BigQueryHook like args. + :type api_resource_configs: dict :param maximum_billing_tier: Positive integer that serves as a multiplier of the basic price. - :type maximum_billing_tier: integer + :type maximum_billing_tier: int :param maximum_bytes_billed: Limits the bytes billed for this job. Queries that will have bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be @@ -512,28 +679,55 @@ def run_query(self, :type maximum_bytes_billed: float :param create_disposition: Specifies whether the job is allowed to create new tables. - :type create_disposition: string - :param query_params a dictionary containing query parameter types and + :type create_disposition: str + :param query_params: a list of dictionary containing query parameter types and values, passed to BigQuery - :type query_params: dict - :param schema_update_options: Allows the schema of the desitination + :type query_params: list + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict + :param schema_update_options: Allows the schema of the destination table to be updated as a side effect of the query job. :type schema_update_options: tuple :param priority: Specifies a priority for the query. Possible values include INTERACTIVE and BATCH. The default value is INTERACTIVE. - :type priority: string + :type priority: str :param time_partitioning: configure optional time partitioning fields i.e. - partition by field, type and - expiration as per API specifications. Note that 'field' is not available in - conjunction with dataset.table$partition. + partition by field, type and expiration as per API specifications. :type time_partitioning: dict - + :param cluster_fields: Request that the result of this query be stored sorted + by one or more columns. This is only available in combination with + time_partitioning. The order of columns given determines the sort order. + :type cluster_fields: list[str] + :param location: The geographic location of the job. Required except for + US and EU. See details at + https://cloud.google.com/bigquery/docs/locations#specifying_your_location + :type location: str """ - # TODO remove `bql` in Airflow 2.0 - Jira: [AIRFLOW-2513] + if time_partitioning is None: + time_partitioning = {} + + if location: + self.location = location + + if not api_resource_configs: + api_resource_configs = self.api_resource_configs + else: + _validate_value('api_resource_configs', + api_resource_configs, dict) + configuration = deepcopy(api_resource_configs) + if 'query' not in configuration: + configuration['query'] = {} + + else: + _validate_value("api_resource_configs['query']", + configuration['query'], dict) + sql = bql if sql is None else sql + # TODO remove `bql` in Airflow 2.0 - Jira: [AIRFLOW-2513] if bql: import warnings warnings.warn('Deprecated parameter `bql` used in ' @@ -544,88 +738,113 @@ def run_query(self, 'Airflow.', category=DeprecationWarning) - if sql is None: - raise TypeError('`BigQueryBaseCursor.run_query` missing 1 required ' - 'positional argument: `sql`') + if sql is None and not configuration['query'].get('query', None): + raise TypeError('`BigQueryBaseCursor.run_query` ' + 'missing 1 required positional argument: `sql`') # BigQuery also allows you to define how you want a table's schema to change # as a side effect of a query job # for more details: # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query.schemaUpdateOptions + allowed_schema_update_options = [ 'ALLOW_FIELD_ADDITION', "ALLOW_FIELD_RELAXATION" ] - if not set(allowed_schema_update_options).issuperset( - set(schema_update_options)): - raise ValueError( - "{0} contains invalid schema update options. " - "Please only use one or more of the following options: {1}" - .format(schema_update_options, allowed_schema_update_options)) - if use_legacy_sql is None: - use_legacy_sql = self.use_legacy_sql + if not set(allowed_schema_update_options + ).issuperset(set(schema_update_options)): + raise ValueError("{0} contains invalid schema update options. " + "Please only use one or more of the following " + "options: {1}" + .format(schema_update_options, + allowed_schema_update_options)) - configuration = { - 'query': { - 'query': sql, - 'useLegacySql': use_legacy_sql, - 'maximumBillingTier': maximum_billing_tier, - 'maximumBytesBilled': maximum_bytes_billed, - 'priority': priority - } - } + if schema_update_options: + if write_disposition not in ["WRITE_APPEND", "WRITE_TRUNCATE"]: + raise ValueError("schema_update_options is only " + "allowed if write_disposition is " + "'WRITE_APPEND' or 'WRITE_TRUNCATE'.") if destination_dataset_table: - assert '.' in destination_dataset_table, ( - 'Expected destination_dataset_table in the format of ' - '.
. Got: {}').format(destination_dataset_table) destination_project, destination_dataset, destination_table = \ _split_tablename(table_input=destination_dataset_table, default_project_id=self.project_id) - configuration['query'].update({ - 'allowLargeResults': allow_large_results, - 'flattenResults': flatten_results, - 'writeDisposition': write_disposition, - 'createDisposition': create_disposition, - 'destinationTable': { - 'projectId': destination_project, - 'datasetId': destination_dataset, - 'tableId': destination_table, - } - }) - if udf_config: - assert isinstance(udf_config, list) - configuration['query'].update({ - 'userDefinedFunctionResources': udf_config - }) - if query_params: - if self.use_legacy_sql: - raise ValueError("Query paramaters are not allowed when using " - "legacy SQL") - else: - configuration['query']['queryParameters'] = query_params + destination_dataset_table = { + 'projectId': destination_project, + 'datasetId': destination_dataset, + 'tableId': destination_table, + } - time_partitioning = _cleanse_time_partitioning( - destination_dataset_table, - time_partitioning - ) - if time_partitioning: - configuration['query'].update({ - 'timePartitioning': time_partitioning - }) + if cluster_fields: + cluster_fields = {'fields': cluster_fields} + + query_param_list = [ + (sql, 'query', None, six.string_types), + (priority, 'priority', 'INTERACTIVE', six.string_types), + (use_legacy_sql, 'useLegacySql', self.use_legacy_sql, bool), + (query_params, 'queryParameters', None, list), + (udf_config, 'userDefinedFunctionResources', None, list), + (maximum_billing_tier, 'maximumBillingTier', None, int), + (maximum_bytes_billed, 'maximumBytesBilled', None, float), + (time_partitioning, 'timePartitioning', {}, dict), + (schema_update_options, 'schemaUpdateOptions', None, tuple), + (destination_dataset_table, 'destinationTable', None, dict), + (cluster_fields, 'clustering', None, dict), + ] - if schema_update_options: - if write_disposition not in ["WRITE_APPEND", "WRITE_TRUNCATE"]: - raise ValueError("schema_update_options is only " - "allowed if write_disposition is " - "'WRITE_APPEND' or 'WRITE_TRUNCATE'.") - else: - self.log.info( - "Adding experimental " - "'schemaUpdateOptions': {0}".format(schema_update_options)) - configuration['query'][ - 'schemaUpdateOptions'] = schema_update_options + for param_tuple in query_param_list: + + param, param_name, param_default, param_type = param_tuple + + if param_name not in configuration['query'] and param in [None, {}, ()]: + if param_name == 'timePartitioning': + param_default = _cleanse_time_partitioning( + destination_dataset_table, time_partitioning) + param = param_default + + if param not in [None, {}, ()]: + _api_resource_configs_duplication_check( + param_name, param, configuration['query']) + + configuration['query'][param_name] = param + + # check valid type of provided param, + # it last step because we can get param from 2 sources, + # and first of all need to find it + + _validate_value(param_name, configuration['query'][param_name], + param_type) + + if param_name == 'schemaUpdateOptions' and param: + self.log.info("Adding experimental 'schemaUpdateOptions': " + "%s", schema_update_options) + + if param_name == 'destinationTable': + for key in ['projectId', 'datasetId', 'tableId']: + if key not in configuration['query']['destinationTable']: + raise ValueError( + "Not correct 'destinationTable' in " + "api_resource_configs. 'destinationTable' " + "must be a dict with {'projectId':'', " + "'datasetId':'', 'tableId':''}") + + configuration['query'].update({ + 'allowLargeResults': allow_large_results, + 'flattenResults': flatten_results, + 'writeDisposition': write_disposition, + 'createDisposition': create_disposition, + }) + + if 'useLegacySql' in configuration['query'] and configuration['query']['useLegacySql'] and\ + 'queryParameters' in configuration['query']: + raise ValueError("Query parameters are not allowed " + "when using legacy SQL") + + if labels: + _api_resource_configs_duplication_check( + 'labels', labels, configuration) + configuration['labels'] = labels return self.run_with_configuration(configuration) @@ -636,7 +855,8 @@ def run_extract( # noqa compression='NONE', export_format='CSV', field_delimiter=',', - print_header=True): + print_header=True, + labels=None): """ Executes a BigQuery extract command to copy data from BigQuery to Google Cloud Storage. See here: @@ -645,22 +865,25 @@ def run_extract( # noqa For more details about these parameters. - :param source_project_dataset_table: The dotted .
+ :param source_project_dataset_table: The dotted ``.
`` BigQuery table to use as the source data. - :type source_project_dataset_table: string + :type source_project_dataset_table: str :param destination_cloud_storage_uris: The destination Google Cloud Storage URI (e.g. gs://some-bucket/some-file.txt). Follows convention defined here: https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple :type destination_cloud_storage_uris: list :param compression: Type of compression to use. - :type compression: string + :type compression: str :param export_format: File format to export. - :type export_format: string + :type export_format: str :param field_delimiter: The delimiter to use when extracting to a CSV. - :type field_delimiter: string + :type field_delimiter: str :param print_header: Whether to print a header for a CSV file extract. - :type print_header: boolean + :type print_header: bool + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict """ source_project, source_dataset, source_table = \ @@ -681,6 +904,9 @@ def run_extract( # noqa } } + if labels: + configuration['labels'] = labels + if export_format == 'CSV': # Only set fieldDelimiter and printHeader fields if using CSV. # Google does not like it if you set these fields for other export @@ -694,7 +920,8 @@ def run_copy(self, source_project_dataset_tables, destination_project_dataset_table, write_disposition='WRITE_EMPTY', - create_disposition='CREATE_IF_NEEDED'): + create_disposition='CREATE_IF_NEEDED', + labels=None): """ Executes a BigQuery copy command to copy data from one BigQuery table to another. See here: @@ -704,19 +931,22 @@ def run_copy(self, For more details about these parameters. :param source_project_dataset_tables: One or more dotted - (project:|project.).
+ ``(project:|project.).
`` BigQuery tables to use as the source data. Use a list if there are multiple source tables. - If is not included, project will be the project defined + If ```` is not included, project will be the project defined in the connection json. :type source_project_dataset_tables: list|string :param destination_project_dataset_table: The destination BigQuery - table. Format is: (project:|project.).
- :type destination_project_dataset_table: string + table. Format is: ``(project:|project.).
`` + :type destination_project_dataset_table: str :param write_disposition: The write disposition if the table already exists. - :type write_disposition: string + :type write_disposition: str :param create_disposition: The create disposition if the table doesn't exist. - :type create_disposition: string + :type create_disposition: str + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict """ source_project_dataset_tables = ([ source_project_dataset_tables @@ -754,12 +984,15 @@ def run_copy(self, } } + if labels: + configuration['labels'] = labels + return self.run_with_configuration(configuration) def run_load(self, destination_project_dataset_table, - schema_fields, source_uris, + schema_fields=None, source_format='CSV', create_disposition='CREATE_IF_NEEDED', skip_leading_rows=0, @@ -771,8 +1004,10 @@ def run_load(self, allow_quoted_newlines=False, allow_jagged_rows=False, schema_update_options=(), - src_fmt_configs={}, - time_partitioning={}): + src_fmt_configs=None, + time_partitioning=None, + cluster_fields=None, + autodetect=False): """ Executes a BigQuery load command to load data from Google Cloud Storage to BigQuery. See here: @@ -782,35 +1017,39 @@ def run_load(self, For more details about these parameters. :param destination_project_dataset_table: - The dotted (.|:).
($) BigQuery - table to load data into. If is not included, project will be the + The dotted ``(.|:).
($)`` BigQuery + table to load data into. If ```` is not included, project will be the project defined in the connection json. If a partition is specified the operator will automatically append the data, create a new partition or create a new DAY partitioned table. - :type destination_project_dataset_table: string + :type destination_project_dataset_table: str :param schema_fields: The schema field list as defined here: https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load + Required if autodetect=False; optional if autodetect=True. :type schema_fields: list + :param autodetect: Attempt to autodetect the schema for CSV and JSON + source files. + :type autodetect: bool :param source_uris: The source Google Cloud Storage URI (e.g. gs://some-bucket/some-file.txt). A single wild per-object name can be used. :type source_uris: list :param source_format: File format to export. - :type source_format: string + :type source_format: str :param create_disposition: The create disposition if the table doesn't exist. - :type create_disposition: string + :type create_disposition: str :param skip_leading_rows: Number of rows to skip when loading from a CSV. :type skip_leading_rows: int :param write_disposition: The write disposition if the table already exists. - :type write_disposition: string + :type write_disposition: str :param field_delimiter: The delimiter to use when loading from a CSV. - :type field_delimiter: string + :type field_delimiter: str :param max_bad_records: The maximum number of bad records that BigQuery can ignore when running the job. :type max_bad_records: int :param quote_character: The value that is used to quote data sections in a CSV file. - :type quote_character: string + :type quote_character: str :param ignore_unknown_values: [Optional] Indicates if BigQuery should allow extra values that are not represented in the table schema. If true, the extra values are ignored. If false, records with extra columns @@ -819,23 +1058,25 @@ def run_load(self, :type ignore_unknown_values: bool :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false). - :type allow_quoted_newlines: boolean + :type allow_quoted_newlines: bool :param allow_jagged_rows: Accept rows that are missing trailing optional columns. The missing values are treated as nulls. If false, records with missing trailing columns are treated as bad records, and if there are too many bad records, an invalid error is returned in the job result. Only applicable when soure_format is CSV. :type allow_jagged_rows: bool - :param schema_update_options: Allows the schema of the desitination + :param schema_update_options: Allows the schema of the destination table to be updated as a side effect of the load job. :type schema_update_options: tuple :param src_fmt_configs: configure optional fields specific to the source format :type src_fmt_configs: dict :param time_partitioning: configure optional time partitioning fields i.e. - partition by field, type and - expiration as per API specifications. Note that 'field' is not available in - conjunction with dataset.table$partition. + partition by field, type and expiration as per API specifications. :type time_partitioning: dict + :param cluster_fields: Request that the result of this load be stored sorted + by one or more columns. This is only available in combination with + time_partitioning. The order of columns given determines the sort order. + :type cluster_fields: list[str] """ # bigquery only allows certain source formats @@ -843,6 +1084,14 @@ def run_load(self, # if it's not, we raise a ValueError # Refer to this link for more details: # https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.query.tableDefinitions.(key).sourceFormat + + if schema_fields is None and not autodetect: + raise ValueError( + 'You must either pass a schema or autodetect=True.') + + if src_fmt_configs is None: + src_fmt_configs = {} + source_format = source_format.upper() allowed_formats = [ "CSV", "NEWLINE_DELIMITED_JSON", "AVRO", "GOOGLE_SHEETS", @@ -863,7 +1112,7 @@ def run_load(self, if not set(allowed_schema_update_options).issuperset( set(schema_update_options)): raise ValueError( - "{0} contains invalid schema update options. " + "{0} contains invalid schema update options." "Please only use one or more of the following options: {1}" .format(schema_update_options, allowed_schema_update_options)) @@ -874,6 +1123,7 @@ def run_load(self, configuration = { 'load': { + 'autodetect': autodetect, 'createDisposition': create_disposition, 'destinationTable': { 'projectId': destination_project, @@ -896,6 +1146,9 @@ def run_load(self, 'timePartitioning': time_partitioning }) + if cluster_fields: + configuration['load'].update({'clustering': {'fields': cluster_fields}}) + if schema_fields: configuration['load']['schema'] = {'fields': schema_fields} @@ -906,8 +1159,9 @@ def run_load(self, "'WRITE_APPEND' or 'WRITE_TRUNCATE'.") else: self.log.info( - "Adding experimental " - "'schemaUpdateOptions': {0}".format(schema_update_options)) + "Adding experimental 'schemaUpdateOptions': %s", + schema_update_options + ) configuration['load'][ 'schemaUpdateOptions'] = schema_update_options @@ -936,7 +1190,7 @@ def run_load(self, 'DATASTORE_BACKUP': ['projectionFields'], 'NEWLINE_DELIMITED_JSON': ['autodetect', 'ignoreUnknownValues'], 'PARQUET': ['autodetect', 'ignoreUnknownValues'], - 'AVRO': [], + 'AVRO': ['useAvroLogicalTypes'], } valid_configs = src_fmt_to_configs_mapping[source_format] src_fmt_configs = { @@ -974,12 +1228,18 @@ def run_with_configuration(self, configuration): # Wait for query to finish. keep_polling_job = True - while (keep_polling_job): + while keep_polling_job: try: - job = jobs.get( - projectId=self.project_id, - jobId=self.running_job_id).execute() - if (job['status']['state'] == 'DONE'): + if self.location: + job = jobs.get( + projectId=self.project_id, + jobId=self.running_job_id, + location=self.location).execute() + else: + job = jobs.get( + projectId=self.project_id, + jobId=self.running_job_id).execute() + if job['status']['state'] == 'DONE': keep_polling_job = False # Check if job had errors. if 'errorResult' in job['status']: @@ -999,16 +1259,22 @@ def run_with_configuration(self, configuration): time.sleep(5) else: raise Exception( - 'BigQuery job status check failed. Final error was: %s', - err.resp.status) + 'BigQuery job status check failed. Final error was: {}'. + format(err.resp.status)) return self.running_job_id def poll_job_complete(self, job_id): jobs = self.service.jobs() try: - job = jobs.get(projectId=self.project_id, jobId=job_id).execute() - if (job['status']['state'] == 'DONE'): + if self.location: + job = jobs.get(projectId=self.project_id, + jobId=job_id, + location=self.location).execute() + else: + job = jobs.get(projectId=self.project_id, + jobId=job_id).execute() + if job['status']['state'] == 'DONE': return True except HttpError as err: if err.resp.status in [500, 503]: @@ -1017,8 +1283,8 @@ def poll_job_complete(self, job_id): err.resp.status, job_id) else: raise Exception( - 'BigQuery job status check failed. Final error was: %s', - err.resp.status) + 'BigQuery job status check failed. Final error was: {}'. + format(err.resp.status)) return False def cancel_query(self): @@ -1030,9 +1296,15 @@ def cancel_query(self): not self.poll_job_complete(self.running_job_id)): self.log.info('Attempting to cancel job : %s, %s', self.project_id, self.running_job_id) - jobs.cancel( - projectId=self.project_id, - jobId=self.running_job_id).execute() + if self.location: + jobs.cancel( + projectId=self.project_id, + jobId=self.running_job_id, + location=self.location).execute() + else: + jobs.cancel( + projectId=self.project_id, + jobId=self.running_job_id).execute() else: self.log.info('No running BigQuery jobs to cancel.') return @@ -1042,13 +1314,13 @@ def cancel_query(self): polling_attempts = 0 job_complete = False - while (polling_attempts < max_polling_attempts and not job_complete): + while polling_attempts < max_polling_attempts and not job_complete: polling_attempts = polling_attempts + 1 job_complete = self.poll_job_complete(self.running_job_id) - if (job_complete): + if job_complete: self.log.info('Job successfully canceled: %s, %s', self.project_id, self.running_job_id) - elif (polling_attempts == max_polling_attempts): + elif polling_attempts == max_polling_attempts: self.log.info( "Stopping polling due to timeout. Job with id %s " "has not completed cancel and may or may not finish.", @@ -1112,18 +1384,14 @@ def run_table_delete(self, deletion_dataset_table, is set to True. :param deletion_dataset_table: A dotted - (.|:).
that indicates which table - will be deleted. + ``(.|:).
`` that indicates which table + will be deleted. :type deletion_dataset_table: str :param ignore_if_missing: if True, then return success even if the - requested table does not exist. - :type ignore_if_missing: boolean + requested table does not exist. + :type ignore_if_missing: bool :return: """ - - assert '.' in deletion_dataset_table, ( - 'Expected deletion_dataset_table in the format of ' - '.
. Got: {}').format(deletion_dataset_table) deletion_project, deletion_dataset, deletion_table = \ _split_tablename(table_input=deletion_dataset_table, default_project_id=self.project_id) @@ -1155,7 +1423,7 @@ def run_table_upsert(self, dataset_id, table_resource, project_id=None): https://cloud.google.com/bigquery/docs/reference/v2/tables#resource :type table_resource: dict :param project_id: the project to upsert the table into. If None, - project will be self.project_id. + project will be self.project_id. :return: """ # check to see if the table exists @@ -1209,10 +1477,10 @@ def run_grant_dataset_view_access(self, :param view_table: the table of the view :type view_table: str :param source_project: the project of the source dataset. If None, - self.project_id will be used. + self.project_id will be used. :type source_project: str :param view_project: the project that the view is in. If None, - self.project_id will be used. + self.project_id will be used. :type view_project: str :return: the datasets resource of the source dataset. """ @@ -1251,10 +1519,257 @@ def run_grant_dataset_view_access(self, # if view is already in access, do nothing. self.log.info( 'Table %s:%s.%s already has authorized view access to %s:%s dataset.', - view_project, view_dataset, view_table, source_project, - source_dataset) + view_project, view_dataset, view_table, source_project, source_dataset) return source_dataset_resource + def create_empty_dataset(self, dataset_id="", project_id="", + dataset_reference=None): + """ + Create a new empty dataset: + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/insert + + :param project_id: The name of the project where we want to create + an empty a dataset. Don't need to provide, if projectId in dataset_reference. + :type project_id: str + :param dataset_id: The id of dataset. Don't need to provide, + if datasetId in dataset_reference. + :type dataset_id: str + :param dataset_reference: Dataset reference that could be provided + with request body. More info: + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource + :type dataset_reference: dict + """ + + if dataset_reference: + _validate_value('dataset_reference', dataset_reference, dict) + else: + dataset_reference = {} + + if "datasetReference" not in dataset_reference: + dataset_reference["datasetReference"] = {} + + if not dataset_reference["datasetReference"].get("datasetId") and not dataset_id: + raise ValueError( + "{} not provided datasetId. Impossible to create dataset") + + dataset_required_params = [(dataset_id, "datasetId", ""), + (project_id, "projectId", self.project_id)] + for param_tuple in dataset_required_params: + param, param_name, param_default = param_tuple + if param_name not in dataset_reference['datasetReference']: + if param_default and not param: + self.log.info( + "%s was not specified. Will be used default value %s.", + param_name, param_default + ) + param = param_default + dataset_reference['datasetReference'].update( + {param_name: param}) + elif param: + _api_resource_configs_duplication_check( + param_name, param, + dataset_reference['datasetReference'], 'dataset_reference') + + dataset_id = dataset_reference.get("datasetReference").get("datasetId") + dataset_project_id = dataset_reference.get("datasetReference").get( + "projectId") + + self.log.info('Creating Dataset: %s in project: %s ', dataset_id, + dataset_project_id) + + try: + self.service.datasets().insert( + projectId=dataset_project_id, + body=dataset_reference).execute() + self.log.info('Dataset created successfully: In project %s ' + 'Dataset %s', dataset_project_id, dataset_id) + + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content) + ) + + def delete_dataset(self, project_id, dataset_id): + """ + Delete a dataset of Big query in your project. + :param project_id: The name of the project where we have the dataset . + :type project_id: str + :param dataset_id: The dataset to be delete. + :type dataset_id: str + :return: + """ + project_id = project_id if project_id is not None else self.project_id + self.log.info('Deleting from project: %s Dataset:%s', + project_id, dataset_id) + + try: + self.service.datasets().delete( + projectId=project_id, + datasetId=dataset_id).execute() + self.log.info('Dataset deleted successfully: In project %s ' + 'Dataset %s', project_id, dataset_id) + + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content) + ) + + def get_dataset(self, dataset_id, project_id=None): + """ + Method returns dataset_resource if dataset exist + and raised 404 error if dataset does not exist + + :param dataset_id: The BigQuery Dataset ID + :type dataset_id: str + :param project_id: The GCP Project ID + :type project_id: str + :return: dataset_resource + + .. seealso:: + For more information, see Dataset Resource content: + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource + """ + + if not dataset_id or not isinstance(dataset_id, str): + raise ValueError("dataset_id argument must be provided and has " + "a type 'str'. You provided: {}".format(dataset_id)) + + dataset_project_id = project_id if project_id else self.project_id + + try: + dataset_resource = self.service.datasets().get( + datasetId=dataset_id, projectId=dataset_project_id).execute() + self.log.info("Dataset Resource: %s", dataset_resource) + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content)) + + return dataset_resource + + def get_datasets_list(self, project_id=None): + """ + Method returns full list of BigQuery datasets in the current project + + .. seealso:: + For more information, see: + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/list + + :param project_id: Google Cloud Project for which you + try to get all datasets + :type project_id: str + :return: datasets_list + + Example of returned datasets_list: :: + + { + "kind":"bigquery#dataset", + "location":"US", + "id":"your-project:dataset_2_test", + "datasetReference":{ + "projectId":"your-project", + "datasetId":"dataset_2_test" + } + }, + { + "kind":"bigquery#dataset", + "location":"US", + "id":"your-project:dataset_1_test", + "datasetReference":{ + "projectId":"your-project", + "datasetId":"dataset_1_test" + } + } + ] + """ + dataset_project_id = project_id if project_id else self.project_id + + try: + datasets_list = self.service.datasets().list( + projectId=dataset_project_id).execute()['datasets'] + self.log.info("Datasets List: %s", datasets_list) + + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content)) + + return datasets_list + + def insert_all(self, project_id, dataset_id, table_id, + rows, ignore_unknown_values=False, + skip_invalid_rows=False, fail_on_error=False): + """ + Method to stream data into BigQuery one record at a time without needing + to run a load job + + .. seealso:: + For more information, see: + https://cloud.google.com/bigquery/docs/reference/rest/v2/tabledata/insertAll + + :param project_id: The name of the project where we have the table + :type project_id: str + :param dataset_id: The name of the dataset where we have the table + :type dataset_id: str + :param table_id: The name of the table + :type table_id: str + :param rows: the rows to insert + :type rows: list + + **Example or rows**: + rows=[{"json": {"a_key": "a_value_0"}}, {"json": {"a_key": "a_value_1"}}] + + :param ignore_unknown_values: [Optional] Accept rows that contain values + that do not match the schema. The unknown values are ignored. + The default value is false, which treats unknown values as errors. + :type ignore_unknown_values: bool + :param skip_invalid_rows: [Optional] Insert all valid rows of a request, + even if invalid rows exist. The default value is false, which causes + the entire request to fail if any invalid rows exist. + :type skip_invalid_rows: bool + :param fail_on_error: [Optional] Force the task to fail if any errors occur. + The default value is false, which indicates the task should not fail + even if any insertion errors occur. + :type fail_on_error: bool + """ + + dataset_project_id = project_id if project_id else self.project_id + + body = { + "rows": rows, + "ignoreUnknownValues": ignore_unknown_values, + "kind": "bigquery#tableDataInsertAllRequest", + "skipInvalidRows": skip_invalid_rows, + } + + try: + self.log.info( + 'Inserting %s row(s) into Table %s:%s.%s', + len(rows), dataset_project_id, dataset_id, table_id + ) + + resp = self.service.tabledata().insertAll( + projectId=dataset_project_id, datasetId=dataset_id, + tableId=table_id, body=body + ).execute() + + if 'insertErrors' not in resp: + self.log.info( + 'All row(s) inserted successfully: %s:%s.%s', + dataset_project_id, dataset_id, table_id + ) + else: + error_msg = '{} insert error(s) occured: {}:{}.{}. Details: {}'.format( + len(resp['insertErrors']), + dataset_project_id, dataset_id, table_id, resp['insertErrors']) + if fail_on_error: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(error_msg) + ) + self.log.info(error_msg) + except HttpError as err: + raise AirflowException( + 'BigQuery job failed. Error was: {}'.format(err.content) + ) + class BigQueryCursor(BigQueryBaseCursor): """ @@ -1265,11 +1780,13 @@ class BigQueryCursor(BigQueryBaseCursor): https://github.com/dropbox/PyHive/blob/master/pyhive/common.py """ - def __init__(self, service, project_id, use_legacy_sql=True): + def __init__(self, service, project_id, use_legacy_sql=True, location=None): super(BigQueryCursor, self).__init__( service=service, project_id=project_id, - use_legacy_sql=use_legacy_sql) + use_legacy_sql=use_legacy_sql, + location=location, + ) self.buffersize = None self.page_token = None self.job_id = None @@ -1295,7 +1812,7 @@ def execute(self, operation, parameters=None): Executes a BigQuery query, and returns the job ID. :param operation: The query to execute. - :type operation: string + :type operation: str :param parameters: Parameters to substitute into the query. :type parameters: dict """ @@ -1308,7 +1825,7 @@ def executemany(self, operation, seq_of_parameters): Execute a BigQuery query multiple times with different parameters. :param operation: The query to execute. - :type operation: string + :type operation: str :param seq_of_parameters: List of dictionary parameters to substitute into the query. :type seq_of_parameters: list @@ -1423,7 +1940,7 @@ def _bind_parameters(operation, parameters): """ Helper method that binds parameters to a SQL query. """ # inspired by MySQL Python Connector (conversion.py) string_parameters = {} - for (name, value) in parameters.iteritems(): + for (name, value) in iteritems(parameters): if value is None: string_parameters[name] = 'NULL' elif isinstance(value, basestring): @@ -1456,14 +1973,23 @@ def _bq_cast(string_field, bq_type): elif bq_type == 'FLOAT' or bq_type == 'TIMESTAMP': return float(string_field) elif bq_type == 'BOOLEAN': - assert string_field in set(['true', 'false']) + if string_field not in ['true', 'false']: + raise ValueError("{} must have value 'true' or 'false'".format( + string_field)) return string_field == 'true' else: return string_field def _split_tablename(table_input, default_project_id, var_name=None): - assert default_project_id is not None, "INTERNAL: No default project is specified" + + if '.' not in table_input: + raise ValueError( + 'Expected target table name in the format of ' + '.
. Got: {}'.format(table_input)) + + if not default_project_id: + raise ValueError("INTERNAL: No default project is specified") def var_print(var_name): if var_name is None: @@ -1475,7 +2001,6 @@ def var_print(var_name): raise Exception(('{var}Use either : or . to specify project ' 'got {input}').format( var=var_print(var_name), input=table_input)) - cmpt = table_input.rsplit(':', 1) project_id = None rest = table_input @@ -1493,8 +2018,10 @@ def var_print(var_name): cmpt = rest.split('.') if len(cmpt) == 3: - assert project_id is None, ("{var}Use either : or . to specify project" - ).format(var=var_print(var_name)) + if project_id: + raise ValueError( + "{var}Use either : or . to specify project".format( + var=var_print(var_name))) project_id = cmpt[0] dataset_id = cmpt[1] table_id = cmpt[2] @@ -1510,11 +2037,10 @@ def var_print(var_name): if project_id is None: if var_name is not None: log = LoggingMixin().log - log.info('Project not included in {var}: {input}; ' - 'using project "{project}"'.format( - var=var_name, - input=table_input, - project=default_project_id)) + log.info( + 'Project not included in %s: %s; using project "%s"', + var_name, table_input, default_project_id + ) project_id = default_project_id return project_id, dataset_id, table_id @@ -1522,13 +2048,30 @@ def var_print(var_name): def _cleanse_time_partitioning(destination_dataset_table, time_partitioning_in): # if it is a partitioned table ($ is in the table name) add partition load option + + if time_partitioning_in is None: + time_partitioning_in = {} + time_partitioning_out = {} if destination_dataset_table and '$' in destination_dataset_table: - assert not time_partitioning_in.get('field'), ( - "Cannot specify field partition and partition name " - "(dataset.table$partition) at the same time" - ) time_partitioning_out['type'] = 'DAY' - time_partitioning_out.update(time_partitioning_in) return time_partitioning_out + + +def _validate_value(key, value, expected_type): + """ function to check expected type and raise + error if type is not correct """ + if not isinstance(value, expected_type): + raise TypeError("{} argument must have a type {} not {}".format( + key, expected_type, type(value))) + + +def _api_resource_configs_duplication_check(key, value, config_dict, + config_dict_name='api_resource_configs'): + if key in config_dict and value != config_dict[key]: + raise ValueError("Values of {param_name} param are duplicated. " + "{dict_name} contained {param_name} param " + "in `query` config and {param_name} was also provided " + "with arg to run_query() method. Please remove duplicates." + .format(param_name=key, dict_name=config_dict_name)) diff --git a/airflow/contrib/hooks/cassandra_hook.py b/airflow/contrib/hooks/cassandra_hook.py index 704ba0d8d03e2..08def3a5b775c 100644 --- a/airflow/contrib/hooks/cassandra_hook.py +++ b/airflow/contrib/hooks/cassandra_hook.py @@ -36,14 +36,17 @@ class CassandraHook(BaseHook, LoggingMixin): Port can be specified in the port field of the connection. If SSL is enabled in Cassandra, pass in a dict in the extra field as kwargs for - ``ssl.wrap_socket()``. For example: - { - 'ssl_options' : { - 'ca_certs' : PATH_TO_CA_CERTS - } + ``ssl.wrap_socket()``. For example:: + + { + 'ssl_options' : { + 'ca_certs' : PATH_TO_CA_CERTS } + } + + Default load balancing policy is RoundRobinPolicy. To specify a different + LB policy:: - Default load balancing policy is RoundRobinPolicy. To specify a different LB policy: - DCAwareRoundRobinPolicy { 'load_balancing_policy': 'DCAwareRoundRobinPolicy', @@ -158,22 +161,37 @@ def get_lb_policy(policy_name, policy_args): child_policy_args) return TokenAwarePolicy(child_policy) + def table_exists(self, table): + """ + Checks if a table exists in Cassandra + + :param table: Target Cassandra table. + Use dot notation to target a specific keyspace. + :type table: str + """ + keyspace = self.keyspace + if '.' in table: + keyspace, table = table.split('.', 1) + cluster_metadata = self.get_conn().cluster.metadata + return (keyspace in cluster_metadata.keyspaces and + table in cluster_metadata.keyspaces[keyspace].tables) + def record_exists(self, table, keys): """ Checks if a record exists in Cassandra :param table: Target Cassandra table. Use dot notation to target a specific keyspace. - :type table: string + :type table: str :param keys: The keys and their values to check the existence. :type keys: dict """ - keyspace = None + keyspace = self.keyspace if '.' in table: keyspace, table = table.split('.', 1) ks = " AND ".join("{}=%({})s".format(key, key) for key in keys.keys()) cql = "SELECT * FROM {keyspace}.{table} WHERE {keys}".format( - keyspace=(keyspace or self.keyspace), table=table, keys=ks) + keyspace=keyspace, table=table, keys=ks) try: rs = self.get_conn().execute(cql, keys) diff --git a/airflow/contrib/hooks/databricks_hook.py b/airflow/contrib/hooks/databricks_hook.py index 1443ff4740b94..a3cd31617a4c0 100644 --- a/airflow/contrib/hooks/databricks_hook.py +++ b/airflow/contrib/hooks/databricks_hook.py @@ -24,22 +24,21 @@ from airflow.hooks.base_hook import BaseHook from requests import exceptions as requests_exceptions from requests.auth import AuthBase +from time import sleep +from six.moves.urllib import parse as urlparse -from airflow.utils.log.logging_mixin import LoggingMixin - -try: - from urllib import parse as urlparse -except ImportError: - import urlparse - +RESTART_CLUSTER_ENDPOINT = ("POST", "api/2.0/clusters/restart") +START_CLUSTER_ENDPOINT = ("POST", "api/2.0/clusters/start") +TERMINATE_CLUSTER_ENDPOINT = ("POST", "api/2.0/clusters/delete") +RUN_NOW_ENDPOINT = ('POST', 'api/2.0/jobs/run-now') SUBMIT_RUN_ENDPOINT = ('POST', 'api/2.0/jobs/runs/submit') GET_RUN_ENDPOINT = ('GET', 'api/2.0/jobs/runs/get') CANCEL_RUN_ENDPOINT = ('POST', 'api/2.0/jobs/runs/cancel') USER_AGENT_HEADER = {'user-agent': 'airflow-{v}'.format(v=__version__)} -class DatabricksHook(BaseHook, LoggingMixin): +class DatabricksHook(BaseHook): """ Interact with Databricks. """ @@ -47,38 +46,47 @@ def __init__( self, databricks_conn_id='databricks_default', timeout_seconds=180, - retry_limit=3): + retry_limit=3, + retry_delay=1.0): """ :param databricks_conn_id: The name of the databricks connection to use. - :type databricks_conn_id: string + :type databricks_conn_id: str :param timeout_seconds: The amount of time in seconds the requests library will wait before timing-out. :type timeout_seconds: int :param retry_limit: The number of times to retry the connection in case of service outages. :type retry_limit: int + :param retry_delay: The number of seconds to wait between retries (it + might be a floating point number). + :type retry_delay: float """ self.databricks_conn_id = databricks_conn_id self.databricks_conn = self.get_connection(databricks_conn_id) self.timeout_seconds = timeout_seconds - assert retry_limit >= 1, 'Retry limit must be greater than equal to 1' + if retry_limit < 1: + raise ValueError('Retry limit must be greater than equal to 1') self.retry_limit = retry_limit + self.retry_delay = retry_delay - def _parse_host(self, host): + @staticmethod + def _parse_host(host): """ The purpose of this function is to be robust to improper connections settings provided by users, specifically in the host field. - For example -- when users supply ``https://xx.cloud.databricks.com`` as the - host, we must strip out the protocol to get the host. - >>> h = DatabricksHook() - >>> assert h._parse_host('https://xx.cloud.databricks.com') == \ - 'xx.cloud.databricks.com' + host, we must strip out the protocol to get the host.:: + + h = DatabricksHook() + assert h._parse_host('https://xx.cloud.databricks.com') == \ + 'xx.cloud.databricks.com' In the case where users supply the correct ``xx.cloud.databricks.com`` as the - host, this function is a no-op. - >>> assert h._parse_host('xx.cloud.databricks.com') == 'xx.cloud.databricks.com' + host, this function is a no-op.:: + + assert h._parse_host('xx.cloud.databricks.com') == 'xx.cloud.databricks.com' + """ urlparse_host = urlparse.urlparse(host).hostname if urlparse_host: @@ -91,8 +99,9 @@ def _parse_host(self, host): def _do_api_call(self, endpoint_info, json): """ Utility function to perform an API call with retries + :param endpoint_info: Tuple of method and endpoint - :type endpoint_info: (string, string) + :type endpoint_info: tuple[string, string] :param json: Parameters for this API call. :type json: dict :return: If the api call returns a OK status code, @@ -117,7 +126,8 @@ def _do_api_call(self, endpoint_info, json): else: raise AirflowException('Unexpected HTTP Method: ' + method) - for attempt_num in range(1, self.retry_limit + 1): + attempt_num = 1 + while True: try: response = request_func( url, @@ -125,21 +135,41 @@ def _do_api_call(self, endpoint_info, json): auth=auth, headers=USER_AGENT_HEADER, timeout=self.timeout_seconds) - if response.status_code == requests.codes.ok: - return response.json() - else: + response.raise_for_status() + return response.json() + except requests_exceptions.RequestException as e: + if not _retryable_error(e): # In this case, the user probably made a mistake. # Don't retry. raise AirflowException('Response: {0}, Status Code: {1}'.format( - response.content, response.status_code)) - except (requests_exceptions.ConnectionError, - requests_exceptions.Timeout) as e: - self.log.error( - 'Attempt %s API Request to Databricks failed with reason: %s', - attempt_num, e - ) - raise AirflowException(('API requests to Databricks failed {} times. ' + - 'Giving up.').format(self.retry_limit)) + e.response.content, e.response.status_code)) + + self._log_request_error(attempt_num, e) + + if attempt_num == self.retry_limit: + raise AirflowException(('API requests to Databricks failed {} times. ' + + 'Giving up.').format(self.retry_limit)) + + attempt_num += 1 + sleep(self.retry_delay) + + def _log_request_error(self, attempt_num, error): + self.log.error( + 'Attempt %s API Request to Databricks failed with reason: %s', + attempt_num, error + ) + + def run_now(self, json): + """ + Utility function to call the ``api/2.0/jobs/run-now`` endpoint. + + :param json: The data used in the body of the request to the ``run-now`` endpoint. + :type json: dict + :return: the run_id as a string + :rtype: str + """ + response = self._do_api_call(RUN_NOW_ENDPOINT, json) + return response['run_id'] def submit_run(self, json): """ @@ -148,7 +178,7 @@ def submit_run(self, json): :param json: The data used in the body of the request to the ``submit`` endpoint. :type json: dict :return: the run_id as a string - :rtype: string + :rtype: str """ response = self._do_api_call(SUBMIT_RUN_ENDPOINT, json) return response['run_id'] @@ -172,6 +202,21 @@ def cancel_run(self, run_id): json = {'run_id': run_id} self._do_api_call(CANCEL_RUN_ENDPOINT, json) + def restart_cluster(self, json): + self._do_api_call(RESTART_CLUSTER_ENDPOINT, json) + + def start_cluster(self, json): + self._do_api_call(START_CLUSTER_ENDPOINT, json) + + def terminate_cluster(self, json): + self._do_api_call(TERMINATE_CLUSTER_ENDPOINT, json) + + +def _retryable_error(exception): + return isinstance(exception, requests_exceptions.ConnectionError) \ + or isinstance(exception, requests_exceptions.Timeout) \ + or exception.response is not None and exception.response.status_code >= 500 + RUN_LIFE_CYCLE_STATES = [ 'PENDING', diff --git a/airflow/contrib/hooks/datadog_hook.py b/airflow/contrib/hooks/datadog_hook.py index 3dfeb781aefe0..50209dd112e06 100644 --- a/airflow/contrib/hooks/datadog_hook.py +++ b/airflow/contrib/hooks/datadog_hook.py @@ -35,7 +35,7 @@ class DatadogHook(BaseHook, LoggingMixin): Airflow runs. :param datadog_conn_id: The connection to datadog, containing metadata for api keys. - :param datadog_conn_id: string + :param datadog_conn_id: str """ def __init__(self, datadog_conn_id='datadog_default'): conn = self.get_connection(datadog_conn_id) @@ -50,38 +50,37 @@ def __init__(self, datadog_conn_id='datadog_default'): if self.api_key is None: raise AirflowException("api_key must be specified in the " "Datadog connection details") - if self.app_key is None: - raise AirflowException("app_key must be specified in the " - "Datadog connection details") self.log.info("Setting up api keys for Datadog") - options = { - 'api_key': self.api_key, - 'app_key': self.app_key - } - initialize(**options) + initialize(api_key=self.api_key, app_key=self.app_key) def validate_response(self, response): if response['status'] != 'ok': self.log.error("Datadog returned: %s", response) raise AirflowException("Error status received from Datadog") - def send_metric(self, metric_name, datapoint, tags=None): + def send_metric(self, metric_name, datapoint, tags=None, type_=None, interval=None): """ Sends a single datapoint metric to DataDog :param metric_name: The name of the metric - :type metric_name: string + :type metric_name: str :param datapoint: A single integer or float related to the metric - :type datapoint: integer or float + :type datapoint: int or float :param tags: A list of tags associated with the metric :type tags: list + :param type_: Type of your metric: gauge, rate, or count + :type type_: str + :param interval: If the type of the metric is rate or count, define the corresponding interval + :type interval: int """ response = api.Metric.send( metric=metric_name, points=datapoint, host=self.host, - tags=tags) + tags=tags, + type=type_, + interval=interval) self.validate_response(response) return response @@ -95,7 +94,7 @@ def query_metric(self, function applied to it and returns the results. :param query: The datadog query to execute (see datadog docs) - :type query: string + :type query: str :param from_seconds_ago: How many seconds ago to start querying for. :type from_seconds_ago: int :param to_seconds_ago: Up to how many seconds ago to query for. @@ -111,31 +110,48 @@ def query_metric(self, self.validate_response(response) return response - def post_event(self, title, text, tags=None, alert_type=None, aggregation_key=None): + def post_event(self, title, text, aggregation_key=None, alert_type=None, date_happened=None, + handle=None, priority=None, related_event_id=None, tags=None, device_name=None): """ Posts an event to datadog (processing finished, potentially alerts, other issues) Think about this as a means to maintain persistence of alerts, rather than alerting itself. :param title: The title of the event - :type title: string + :type title: str :param text: The body of the event (more information) - :type text: string - :param tags: List of string tags to apply to the event - :type tags: list + :type text: str + :param aggregation_key: Key that can be used to aggregate this event in a stream + :type aggregation_key: str :param alert_type: The alert type for the event, one of ["error", "warning", "info", "success"] - :type alert_type: string - :param aggregation_key: Key that can be used to aggregate this event in a stream - :type aggregation_key: string + :type alert_type: str + :param date_happened: POSIX timestamp of the event; defaults to now + :type date_happened: int + :handle: User to post the event as; defaults to owner of the application key used + to submit. + :param handle: str + :param priority: Priority to post the event as. ("normal" or "low", defaults to "normal") + :type priority: str + :param related_event_id: Post event as a child of the given event + :type related_event_id: id + :param tags: List of tags to apply to the event + :type tags: list[str] + :param device_name: device_name to post the event with + :type device_name: list """ response = api.Event.create( title=title, text=text, - host=self.host, - tags=tags, - alert_type=alert_type, aggregation_key=aggregation_key, + alert_type=alert_type, + date_happened=date_happened, + handle=handle, + priority=priority, + related_event_id=related_event_id, + tags=tags, + host=self.host, + device_name=device_name, source_type_name=self.source_type_name) self.validate_response(response) diff --git a/airflow/contrib/hooks/datastore_hook.py b/airflow/contrib/hooks/datastore_hook.py index 5e54cf2a65384..308809e1413b1 100644 --- a/airflow/contrib/hooks/datastore_hook.py +++ b/airflow/contrib/hooks/datastore_hook.py @@ -19,7 +19,7 @@ # import time -from apiclient.discovery import build +from googleapiclient.discovery import build from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook @@ -41,7 +41,7 @@ def __init__(self, def get_conn(self, version='v1'): """ - Returns a Google Cloud Storage service object. + Returns a Google Cloud Datastore service object. """ http_authorized = self._authorize() return build( @@ -172,7 +172,7 @@ def export_to_storage_bucket(self, bucket, namespace=None, """ Export entities from Cloud Datastore to Cloud Storage for backup """ - output_uri_prefix = 'gs://' + ('/').join(filter(None, [bucket, namespace])) + output_uri_prefix = 'gs://' + '/'.join(filter(None, [bucket, namespace])) if not entity_filter: entity_filter = {} if not labels: @@ -191,7 +191,7 @@ def import_from_storage_bucket(self, bucket, file, """ Import a backup from Cloud Storage to Cloud Datastore """ - input_url = 'gs://' + ('/').join(filter(None, [bucket, namespace, file])) + input_url = 'gs://' + '/'.join(filter(None, [bucket, namespace, file])) if not entity_filter: entity_filter = {} if not labels: diff --git a/airflow/contrib/hooks/dingding_hook.py b/airflow/contrib/hooks/dingding_hook.py new file mode 100644 index 0000000000000..1da373a8fb940 --- /dev/null +++ b/airflow/contrib/hooks/dingding_hook.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json + +import requests + +from airflow import AirflowException +from airflow.hooks.http_hook import HttpHook + + +class DingdingHook(HttpHook): + """ + This hook allows you send Dingding message using Dingding custom bot. + Get Dingding token from conn_id.password. And prefer set domain to + conn_id.host, if not will use default ``https://oapi.dingtalk.com``. + + For more detail message in + `Dingding custom bot `_ + + :param dingding_conn_id: The name of the Dingding connection to use + :type dingding_conn_id: str + :param message_type: Message type you want to send to Dingding, support five type so far + including text, link, markdown, actionCard, feedCard + :type message_type: str + :param message: The message send to Dingding chat group + :type message: str or dict + :param at_mobiles: Remind specific users with this message + :type at_mobiles: list[str] + :param at_all: Remind all people in group or not. If True, will overwrite ``at_mobiles`` + :type at_all: bool + """ + + def __init__(self, + dingding_conn_id='dingding_default', + message_type='text', + message=None, + at_mobiles=None, + at_all=False, + *args, + **kwargs + ): + super(DingdingHook, self).__init__(http_conn_id=dingding_conn_id, *args, **kwargs) + self.message_type = message_type + self.message = message + self.at_mobiles = at_mobiles + self.at_all = at_all + + def _get_endpoint(self): + """ + Get Dingding endpoint for sending message. + """ + conn = self.get_connection(self.http_conn_id) + token = conn.password + if not token: + raise AirflowException('Dingding token is requests but get nothing, ' + 'check you conn_id configuration.') + return 'robot/send?access_token={}'.format(token) + + def _build_message(self): + """ + Build different type of Dingding message + As most commonly used type, text message just need post message content + rather than a dict like ``{'content': 'message'}`` + """ + if self.message_type in ['text', 'markdown']: + data = { + 'msgtype': self.message_type, + self.message_type: { + 'content': self.message + } if self.message_type == 'text' else self.message, + 'at': { + 'atMobiles': self.at_mobiles, + 'isAtAll': self.at_all + } + } + else: + data = { + 'msgtype': self.message_type, + self.message_type: self.message + } + return json.dumps(data) + + def get_conn(self, headers=None): + """ + Overwrite HttpHook get_conn because just need base_url and headers and + not don't need generic params + :param headers: additional headers to be passed through as a dictionary + :type headers: dict + """ + conn = self.get_connection(self.http_conn_id) + self.base_url = conn.host if conn.host else 'https://oapi.dingtalk.com' + session = requests.Session() + if headers: + session.headers.update(headers) + return session + + def send(self): + """ + Send Dingding message + """ + support_type = ['text', 'link', 'markdown', 'actionCard', 'feedCard'] + if self.message_type not in support_type: + raise ValueError('DingdingWebhookHook only support {} ' + 'so far, but receive {}'.format(support_type, self.message_type)) + + data = self._build_message() + self.log.info('Sending Dingding type %s message %s', self.message_type, data) + resp = self.run(endpoint=self._get_endpoint(), + data=data, + headers={'Content-Type': 'application/json'}) + + # Dingding success send message will with errcode equal to 0 + if int(resp.json().get('errcode')) != 0: + raise AirflowException('Send Dingding message failed, receive error ' + 'message %s', resp.text) + self.log.info('Success Send Dingding message') diff --git a/airflow/contrib/hooks/emr_hook.py b/airflow/contrib/hooks/emr_hook.py index 6cd92c6d85874..7571d0988f697 100644 --- a/airflow/contrib/hooks/emr_hook.py +++ b/airflow/contrib/hooks/emr_hook.py @@ -23,16 +23,17 @@ class EmrHook(AwsHook): """ - Interact with AWS EMR. emr_conn_id is only neccessary for using the + Interact with AWS EMR. emr_conn_id is only necessary for using the create_job_flow method. """ - def __init__(self, emr_conn_id=None, *args, **kwargs): + def __init__(self, emr_conn_id=None, region_name=None, *args, **kwargs): self.emr_conn_id = emr_conn_id + self.region_name = region_name super(EmrHook, self).__init__(*args, **kwargs) def get_conn(self): - self.conn = self.get_client_type('emr') + self.conn = self.get_client_type('emr', self.region_name) return self.conn def create_job_flow(self, job_flow_overrides): @@ -51,19 +52,6 @@ def create_job_flow(self, job_flow_overrides): config = emr_conn.extra_dejson.copy() config.update(job_flow_overrides) - response = self.get_conn().run_job_flow( - Name=config.get('Name'), - LogUri=config.get('LogUri'), - ReleaseLabel=config.get('ReleaseLabel'), - Instances=config.get('Instances'), - Steps=config.get('Steps', []), - BootstrapActions=config.get('BootstrapActions', []), - Applications=config.get('Applications'), - Configurations=config.get('Configurations', []), - VisibleToAllUsers=config.get('VisibleToAllUsers'), - JobFlowRole=config.get('JobFlowRole'), - ServiceRole=config.get('ServiceRole'), - Tags=config.get('Tags') - ) + response = self.get_conn().run_job_flow(**config) return response diff --git a/airflow/contrib/hooks/fs_hook.py b/airflow/contrib/hooks/fs_hook.py index 6832f20c225c1..1aa528b6205dc 100644 --- a/airflow/contrib/hooks/fs_hook.py +++ b/airflow/contrib/hooks/fs_hook.py @@ -30,7 +30,7 @@ class FSHook(BaseHook): example: Conn Id: fs_test Conn Type: File (path) - Host, Shchema, Login, Password, Port: empty + Host, Schema, Login, Password, Port: empty Extra: {"path": "/tmp"} """ diff --git a/airflow/contrib/hooks/ftp_hook.py b/airflow/contrib/hooks/ftp_hook.py index 8beefb372916c..a12ece9bc9420 100644 --- a/airflow/contrib/hooks/ftp_hook.py +++ b/airflow/contrib/hooks/ftp_hook.py @@ -24,8 +24,6 @@ from airflow.hooks.base_hook import BaseHook from past.builtins import basestring -from airflow.utils.log.logging_mixin import LoggingMixin - def mlsd(conn, path="", facts=None): """ @@ -60,7 +58,7 @@ def mlsd(conn, path="", facts=None): yield (name, entry) -class FTPHook(BaseHook, LoggingMixin): +class FTPHook(BaseHook): """ Interact with FTP. @@ -148,7 +146,11 @@ def delete_directory(self, path): conn = self.get_conn() conn.rmd(path) - def retrieve_file(self, remote_full_path, local_full_path_or_buffer): + def retrieve_file( + self, + remote_full_path, + local_full_path_or_buffer, + callback=None): """ Transfers the remote file to a local location. @@ -161,23 +163,60 @@ def retrieve_file(self, remote_full_path, local_full_path_or_buffer): :param local_full_path_or_buffer: full path to the local file or a file-like buffer :type local_full_path_or_buffer: str or file-like buffer + :param callback: callback which is called each time a block of data + is read. if you do not use a callback, these blocks will be written + to the file or buffer passed in. if you do pass in a callback, note + that writing to a file or buffer will need to be handled inside the + callback. + [default: output_handle.write()] + :type callback: callable + + :Example:: + + hook = FTPHook(ftp_conn_id='my_conn') + + remote_path = '/path/to/remote/file' + local_path = '/path/to/local/file' + + # with a custom callback (in this case displaying progress on each read) + def print_progress(percent_progress): + self.log.info('Percent Downloaded: %s%%' % percent_progress) + + total_downloaded = 0 + total_file_size = hook.get_size(remote_path) + output_handle = open(local_path, 'wb') + def write_to_file_with_progress(data): + total_downloaded += len(data) + output_handle.write(data) + percent_progress = (total_downloaded / total_file_size) * 100 + print_progress(percent_progress) + hook.retrieve_file(remote_path, None, callback=write_to_file_with_progress) + + # without a custom callback data is written to the local_path + hook.retrieve_file(remote_path, local_path) """ conn = self.get_conn() is_path = isinstance(local_full_path_or_buffer, basestring) - if is_path: - output_handle = open(local_full_path_or_buffer, 'wb') + # without a callback, default to writing to a user-provided file or + # file-like buffer + if not callback: + if is_path: + output_handle = open(local_full_path_or_buffer, 'wb') + else: + output_handle = local_full_path_or_buffer + callback = output_handle.write else: - output_handle = local_full_path_or_buffer + output_handle = None remote_path, remote_file_name = os.path.split(remote_full_path) conn.cwd(remote_path) self.log.info('Retrieving file from FTP: %s', remote_full_path) - conn.retrbinary('RETR %s' % remote_file_name, output_handle.write) + conn.retrbinary('RETR %s' % remote_file_name, callback) self.log.info('Finished retrieving file from FTP: %s', remote_full_path) - if is_path: + if is_path and output_handle: output_handle.close() def store_file(self, remote_full_path, local_full_path_or_buffer): @@ -230,6 +269,12 @@ def rename(self, from_name, to_name): return conn.rename(from_name, to_name) def get_mod_time(self, path): + """ + Returns a datetime object representing the last time the file was modified + + :param path: remote file path + :type path: string + """ conn = self.get_conn() ftp_mdtm = conn.sendcmd('MDTM ' + path) time_val = ftp_mdtm[4:] @@ -239,6 +284,16 @@ def get_mod_time(self, path): except ValueError: return datetime.datetime.strptime(time_val, '%Y%m%d%H%M%S') + def get_size(self, path): + """ + Returns the size of a file (in bytes) + + :param path: remote file path + :type path: string + """ + conn = self.get_conn() + return conn.size(path) + class FTPSHook(FTPHook): diff --git a/airflow/contrib/hooks/gcp_api_base_hook.py b/airflow/contrib/hooks/gcp_api_base_hook.py index 053494743f064..db8f7de05e246 100644 --- a/airflow/contrib/hooks/gcp_api_base_hook.py +++ b/airflow/contrib/hooks/gcp_api_base_hook.py @@ -18,51 +18,62 @@ # under the License. # import json +import functools import httplib2 import google.auth import google_auth_httplib2 import google.oauth2.service_account +import os +import tempfile + +from google.api_core.exceptions import GoogleAPICallError, AlreadyExists, RetryError +from googleapiclient.errors import HttpError from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin _DEFAULT_SCOPES = ('https://www.googleapis.com/auth/cloud-platform',) +# The name of the environment variable that Google Authentication library uses +# to get service account key location. Read more: +# https://cloud.google.com/docs/authentication/getting-started#setting_the_environment_variable +_G_APP_CRED_ENV_VAR = "GOOGLE_APPLICATION_CREDENTIALS" -class GoogleCloudBaseHook(BaseHook, LoggingMixin): +class GoogleCloudBaseHook(BaseHook): """ A base hook for Google cloud-related hooks. Google cloud has a shared REST API client that is built in the same way no matter which service you use. This class helps construct and authorize the credentials needed to then - call apiclient.discovery.build() to actually discover and build a client + call googleapiclient.discovery.build() to actually discover and build a client for a Google cloud service. The class also contains some miscellaneous helper functions. All hook derived from this base hook use the 'Google Cloud Platform' connection - type. Two ways of authentication are supported: + type. Three ways of authentication are supported: Default credentials: Only the 'Project Id' is required. You'll need to have set up default credentials, such as by the ``GOOGLE_APPLICATION_DEFAULT`` environment variable or from the metadata server on Google Compute Engine. - JSON key file: Specify 'Project Id', 'Key Path' and 'Scope'. + JSON key file: Specify 'Project Id', 'Keyfile Path' and 'Scope'. Legacy P12 key files are not supported. + + JSON data provided in the UI: Specify 'Keyfile JSON'. """ def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): """ :param gcp_conn_id: The connection ID to use when fetching connection info. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to @@ -75,7 +86,7 @@ def _get_credentials(self): key_path = self._get_field('key_path', False) keyfile_dict = self._get_field('keyfile_dict', False) scope = self._get_field('scope', None) - if scope is not None: + if scope: scopes = [s.strip() for s in scope.split(',')] else: scopes = _DEFAULT_SCOPES @@ -87,7 +98,7 @@ def _get_credentials(self): elif key_path: # Get credentials from a JSON file. if key_path.endswith('.json'): - self.log.info('Getting connection using a JSON key file.') + self.log.debug('Getting connection using JSON key file %s' % key_path) credentials = ( google.oauth2.service_account.Credentials.from_service_account_file( key_path, scopes=scopes) @@ -142,7 +153,7 @@ def _get_field(self, f, default=None): key_path, etc. They get formatted as shown below. """ long_f = 'extra__google_cloud_platform__{}'.format(f) - if long_f in self.extras: + if hasattr(self, 'extras') and long_f in self.extras: return self.extras[long_f] else: return default @@ -150,3 +161,99 @@ def _get_field(self, f, default=None): @property def project_id(self): return self._get_field('project') + + @staticmethod + def catch_http_exception(func): + """ + Function decorator that intercepts HTTP Errors and raises AirflowException + with more informative message. + """ + + @functools.wraps(func) + def wrapper_decorator(self, *args, **kwargs): + try: + return func(self, *args, **kwargs) + except GoogleAPICallError as e: + if isinstance(e, AlreadyExists): + raise e + else: + self.log.error('The request failed:\n%s', str(e)) + raise AirflowException(e) + except RetryError as e: + self.log.error('The request failed due to a retryable error and retry attempts failed.') + raise AirflowException(e) + except ValueError as e: + self.log.error('The request failed, the parameters are invalid.') + raise AirflowException(e) + except HttpError as e: + self.log.error('The request failed:\n%s', str(e)) + raise AirflowException(e) + + return wrapper_decorator + + @staticmethod + def fallback_to_default_project_id(func): + """ + Decorator that provides fallback for Google Cloud Platform project id. If + the project is None it will be replaced with the project_id from the + service account the Hook is authenticated with. Project id can be specified + either via project_id kwarg or via first parameter in positional args. + + :param func: function to wrap + :return: result of the function call + """ + @functools.wraps(func) + def inner_wrapper(self, *args, **kwargs): + if len(args) > 0: + raise AirflowException( + "You must use keyword arguments in this methods rather than" + " positional") + if 'project_id' in kwargs: + kwargs['project_id'] = self._get_project_id(kwargs['project_id']) + else: + kwargs['project_id'] = self._get_project_id(None) + if not kwargs['project_id']: + raise AirflowException("The project id must be passed either as " + "keyword project_id parameter or as project_id extra " + "in GCP connection definition. Both are not set!") + return func(self, *args, **kwargs) + return inner_wrapper + + def _get_project_id(self, project_id): + """ + In case project_id is None, overrides it with default project_id from + the service account that is authorized. + + :param project_id: project id to + :type project_id: str + :return: the project_id specified or default project id if project_id is None + """ + return project_id if project_id else self.project_id + + class _Decorators(object): + """A private inner class for keeping all decorator methods.""" + + @staticmethod + def provide_gcp_credential_file(func): + """ + Function decorator that provides a GOOGLE_APPLICATION_CREDENTIALS + environment variable, pointing to file path of a JSON file of service + account key. + """ + @functools.wraps(func) + def wrapper(self, *args, **kwargs): + with tempfile.NamedTemporaryFile(mode='w+t') as conf_file: + key_path = self._get_field('key_path', False) + keyfile_dict = self._get_field('keyfile_dict', False) + if key_path: + if key_path.endswith('.p12'): + raise AirflowException( + 'Legacy P12 key file are not supported, ' + 'use a JSON key file.') + os.environ[_G_APP_CRED_ENV_VAR] = key_path + elif keyfile_dict: + conf_file.write(keyfile_dict) + conf_file.flush() + os.environ[_G_APP_CRED_ENV_VAR] = conf_file.name + return func(self, *args, **kwargs) + return wrapper diff --git a/airflow/contrib/hooks/gcp_bigtable_hook.py b/airflow/contrib/hooks/gcp_bigtable_hook.py new file mode 100644 index 0000000000000..d50a95deb46c7 --- /dev/null +++ b/airflow/contrib/hooks/gcp_bigtable_hook.py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from google.cloud.bigtable import Client +from google.cloud.bigtable.cluster import Cluster +from google.cloud.bigtable.instance import Instance +from google.cloud.bigtable.table import Table +from google.cloud.bigtable_admin_v2 import enums +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + + +class BigtableHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Bigtable APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + """ + + _client = None + + def __init__(self, + gcp_conn_id='google_cloud_default', + delegate_to=None): + super(BigtableHook, self).__init__(gcp_conn_id, delegate_to) + + def _get_client(self, project_id): + if not self._client: + self._client = Client(project=project_id, credentials=self._get_credentials(), + admin=True) + return self._client + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_instance(self, instance_id, project_id=None): + """ + Retrieves and returns the specified Cloud Bigtable instance if it exists. + Otherwise, returns None. + + :param instance_id: The ID of the Cloud Bigtable instance. + :type instance_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + BigTable exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + """ + + instance = self._get_client(project_id=project_id).instance(instance_id) + if not instance.exists(): + return None + return instance + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_instance(self, instance_id, project_id=None): + """ + Deletes the specified Cloud Bigtable instance. + Raises google.api_core.exceptions.NotFound if the Cloud Bigtable instance does + not exist. + + :param project_id: Optional, Google Cloud Platform project ID where the + BigTable exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param instance_id: The ID of the Cloud Bigtable instance. + :type instance_id: str + """ + instance = self.get_instance(instance_id=instance_id, project_id=project_id) + if instance: + instance.delete() + else: + self.log.info("The instance '%s' does not exist in project '%s'. Exiting", instance_id, + project_id) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_instance(self, + instance_id, + main_cluster_id, + main_cluster_zone, + project_id=None, + replica_cluster_id=None, + replica_cluster_zone=None, + instance_display_name=None, + instance_type=enums.Instance.Type.TYPE_UNSPECIFIED, + instance_labels=None, + cluster_nodes=None, + cluster_storage_type=enums.StorageType.STORAGE_TYPE_UNSPECIFIED, + timeout=None): + """ + Creates new instance. + + :type instance_id: str + :param instance_id: The ID for the new instance. + :type main_cluster_id: str + :param main_cluster_id: The ID for main cluster for the new instance. + :type main_cluster_zone: str + :param main_cluster_zone: The zone for main cluster. + See https://cloud.google.com/bigtable/docs/locations for more details. + :type project_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + BigTable exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type replica_cluster_id: str + :param replica_cluster_id: (optional) The ID for replica cluster for the new + instance. + :type replica_cluster_zone: str + :param replica_cluster_zone: (optional) The zone for replica cluster. + :type instance_type: enums.Instance.Type + :param instance_type: (optional) The type of the instance. + :type instance_display_name: str + :param instance_display_name: (optional) Human-readable name of the instance. + Defaults to ``instance_id``. + :type instance_labels: dict + :param instance_labels: (optional) Dictionary of labels to associate with the + instance. + :type cluster_nodes: int + :param cluster_nodes: (optional) Number of nodes for cluster. + :type cluster_storage_type: enums.StorageType + :param cluster_storage_type: (optional) The type of storage. + :type timeout: int + :param timeout: (optional) timeout (in seconds) for instance creation. + If None is not specified, Operator will wait indefinitely. + """ + cluster_storage_type = enums.StorageType(cluster_storage_type) + instance_type = enums.Instance.Type(instance_type) + + instance = Instance( + instance_id, + self._get_client(project_id=project_id), + instance_display_name, + instance_type, + instance_labels, + ) + + clusters = [ + instance.cluster( + main_cluster_id, + main_cluster_zone, + cluster_nodes, + cluster_storage_type + ) + ] + if replica_cluster_id and replica_cluster_zone: + clusters.append(instance.cluster( + replica_cluster_id, + replica_cluster_zone, + cluster_nodes, + cluster_storage_type + )) + operation = instance.create( + clusters=clusters + ) + operation.result(timeout) + return instance + + @staticmethod + def create_table(instance, + table_id, + initial_split_keys=None, + column_families=None): + """ + Creates the specified Cloud Bigtable table. + Raises ``google.api_core.exceptions.AlreadyExists`` if the table exists. + + :type instance: Instance + :param instance: The Cloud Bigtable instance that owns the table. + :type table_id: str + :param table_id: The ID of the table to create in Cloud Bigtable. + :type initial_split_keys: list + :param initial_split_keys: (Optional) A list of row keys in bytes to use to + initially split the table. + :type column_families: dict + :param column_families: (Optional) A map of columns to create. The key is the + column_id str, and the value is a + :class:`google.cloud.bigtable.column_family.GarbageCollectionRule`. + """ + if column_families is None: + column_families = {} + if initial_split_keys is None: + initial_split_keys = [] + table = Table(table_id, instance) + table.create(initial_split_keys, column_families) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_table(self, instance_id, table_id, project_id=None): + """ + Deletes the specified table in Cloud Bigtable. + Raises google.api_core.exceptions.NotFound if the table does not exist. + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance. + :type table_id: str + :param table_id: The ID of the table in Cloud Bigtable. + :type project_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + BigTable exists. If set to None or missing, + the default project_id from the GCP connection is used. + """ + table = self.get_instance(instance_id=instance_id, project_id=project_id).table(table_id=table_id) + table.delete() + + @staticmethod + def update_cluster(instance, cluster_id, nodes): + """ + Updates number of nodes in the specified Cloud Bigtable cluster. + Raises google.api_core.exceptions.NotFound if the cluster does not exist. + + :type instance: Instance + :param instance: The Cloud Bigtable instance that owns the cluster. + :type cluster_id: str + :param cluster_id: The ID of the cluster. + :type nodes: int + :param nodes: The desired number of nodes. + """ + cluster = Cluster(cluster_id, instance) + cluster.serve_nodes = nodes + cluster.update() + + @staticmethod + def get_column_families_for_table(instance, table_id): + """ + Fetches Column Families for the specified table in Cloud Bigtable. + + :type instance: Instance + :param instance: The Cloud Bigtable instance that owns the table. + :type table_id: str + :param table_id: The ID of the table in Cloud Bigtable to fetch Column Families + from. + """ + + table = Table(table_id, instance) + return table.list_column_families() + + @staticmethod + def get_cluster_states_for_table(instance, table_id): + """ + Fetches Cluster States for the specified table in Cloud Bigtable. + Raises google.api_core.exceptions.NotFound if the table does not exist. + + :type instance: Instance + :param instance: The Cloud Bigtable instance that owns the table. + :type table_id: str + :param table_id: The ID of the table in Cloud Bigtable to fetch Cluster States + from. + """ + + table = Table(table_id, instance) + return table.get_cluster_states() diff --git a/airflow/contrib/hooks/gcp_compute_hook.py b/airflow/contrib/hooks/gcp_compute_hook.py new file mode 100644 index 0000000000000..02e0f56bf7697 --- /dev/null +++ b/airflow/contrib/hooks/gcp_compute_hook.py @@ -0,0 +1,335 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +from googleapiclient.discovery import build + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +# Number of retries - used by googleapiclient method calls to perform retries +# For requests that are "retriable" +NUM_RETRIES = 5 + +# Time to sleep between active checks of the operation results +TIME_TO_SLEEP_IN_SECONDS = 1 + + +class GceOperationStatus: + PENDING = "PENDING" + RUNNING = "RUNNING" + DONE = "DONE" + + +# noinspection PyAbstractClass +class GceHook(GoogleCloudBaseHook): + """ + Hook for Google Compute Engine APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + + """ + _conn = None + + def __init__(self, + api_version='v1', + gcp_conn_id='google_cloud_default', + delegate_to=None): + super(GceHook, self).__init__(gcp_conn_id, delegate_to) + self.api_version = api_version + + def get_conn(self): + """ + Retrieves connection to Google Compute Engine. + + :return: Google Compute Engine services object + :rtype: dict + """ + if not self._conn: + http_authorized = self._authorize() + self._conn = build('compute', self.api_version, + http=http_authorized, cache_discovery=False) + return self._conn + + @GoogleCloudBaseHook.fallback_to_default_project_id + def start_instance(self, zone, resource_id, project_id=None): + """ + Starts an existing instance defined by project_id, zone and resource_id. + Must be called with keyword arguments rather than positional. + + :param zone: Google Cloud Platform zone where the instance exists + :type zone: str + :param resource_id: Name of the Compute Engine instance resource + :type resource_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instances().start( + project=project_id, + zone=zone, + instance=resource_id + ).execute(num_retries=NUM_RETRIES) + try: + operation_name = response["name"] + except KeyError: + raise AirflowException( + "Wrong response '{}' returned - it should contain " + "'name' field".format(response)) + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name, + zone=zone) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def stop_instance(self, zone, resource_id, project_id=None): + """ + Stops an instance defined by project_id, zone and resource_id + Must be called with keyword arguments rather than positional. + + :param zone: Google Cloud Platform zone where the instance exists + :type zone: str + :param resource_id: Name of the Compute Engine instance resource + :type resource_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instances().stop( + project=project_id, + zone=zone, + instance=resource_id + ).execute(num_retries=NUM_RETRIES) + try: + operation_name = response["name"] + except KeyError: + raise AirflowException( + "Wrong response '{}' returned - it should contain " + "'name' field".format(response)) + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name, + zone=zone) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def set_machine_type(self, zone, resource_id, body, project_id=None): + """ + Sets machine type of an instance defined by project_id, zone and resource_id. + Must be called with keyword arguments rather than positional. + + :param zone: Google Cloud Platform zone where the instance exists. + :type zone: str + :param resource_id: Name of the Compute Engine instance resource + :type resource_id: str + :param body: Body required by the Compute Engine setMachineType API, + as described in + https://cloud.google.com/compute/docs/reference/rest/v1/instances/setMachineType + :type body: dict + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self._execute_set_machine_type(zone, resource_id, body, project_id) + try: + operation_name = response["name"] + except KeyError: + raise AirflowException( + "Wrong response '{}' returned - it should contain " + "'name' field".format(response)) + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name, + zone=zone) + + def _execute_set_machine_type(self, zone, resource_id, body, project_id): + return self.get_conn().instances().setMachineType( + project=project_id, zone=zone, instance=resource_id, body=body)\ + .execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_instance_template(self, resource_id, project_id=None): + """ + Retrieves instance template by project_id and resource_id. + Must be called with keyword arguments rather than positional. + + :param resource_id: Name of the instance template + :type resource_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: Instance template representation as object according to + https://cloud.google.com/compute/docs/reference/rest/v1/instanceTemplates + :rtype: dict + """ + response = self.get_conn().instanceTemplates().get( + project=project_id, + instanceTemplate=resource_id + ).execute(num_retries=NUM_RETRIES) + return response + + @GoogleCloudBaseHook.fallback_to_default_project_id + def insert_instance_template(self, body, request_id=None, project_id=None): + """ + Inserts instance template using body specified + Must be called with keyword arguments rather than positional. + + :param body: Instance template representation as object according to + https://cloud.google.com/compute/docs/reference/rest/v1/instanceTemplates + :type body: dict + :param request_id: Optional, unique request_id that you might add to achieve + full idempotence (for example when client call times out repeating the request + with the same request id will not create a new instance template again) + It should be in UUID format as defined in RFC 4122 + :type request_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instanceTemplates().insert( + project=project_id, + body=body, + requestId=request_id + ).execute(num_retries=NUM_RETRIES) + try: + operation_name = response["name"] + except KeyError: + raise AirflowException( + "Wrong response '{}' returned - it should contain " + "'name' field".format(response)) + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_instance_group_manager(self, zone, resource_id, project_id=None): + """ + Retrieves Instance Group Manager by project_id, zone and resource_id. + Must be called with keyword arguments rather than positional. + + :param zone: Google Cloud Platform zone where the Instance Group Manager exists + :type zone: str + :param resource_id: Name of the Instance Group Manager + :type resource_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: Instance group manager representation as object according to + https://cloud.google.com/compute/docs/reference/rest/beta/instanceGroupManagers + :rtype: dict + """ + response = self.get_conn().instanceGroupManagers().get( + project=project_id, + zone=zone, + instanceGroupManager=resource_id + ).execute(num_retries=NUM_RETRIES) + return response + + @GoogleCloudBaseHook.fallback_to_default_project_id + def patch_instance_group_manager(self, zone, resource_id, + body, request_id=None, project_id=None): + """ + Patches Instance Group Manager with the specified body. + Must be called with keyword arguments rather than positional. + + :param zone: Google Cloud Platform zone where the Instance Group Manager exists + :type zone: str + :param resource_id: Name of the Instance Group Manager + :type resource_id: str + :param body: Instance Group Manager representation as json-merge-patch object + according to + https://cloud.google.com/compute/docs/reference/rest/beta/instanceTemplates/patch + :type body: dict + :param request_id: Optional, unique request_id that you might add to achieve + full idempotence (for example when client call times out repeating the request + with the same request id will not create a new instance template again). + It should be in UUID format as defined in RFC 4122 + :type request_id: str + :param project_id: Optional, Google Cloud Platform project ID where the + Compute Engine Instance exists. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instanceGroupManagers().patch( + project=project_id, + zone=zone, + instanceGroupManager=resource_id, + body=body, + requestId=request_id + ).execute(num_retries=NUM_RETRIES) + try: + operation_name = response["name"] + except KeyError: + raise AirflowException( + "Wrong response '{}' returned - it should contain " + "'name' field".format(response)) + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name, + zone=zone) + + def _wait_for_operation_to_complete(self, project_id, operation_name, zone=None): + """ + Waits for the named operation to complete - checks status of the async call. + + :param operation_name: name of the operation + :type operation_name: str + :param zone: optional region of the request (might be None for global operations) + :type zone: str + :return: None + """ + service = self.get_conn() + while True: + if zone is None: + # noinspection PyTypeChecker + operation_response = self._check_global_operation_status( + service, operation_name, project_id) + else: + # noinspection PyTypeChecker + operation_response = self._check_zone_operation_status( + service, operation_name, project_id, zone) + if operation_response.get("status") == GceOperationStatus.DONE: + error = operation_response.get("error") + if error: + code = operation_response.get("httpErrorStatusCode") + msg = operation_response.get("httpErrorMessage") + # Extracting the errors list as string and trimming square braces + error_msg = str(error.get("errors"))[1:-1] + raise AirflowException("{} {}: ".format(code, msg) + error_msg) + # No meaningful info to return from the response in case of success + return + time.sleep(TIME_TO_SLEEP_IN_SECONDS) + + @staticmethod + def _check_zone_operation_status(service, operation_name, project_id, zone): + return service.zoneOperations().get( + project=project_id, zone=zone, operation=operation_name).execute( + num_retries=NUM_RETRIES) + + @staticmethod + def _check_global_operation_status(service, operation_name, project_id): + return service.globalOperations().get( + project=project_id, operation=operation_name).execute( + num_retries=NUM_RETRIES) diff --git a/airflow/contrib/hooks/gcp_container_hook.py b/airflow/contrib/hooks/gcp_container_hook.py index d36d796d764a5..909cc6338377b 100644 --- a/airflow/contrib/hooks/gcp_container_hook.py +++ b/airflow/contrib/hooks/gcp_container_hook.py @@ -21,9 +21,9 @@ import time from airflow import AirflowException, version -from airflow.hooks.base_hook import BaseHook +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook -from google.api_core.exceptions import AlreadyExists +from google.api_core.exceptions import AlreadyExists, NotFound from google.api_core.gapic_v1.method import DEFAULT from google.cloud import container_v1, exceptions from google.cloud.container_v1.gapic.enums import Operation @@ -34,19 +34,30 @@ OPERATIONAL_POLL_INTERVAL = 15 -class GKEClusterHook(BaseHook): +class GKEClusterHook(GoogleCloudBaseHook): - def __init__(self, project_id, location): - self.project_id = project_id + def __init__(self, + gcp_conn_id='google_cloud_default', + delegate_to=None, + location=None): + super(GKEClusterHook, self).__init__( + gcp_conn_id=gcp_conn_id, delegate_to=delegate_to) + self._client = None self.location = location - # Add client library info for better error tracking - client_info = ClientInfo(client_library_version='airflow_v' + version.version) - self.client = container_v1.ClusterManagerClient(client_info=client_info) + def get_client(self): + if self._client is None: + credentials = self._get_credentials() + # Add client library info for better error tracking + client_info = ClientInfo(client_library_version='airflow_v' + version.version) + self._client = container_v1.ClusterManagerClient(credentials=credentials, client_info=client_info) + return self._client - def _dict_to_proto(self, py_dict, proto): + @staticmethod + def _dict_to_proto(py_dict, proto): """ Converts a python dictionary to the proto supplied + :param py_dict: The dictionary to convert :type py_dict: dict :param proto: The proto object to merge with dictionary @@ -58,15 +69,18 @@ def _dict_to_proto(self, py_dict, proto): dict_json_str = json.dumps(py_dict) return json_format.Parse(dict_json_str, proto) - def wait_for_operation(self, operation): + def wait_for_operation(self, operation, project_id=None): """ Given an operation, continuously fetches the status from Google Cloud until either completion or an error occurring + :param operation: The Operation to wait for - :type operation: A google.cloud.container_V1.gapic.enums.Operator + :type operation: google.cloud.container_V1.gapic.enums.Operation + :param project_id: Google Cloud Platform project ID + :type project_id: str :return: A new, updated operation fetched from Google Cloud """ - self.log.info("Waiting for OPERATION_NAME %s" % operation.name) + self.log.info("Waiting for OPERATION_NAME %s", operation.name) time.sleep(OPERATIONAL_POLL_INTERVAL) while operation.status != Operation.Status.DONE: if operation.status == Operation.Status.RUNNING or operation.status == \ @@ -76,27 +90,33 @@ def wait_for_operation(self, operation): raise exceptions.GoogleCloudError( "Operation has failed with status: %s" % operation.status) # To update status of operation - operation = self.get_operation(operation.name) + operation = self.get_operation(operation.name, project_id=project_id or self.project_id) return operation - def get_operation(self, operation_name): + def get_operation(self, operation_name, project_id=None): """ Fetches the operation from Google Cloud + :param operation_name: Name of operation to fetch :type operation_name: str + :param project_id: Google Cloud Platform project ID + :type project_id: str :return: The new, updated operation from Google Cloud """ - return self.client.get_operation(project_id=self.project_id, - zone=self.location, - operation_id=operation_name) + return self.get_client().get_operation(project_id=project_id or self.project_id, + zone=self.location, + operation_id=operation_name) - def _append_label(self, cluster_proto, key, val): + @staticmethod + def _append_label(cluster_proto, key, val): """ Append labels to provided Cluster Protobuf - Labels must fit the regex [a-z]([-a-z0-9]*[a-z0-9])? (current airflow version - string follows semantic versioning spec: x.y.z). - :param cluster_proto: The proto to append resource_label airflow version to + Labels must fit the regex ``[a-z]([-a-z0-9]*[a-z0-9])?`` (current + airflow version string follows semantic versioning spec: x.y.z). + + :param cluster_proto: The proto to append resource_label airflow + version to :type cluster_proto: google.cloud.container_v1.types.Cluster :param key: The key label :type key: str @@ -108,7 +128,7 @@ def _append_label(self, cluster_proto, key, val): cluster_proto.resource_labels.update({key: val}) return cluster_proto - def delete_cluster(self, name, retry=DEFAULT, timeout=DEFAULT): + def delete_cluster(self, name, project_id=None, retry=DEFAULT, timeout=DEFAULT): """ Deletes the cluster, including the Kubernetes endpoint and all worker nodes. Firewalls and routes that were configured during @@ -119,6 +139,8 @@ def delete_cluster(self, name, retry=DEFAULT, timeout=DEFAULT): :param name: The name of the cluster to delete :type name: str + :param project_id: Google Cloud Platform project ID + :type project_id: str :param retry: Retry object used to determine when/if to retry requests. If None is specified, requests will not be retried. :type retry: google.api_core.retry.Retry @@ -129,30 +151,35 @@ def delete_cluster(self, name, retry=DEFAULT, timeout=DEFAULT): :return: The full url to the delete operation if successful, else None """ - self.log.info("Deleting (project_id={}, zone={}, cluster_id={})".format( - self.project_id, self.location, name)) + self.log.info( + "Deleting (project_id=%s, zone=%s, cluster_id=%s)", self.project_id, self.location, name + ) try: - op = self.client.delete_cluster(project_id=self.project_id, - zone=self.location, - cluster_id=name, - retry=retry, - timeout=timeout) + op = self.get_client().delete_cluster(project_id=project_id or self.project_id, + zone=self.location, + cluster_id=name, + retry=retry, + timeout=timeout) op = self.wait_for_operation(op) # Returns server-defined url for the resource return op.self_link - except exceptions.NotFound as error: - self.log.info('Assuming Success: ' + error.message) + except NotFound as error: + self.log.info('Assuming Success: %s', error.message) - def create_cluster(self, cluster, retry=DEFAULT, timeout=DEFAULT): + def create_cluster(self, cluster, project_id=None, retry=DEFAULT, timeout=DEFAULT): """ Creates a cluster, consisting of the specified number and type of Google Compute Engine instances. - :param cluster: A Cluster protobuf or dict. If dict is provided, it must be of - the same form as the protobuf message google.cloud.container_v1.types.Cluster + :param cluster: A Cluster protobuf or dict. If dict is provided, it must + be of the same form as the protobuf message + :class:`google.cloud.container_v1.types.Cluster` :type cluster: dict or google.cloud.container_v1.types.Cluster - :param retry: A retry object (google.api_core.retry.Retry) used to retry requests. + :param project_id: Google Cloud Platform project ID + :type project_id: str + :param retry: A retry object (``google.api_core.retry.Retry``) used to + retry requests. If None is specified, requests will not be retried. :type retry: google.api_core.retry.Retry :param timeout: The amount of time, in seconds, to wait for the request to @@ -160,7 +187,7 @@ def create_cluster(self, cluster, retry=DEFAULT, timeout=DEFAULT): individual attempt. :type timeout: float :return: The full url to the new, or existing, cluster - :raises + :raises: ParseError: On JSON parsing problems when trying to convert dict AirflowException: cluster is not dict type nor Cluster proto type """ @@ -174,28 +201,31 @@ def create_cluster(self, cluster, retry=DEFAULT, timeout=DEFAULT): self._append_label(cluster, 'airflow-version', 'v' + version.version) - self.log.info("Creating (project_id={}, zone={}, cluster_name={})".format( - self.project_id, - self.location, - cluster.name)) + self.log.info( + "Creating (project_id=%s, zone=%s, cluster_name=%s)", + self.project_id, self.location, cluster.name + ) try: - op = self.client.create_cluster(project_id=self.project_id, - zone=self.location, - cluster=cluster, - retry=retry, - timeout=timeout) + op = self.get_client().create_cluster(project_id=project_id or self.project_id, + zone=self.location, + cluster=cluster, + retry=retry, + timeout=timeout) op = self.wait_for_operation(op) return op.target_link except AlreadyExists as error: - self.log.info('Assuming Success: ' + error.message) + self.log.info('Assuming Success: %s', error.message) return self.get_cluster(name=cluster.name).self_link - def get_cluster(self, name, retry=DEFAULT, timeout=DEFAULT): + def get_cluster(self, name, project_id=None, retry=DEFAULT, timeout=DEFAULT): """ Gets details of specified cluster + :param name: The name of the cluster to retrieve :type name: str + :param project_id: Google Cloud Platform project ID + :type project_id: str :param retry: A retry object used to retry requests. If None is specified, requests will not be retried. :type retry: google.api_core.retry.Retry @@ -203,15 +233,15 @@ def get_cluster(self, name, retry=DEFAULT, timeout=DEFAULT): complete. Note that if retry is specified, the timeout applies to each individual attempt. :type timeout: float - :return: A google.cloud.container_v1.types.Cluster instance + :return: google.cloud.container_v1.types.Cluster """ - self.log.info("Fetching cluster (project_id={}, zone={}, cluster_name={})".format( - self.project_id, - self.location, - name)) - - return self.client.get_cluster(project_id=self.project_id, - zone=self.location, - cluster_id=name, - retry=retry, - timeout=timeout).self_link + self.log.info( + "Fetching cluster (project_id=%s, zone=%s, cluster_name=%s)", + project_id or self.project_id, self.location, name + ) + + return self.get_client().get_cluster(project_id=project_id or self.project_id, + zone=self.location, + cluster_id=name, + retry=retry, + timeout=timeout).self_link diff --git a/airflow/contrib/hooks/gcp_dataflow_hook.py b/airflow/contrib/hooks/gcp_dataflow_hook.py index 7abb413a1afcf..44679b3205362 100644 --- a/airflow/contrib/hooks/gcp_dataflow_hook.py +++ b/airflow/contrib/hooks/gcp_dataflow_hook.py @@ -17,12 +17,13 @@ # specific language governing permissions and limitations # under the License. import json +import re import select import subprocess import time import uuid -from apiclient.discovery import build +from googleapiclient.discovery import build from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.utils.log.logging_mixin import LoggingMixin @@ -33,12 +34,13 @@ class _DataflowJob(LoggingMixin): - def __init__(self, dataflow, project_number, name, location, poll_sleep=10): + def __init__(self, dataflow, project_number, name, location, poll_sleep=10, + job_id=None): self._dataflow = dataflow self._project_number = project_number self._job_name = name self._job_location = location - self._job_id = None + self._job_id = job_id self._job = self._get_job() self._poll_sleep = poll_sleep @@ -46,7 +48,7 @@ def _get_job_id_from_name(self): jobs = self._dataflow.projects().locations().jobs().list( projectId=self._project_number, location=self._job_location - ).execute() + ).execute(num_retries=5) for job in jobs['jobs']: if job['name'] == self._job_name: self._job_id = job['id'] @@ -54,13 +56,15 @@ def _get_job_id_from_name(self): return None def _get_job(self): - if self._job_name: + if self._job_id: + job = self._dataflow.projects().locations().jobs().get( + projectId=self._project_number, + location=self._job_location, + jobId=self._job_id).execute(num_retries=5) + elif self._job_name: job = self._get_job_id_from_name() else: - job = self._dataflow.projects().jobs().get( - projectId=self._project_number, - jobId=self._job_id - ).execute() + raise Exception('Missing both dataflow job ID and name.') if job and 'currentState' in job: self.log.info( @@ -123,36 +127,50 @@ def __init__(self, cmd): def _line(self, fd): if fd == self._proc.stderr.fileno(): - lines = self._proc.stderr.readlines() - for line in lines: + line = b''.join(self._proc.stderr.readlines()) + if line: self.log.warning(line[:-1]) - if lines: - return lines[-1] + return line if fd == self._proc.stdout.fileno(): - line = self._proc.stdout.readline() + line = b''.join(self._proc.stdout.readlines()) + if line: + self.log.info(line[:-1]) return line @staticmethod def _extract_job(line): - if line is not None: - if line.startswith("Submitted job: "): - return line[15:-1] + # Job id info: https://goo.gl/SE29y9. + job_id_pattern = re.compile( + br'.*console.cloud.google.com/dataflow.*/jobs/([a-z|0-9|A-Z|\-|\_]+).*') + matched_job = job_id_pattern.search(line or '') + if matched_job: + return matched_job.group(1).decode() def wait_for_done(self): reads = [self._proc.stderr.fileno(), self._proc.stdout.fileno()] self.log.info("Start waiting for DataFlow process to complete.") - while self._proc.poll() is None: + job_id = None + # Make sure logs are processed regardless whether the subprocess is + # terminated. + process_ends = False + while True: ret = select.select(reads, [], [], 5) if ret is not None: for fd in ret[0]: line = self._line(fd) if line: - self.log.debug(line[:-1]) + job_id = job_id or self._extract_job(line) else: self.log.info("Waiting for DataFlow process to complete.") - if self._proc.returncode is not 0: + if process_ends: + break + if self._proc.poll() is not None: + # Mark process completion but allows its outputs to be consumed. + process_ends = True + if self._proc.returncode != 0: raise Exception("DataFlow failed with return code {}".format( self._proc.returncode)) + return job_id class DataFlowHook(GoogleCloudBaseHook): @@ -166,20 +184,20 @@ def __init__(self, def get_conn(self): """ - Returns a Google Cloud Storage service object. + Returns a Google Cloud Dataflow service object. """ http_authorized = self._authorize() return build( 'dataflow', 'v1b3', http=http_authorized, cache_discovery=False) - def _start_dataflow(self, task_id, variables, name, - command_prefix, label_formatter): + @GoogleCloudBaseHook._Decorators.provide_gcp_credential_file + def _start_dataflow(self, variables, name, command_prefix, label_formatter): variables = self._set_variables(variables) - cmd = command_prefix + self._build_cmd(task_id, variables, - label_formatter) - _Dataflow(cmd).wait_for_done() + cmd = command_prefix + self._build_cmd(variables, label_formatter) + job_id = _Dataflow(cmd).wait_for_done() _DataflowJob(self.get_conn(), variables['project'], name, - variables['region'], self.poll_sleep).wait_for_done() + variables['region'], + self.poll_sleep, job_id).wait_for_done() @staticmethod def _set_variables(variables): @@ -189,12 +207,9 @@ def _set_variables(variables): variables['region'] = DEFAULT_DATAFLOW_LOCATION return variables - def start_java_dataflow(self, task_id, variables, dataflow, job_class=None, + def start_java_dataflow(self, job_name, variables, dataflow, job_class=None, append_job_name=True): - if append_job_name: - name = task_id + "-" + str(uuid.uuid1())[:8] - else: - name = task_id + name = self._build_dataflow_job_name(job_name, append_job_name) variables['jobName'] = name def label_formatter(labels_dict): @@ -202,34 +217,45 @@ def label_formatter(labels_dict): json.dumps(labels_dict).replace(' ', ''))] command_prefix = (["java", "-cp", dataflow, job_class] if job_class else ["java", "-jar", dataflow]) - self._start_dataflow(task_id, variables, name, - command_prefix, label_formatter) + self._start_dataflow(variables, name, command_prefix, label_formatter) - def start_template_dataflow(self, task_id, variables, parameters, dataflow_template, + def start_template_dataflow(self, job_name, variables, parameters, dataflow_template, append_job_name=True): - if append_job_name: - name = task_id + "-" + str(uuid.uuid1())[:8] - else: - name = task_id + variables = self._set_variables(variables) + name = self._build_dataflow_job_name(job_name, append_job_name) self._start_template_dataflow( name, variables, parameters, dataflow_template) - def start_python_dataflow(self, task_id, variables, dataflow, py_options, + def start_python_dataflow(self, job_name, variables, dataflow, py_options, append_job_name=True): - if append_job_name: - name = task_id + "-" + str(uuid.uuid1())[:8] - else: - name = task_id + name = self._build_dataflow_job_name(job_name, append_job_name) variables['job_name'] = name def label_formatter(labels_dict): return ['--labels={}={}'.format(key, value) for key, value in labels_dict.items()] - self._start_dataflow(task_id, variables, name, - ["python"] + py_options + [dataflow], + self._start_dataflow(variables, name, ["python2"] + py_options + [dataflow], label_formatter) - def _build_cmd(self, task_id, variables, label_formatter): + @staticmethod + def _build_dataflow_job_name(job_name, append_job_name=True): + base_job_name = str(job_name).replace('_', '-') + + if not re.match(r"^[a-z]([-a-z0-9]*[a-z0-9])?$", base_job_name): + raise ValueError( + 'Invalid job_name ({}); the name must consist of' + 'only the characters [-a-z0-9], starting with a ' + 'letter and ending with a letter or number '.format(base_job_name)) + + if append_job_name: + safe_job_name = base_job_name + "-" + str(uuid.uuid4())[:8] + else: + safe_job_name = base_job_name + + return safe_job_name + + @staticmethod + def _build_cmd(variables, label_formatter): command = ["--runner=DataflowRunner"] if variables is not None: for attr, value in variables.items(): @@ -241,21 +267,25 @@ def _build_cmd(self, task_id, variables, label_formatter): command.append("--" + attr + "=" + value) return command - def _start_template_dataflow(self, name, variables, parameters, dataflow_template): + def _start_template_dataflow(self, name, variables, parameters, + dataflow_template): # Builds RuntimeEnvironment from variables dictionary # https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment environment = {} for key in ['maxWorkers', 'zone', 'serviceAccountEmail', 'tempLocation', - 'bypassTempDirValidation', 'machineType']: + 'bypassTempDirValidation', 'machineType', 'network', 'subnetwork']: if key in variables: environment.update({key: variables[key]}) body = {"jobName": name, "parameters": parameters, "environment": environment} service = self.get_conn() - request = service.projects().templates().launch(projectId=variables['project'], - gcsPath=dataflow_template, - body=body) + request = service.projects().locations().templates().launch( + projectId=variables['project'], + location=variables['region'], + gcsPath=dataflow_template, + body=body + ) response = request.execute() variables = self._set_variables(variables) _DataflowJob(self.get_conn(), variables['project'], name, variables['region'], diff --git a/airflow/contrib/hooks/gcp_dataproc_hook.py b/airflow/contrib/hooks/gcp_dataproc_hook.py index fc15137cbfa91..0f5d6e216b018 100644 --- a/airflow/contrib/hooks/gcp_dataproc_hook.py +++ b/airflow/contrib/hooks/gcp_dataproc_hook.py @@ -20,14 +20,16 @@ import time import uuid -from apiclient.discovery import build +from googleapiclient.discovery import build +from zope.deprecation import deprecation from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.utils.log.logging_mixin import LoggingMixin class _DataProcJob(LoggingMixin): - def __init__(self, dataproc_api, project_id, job, region='global'): + def __init__(self, dataproc_api, project_id, job, region='global', + job_error_states=None): self.dataproc_api = dataproc_api self.project_id = project_id self.region = region @@ -36,6 +38,7 @@ def __init__(self, dataproc_api, project_id, job, region='global'): region=self.region, body=job).execute() self.job_id = self.job['reference']['jobId'] + self.job_error_states = job_error_states self.log.info( 'DataProc job %s is %s', self.job_id, str(self.job['status']['state']) @@ -48,13 +51,11 @@ def wait_for_done(self): region=self.region, jobId=self.job_id).execute(num_retries=5) if 'ERROR' == self.job['status']['state']: - print(str(self.job)) self.log.error('DataProc job %s has errors', self.job_id) self.log.error(self.job['status']['details']) self.log.debug(str(self.job)) return False if 'CANCELLED' == self.job['status']['state']: - print(str(self.job)) self.log.warning('DataProc job %s is cancelled', self.job_id) if 'details' in self.job['status']: self.log.warning(self.job['status']['details']) @@ -69,10 +70,14 @@ def wait_for_done(self): time.sleep(5) def raise_error(self, message=None): - if 'ERROR' == self.job['status']['state']: - if message is None: - message = "Google DataProc job has error" - raise Exception(message + ": " + str(self.job['status']['details'])) + job_state = self.job['status']['state'] + # We always consider ERROR to be an error state. + if (self.job_error_states and job_state in self.job_error_states) or 'ERROR' == job_state: + ex_message = message or ("Google DataProc job has state: %s" % job_state) + ex_details = (str(self.job['status']['details']) + if 'details' in self.job['status'] + else "No details available") + raise Exception(ex_message + ": " + ex_details) def get(self): return self.job @@ -80,7 +85,7 @@ def get(self): class _DataProcJobBuilder: def __init__(self, project_id, task_id, cluster_name, job_type, properties): - name = task_id + "_" + str(uuid.uuid1())[:8] + name = task_id + "_" + str(uuid.uuid4())[:8] self.job_type = job_type self.job = { "job": { @@ -140,7 +145,7 @@ def set_python_main(self, main): self.job["job"][self.job_type]["mainPythonFileUri"] = main def set_job_name(self, name): - self.job["job"]["reference"]["jobId"] = name + "_" + str(uuid.uuid1())[:8] + self.job["job"]["reference"]["jobId"] = name + "_" + str(uuid.uuid4())[:8] def build(self): return self.job @@ -215,16 +220,26 @@ def get_cluster(self, project_id, region, cluster_name): clusterName=cluster_name ).execute(num_retries=5) - def submit(self, project_id, job, region='global'): - submitted = _DataProcJob(self.get_conn(), project_id, job, region) + def submit(self, project_id, job, region='global', job_error_states=None): + submitted = _DataProcJob(self.get_conn(), project_id, job, region, + job_error_states=job_error_states) if not submitted.wait_for_done(): - submitted.raise_error('DataProcTask has errors') + submitted.raise_error() def create_job_template(self, task_id, cluster_name, job_type, properties): return _DataProcJobBuilder(self.project_id, task_id, cluster_name, job_type, properties) - def await(self, operation): + def wait(self, operation): """Awaits for Google Cloud Dataproc Operation to complete.""" submitted = _DataProcOperation(self.get_conn(), operation) submitted.wait_for_done() + + +setattr( + DataProcHook, + "await", + deprecation.deprecated( + DataProcHook.wait, "renamed to 'wait' for Python3.7 compatibility" + ), +) diff --git a/airflow/contrib/hooks/gcp_function_hook.py b/airflow/contrib/hooks/gcp_function_hook.py new file mode 100644 index 0000000000000..86a82dd7607c7 --- /dev/null +++ b/airflow/contrib/hooks/gcp_function_hook.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +import requests +from googleapiclient.discovery import build + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +# Number of retries - used by googleapiclient method calls to perform retries +# For requests that are "retriable" +NUM_RETRIES = 5 + +# Time to sleep between active checks of the operation results +TIME_TO_SLEEP_IN_SECONDS = 1 + + +# noinspection PyAbstractClass +class GcfHook(GoogleCloudBaseHook): + """ + Hook for the Google Cloud Functions APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + """ + _conn = None + + def __init__(self, + api_version, + gcp_conn_id='google_cloud_default', + delegate_to=None): + super(GcfHook, self).__init__(gcp_conn_id, delegate_to) + self.api_version = api_version + + @staticmethod + def _full_location(project_id, location): + """ + Retrieve full location of the function in the form of + projects//locations/ + + :param project_id: The Google Cloud Project project_id where the function belongs. + :type project_id: str + :param location: The location where the function is created. + :type location: str + :return: + """ + return 'projects/{}/locations/{}'.format(project_id, location) + + def get_conn(self): + """ + Retrieves the connection to Cloud Functions. + + :return: Google Cloud Function services object. + :rtype: dict + """ + if not self._conn: + http_authorized = self._authorize() + self._conn = build('cloudfunctions', self.api_version, + http=http_authorized, cache_discovery=False) + return self._conn + + def get_function(self, name): + """ + Returns the Cloud Function with the given name. + + :param name: Name of the function. + :type name: str + :return: A Cloud Functions object representing the function. + :rtype: dict + """ + return self.get_conn().projects().locations().functions().get( + name=name).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_new_function(self, location, body, project_id=None): + """ + Creates a new function in Cloud Function in the location specified in the body. + + :param location: The location of the function. + :type location: str + :param body: The body required by the Cloud Functions insert API. + :type body: dict + :param project_id: Optional, Google Cloud Project project_id where the function belongs. + If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().projects().locations().functions().create( + location=self._full_location(project_id, location), + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(operation_name=operation_name) + + def update_function(self, name, body, update_mask): + """ + Updates Cloud Functions according to the specified update mask. + + :param name: The name of the function. + :type name: str + :param body: The body required by the cloud function patch API. + :type body: dict + :param update_mask: The update mask - array of fields that should be patched. + :type update_mask: [str] + :return: None + """ + response = self.get_conn().projects().locations().functions().patch( + updateMask=",".join(update_mask), + name=name, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def upload_function_zip(self, location, zip_path, project_id=None): + """ + Uploads zip file with sources. + + :param location: The location where the function is created. + :type location: str + :param zip_path: The path of the valid .zip file to upload. + :type zip_path: str + :param project_id: Optional, Google Cloud Project project_id where the function belongs. + If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: The upload URL that was returned by generateUploadUrl method. + """ + response = self.get_conn().projects().locations().functions().generateUploadUrl( + parent=self._full_location(project_id, location) + ).execute(num_retries=NUM_RETRIES) + upload_url = response.get('uploadUrl') + with open(zip_path, 'rb') as fp: + requests.put( + url=upload_url, + data=fp, + # Those two headers needs to be specified according to: + # https://cloud.google.com/functions/docs/reference/rest/v1/projects.locations.functions/generateUploadUrl + # nopep8 + headers={ + 'Content-type': 'application/zip', + 'x-goog-content-length-range': '0,104857600', + } + ) + return upload_url + + def delete_function(self, name): + """ + Deletes the specified Cloud Function. + + :param name: The name of the function. + :type name: str + :return: None + """ + response = self.get_conn().projects().locations().functions().delete( + name=name).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(operation_name=operation_name) + + def _wait_for_operation_to_complete(self, operation_name): + """ + Waits for the named operation to complete - checks status of the + asynchronous call. + + :param operation_name: The name of the operation. + :type operation_name: str + :return: The response returned by the operation. + :rtype: dict + :exception: AirflowException in case error is returned. + """ + service = self.get_conn() + while True: + operation_response = service.operations().get( + name=operation_name, + ).execute(num_retries=NUM_RETRIES) + if operation_response.get("done"): + response = operation_response.get("response") + error = operation_response.get("error") + # Note, according to documentation always either response or error is + # set when "done" == True + if error: + raise AirflowException(str(error)) + return response + time.sleep(TIME_TO_SLEEP_IN_SECONDS) diff --git a/airflow/contrib/hooks/gcp_kms_hook.py b/airflow/contrib/hooks/gcp_kms_hook.py new file mode 100644 index 0000000000000..138cc173ed973 --- /dev/null +++ b/airflow/contrib/hooks/gcp_kms_hook.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import base64 + +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +from googleapiclient.discovery import build + + +def _b64encode(s): + """ Base 64 encodes a bytes object to a string """ + return base64.b64encode(s).decode('ascii') + + +def _b64decode(s): + """ Base 64 decodes a string to bytes. """ + return base64.b64decode(s.encode('utf-8')) + + +class GoogleCloudKMSHook(GoogleCloudBaseHook): + """ + Interact with Google Cloud KMS. This hook uses the Google Cloud Platform + connection. + """ + + def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): + super(GoogleCloudKMSHook, self).__init__(gcp_conn_id, delegate_to=delegate_to) + + def get_conn(self): + """ + Returns a KMS service object. + + :rtype: googleapiclient.discovery.Resource + """ + http_authorized = self._authorize() + return build( + 'cloudkms', 'v1', http=http_authorized, cache_discovery=False) + + def encrypt(self, key_name, plaintext, authenticated_data=None): + """ + Encrypts a plaintext message using Google Cloud KMS. + + :param key_name: The Resource Name for the key (or key version) + to be used for encyption. Of the form + ``projects/*/locations/*/keyRings/*/cryptoKeys/**`` + :type key_name: str + :param plaintext: The message to be encrypted. + :type plaintext: bytes + :param authenticated_data: Optional additional authenticated data that + must also be provided to decrypt the message. + :type authenticated_data: bytes + :return: The base 64 encoded ciphertext of the original message. + :rtype: str + """ + keys = self.get_conn().projects().locations().keyRings().cryptoKeys() + body = {'plaintext': _b64encode(plaintext)} + if authenticated_data: + body['additionalAuthenticatedData'] = _b64encode(authenticated_data) + + request = keys.encrypt(name=key_name, body=body) + response = request.execute() + + ciphertext = response['ciphertext'] + return ciphertext + + def decrypt(self, key_name, ciphertext, authenticated_data=None): + """ + Decrypts a ciphertext message using Google Cloud KMS. + + :param key_name: The Resource Name for the key to be used for decyption. + Of the form ``projects/*/locations/*/keyRings/*/cryptoKeys/**`` + :type key_name: str + :param ciphertext: The message to be decrypted. + :type ciphertext: str + :param authenticated_data: Any additional authenticated data that was + provided when encrypting the message. + :type authenticated_data: bytes + :return: The original message. + :rtype: bytes + """ + keys = self.get_conn().projects().locations().keyRings().cryptoKeys() + body = {'ciphertext': ciphertext} + if authenticated_data: + body['additionalAuthenticatedData'] = _b64encode(authenticated_data) + + request = keys.decrypt(name=key_name, body=body) + response = request.execute() + + plaintext = _b64decode(response['plaintext']) + return plaintext diff --git a/airflow/contrib/hooks/gcp_mlengine_hook.py b/airflow/contrib/hooks/gcp_mlengine_hook.py index 66f392b1564c9..715e82d47d5de 100644 --- a/airflow/contrib/hooks/gcp_mlengine_hook.py +++ b/airflow/contrib/hooks/gcp_mlengine_hook.py @@ -15,8 +15,8 @@ # limitations under the License. import random import time -from apiclient import errors -from apiclient.discovery import build +from googleapiclient.errors import HttpError +from googleapiclient.discovery import build from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.utils.log.logging_mixin import LoggingMixin @@ -37,7 +37,7 @@ def _poll_with_exponential_delay(request, max_n, is_done_func, is_error_func): return response else: time.sleep((2**i) + (random.randint(0, 1000) / 1000)) - except errors.HttpError as e: + except HttpError as e: if e.resp.status != 429: log.info('Something went wrong. Not retrying: %s', format(e)) raise @@ -63,10 +63,11 @@ def create_job(self, project_id, job, use_existing_job_fn=None): :param project_id: The Google Cloud project id within which MLEngine job will be launched. - :type project_id: string + :type project_id: str :param job: MLEngine Job object that should be provided to the MLEngine API, such as: :: + { 'jobId': 'my_job_id', 'trainingInput': { @@ -74,6 +75,7 @@ def create_job(self, project_id, job, use_existing_job_fn=None): ... } } + :type job: dict :param use_existing_job_fn: In case that a MLEngine job with the same @@ -96,7 +98,7 @@ def create_job(self, project_id, job, use_existing_job_fn=None): try: request.execute() - except errors.HttpError as e: + except HttpError as e: # 409 means there is an existing job with the same job ID. if e.resp.status == 409: if use_existing_job_fn is not None: @@ -126,14 +128,14 @@ def _get_job(self, project_id, job_id): :rtype: dict Raises: - apiclient.errors.HttpError: if HTTP error is returned from server + googleapiclient.errors.HttpError: if HTTP error is returned from server """ job_name = 'projects/{}/jobs/{}'.format(project_id, job_id) request = self._mlengine.projects().jobs().get(name=job_name) while True: try: return request.execute() - except errors.HttpError as e: + except HttpError as e: if e.resp.status == 429: # polling after 30 seconds when quota failure occurs time.sleep(30) @@ -149,10 +151,11 @@ def _wait_for_job_done(self, project_id, job_id, interval=30): a terminal state. Raises: - apiclient.errors.HttpError: if HTTP error is returned when getting + googleapiclient.errors.HttpError: if HTTP error is returned when getting the job """ - assert interval > 0 + if interval <= 0: + raise ValueError("Interval must be > 0") while True: job = self._get_job(project_id, job_id) if job['state'] in ['SUCCEEDED', 'FAILED', 'CANCELLED']: @@ -192,7 +195,7 @@ def set_default_version(self, project_id, model_name, version_name): response = request.execute() self.log.info('Successfully set version: %s to default', response) return response - except errors.HttpError as e: + except HttpError as e: self.log.error('Something went wrong: %s', e) raise @@ -242,7 +245,9 @@ def create_model(self, project_id, model): """ Create a Model. Blocks until finished. """ - assert model['name'] is not None and model['name'] is not '' + if not model['name']: + raise ValueError("Model name must be provided and " + "could not be an empty string") project = 'projects/{}'.format(project_id) request = self._mlengine.projects().models().create( @@ -253,13 +258,15 @@ def get_model(self, project_id, model_name): """ Gets a Model. Blocks until finished. """ - assert model_name is not None and model_name is not '' + if not model_name: + raise ValueError("Model name must be provided and " + "it could not be an empty string") full_model_name = 'projects/{}/models/{}'.format( project_id, model_name) request = self._mlengine.projects().models().get(name=full_model_name) try: return request.execute() - except errors.HttpError as e: + except HttpError as e: if e.resp.status == 404: self.log.error('Model was not found: %s', e) return None diff --git a/airflow/contrib/hooks/gcp_natural_language_hook.py b/airflow/contrib/hooks/gcp_natural_language_hook.py new file mode 100644 index 0000000000000..c5fc935854256 --- /dev/null +++ b/airflow/contrib/hooks/gcp_natural_language_hook.py @@ -0,0 +1,217 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from google.cloud.language_v1 import LanguageServiceClient + + +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + + +# noinspection PyAbstractClass +class CloudNaturalLanguageHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Natural Language Service. + + :param gcp_conn_id: The connection ID to use when fetching connection info. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + """ + + _conn = None + + def __init__(self, gcp_conn_id="google_cloud_default", delegate_to=None): + super(CloudNaturalLanguageHook, self).__init__(gcp_conn_id, delegate_to) + + def get_conn(self): + """ + Retrieves connection to Cloud Natural Language service. + + :return: Cloud Natural Language service object + :rtype: google.cloud.language_v1.LanguageServiceClient + """ + if not self._conn: + self._conn = LanguageServiceClient(credentials=self._get_credentials()) + return self._conn + + @GoogleCloudBaseHook.catch_http_exception + def analyze_entities(self, document, encoding_type=None, retry=None, timeout=None, metadata=None): + """ + Finds named entities in the text along with entity types, + salience, mentions for each entity, and other properties. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or class google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + """ + client = self.get_conn() + + return client.analyze_entities( + document=document, encoding_type=encoding_type, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def analyze_entity_sentiment(self, document, encoding_type=None, retry=None, timeout=None, metadata=None): + """ + Finds entities, similar to AnalyzeEntities in the text and analyzes sentiment associated with each + entity and its mentions. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or class google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + """ + client = self.get_conn() + + return client.analyze_entity_sentiment( + document=document, encoding_type=encoding_type, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def analyze_sentiment(self, document, encoding_type=None, retry=None, timeout=None, metadata=None): + """ + Analyzes the sentiment of the provided text. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or class google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + """ + client = self.get_conn() + + return client.analyze_sentiment( + document=document, encoding_type=encoding_type, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def analyze_syntax(self, document, encoding_type=None, retry=None, timeout=None, metadata=None): + """ + Analyzes the syntax of the text and provides sentence boundaries and tokenization along with part + of speech tags, dependency trees, and other properties. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or class google.cloud.language_v1.types.Document# + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeSyntaxResponse + """ + client = self.get_conn() + + return client.analyze_syntax( + document=document, encoding_type=encoding_type, retry=retry, timeout=timeout, metadata=metadata + ) + + @GoogleCloudBaseHook.catch_http_exception + def annotate_text(self, document, features, encoding_type=None, retry=None, timeout=None, metadata=None): + """ + A convenience method that provides all the features that analyzeSentiment, + analyzeEntities, and analyzeSyntax provide in one call. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or google.cloud.language_v1.types.Document + :param features: The enabled features. + If a dict is provided, it must be of the same form as the protobuf message Features + :type features: dict or google.cloud.language_v1.enums.Features + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnnotateTextResponse + """ + client = self.get_conn() + + return client.annotate_text( + document=document, + features=features, + encoding_type=encoding_type, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + @GoogleCloudBaseHook.catch_http_exception + def classify_text(self, document, retry=None, timeout=None, metadata=None): + """ + Classifies a document into categories. + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or class google.cloud.language_v1.types.Document + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :type retry: google.api_core.retry.Retry + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + """ + client = self.get_conn() + + return client.classify_text(document=document, retry=retry, timeout=timeout, metadata=metadata) diff --git a/airflow/contrib/hooks/gcp_pubsub_hook.py b/airflow/contrib/hooks/gcp_pubsub_hook.py index 1d55d5d487237..50512d2127463 100644 --- a/airflow/contrib/hooks/gcp_pubsub_hook.py +++ b/airflow/contrib/hooks/gcp_pubsub_hook.py @@ -19,8 +19,8 @@ from uuid import uuid4 -from apiclient.discovery import build -from apiclient import errors +from googleapiclient.discovery import build +from googleapiclient.errors import HttpError from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook @@ -50,7 +50,7 @@ def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): def get_conn(self): """Returns a Pub/Sub service object. - :rtype: apiclient.discovery.Resource + :rtype: googleapiclient.discovery.Resource """ http_authorized = self._authorize() return build( @@ -60,10 +60,10 @@ def publish(self, project, topic, messages): """Publishes messages to a Pub/Sub topic. :param project: the GCP project ID in which to publish - :type project: string + :type project: str :param topic: the Pub/Sub topic to which to publish; do not include the ``projects/{project}/topics/`` prefix. - :type topic: string + :type topic: str :param messages: messages to publish; if the data field in a message is set, it should already be base64 encoded. :type messages: list of PubSub messages; see @@ -75,7 +75,7 @@ def publish(self, project, topic, messages): topic=full_topic, body=body) try: request.execute() - except errors.HttpError as e: + except HttpError as e: raise PubSubException( 'Error publishing to topic {}'.format(full_topic), e) @@ -84,10 +84,10 @@ def create_topic(self, project, topic, fail_if_exists=False): :param project: the GCP project ID in which to create the topic - :type project: string + :type project: str :param topic: the Pub/Sub topic name to create; do not include the ``projects/{project}/topics/`` prefix. - :type topic: string + :type topic: str :param fail_if_exists: if set, raise an exception if the topic already exists :type fail_if_exists: bool @@ -97,7 +97,7 @@ def create_topic(self, project, topic, fail_if_exists=False): try: service.projects().topics().create( name=full_topic, body={}).execute() - except errors.HttpError as e: + except HttpError as e: # Status code 409 indicates that the topic already exists. if str(e.resp['status']) == '409': message = 'Topic already exists: {}'.format(full_topic) @@ -112,10 +112,10 @@ def delete_topic(self, project, topic, fail_if_not_exists=False): """Deletes a Pub/Sub topic if it exists. :param project: the GCP project ID in which to delete the topic - :type project: string + :type project: str :param topic: the Pub/Sub topic name to delete; do not include the ``projects/{project}/topics/`` prefix. - :type topic: string + :type topic: str :param fail_if_not_exists: if set, raise an exception if the topic does not exist :type fail_if_not_exists: bool @@ -124,7 +124,7 @@ def delete_topic(self, project, topic, fail_if_not_exists=False): full_topic = _format_topic(project, topic) try: service.projects().topics().delete(topic=full_topic).execute() - except errors.HttpError as e: + except HttpError as e: # Status code 409 indicates that the topic was not found if str(e.resp['status']) == '404': message = 'Topic does not exist: {}'.format(full_topic) @@ -142,17 +142,17 @@ def create_subscription(self, topic_project, topic, subscription=None, :param topic_project: the GCP project ID of the topic that the subscription will be bound to. - :type topic_project: string + :type topic_project: str :param topic: the Pub/Sub topic name that the subscription will be bound to create; do not include the ``projects/{project}/subscriptions/`` prefix. - :type topic: string + :type topic: str :param subscription: the Pub/Sub subscription name. If empty, a random name will be generated using the uuid module - :type subscription: string + :type subscription: str :param subscription_project: the GCP project ID where the subscription will be created. If unspecified, ``topic_project`` will be used. - :type subscription_project: string + :type subscription_project: str :param ack_deadline_secs: Number of seconds that a subscriber has to acknowledge each message pulled from the subscription :type ack_deadline_secs: int @@ -161,7 +161,7 @@ def create_subscription(self, topic_project, topic, subscription=None, :type fail_if_exists: bool :return: subscription name which will be the system-generated value if the ``subscription`` parameter is not supplied - :rtype: string + :rtype: str """ service = self.get_conn() full_topic = _format_topic(topic_project, topic) @@ -178,7 +178,7 @@ def create_subscription(self, topic_project, topic, subscription=None, try: service.projects().subscriptions().create( name=full_subscription, body=body).execute() - except errors.HttpError as e: + except HttpError as e: # Status code 409 indicates that the subscription already exists. if str(e.resp['status']) == '409': message = 'Subscription already exists: {}'.format( @@ -197,10 +197,10 @@ def delete_subscription(self, project, subscription, """Deletes a Pub/Sub subscription, if it exists. :param project: the GCP project ID where the subscription exists - :type project: string + :type project: str :param subscription: the Pub/Sub subscription name to delete; do not include the ``projects/{project}/subscriptions/`` prefix. - :type subscription: string + :type subscription: str :param fail_if_not_exists: if set, raise an exception if the topic does not exist :type fail_if_not_exists: bool @@ -210,7 +210,7 @@ def delete_subscription(self, project, subscription, try: service.projects().subscriptions().delete( subscription=full_subscription).execute() - except errors.HttpError as e: + except HttpError as e: # Status code 404 indicates that the subscription was not found if str(e.resp['status']) == '404': message = 'Subscription does not exist: {}'.format( @@ -228,10 +228,10 @@ def pull(self, project, subscription, max_messages, """Pulls up to ``max_messages`` messages from Pub/Sub subscription. :param project: the GCP project ID where the subscription exists - :type project: string + :type project: str :param subscription: the Pub/Sub subscription name to pull from; do not include the 'projects/{project}/topics/' prefix. - :type subscription: string + :type subscription: str :param max_messages: The maximum number of messages to return from the Pub/Sub API. :type max_messages: int @@ -239,11 +239,10 @@ def pull(self, project, subscription, max_messages, return if no messages are available. Otherwise, the request will block for an undisclosed, but bounded period of time :type return_immediately: bool - :return A list of Pub/Sub ReceivedMessage objects each containing + :return: A list of Pub/Sub ReceivedMessage objects each containing an ``ackId`` property and a ``message`` property, which includes the base64-encoded message content. See - https://cloud.google.com/pubsub/docs/reference/rest/v1/\ - projects.subscriptions/pull#ReceivedMessage + https://cloud.google.com/pubsub/docs/reference/rest/v1/projects.subscriptions/pull#ReceivedMessage """ service = self.get_conn() full_subscription = _format_subscription(project, subscription) @@ -255,7 +254,7 @@ def pull(self, project, subscription, max_messages, response = service.projects().subscriptions().pull( subscription=full_subscription, body=body).execute() return response.get('receivedMessages', []) - except errors.HttpError as e: + except HttpError as e: raise PubSubException( 'Error pulling messages from subscription {}'.format( full_subscription), e) @@ -265,10 +264,10 @@ def acknowledge(self, project, subscription, ack_ids): :param project: the GCP project name or ID in which to create the topic - :type project: string + :type project: str :param subscription: the Pub/Sub subscription name to delete; do not include the 'projects/{project}/topics/' prefix. - :type subscription: string + :type subscription: str :param ack_ids: List of ReceivedMessage ackIds from a previous pull response :type ack_ids: list @@ -279,7 +278,7 @@ def acknowledge(self, project, subscription, ack_ids): service.projects().subscriptions().acknowledge( subscription=full_subscription, body={'ackIds': ack_ids} ).execute() - except errors.HttpError as e: + except HttpError as e: raise PubSubException( 'Error acknowledging {} messages pulled from subscription {}' .format(len(ack_ids), full_subscription), e) diff --git a/airflow/contrib/hooks/gcp_spanner_hook.py b/airflow/contrib/hooks/gcp_spanner_hook.py new file mode 100644 index 0000000000000..6b442c46989f5 --- /dev/null +++ b/airflow/contrib/hooks/gcp_spanner_hook.py @@ -0,0 +1,349 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from google.api_core.exceptions import GoogleAPICallError, AlreadyExists +from google.cloud.spanner_v1.client import Client +from google.longrunning.operations_grpc_pb2 import Operation # noqa: F401 + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + + +class CloudSpannerHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Spanner APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + """ + _client = None + + def __init__(self, + gcp_conn_id='google_cloud_default', + delegate_to=None): + super(CloudSpannerHook, self).__init__(gcp_conn_id, delegate_to) + + def _get_client(self, project_id): + """ + Provides a client for interacting with the Cloud Spanner API. + + :param project_id: The ID of the GCP project. + :type project_id: str + :return: google.cloud.spanner_v1.client.Client + :rtype: object + """ + if not self._client: + self._client = Client(project=project_id, credentials=self._get_credentials()) + return self._client + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_instance(self, instance_id, project_id=None): + """ + Gets information about a particular instance. + + :param project_id: Optional, The ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :return: google.cloud.spanner_v1.instance.Instance + :rtype: object + """ + instance = self._get_client(project_id=project_id).instance(instance_id=instance_id) + if not instance.exists(): + return None + return instance + + def _apply_to_instance(self, project_id, instance_id, configuration_name, node_count, + display_name, func): + """ + Invokes a method on a given instance by applying a specified Callable. + + :param project_id: The ID of the GCP project that owns the Cloud Spanner + database. + :type project_id: str + :param instance_id: The ID of the instance. + :type instance_id: str + :param configuration_name: Name of the instance configuration defining how the + instance will be created. Required for instances which do not yet exist. + :type configuration_name: str + :param node_count: (Optional) Number of nodes allocated to the instance. + :type node_count: int + :param display_name: (Optional) The display name for the instance in the Cloud + Console UI. (Must be between 4 and 30 characters.) If this value is not set + in the constructor, will fall back to the instance ID. + :type display_name: str + :param func: Method of the instance to be called. + :type func: Callable + """ + # noinspection PyUnresolvedReferences + instance = self._get_client(project_id=project_id).instance( + instance_id=instance_id, configuration_name=configuration_name, + node_count=node_count, display_name=display_name) + try: + operation = func(instance) # type: Operation + except GoogleAPICallError as e: + self.log.error('An error occurred: %s. Exiting.', e.message) + raise e + + if operation: + result = operation.result() + self.log.info(result) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_instance(self, instance_id, configuration_name, node_count, + display_name, project_id=None): + """ + Creates a new Cloud Spanner instance. + + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param configuration_name: The name of the instance configuration defining how the + instance will be created. Possible configuration values can be retrieved via + https://cloud.google.com/spanner/docs/reference/rest/v1/projects.instanceConfigs/list + :type configuration_name: str + :param node_count: (Optional) The number of nodes allocated to the Cloud Spanner + instance. + :type node_count: int + :param display_name: (Optional) The display name for the instance in the GCP + Console. Must be between 4 and 30 characters. If this value is not set in + the constructor, the name falls back to the instance ID. + :type display_name: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + self._apply_to_instance(project_id, instance_id, configuration_name, + node_count, display_name, lambda x: x.create()) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def update_instance(self, instance_id, configuration_name, node_count, + display_name, project_id=None): + """ + Updates an existing Cloud Spanner instance. + + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param configuration_name: The name of the instance configuration defining how the + instance will be created. Possible configuration values can be retrieved via + https://cloud.google.com/spanner/docs/reference/rest/v1/projects.instanceConfigs/list + :type configuration_name: str + :param node_count: (Optional) The number of nodes allocated to the Cloud Spanner + instance. + :type node_count: int + :param display_name: (Optional) The display name for the instance in the GCP + Console. Must be between 4 and 30 characters. If this value is not set in + the constructor, the name falls back to the instance ID. + :type display_name: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + return self._apply_to_instance(project_id, instance_id, configuration_name, + node_count, display_name, lambda x: x.update()) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_instance(self, instance_id, project_id=None): + """ + Deletes an existing Cloud Spanner instance. + + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + + instance = self._get_client(project_id=project_id).instance(instance_id) + try: + instance.delete() + return + except GoogleAPICallError as e: + self.log.error('An error occurred: %s. Exiting.', e.message) + raise e + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_database(self, instance_id, database_id, project_id=None): + """ + Retrieves a database in Cloud Spanner. If the database does not exist + in the specified instance, it returns None. + + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param database_id: The ID of the database in Cloud Spanner. + :type database_id: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: Database object or None if database does not exist + :rtype: google.cloud.spanner_v1.database.Database or None + """ + + instance = self._get_client(project_id=project_id).instance( + instance_id=instance_id) + if not instance.exists(): + raise AirflowException("The instance {} does not exist in project {} !". + format(instance_id, project_id)) + database = instance.database(database_id=database_id) + if not database.exists(): + return None + else: + return database + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_database(self, instance_id, database_id, ddl_statements, project_id=None): + """ + Creates a new database in Cloud Spanner. + + :type project_id: str + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param database_id: The ID of the database to create in Cloud Spanner. + :type database_id: str + :param ddl_statements: The string list containing DDL for the new database. + :type ddl_statements: list[str] + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :return: None + """ + + instance = self._get_client(project_id=project_id).instance( + instance_id=instance_id) + if not instance.exists(): + raise AirflowException("The instance {} does not exist in project {} !". + format(instance_id, project_id)) + database = instance.database(database_id=database_id, + ddl_statements=ddl_statements) + try: + operation = database.create() # type: Operation + except GoogleAPICallError as e: + self.log.error('An error occurred: %s. Exiting.', e.message) + raise e + + if operation: + result = operation.result() + self.log.info(result) + return + + @GoogleCloudBaseHook.fallback_to_default_project_id + def update_database(self, instance_id, database_id, ddl_statements, + project_id=None, + operation_id=None): + """ + Updates DDL of a database in Cloud Spanner. + + :type project_id: str + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param database_id: The ID of the database in Cloud Spanner. + :type database_id: str + :param ddl_statements: The string list containing DDL for the new database. + :type ddl_statements: list[str] + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :param operation_id: (Optional) The unique per database operation ID that can be + specified to implement idempotency check. + :type operation_id: str + :return: None + """ + + instance = self._get_client(project_id=project_id).instance( + instance_id=instance_id) + if not instance.exists(): + raise AirflowException("The instance {} does not exist in project {} !". + format(instance_id, project_id)) + database = instance.database(database_id=database_id) + try: + operation = database.update_ddl( + ddl_statements=ddl_statements, operation_id=operation_id) + if operation: + result = operation.result() + self.log.info(result) + return + except AlreadyExists as e: + if e.code == 409 and operation_id in e.message: + self.log.info("Replayed update_ddl message - the operation id %s " + "was already done before.", operation_id) + return + except GoogleAPICallError as e: + self.log.error('An error occurred: %s. Exiting.', e.message) + raise e + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_database(self, instance_id, database_id, project_id=None): + """ + Drops a database in Cloud Spanner. + + :type project_id: str + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param database_id: The ID of the database in Cloud Spanner. + :type database_id: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :return: True if everything succeeded + :rtype: bool + """ + + instance = self._get_client(project_id=project_id).\ + instance(instance_id=instance_id) + if not instance.exists(): + raise AirflowException("The instance {} does not exist in project {} !". + format(instance_id, project_id)) + database = instance.database(database_id=database_id) + if not database.exists(): + self.log.info("The database {} is already deleted from instance {}. " + "Exiting.".format(database_id, instance_id)) + return + try: + operation = database.drop() # type: Operation + except GoogleAPICallError as e: + self.log.error('An error occurred: %s. Exiting.', e.message) + raise e + + if operation: + result = operation.result() + self.log.info(result) + return + + @GoogleCloudBaseHook.fallback_to_default_project_id + def execute_dml(self, instance_id, database_id, queries, project_id=None): + """ + Executes an arbitrary DML query (INSERT, UPDATE, DELETE). + + :param instance_id: The ID of the Cloud Spanner instance. + :type instance_id: str + :param database_id: The ID of the database in Cloud Spanner. + :type database_id: str + :param queries: The queries to execute. + :type queries: str + :param project_id: Optional, the ID of the GCP project that owns the Cloud Spanner + database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + """ + self._get_client(project_id=project_id).instance(instance_id=instance_id).\ + database(database_id=database_id).run_in_transaction( + lambda transaction: self._execute_sql_in_transaction(transaction, queries)) + + @staticmethod + def _execute_sql_in_transaction(transaction, queries): + for sql in queries: + transaction.execute_update(sql) diff --git a/airflow/contrib/hooks/gcp_sql_hook.py b/airflow/contrib/hooks/gcp_sql_hook.py new file mode 100644 index 0000000000000..b81fc81cb4766 --- /dev/null +++ b/airflow/contrib/hooks/gcp_sql_hook.py @@ -0,0 +1,1000 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import errno +import json +import os +import random +import re +import shutil +import string + +import socket +import platform +import subprocess +import time +import uuid +import os.path + +from googleapiclient.errors import HttpError +from subprocess import Popen, PIPE +from six.moves.urllib.parse import quote_plus + +import requests +from googleapiclient.discovery import build + +from airflow import AirflowException, LoggingMixin +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +# Number of retries - used by googleapiclient method calls to perform retries +# For requests that are "retriable" +from airflow.hooks.base_hook import BaseHook +from airflow.hooks.mysql_hook import MySqlHook +from airflow.hooks.postgres_hook import PostgresHook +from airflow.models.connection import Connection +from airflow.utils.db import provide_session + +UNIX_PATH_MAX = 108 + +NUM_RETRIES = 5 + +# Time to sleep between active checks of the operation results +TIME_TO_SLEEP_IN_SECONDS = 1 + + +class CloudSqlOperationStatus: + PENDING = "PENDING" + RUNNING = "RUNNING" + DONE = "DONE" + UNKNOWN = "UNKNOWN" + + +# noinspection PyAbstractClass +class CloudSqlHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud SQL APIs. + + All the methods in the hook where project_id is used must be called with + keyword arguments rather than positional. + """ + _conn = None + + def __init__(self, + api_version, + gcp_conn_id='google_cloud_default', + delegate_to=None): + super(CloudSqlHook, self).__init__(gcp_conn_id, delegate_to) + self.api_version = api_version + + def get_conn(self): + """ + Retrieves connection to Cloud SQL. + + :return: Google Cloud SQL services object. + :rtype: dict + """ + if not self._conn: + http_authorized = self._authorize() + self._conn = build('sqladmin', self.api_version, + http=http_authorized, cache_discovery=False) + return self._conn + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_instance(self, instance, project_id=None): + """ + Retrieves a resource containing information about a Cloud SQL instance. + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: A Cloud SQL instance resource. + :rtype: dict + """ + return self.get_conn().instances().get( + project=project_id, + instance=instance + ).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_instance(self, body, project_id=None): + """ + Creates a new Cloud SQL instance. + + :param body: Body required by the Cloud SQL insert API, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/insert#request-body. + :type body: dict + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instances().insert( + project=project_id, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def patch_instance(self, body, instance, project_id=None): + """ + Updates settings of a Cloud SQL instance. + + Caution: This is not a partial update, so you must include values for + all the settings that you want to retain. + + :param body: Body required by the Cloud SQL patch API, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/patch#request-body. + :type body: dict + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().instances().patch( + project=project_id, + instance=instance, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_instance(self, instance, project_id=None): + """ + Deletes a Cloud SQL instance. + + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :return: None + """ + response = self.get_conn().instances().delete( + project=project_id, + instance=instance, + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_database(self, instance, database, project_id=None): + """ + Retrieves a database resource from a Cloud SQL instance. + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param database: Name of the database in the instance. + :type database: str + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: A Cloud SQL database resource, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/databases#resource. + :rtype: dict + """ + return self.get_conn().databases().get( + project=project_id, + instance=instance, + database=database + ).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_database(self, instance, body, project_id=None): + """ + Creates a new database inside a Cloud SQL instance. + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/databases/insert#request-body. + :type body: dict + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().databases().insert( + project=project_id, + instance=instance, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def patch_database(self, instance, database, body, project_id=None): + """ + Updates a database resource inside a Cloud SQL instance. + + This method supports patch semantics. + See https://cloud.google.com/sql/docs/mysql/admin-api/how-tos/performance#patch. + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param database: Name of the database to be updated in the instance. + :type database: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/databases/insert#request-body. + :type body: dict + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().databases().patch( + project=project_id, + instance=instance, + database=database, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_database(self, instance, database, project_id=None): + """ + Deletes a database from a Cloud SQL instance. + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param database: Name of the database to be deleted in the instance. + :type database: str + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + response = self.get_conn().databases().delete( + project=project_id, + instance=instance, + database=database + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def export_instance(self, instance, body, project_id=None): + """ + Exports data from a Cloud SQL instance to a Cloud Storage bucket as a SQL dump + or CSV file. + + :param instance: Database instance ID of the Cloud SQL instance. This does not include the + project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/export#request-body + :type body: dict + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + try: + response = self.get_conn().instances().export( + project=project_id, + instance=instance, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + except HttpError as ex: + raise AirflowException( + 'Exporting instance {} failed: {}'.format(instance, ex.content) + ) + + @GoogleCloudBaseHook.fallback_to_default_project_id + def import_instance(self, instance, body, project_id=None): + """ + Imports data into a Cloud SQL instance from a SQL dump or CSV file in + Cloud Storage. + + :param instance: Database instance ID. This does not include the + project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/export#request-body + :type body: dict + :param project_id: Project ID of the project that contains the instance. If set + to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :return: None + """ + try: + response = self.get_conn().instances().import_( + project=project_id, + instance=instance, + body=body + ).execute(num_retries=NUM_RETRIES) + operation_name = response["name"] + self._wait_for_operation_to_complete(project_id=project_id, + operation_name=operation_name) + except HttpError as ex: + raise AirflowException( + 'Importing instance {} failed: {}'.format(instance, ex.content) + ) + + def _wait_for_operation_to_complete(self, project_id, operation_name): + """ + Waits for the named operation to complete - checks status of the + asynchronous call. + + :param project_id: Project ID of the project that contains the instance. + :type project_id: str + :param operation_name: Name of the operation. + :type operation_name: str + :return: None + """ + service = self.get_conn() + while True: + operation_response = service.operations().get( + project=project_id, + operation=operation_name, + ).execute(num_retries=NUM_RETRIES) + if operation_response.get("status") == CloudSqlOperationStatus.DONE: + error = operation_response.get("error") + if error: + # Extracting the errors list as string and trimming square braces + error_msg = str(error.get("errors"))[1:-1] + raise AirflowException(error_msg) + # No meaningful info to return from the response in case of success + return + time.sleep(TIME_TO_SLEEP_IN_SECONDS) + + +CLOUD_SQL_PROXY_DOWNLOAD_URL = "https://dl.google.com/cloudsql/cloud_sql_proxy.{}.{}" +CLOUD_SQL_PROXY_VERSION_DOWNLOAD_URL = \ + "https://storage.googleapis.com/cloudsql-proxy/{}/cloud_sql_proxy.{}.{}" + +GCP_CREDENTIALS_KEY_PATH = "extra__google_cloud_platform__key_path" +GCP_CREDENTIALS_KEYFILE_DICT = "extra__google_cloud_platform__keyfile_dict" + + +class CloudSqlProxyRunner(LoggingMixin): + """ + Downloads and runs cloud-sql-proxy as subprocess of the Python process. + + The cloud-sql-proxy needs to be downloaded and started before we can connect + to the Google Cloud SQL instance via database connection. It establishes + secure tunnel connection to the database. It authorizes using the + GCP credentials that are passed by the configuration. + + More details about the proxy can be found here: + https://cloud.google.com/sql/docs/mysql/sql-proxy + + """ + + def __init__(self, + path_prefix, + instance_specification, + gcp_conn_id='google_cloud_default', + project_id=None, + sql_proxy_version=None, + sql_proxy_binary_path=None): + """ + Creates the proxy runner class. + + :param path_prefix: Unique path prefix where proxy will be downloaded and + directories created for unix sockets. + :type path_prefix: str + :param instance_specification: Specification of the instance to connect the + proxy to. It should be specified in the form that is described in + https://cloud.google.com/sql/docs/mysql/sql-proxy#multiple-instances in + -instances parameter (typically in the form of ``::`` + for UNIX socket connections and in the form of + ``::=tcp:`` for TCP connections. + :type instance_specification: str + :param gcp_conn_id: Id of Google Cloud Platform connection to use for + authentication + :type gcp_conn_id: str + :param project_id: Optional id of the GCP project to connect to - it overwrites + default project id taken from the GCP connection. + :type project_id: str + :param sql_proxy_version: Specific version of SQL proxy to download + (for example 'v1.13'). By default latest version is downloaded. + :type sql_proxy_version: str + :param sql_proxy_binary_path: If specified, then proxy will be + used from the path specified rather than dynamically generated. This means + that if the binary is not present in that path it will also be downloaded. + :type sql_proxy_binary_path: str + """ + super(CloudSqlProxyRunner, self).__init__() + self.path_prefix = path_prefix + if not self.path_prefix: + raise AirflowException("The path_prefix must not be empty!") + self.sql_proxy_was_downloaded = False + self.sql_proxy_version = sql_proxy_version + self.download_sql_proxy_dir = None + self.sql_proxy_process = None + self.instance_specification = instance_specification + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self.command_line_parameters = [] + self.cloud_sql_proxy_socket_directory = self.path_prefix + self.sql_proxy_path = sql_proxy_binary_path if sql_proxy_binary_path \ + else self.path_prefix + "_cloud_sql_proxy" + self.credentials_path = self.path_prefix + "_credentials.json" + self._build_command_line_parameters() + + def _build_command_line_parameters(self): + self.command_line_parameters.extend( + ['-dir', self.cloud_sql_proxy_socket_directory]) + self.command_line_parameters.extend( + ['-instances', self.instance_specification]) + + @staticmethod + def _is_os_64bit(): + return platform.machine().endswith('64') + + def _download_sql_proxy_if_needed(self): + if os.path.isfile(self.sql_proxy_path): + self.log.info("cloud-sql-proxy is already present") + return + system = platform.system().lower() + processor = "amd64" if CloudSqlProxyRunner._is_os_64bit() else "386" + if not self.sql_proxy_version: + download_url = CLOUD_SQL_PROXY_DOWNLOAD_URL.format(system, processor) + else: + download_url = CLOUD_SQL_PROXY_VERSION_DOWNLOAD_URL.format( + self.sql_proxy_version, system, processor) + proxy_path_tmp = self.sql_proxy_path + ".tmp" + self.log.info("Downloading cloud_sql_proxy from %s to %s", + download_url, proxy_path_tmp) + r = requests.get(download_url, allow_redirects=True) + # Downloading to .tmp file first to avoid case where partially downloaded + # binary is used by parallel operator which uses the same fixed binary path + with open(proxy_path_tmp, 'wb') as f: + f.write(r.content) + if r.status_code != 200: + raise AirflowException( + "The cloud-sql-proxy could not be downloaded. Status code = {}. " + "Reason = {}".format(r.status_code, r.reason)) + self.log.info("Moving sql_proxy binary from %s to %s", + proxy_path_tmp, self.sql_proxy_path) + shutil.move(proxy_path_tmp, self.sql_proxy_path) + os.chmod(self.sql_proxy_path, 0o744) # Set executable bit + self.sql_proxy_was_downloaded = True + + @provide_session + def _get_credential_parameters(self, session): + connection = session.query(Connection). \ + filter(Connection.conn_id == self.gcp_conn_id).first() + session.expunge_all() + if GCP_CREDENTIALS_KEY_PATH in connection.extra_dejson: + credential_params = [ + '-credential_file', + connection.extra_dejson[GCP_CREDENTIALS_KEY_PATH] + ] + elif GCP_CREDENTIALS_KEYFILE_DICT in connection.extra_dejson: + credential_file_content = json.loads( + connection.extra_dejson[GCP_CREDENTIALS_KEYFILE_DICT]) + self.log.info("Saving credentials to %s", self.credentials_path) + with open(self.credentials_path, "w") as f: + json.dump(credential_file_content, f) + credential_params = [ + '-credential_file', + self.credentials_path + ] + else: + self.log.info( + "The credentials are not supplied by neither key_path nor " + "keyfile_dict of the gcp connection %s. Falling back to " + "default activated account", self.gcp_conn_id) + credential_params = [] + + if not self.instance_specification: + project_id = connection.extra_dejson.get( + 'extra__google_cloud_platform__project') + if self.project_id: + project_id = self.project_id + if not project_id: + raise AirflowException("For forwarding all instances, the project id " + "for GCP should be provided either " + "by project_id extra in the GCP connection or by " + "project_id provided in the operator.") + credential_params.extend(['-projects', project_id]) + return credential_params + + def start_proxy(self): + """ + Starts Cloud SQL Proxy. + + You have to remember to stop the proxy if you started it! + """ + self._download_sql_proxy_if_needed() + if self.sql_proxy_process: + raise AirflowException("The sql proxy is already running: {}".format( + self.sql_proxy_process)) + else: + command_to_run = [self.sql_proxy_path] + command_to_run.extend(self.command_line_parameters) + try: + self.log.info("Creating directory %s", + self.cloud_sql_proxy_socket_directory) + os.makedirs(self.cloud_sql_proxy_socket_directory) + except OSError: + # Needed for python 2 compatibility (exists_ok missing) + pass + command_to_run.extend(self._get_credential_parameters()) + self.log.info("Running the command: `%s`", " ".join(command_to_run)) + self.sql_proxy_process = Popen(command_to_run, + stdin=PIPE, stdout=PIPE, stderr=PIPE) + self.log.info("The pid of cloud_sql_proxy: %s", self.sql_proxy_process.pid) + while True: + line = self.sql_proxy_process.stderr.readline().decode('utf-8') + return_code = self.sql_proxy_process.poll() + if line == '' and return_code is not None: + self.sql_proxy_process = None + raise AirflowException( + "The cloud_sql_proxy finished early with return code {}!".format( + return_code)) + if line != '': + self.log.info(line) + if "googleapi: Error" in line or "invalid instance name:" in line: + self.stop_proxy() + raise AirflowException( + "Error when starting the cloud_sql_proxy {}!".format( + line)) + if "Ready for new connections" in line: + return + + def stop_proxy(self): + """ + Stops running proxy. + + You should stop the proxy after you stop using it. + """ + if not self.sql_proxy_process: + raise AirflowException("The sql proxy is not started yet") + else: + self.log.info("Stopping the cloud_sql_proxy pid: %s", + self.sql_proxy_process.pid) + self.sql_proxy_process.kill() + self.sql_proxy_process = None + # Cleanup! + self.log.info("Removing the socket directory: %s", + self.cloud_sql_proxy_socket_directory) + shutil.rmtree(self.cloud_sql_proxy_socket_directory, ignore_errors=True) + if self.sql_proxy_was_downloaded: + self.log.info("Removing downloaded proxy: %s", self.sql_proxy_path) + # Silently ignore if the file has already been removed (concurrency) + try: + os.remove(self.sql_proxy_path) + except OSError as e: + if not e.errno == errno.ENOENT: + raise + else: + self.log.info("Skipped removing proxy - it was not downloaded: %s", + self.sql_proxy_path) + if os.path.isfile(self.credentials_path): + self.log.info("Removing generated credentials file %s", + self.credentials_path) + # Here file cannot be delete by concurrent task (each task has its own copy) + os.remove(self.credentials_path) + + def get_proxy_version(self): + """ + Returns version of the Cloud SQL Proxy. + """ + self._download_sql_proxy_if_needed() + command_to_run = [self.sql_proxy_path] + command_to_run.extend(['--version']) + command_to_run.extend(self._get_credential_parameters()) + result = subprocess.check_output(command_to_run).decode('utf-8') + pattern = re.compile("^.*[V|v]ersion ([^;]*);.*$") + m = pattern.match(result) + if m: + return m.group(1) + else: + return None + + def get_socket_path(self): + """ + Retrieves UNIX socket path used by Cloud SQL Proxy. + + :return: The dynamically generated path for the socket created by the proxy. + :rtype: str + """ + return self.cloud_sql_proxy_socket_directory + "/" + self.instance_specification + + +CONNECTION_URIS = { + "postgres": { + "proxy": { + "tcp": + "postgresql://{user}:{password}@127.0.0.1:{proxy_port}/{database}", + "socket": + "postgresql://{user}:{password}@{socket_path}/{database}" + }, + "public": { + "ssl": + "postgresql://{user}:{password}@{public_ip}:{public_port}/{database}?" + "sslmode=verify-ca&" + "sslcert={client_cert_file}&" + "sslkey={client_key_file}&" + "sslrootcert={server_ca_file}", + "non-ssl": + "postgresql://{user}:{password}@{public_ip}:{public_port}/{database}" + } + }, + "mysql": { + "proxy": { + "tcp": + "mysql://{user}:{password}@127.0.0.1:{proxy_port}/{database}", + "socket": + "mysql://{user}:{password}@localhost/{database}?" + "unix_socket={socket_path}" + }, + "public": { + "ssl": + "mysql://{user}:{password}@{public_ip}:{public_port}/{database}?" + "ssl={ssl_spec}", + "non-ssl": + "mysql://{user}:{password}@{public_ip}:{public_port}/{database}" + } + } +} + +CLOUD_SQL_VALID_DATABASE_TYPES = ['postgres', 'mysql'] + + +# noinspection PyAbstractClass +class CloudSqlDatabaseHook(BaseHook): + """ + Serves DB connection configuration for Google Cloud SQL (Connections + of *gcpcloudsql://* type). + + The hook is a "meta" one. It does not perform an actual connection. + It is there to retrieve all the parameters configured in gcpcloudsql:// connection, + start/stop Cloud SQL Proxy if needed, dynamically generate Postgres or MySQL + connection in the database and return an actual Postgres or MySQL hook. + The returned Postgres/MySQL hooks are using direct connection or Cloud SQL + Proxy socket/TCP as configured. + + Main parameters of the hook are retrieved from the standard URI components: + + * **user** - User name to authenticate to the database (from login of the URI). + * **password** - Password to authenticate to the database (from password of the URI). + * **public_ip** - IP to connect to for public connection (from host of the URI). + * **public_port** - Port to connect to for public connection (from port of the URI). + * **database** - Database to connect to (from schema of the URI). + + Remaining parameters are retrieved from the extras (URI query parameters): + + * **project_id** - Optional, Google Cloud Platform project where the Cloud SQL + instance exists. If missing, default project id passed is used. + * **instance** - Name of the instance of the Cloud SQL database instance. + * **location** - The location of the Cloud SQL instance (for example europe-west1). + * **database_type** - The type of the database instance (MySQL or Postgres). + * **use_proxy** - (default False) Whether SQL proxy should be used to connect to Cloud + SQL DB. + * **use_ssl** - (default False) Whether SSL should be used to connect to Cloud SQL DB. + You cannot use proxy and SSL together. + * **sql_proxy_use_tcp** - (default False) If set to true, TCP is used to connect via + proxy, otherwise UNIX sockets are used. + * **sql_proxy_binary_path** - Optional path to Cloud SQL Proxy binary. If the binary + is not specified or the binary is not present, it is automatically downloaded. + * **sql_proxy_version** - Specific version of the proxy to download (for example + v1.13). If not specified, the latest version is downloaded. + * **sslcert** - Path to client certificate to authenticate when SSL is used. + * **sslkey** - Path to client private key to authenticate when SSL is used. + * **sslrootcert** - Path to server's certificate to authenticate when SSL is used. + + :param gcp_cloudsql_conn_id: URL of the connection + :type gcp_cloudsql_conn_id: str + :param default_gcp_project_id: Default project id used if project_id not specified + in the connection URL + :type default_gcp_project_id: str + """ + _conn = None + + def __init__(self, gcp_cloudsql_conn_id='google_cloud_sql_default', + default_gcp_project_id=None): + super(CloudSqlDatabaseHook, self).__init__(source=None) + self.gcp_cloudsql_conn_id = gcp_cloudsql_conn_id + self.cloudsql_connection = self.get_connection(self.gcp_cloudsql_conn_id) + self.extras = self.cloudsql_connection.extra_dejson + self.project_id = self.extras.get('project_id', default_gcp_project_id) + self.instance = self.extras.get('instance') + self.database = self.cloudsql_connection.schema + self.location = self.extras.get('location') + self.database_type = self.extras.get('database_type') + self.use_proxy = self._get_bool(self.extras.get('use_proxy', 'False')) + self.use_ssl = self._get_bool(self.extras.get('use_ssl', 'False')) + self.sql_proxy_use_tcp = self._get_bool( + self.extras.get('sql_proxy_use_tcp', 'False')) + self.sql_proxy_version = self.extras.get('sql_proxy_version') + self.sql_proxy_binary_path = self.extras.get('sql_proxy_binary_path') + self.user = self.cloudsql_connection.login + self.password = self.cloudsql_connection.password + self.public_ip = self.cloudsql_connection.host + self.public_port = self.cloudsql_connection.port + self.sslcert = self.extras.get('sslcert') + self.sslkey = self.extras.get('sslkey') + self.sslrootcert = self.extras.get('sslrootcert') + # Port and socket path and db_hook are automatically generated + self.sql_proxy_tcp_port = None + self.sql_proxy_unique_path = None + self.db_hook = None + self.reserved_tcp_socket = None + # Generated based on clock + clock sequence. Unique per host (!). + # This is important as different hosts share the database + self.db_conn_id = str(uuid.uuid1()) + self._validate_inputs() + + @staticmethod + def _get_bool(val): + if val == 'False': + return False + return val + + @staticmethod + def _check_ssl_file(file_to_check, name): + if not file_to_check: + raise AirflowException("SSL connections requires {name} to be set". + format(name=name)) + if not os.path.isfile(file_to_check): + raise AirflowException("The {file_to_check} must be a readable file". + format(file_to_check=file_to_check)) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required extra 'project_id' is empty") + if not self.location: + raise AirflowException("The required extra 'location' is empty or None") + if not self.instance: + raise AirflowException("The required extra 'instance' is empty or None") + if self.database_type not in CLOUD_SQL_VALID_DATABASE_TYPES: + raise AirflowException("Invalid database type '{}'. Must be one of {}".format( + self.database_type, CLOUD_SQL_VALID_DATABASE_TYPES + )) + if self.use_proxy and self.use_ssl: + raise AirflowException("Cloud SQL Proxy does not support SSL connections." + " SSL is not needed as Cloud SQL Proxy " + "provides encryption on its own") + + def validate_ssl_certs(self): + if self.use_ssl: + self._check_ssl_file(self.sslcert, "sslcert") + self._check_ssl_file(self.sslkey, "sslkey") + self._check_ssl_file(self.sslrootcert, "sslrootcert") + + def validate_socket_path_length(self): + if self.use_proxy and not self.sql_proxy_use_tcp: + if self.database_type == 'postgres': + suffix = "/.s.PGSQL.5432" + else: + suffix = "" + expected_path = "{}/{}:{}:{}{}".format( + self._generate_unique_path(), + self.project_id, self.instance, + self.database, suffix) + if len(expected_path) > UNIX_PATH_MAX: + self.log.info("Too long (%s) path: %s", len(expected_path), expected_path) + raise AirflowException( + "The UNIX socket path length cannot exceed {} characters " + "on Linux system. Either use shorter instance/database " + "name or switch to TCP connection. " + "The socket path for Cloud SQL proxy is now:" + "{}".format( + UNIX_PATH_MAX, expected_path)) + + @staticmethod + def _generate_unique_path(): + # We are not using mkdtemp here as the path generated with mkdtemp + # can be close to 60 characters and there is a limitation in + # length of socket path to around 100 characters in total. + # We append project/location/instance to it later and postgres + # appends its own prefix, so we chose a shorter "/tmp/[8 random characters]" - + random.seed() + while True: + candidate = "/tmp/" + ''.join( + random.choice(string.ascii_lowercase + string.digits) for _ in range(8)) + if not os.path.exists(candidate): + return candidate + + @staticmethod + def _quote(value): + return quote_plus(value) if value else None + + def _generate_connection_uri(self): + if self.use_proxy: + if self.sql_proxy_use_tcp: + if not self.sql_proxy_tcp_port: + self.reserve_free_tcp_port() + if not self.sql_proxy_unique_path: + self.sql_proxy_unique_path = self._generate_unique_path() + + database_uris = CONNECTION_URIS[self.database_type] + ssl_spec = None + socket_path = None + if self.use_proxy: + proxy_uris = database_uris['proxy'] + if self.sql_proxy_use_tcp: + format_string = proxy_uris['tcp'] + else: + format_string = proxy_uris['socket'] + socket_path = \ + "{sql_proxy_socket_path}/{instance_socket_name}".format( + sql_proxy_socket_path=self.sql_proxy_unique_path, + instance_socket_name=self._get_instance_socket_name() + ) + else: + public_uris = database_uris['public'] + if self.use_ssl: + format_string = public_uris['ssl'] + ssl_spec = { + 'cert': self.sslcert, + 'key': self.sslkey, + 'ca': self.sslrootcert + } + else: + format_string = public_uris['non-ssl'] + if not self.user: + raise AirflowException("The login parameter needs to be set in connection") + if not self.public_ip: + raise AirflowException("The location parameter needs to be set in connection") + if not self.password: + raise AirflowException("The password parameter needs to be set in connection") + if not self.database: + raise AirflowException("The database parameter needs to be set in connection") + + connection_uri = format_string.format( + user=quote_plus(self.user) if self.user else '', + password=quote_plus(self.password) if self.password else '', + database=quote_plus(self.database) if self.database else '', + public_ip=self.public_ip, + public_port=self.public_port, + proxy_port=self.sql_proxy_tcp_port, + socket_path=self._quote(socket_path), + ssl_spec=self._quote(json.dumps(ssl_spec)) if ssl_spec else '', + client_cert_file=self._quote(self.sslcert) if self.sslcert else '', + client_key_file=self._quote(self.sslkey) if self.sslcert else '', + server_ca_file=self._quote(self.sslrootcert if self.sslcert else '') + ) + self.log.info("DB connection URI %s", connection_uri.replace( + quote_plus(self.password) if self.password else 'PASSWORD', 'XXXXXXXXXXXX')) + return connection_uri + + def _get_instance_socket_name(self): + return self.project_id + ":" + self.location + ":" + self.instance + + def _get_sqlproxy_instance_specification(self): + instance_specification = self._get_instance_socket_name() + if self.sql_proxy_use_tcp: + instance_specification += "=tcp:" + str(self.sql_proxy_tcp_port) + return instance_specification + + @provide_session + def create_connection(self, session=None): + """ + Create connection in the Connection table, according to whether it uses + proxy, TCP, UNIX sockets, SSL. Connection ID will be randomly generated. + + :param session: Session of the SQL Alchemy ORM (automatically generated with + decorator). + """ + connection = Connection(conn_id=self.db_conn_id) + uri = self._generate_connection_uri() + self.log.info("Creating connection %s", self.db_conn_id) + connection.parse_from_uri(uri) + session.add(connection) + session.commit() + + @provide_session + def retrieve_connection(self, session=None): + """ + Retrieves the dynamically created connection from the Connection table. + + :param session: Session of the SQL Alchemy ORM (automatically generated with + decorator). + """ + self.log.info("Retrieving connection %s", self.db_conn_id) + connections = session.query(Connection).filter( + Connection.conn_id == self.db_conn_id) + if connections.count(): + return connections[0] + return None + + @provide_session + def delete_connection(self, session=None): + """ + Delete the dynamically created connection from the Connection table. + + :param session: Session of the SQL Alchemy ORM (automatically generated with + decorator). + """ + self.log.info("Deleting connection %s", self.db_conn_id) + + connections = session.query(Connection).filter( + Connection.conn_id == self.db_conn_id) + if connections.count(): + connection = connections[0] + session.delete(connection) + session.commit() + else: + self.log.info("Connection was already deleted!") + + def get_sqlproxy_runner(self): + """ + Retrieve Cloud SQL Proxy runner. It is used to manage the proxy + lifecycle per task. + + :return: The Cloud SQL Proxy runner. + :rtype: CloudSqlProxyRunner + """ + if not self.use_proxy: + raise AirflowException("Proxy runner can only be retrieved in case of use_proxy = True") + return CloudSqlProxyRunner( + path_prefix=self.sql_proxy_unique_path, + instance_specification=self._get_sqlproxy_instance_specification(), + project_id=self.project_id, + sql_proxy_version=self.sql_proxy_version, + sql_proxy_binary_path=self.sql_proxy_binary_path + ) + + def get_database_hook(self): + """ + Retrieve database hook. This is the actual Postgres or MySQL database hook + that uses proxy or connects directly to the Google Cloud SQL database. + """ + if self.database_type == 'postgres': + self.db_hook = PostgresHook(postgres_conn_id=self.db_conn_id, + schema=self.database) + else: + self.db_hook = MySqlHook(mysql_conn_id=self.db_conn_id, + schema=self.database) + return self.db_hook + + def cleanup_database_hook(self): + """ + Clean up database hook after it was used. + """ + if self.database_type == 'postgres': + if hasattr(self.db_hook, + 'conn') and self.db_hook.conn and self.db_hook.conn.notices: + for output in self.db_hook.conn.notices: + self.log.info(output) + + def reserve_free_tcp_port(self): + """ + Reserve free TCP port to be used by Cloud SQL Proxy + """ + self.reserved_tcp_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.reserved_tcp_socket.bind(('127.0.0.1', 0)) + self.sql_proxy_tcp_port = self.reserved_tcp_socket.getsockname()[1] + + def free_reserved_port(self): + """ + Free TCP port. Makes it immediately ready to be used by Cloud SQL Proxy. + """ + if self.reserved_tcp_socket: + self.reserved_tcp_socket.close() + self.reserved_tcp_socket = None diff --git a/airflow/contrib/hooks/gcp_transfer_hook.py b/airflow/contrib/hooks/gcp_transfer_hook.py new file mode 100644 index 0000000000000..d51d7747a9644 --- /dev/null +++ b/airflow/contrib/hooks/gcp_transfer_hook.py @@ -0,0 +1,392 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +import time +from copy import deepcopy + +import six +from googleapiclient.discovery import build + +from airflow.exceptions import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + +# Time to sleep between active checks of the operation results +TIME_TO_SLEEP_IN_SECONDS = 10 + + +class GcpTransferJobsStatus: + ENABLED = "ENABLED" + DISABLED = "DISABLED" + DELETED = "DELETED" + + +class GcpTransferOperationStatus: + IN_PROGRESS = "IN_PROGRESS" + PAUSED = "PAUSED" + SUCCESS = "SUCCESS" + FAILED = "FAILED" + ABORTED = "ABORTED" + + +# A list of keywords used to build a request or response +ACCESS_KEY_ID = "accessKeyId" +ALREADY_EXISTING_IN_SINK = "overwriteObjectsAlreadyExistingInSink" +AWS_ACCESS_KEY = "awsAccessKey" +AWS_S3_DATA_SOURCE = 'awsS3DataSource' +BODY = 'body' +BUCKET_NAME = 'bucketName' +DAY = 'day' +DESCRIPTION = "description" +FILTER = 'filter' +FILTER_JOB_NAMES = 'job_names' +FILTER_PROJECT_ID = 'project_id' +GCS_DATA_SINK = 'gcsDataSink' +GCS_DATA_SOURCE = 'gcsDataSource' +HOURS = "hours" +HTTP_DATA_SOURCE = 'httpDataSource' +LIST_URL = 'list_url' +METADATA = 'metadata' +MINUTES = "minutes" +MONTH = 'month' +NAME = 'name' +OBJECT_CONDITIONS = 'object_conditions' +OPERATIONS = 'operations' +PROJECT_ID = 'projectId' +SCHEDULE = 'schedule' +SCHEDULE_END_DATE = 'scheduleEndDate' +SCHEDULE_START_DATE = 'scheduleStartDate' +SECONDS = "seconds" +SECRET_ACCESS_KEY = "secretAccessKey" +START_TIME_OF_DAY = 'startTimeOfDay' +STATUS = "status" +STATUS1 = 'status' +TRANSFER_JOB = 'transfer_job' +TRANSFER_JOB_FIELD_MASK = 'update_transfer_job_field_mask' +TRANSFER_JOBS = 'transferJobs' +TRANSFER_OPERATIONS = 'transferOperations' +TRANSFER_OPTIONS = 'transfer_options' +TRANSFER_SPEC = 'transferSpec' +YEAR = 'year' + +NEGATIVE_STATUSES = {GcpTransferOperationStatus.FAILED, GcpTransferOperationStatus.ABORTED} + +# Number of retries - used by googleapiclient method calls to perform retries +# For requests that are "retriable" +NUM_RETRIES = 5 + + +# noinspection PyAbstractClass +class GCPTransferServiceHook(GoogleCloudBaseHook): + """ + Hook for Google Storage Transfer Service. + """ + + _conn = None + + def __init__(self, api_version='v1', gcp_conn_id='google_cloud_default', delegate_to=None): + super(GCPTransferServiceHook, self).__init__(gcp_conn_id, delegate_to) + self.api_version = api_version + + def get_conn(self): + """ + Retrieves connection to Google Storage Transfer service. + + :return: Google Storage Transfer service object + :rtype: dict + """ + if not self._conn: + http_authorized = self._authorize() + self._conn = build( + 'storagetransfer', self.api_version, http=http_authorized, cache_discovery=False + ) + return self._conn + + @GoogleCloudBaseHook.catch_http_exception + def create_transfer_job(self, body): + """ + Creates a transfer job that runs periodically. + + :param body: (Required) A request body, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/patch#request-body + :type body: dict + :return: transfer job. + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs#TransferJob + :rtype: dict + """ + body = self._inject_project_id(body, BODY, PROJECT_ID) + return self.get_conn().transferJobs().create(body=body).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.fallback_to_default_project_id + @GoogleCloudBaseHook.catch_http_exception + def get_transfer_job(self, job_name, project_id=None): + """ + Gets the latest state of a long-running operation in Google Storage + Transfer Service. + + :param job_name: (Required) Name of the job to be fetched + :type job_name: str + :param project_id: (Optional) the ID of the project that owns the Transfer + Job. If set to None or missing, the default project_id from the GCP + connection is used. + :type project_id: str + :return: Transfer Job + :rtype: dict + """ + return ( + self.get_conn() + .transferJobs() + .get(jobName=job_name, projectId=project_id) + .execute(num_retries=NUM_RETRIES) + ) + + def list_transfer_job(self, filter): + """ + Lists long-running operations in Google Storage Transfer + Service that match the specified filter. + + :param filter: (Required) A request filter, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/list#body.QUERY_PARAMETERS.filter + :type filter: dict + :return: List of Transfer Jobs + :rtype: list[dict] + """ + conn = self.get_conn() + filter = self._inject_project_id(filter, FILTER, FILTER_PROJECT_ID) + request = conn.transferJobs().list(filter=json.dumps(filter)) + jobs = [] + + while request is not None: + response = request.execute(num_retries=NUM_RETRIES) + jobs.extend(response[TRANSFER_JOBS]) + + request = conn.transferJobs().list_next(previous_request=request, previous_response=response) + + return jobs + + @GoogleCloudBaseHook.catch_http_exception + def update_transfer_job(self, job_name, body): + """ + Updates a transfer job that runs periodically. + + :param job_name: (Required) Name of the job to be updated + :type job_name: str + :param body: A request body, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/patch#request-body + :type body: dict + :return: If successful, TransferJob. + :rtype: dict + """ + body = self._inject_project_id(body, BODY, PROJECT_ID) + return ( + self.get_conn().transferJobs().patch(jobName=job_name, body=body).execute(num_retries=NUM_RETRIES) + ) + + @GoogleCloudBaseHook.fallback_to_default_project_id + @GoogleCloudBaseHook.catch_http_exception + def delete_transfer_job(self, job_name, project_id): + """ + Deletes a transfer job. This is a soft delete. After a transfer job is + deleted, the job and all the transfer executions are subject to garbage + collection. Transfer jobs become eligible for garbage collection + 30 days after soft delete. + + :param job_name: (Required) Name of the job to be deleted + :type job_name: str + :param project_id: (Optional) the ID of the project that owns the Transfer + Job. If set to None or missing, the default project_id from the GCP + connection is used. + :type project_id: str + :rtype: None + """ + + return ( + self.get_conn() + .transferJobs() + .patch( + jobName=job_name, + body={ + PROJECT_ID: project_id, + TRANSFER_JOB: {STATUS1: GcpTransferJobsStatus.DELETED}, + TRANSFER_JOB_FIELD_MASK: STATUS1, + }, + ) + .execute(num_retries=NUM_RETRIES) + ) + + @GoogleCloudBaseHook.catch_http_exception + def cancel_transfer_operation(self, operation_name): + """ + Cancels an transfer operation in Google Storage Transfer Service. + + :param operation_name: Name of the transfer operation. + :type operation_name: str + :rtype: None + """ + self.get_conn().transferOperations().cancel(name=operation_name).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.catch_http_exception + def get_transfer_operation(self, operation_name): + """ + Gets an transfer operation in Google Storage Transfer Service. + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :return: transfer operation + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/Operation + :rtype: dict + """ + return self.get_conn().transferOperations().get(name=operation_name).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.catch_http_exception + def list_transfer_operations(self, filter): + """ + Gets an transfer operation in Google Storage Transfer Service. + + :param filter: (Required) A request filter, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/list#body.QUERY_PARAMETERS.filter + With one additional improvement: + + * project_id is optional if you have a project id defined + in the connection + See: :ref:`howto/connection:gcp` + + :type filter: dict + :return: transfer operation + :rtype: list[dict] + """ + conn = self.get_conn() + + filter = self._inject_project_id(filter, FILTER, FILTER_PROJECT_ID) + + operations = [] + + request = conn.transferOperations().list(name=TRANSFER_OPERATIONS, filter=json.dumps(filter)) + + while request is not None: + response = request.execute(num_retries=NUM_RETRIES) + if OPERATIONS in response: + operations.extend(response[OPERATIONS]) + + request = conn.transferOperations().list_next( + previous_request=request, previous_response=response + ) + + return operations + + @GoogleCloudBaseHook.catch_http_exception + def pause_transfer_operation(self, operation_name): + """ + Pauses an transfer operation in Google Storage Transfer Service. + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :rtype: None + """ + self.get_conn().transferOperations().pause(name=operation_name).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.catch_http_exception + def resume_transfer_operation(self, operation_name): + """ + Resumes an transfer operation in Google Storage Transfer Service. + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :rtype: None + """ + self.get_conn().transferOperations().resume(name=operation_name).execute(num_retries=NUM_RETRIES) + + @GoogleCloudBaseHook.catch_http_exception + def wait_for_transfer_job(self, job, expected_statuses=(GcpTransferOperationStatus.SUCCESS,), timeout=60): + """ + Waits until the job reaches the expected state. + + :param job: Transfer job + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs#TransferJob + :type job: dict + :param expected_statuses: State that is expected + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferOperations#Status + :type expected_statuses: set[str] + :param timeout: + :type timeout: time in which the operation must end in seconds + :rtype: None + """ + while timeout > 0: + operations = self.list_transfer_operations( + filter={FILTER_PROJECT_ID: job[PROJECT_ID], FILTER_JOB_NAMES: [job[NAME]]} + ) + + if GCPTransferServiceHook.operations_contain_expected_statuses(operations, expected_statuses): + return + time.sleep(TIME_TO_SLEEP_IN_SECONDS) + timeout -= TIME_TO_SLEEP_IN_SECONDS + raise AirflowException("Timeout. The operation could not be completed within the allotted time.") + + def _inject_project_id(self, body, param_name, target_key): + body = deepcopy(body) + body[target_key] = body.get(target_key, self.project_id) + if not body.get(target_key): + raise AirflowException( + "The project id must be passed either as `{}` key in `{}` parameter or as project_id " + "extra in GCP connection definition. Both are not set!".format(target_key, param_name) + ) + return body + + @staticmethod + def operations_contain_expected_statuses(operations, expected_statuses): + """ + Checks whether the operation list has an operation with the + expected status, then returns true + If it encounters operations in FAILED or ABORTED state + throw :class:`airflow.exceptions.AirflowException`. + + :param operations: (Required) List of transfer operations to check. + :type operations: list[dict] + :param expected_statuses: (Required) status that is expected + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferOperations#Status + :type expected_statuses: set[str] + :return: If there is an operation with the expected state + in the operation list, returns true, + :raises: airflow.exceptions.AirflowException If it encounters operations + with a state in the list, + :rtype: bool + """ + expected_statuses = ( + {expected_statuses} if isinstance(expected_statuses, six.string_types) else set(expected_statuses) + ) + if len(operations) == 0: + return False + + current_statuses = {operation[METADATA][STATUS] for operation in operations} + + if len(current_statuses - set(expected_statuses)) != len(current_statuses): + return True + + if len(NEGATIVE_STATUSES - current_statuses) != len(NEGATIVE_STATUSES): + raise AirflowException( + 'An unexpected operation status was encountered. Expected: {}'.format( + ", ".join(expected_statuses) + ) + ) + return False diff --git a/airflow/contrib/hooks/gcp_translate_hook.py b/airflow/contrib/hooks/gcp_translate_hook.py new file mode 100644 index 0000000000000..2d4adbb5b2b03 --- /dev/null +++ b/airflow/contrib/hooks/gcp_translate_hook.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from google.cloud.translate_v2 import Client +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook + + +class CloudTranslateHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud translate APIs. + """ + + _client = None + + def __init__(self, gcp_conn_id='google_cloud_default'): + super(CloudTranslateHook, self).__init__(gcp_conn_id) + + def get_conn(self): + """ + Retrieves connection to Cloud Translate + + :return: Google Cloud Translate client object. + :rtype: Client + """ + if not self._client: + self._client = Client(credentials=self._get_credentials()) + return self._client + + def translate( + self, values, target_language, format_=None, source_language=None, model=None + ): + """Translate a string or list of strings. + + See https://cloud.google.com/translate/docs/translating-text + + :type values: str or list + :param values: String or list of strings to translate. + + :type target_language: str + :param target_language: The language to translate results into. This + is required by the API and defaults to + the target language of the current instance. + + :type format_: str + :param format_: (Optional) One of ``text`` or ``html``, to specify + if the input text is plain text or HTML. + + :type source_language: str or None + :param source_language: (Optional) The language of the text to + be translated. + + :type model: str or None + :param model: (Optional) The model used to translate the text, such + as ``'base'`` or ``'nmt'``. + + :rtype: str or list + :returns: A list of dictionaries for each queried value. Each + dictionary typically contains three keys (though not + all will be present in all cases) + + * ``detectedSourceLanguage``: The detected language (as an + ISO 639-1 language code) of the text. + * ``translatedText``: The translation of the text into the + target language. + * ``input``: The corresponding input value. + * ``model``: The model used to translate the text. + + If only a single value is passed, then only a single + dictionary will be returned. + :raises: :class:`~exceptions.ValueError` if the number of + values and translations differ. + """ + client = self.get_conn() + + return client.translate( + values=values, + target_language=target_language, + format_=format_, + source_language=source_language, + model=model, + ) diff --git a/airflow/contrib/hooks/gcp_vision_hook.py b/airflow/contrib/hooks/gcp_vision_hook.py new file mode 100644 index 0000000000000..ede44bf5e5e25 --- /dev/null +++ b/airflow/contrib/hooks/gcp_vision_hook.py @@ -0,0 +1,454 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from copy import deepcopy + +from google.cloud.vision_v1 import ProductSearchClient, ImageAnnotatorClient +from google.protobuf.json_format import MessageToDict + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook +from airflow.utils.decorators import cached_property + + +class NameDeterminer: + """ + Class used for checking if the entity has the 'name' attribute set. + + * If so, no action is taken. + + * If not, and the name can be constructed from other parameters provided, it is created and filled in + the entity. + + * If both the entity's 'name' attribute is set and the name can be constructed from other parameters + provided: + + * If they are the same - no action is taken + + * if they are different - an exception is thrown. + + """ + + def __init__(self, label, id_label, get_path): + self.label = label + self.id_label = id_label + self.get_path = get_path + + def get_entity_with_name(self, entity, entity_id, location, project_id): + entity = deepcopy(entity) + explicit_name = getattr(entity, 'name') + if location and entity_id: + # Necessary parameters to construct the name are present. Checking for conflict with explicit name + constructed_name = self.get_path(project_id, location, entity_id) + if not explicit_name: + entity.name = constructed_name + return entity + elif explicit_name != constructed_name: + self._raise_ex_different_names(constructed_name, explicit_name) + else: + # Not enough parameters to construct the name. Trying to use the name from Product / ProductSet. + if explicit_name: + return entity + else: + self._raise_ex_unable_to_determine_name() + + def _raise_ex_unable_to_determine_name(self): + raise AirflowException( + "Unable to determine the {label} name. Please either set the name directly in the {label} " + "object or provide the `location` and `{id_label}` parameters.".format( + label=self.label, id_label=self.id_label + ) + ) + + def _raise_ex_different_names(self, constructed_name, explicit_name): + raise AirflowException( + "The {label} name provided in the object ({explicit_name}) is different than the name created " + "from the input parameters ({constructed_name}). Please either: 1) Remove the {label} name, 2) " + "Remove the location and {id_label} parameters, 3) Unify the {label} name and input " + "parameters.".format( + label=self.label, + explicit_name=explicit_name, + constructed_name=constructed_name, + id_label=self.id_label, + ) + ) + + +class CloudVisionHook(GoogleCloudBaseHook): + """ + Hook for Google Cloud Vision APIs. + """ + + _client = None + product_name_determiner = NameDeterminer('Product', 'product_id', ProductSearchClient.product_path) + product_set_name_determiner = NameDeterminer( + 'ProductSet', 'productset_id', ProductSearchClient.product_set_path + ) + + def __init__(self, gcp_conn_id='google_cloud_default', delegate_to=None): + super(CloudVisionHook, self).__init__(gcp_conn_id, delegate_to) + + def get_conn(self): + """ + Retrieves connection to Cloud Vision. + + :return: Google Cloud Vision client object. + :rtype: google.cloud.vision_v1.ProductSearchClient + """ + if not self._client: + self._client = ProductSearchClient(credentials=self._get_credentials()) + return self._client + + @cached_property + def annotator_client(self): + return ImageAnnotatorClient(credentials=self._get_credentials()) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_product_set( + self, + location, + product_set, + project_id=None, + product_set_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetCreateOperator` + """ + client = self.get_conn() + parent = ProductSearchClient.location_path(project_id, location) + self.log.info('Creating a new ProductSet under the parent: %s', parent) + response = client.create_product_set( + parent=parent, + product_set=product_set, + product_set_id=product_set_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + self.log.info('ProductSet created: %s', response.name if response else '') + self.log.debug('ProductSet created:\n%s', response) + + if not product_set_id: + # Product set id was generated by the API + product_set_id = self._get_autogenerated_id(response) + self.log.info('Extracted autogenerated ProductSet ID from the response: %s', product_set_id) + + return product_set_id + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_product_set( + self, location, product_set_id, project_id=None, retry=None, timeout=None, metadata=None + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetGetOperator` + """ + client = self.get_conn() + name = ProductSearchClient.product_set_path(project_id, location, product_set_id) + self.log.info('Retrieving ProductSet: %s', name) + response = client.get_product_set(name=name, retry=retry, timeout=timeout, metadata=metadata) + self.log.info('ProductSet retrieved.') + self.log.debug('ProductSet retrieved:\n%s', response) + return MessageToDict(response) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def update_product_set( + self, + product_set, + location=None, + product_set_id=None, + update_mask=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetUpdateOperator` + """ + client = self.get_conn() + product_set = self.product_set_name_determiner.get_entity_with_name( + product_set, product_set_id, location, project_id + ) + self.log.info('Updating ProductSet: %s', product_set.name) + response = client.update_product_set( + product_set=product_set, update_mask=update_mask, retry=retry, timeout=timeout, metadata=metadata + ) + self.log.info('ProductSet updated: %s', response.name if response else '') + self.log.debug('ProductSet updated:\n%s', response) + return MessageToDict(response) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_product_set( + self, location, product_set_id, project_id=None, retry=None, timeout=None, metadata=None + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductSetDeleteOperator` + """ + client = self.get_conn() + name = ProductSearchClient.product_set_path(project_id, location, product_set_id) + self.log.info('Deleting ProductSet: %s', name) + client.delete_product_set(name=name, retry=retry, timeout=timeout, metadata=metadata) + self.log.info('ProductSet with the name [%s] deleted.', name) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_product( + self, location, product, project_id=None, product_id=None, retry=None, timeout=None, metadata=None + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductCreateOperator` + """ + client = self.get_conn() + parent = ProductSearchClient.location_path(project_id, location) + self.log.info('Creating a new Product under the parent: %s', parent) + response = client.create_product( + parent=parent, + product=product, + product_id=product_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + self.log.info('Product created: %s', response.name if response else '') + self.log.debug('Product created:\n%s', response) + + if not product_id: + # Product id was generated by the API + product_id = self._get_autogenerated_id(response) + self.log.info('Extracted autogenerated Product ID from the response: %s', product_id) + + return product_id + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def get_product(self, location, product_id, project_id=None, retry=None, timeout=None, metadata=None): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductGetOperator` + """ + client = self.get_conn() + name = ProductSearchClient.product_path(project_id, location, product_id) + self.log.info('Retrieving Product: %s', name) + response = client.get_product(name=name, retry=retry, timeout=timeout, metadata=metadata) + self.log.info('Product retrieved.') + self.log.debug('Product retrieved:\n%s', response) + return MessageToDict(response) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def update_product( + self, + product, + location=None, + product_id=None, + update_mask=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductUpdateOperator` + """ + client = self.get_conn() + product = self.product_name_determiner.get_entity_with_name(product, product_id, location, project_id) + self.log.info('Updating ProductSet: %s', product.name) + response = client.update_product( + product=product, update_mask=update_mask, retry=retry, timeout=timeout, metadata=metadata + ) + self.log.info('Product updated: %s', response.name if response else '') + self.log.debug('Product updated:\n%s', response) + return MessageToDict(response) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_product(self, location, product_id, project_id=None, retry=None, timeout=None, metadata=None): + """ + For the documentation see: + :class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionProductDeleteOperator` + """ + client = self.get_conn() + name = ProductSearchClient.product_path(project_id, location, product_id) + self.log.info('Deleting ProductSet: %s', name) + client.delete_product(name=name, retry=retry, timeout=timeout, metadata=metadata) + self.log.info('Product with the name [%s] deleted:', name) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def create_reference_image( + self, + location, + product_id, + reference_image, + reference_image_id=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :py:class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionReferenceImageCreateOperator` + """ + client = self.get_conn() + self.log.info('Creating ReferenceImage') + parent = ProductSearchClient.product_path(project=project_id, location=location, product=product_id) + + response = client.create_reference_image( + parent=parent, + reference_image=reference_image, + reference_image_id=reference_image_id, + retry=retry, + timeout=timeout, + metadata=metadata, + ) + + self.log.info('ReferenceImage created: %s', response.name if response else '') + self.log.debug('ReferenceImage created:\n%s', response) + + if not reference_image_id: + # Refernece image id was generated by the API + reference_image_id = self._get_autogenerated_id(response) + self.log.info( + 'Extracted autogenerated ReferenceImage ID from the response: %s', reference_image_id + ) + + return reference_image_id + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def delete_reference_image( + self, + location, + product_id, + reference_image_id, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :py:class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionReferenceImageCreateOperator` + """ + client = self.get_conn() + self.log.info('Deleting ReferenceImage') + name = ProductSearchClient.reference_image_path( + project=project_id, location=location, product=product_id, reference_image=reference_image_id + ) + response = client.delete_reference_image(name=name, retry=retry, timeout=timeout, metadata=metadata) + self.log.info('ReferenceImage with the name [%s] deleted.', name) + + return MessageToDict(response) + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def add_product_to_product_set( + self, + product_set_id, + product_id, + location=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :py:class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionAddProductToProductSetOperator` + """ + client = self.get_conn() + + product_name = ProductSearchClient.product_path(project_id, location, product_id) + product_set_name = ProductSearchClient.product_set_path(project_id, location, product_set_id) + + self.log.info('Add Product[name=%s] to Product Set[name=%s]', product_name, product_set_name) + + client.add_product_to_product_set( + name=product_set_name, product=product_name, retry=retry, timeout=timeout, metadata=metadata + ) + + self.log.info('Product added to Product Set') + + @GoogleCloudBaseHook.catch_http_exception + @GoogleCloudBaseHook.fallback_to_default_project_id + def remove_product_from_product_set( + self, + product_set_id, + product_id, + location=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + ): + """ + For the documentation see: + :py:class:`~airflow.contrib.operators.gcp_vision_operator.CloudVisionRemoveProductFromProductSetOperator` + """ + client = self.get_conn() + + product_name = ProductSearchClient.product_path(project_id, location, product_id) + product_set_name = ProductSearchClient.product_set_path(project_id, location, product_set_id) + + self.log.info('Remove Product[name=%s] from Product Set[name=%s]', product_name, product_set_name) + + client.remove_product_from_product_set( + name=product_set_name, product=product_name, retry=retry, timeout=timeout, metadata=metadata + ) + + self.log.info('Product removed from Product Set') + + @GoogleCloudBaseHook.catch_http_exception + def annotate_image(self, request, retry=None, timeout=None): + """ + For the documentation see: + :py:class:`~airflow.contrib.operators.gcp_vision_image_annotator_operator.CloudVisionAnnotateImage` + """ + client = self.annotator_client + + self.log.info('Annotating image') + + response = client.annotate_image(request=request, retry=retry, timeout=timeout) + + self.log.info('Image annotated') + + return MessageToDict(response) + + @staticmethod + def _get_autogenerated_id(response): + try: + name = response.name + except AttributeError as e: + raise AirflowException('Unable to get name from response... [{}]\n{}'.format(response, e)) + if '/' not in name: + raise AirflowException('Unable to get id from name... [{}]'.format(name)) + return name.rsplit('/', 1)[1] diff --git a/airflow/contrib/hooks/gcs_hook.py b/airflow/contrib/hooks/gcs_hook.py index c5e356f41cfc0..cab06b92d5dec 100644 --- a/airflow/contrib/hooks/gcs_hook.py +++ b/airflow/contrib/hooks/gcs_hook.py @@ -17,14 +17,18 @@ # specific language governing permissions and limitations # under the License. # -from apiclient.discovery import build -from apiclient.http import MediaFileUpload -from googleapiclient import errors + +from googleapiclient.discovery import build +from googleapiclient.http import MediaFileUpload +from googleapiclient.errors import HttpError from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook from airflow.exceptions import AirflowException +import gzip as gz +import shutil import re +import os class GoogleCloudStorageHook(GoogleCloudBaseHook): @@ -57,14 +61,15 @@ def copy(self, source_bucket, source_object, destination_bucket=None, source bucket/object is used, but not both. :param source_bucket: The bucket of the object to copy from. - :type source_bucket: string + :type source_bucket: str :param source_object: The object to copy. - :type source_object: string + :type source_object: str :param destination_bucket: The destination of the object to copied to. Can be omitted; then the same bucket is used. - :type destination_bucket: string + :type destination_bucket: str :param destination_object: The (renamed) path of the object if given. Can be omitted; then the same name is used. + :type destination_object: str """ destination_bucket = destination_bucket or source_bucket destination_object = destination_object or source_object @@ -87,7 +92,7 @@ def copy(self, source_bucket, source_object, destination_bucket=None, destinationObject=destination_object, body='') \ .execute() return True - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': return False raise @@ -102,11 +107,11 @@ def rewrite(self, source_bucket, source_object, destination_bucket, destination_object can be omitted, in which case source_object is used. :param source_bucket: The bucket of the object to copy from. - :type source_bucket: string + :type source_bucket: str :param source_object: The object to copy. - :type source_object: string + :type source_object: str :param destination_bucket: The destination of the object to copied to. - :type destination_bucket: string + :type destination_bucket: str :param destination_object: The (renamed) path of the object if given. Can be omitted; then the same name is used. """ @@ -139,7 +144,7 @@ def rewrite(self, source_bucket, source_object, destination_bucket, .execute() self.log.info('Rewrite request #%s: %s', request_count, result) return True - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': return False raise @@ -150,11 +155,11 @@ def download(self, bucket, object, filename=None): Get a file from Google Cloud Storage. :param bucket: The bucket to fetch from. - :type bucket: string + :type bucket: str :param object: The object to fetch. - :type object: string + :type object: str :param filename: If set, a local file path where the file should be written to. - :type filename: string + :type filename: str """ service = self.get_conn() downloaded_file_bytes = service \ @@ -171,42 +176,90 @@ def download(self, bucket, object, filename=None): return downloaded_file_bytes # pylint:disable=redefined-builtin - def upload(self, bucket, object, filename, mime_type='application/octet-stream'): + def upload(self, bucket, object, filename, + mime_type='application/octet-stream', gzip=False, + multipart=False, num_retries=0): """ Uploads a local file to Google Cloud Storage. :param bucket: The bucket to upload to. - :type bucket: string + :type bucket: str :param object: The object name to set when uploading the local file. - :type object: string + :type object: str :param filename: The local file path to the file to be uploaded. - :type filename: string + :type filename: str :param mime_type: The MIME type to set when uploading the file. - :type mime_type: string + :type mime_type: str + :param gzip: Option to compress file for upload + :type gzip: bool + :param multipart: If True, the upload will be split into multiple HTTP requests. The + default size is 256MiB per request. Pass a number instead of True to + specify the request size, which must be a multiple of 262144 (256KiB). + :type multipart: bool or int + :param num_retries: The number of times to attempt to re-upload the file (or individual + chunks, in the case of multipart uploads). Retries are attempted + with exponential backoff. + :type num_retries: int """ service = self.get_conn() - media = MediaFileUpload(filename, mime_type) + + if gzip: + filename_gz = filename + '.gz' + + with open(filename, 'rb') as f_in: + with gz.open(filename_gz, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + filename = filename_gz + try: - service \ - .objects() \ - .insert(bucket=bucket, name=object, media_body=media) \ - .execute() - return True - except errors.HttpError as ex: + if multipart: + if multipart is True: + chunksize = 256 * 1024 * 1024 + else: + chunksize = multipart + + if chunksize % (256 * 1024) > 0 or chunksize < 0: + raise ValueError("Multipart size is not a multiple of 262144 (256KiB)") + + media = MediaFileUpload(filename, mimetype=mime_type, + chunksize=chunksize, resumable=True) + + request = service.objects().insert(bucket=bucket, name=object, media_body=media) + response = None + while response is None: + status, response = request.next_chunk(num_retries=num_retries) + if status: + self.log.info("Upload progress %.1f%%", status.progress() * 100) + + else: + media = MediaFileUpload(filename, mime_type) + + service \ + .objects() \ + .insert(bucket=bucket, name=object, media_body=media) \ + .execute(num_retries=num_retries) + + except HttpError as ex: if ex.resp['status'] == '404': return False raise + finally: + if gzip: + os.remove(filename) + + return True + # pylint:disable=redefined-builtin def exists(self, bucket, object): """ Checks for the existence of a file in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str """ service = self.get_conn() try: @@ -215,7 +268,7 @@ def exists(self, bucket, object): .get(bucket=bucket, object=object) \ .execute() return True - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': return False raise @@ -226,12 +279,12 @@ def is_updated_after(self, bucket, object, ts): Checks if an object is updated in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str :param ts: The timestamp to check against. - :type ts: datetime + :type ts: datetime.datetime """ service = self.get_conn() try: @@ -253,7 +306,7 @@ def is_updated_after(self, bucket, object, ts): if updated > ts: return True - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] != '404': raise @@ -265,11 +318,11 @@ def delete(self, bucket, object, generation=None): parameter is used. :param bucket: name of the bucket, where the object resides - :type bucket: string + :type bucket: str :param object: name of the object to delete - :type object: string + :type object: str :param generation: if present, permanently delete the object of this generation - :type generation: string + :type generation: str :return: True if succeeded """ service = self.get_conn() @@ -280,7 +333,7 @@ def delete(self, bucket, object, generation=None): .delete(bucket=bucket, object=object, generation=generation) \ .execute() return True - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': return False raise @@ -290,23 +343,23 @@ def list(self, bucket, versions=None, maxResults=None, prefix=None, delimiter=No List all objects from the bucket with the give string prefix in name :param bucket: bucket name - :type bucket: string + :type bucket: str :param versions: if true, list all versions of the objects - :type versions: boolean + :type versions: bool :param maxResults: max count of items to return in a single page of responses - :type maxResults: integer + :type maxResults: int :param prefix: prefix string which filters objects whose name begin with this prefix - :type prefix: string + :type prefix: str :param delimiter: filters objects based on the delimiter (for e.g '.csv') - :type delimiter: string + :type delimiter: str :return: a stream of object names matching the filtering criteria """ service = self.get_conn() ids = list() pageToken = None - while(True): + while True: response = service.objects().list( bucket=bucket, versions=versions, @@ -343,9 +396,9 @@ def get_size(self, bucket, object): Gets the size of a file in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str """ self.log.info('Checking the file size of object: %s in bucket: %s', @@ -365,7 +418,7 @@ def get_size(self, bucket, object): return size else: raise ValueError('Object is not a file') - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': raise ValueError('Object Not Found') @@ -374,10 +427,10 @@ def get_crc32c(self, bucket, object): Gets the CRC32c checksum of an object in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str """ self.log.info('Retrieving the crc32c checksum of ' 'object: %s in bucket: %s', object, bucket) @@ -392,7 +445,7 @@ def get_crc32c(self, bucket, object): self.log.info('The crc32c checksum of %s is %s', object, crc32c) return crc32c - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': raise ValueError('Object Not Found') @@ -401,10 +454,10 @@ def get_md5hash(self, bucket, object): Gets the MD5 hash of an object in Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str """ self.log.info('Retrieving the MD5 hash of ' 'object: %s in bucket: %s', object, bucket) @@ -419,12 +472,13 @@ def get_md5hash(self, bucket, object): self.log.info('The md5Hash of %s is %s', object, md5hash) return md5hash - except errors.HttpError as ex: + except HttpError as ex: if ex.resp['status'] == '404': raise ValueError('Object Not Found') def create_bucket(self, bucket_name, + resource=None, storage_class='MULTI_REGIONAL', location='US', project_id=None, @@ -439,7 +493,11 @@ def create_bucket(self, https://cloud.google.com/storage/docs/bucketnaming.html#requirements :param bucket_name: The name of the bucket. - :type bucket_name: string + :type bucket_name: str + :param resource: An optional dict with parameters for creating the bucket. + For information on available parameters, see Cloud Storage API doc: + https://cloud.google.com/storage/docs/json_api/v1/buckets/insert + :type resource: dict :param storage_class: This defines how objects in the bucket are stored and determines the SLA and the cost of storage. Values include @@ -448,9 +506,10 @@ def create_bucket(self, - ``STANDARD`` - ``NEARLINE`` - ``COLDLINE``. + If this value is not specified when the bucket is created, it will default to STANDARD. - :type storage_class: string + :type storage_class: str :param location: The location of the bucket. Object data for objects in the bucket resides in physical storage within this region. Defaults to US. @@ -458,9 +517,9 @@ def create_bucket(self, .. seealso:: https://developers.google.com/storage/docs/bucket-locations - :type location: string + :type location: str :param project_id: The ID of the GCP Project. - :type project_id: string + :type project_id: str :param labels: User-provided labels, in key/value pairs. :type labels: dict :return: If successful, it returns the ``id`` of the bucket. @@ -477,22 +536,24 @@ def create_bucket(self, self.log.info('Creating Bucket: %s; Location: %s; Storage Class: %s', bucket_name, location, storage_class) - assert storage_class in storage_classes, \ - 'Invalid value ({}) passed to storage_class. Value should be ' \ - 'one of {}'.format(storage_class, storage_classes) + if storage_class not in storage_classes: + raise ValueError( + 'Invalid value ({}) passed to storage_class. Value should be ' + 'one of {}'.format(storage_class, storage_classes)) - assert re.match('[a-zA-Z0-9]+', bucket_name[0]), \ - 'Bucket names must start with a number or letter.' + if not re.match('[a-zA-Z0-9]+', bucket_name[0]): + raise ValueError('Bucket names must start with a number or letter.') - assert re.match('[a-zA-Z0-9]+', bucket_name[-1]), \ - 'Bucket names must end with a number or letter.' + if not re.match('[a-zA-Z0-9]+', bucket_name[-1]): + raise ValueError('Bucket names must end with a number or letter.') service = self.get_conn() - bucket_resource = { + bucket_resource = resource or {} + bucket_resource.update({ 'name': bucket_name, 'location': location, 'storageClass': storage_class - } + }) self.log.info('The Default Project ID is %s', self.project_id) @@ -509,11 +570,145 @@ def create_bucket(self, return response['id'] - except errors.HttpError as ex: + except HttpError as ex: raise AirflowException( 'Bucket creation failed. Error was: {}'.format(ex.content) ) + def insert_bucket_acl(self, bucket, entity, role, user_project): + """ + Creates a new ACL entry on the specified bucket. + See: https://cloud.google.com/storage/docs/json_api/v1/bucketAccessControls/insert + + :param bucket: Name of a bucket. + :type bucket: str + :param entity: The entity holding the permission, in one of the following forms: + user-userId, user-email, group-groupId, group-email, domain-domain, + project-team-projectId, allUsers, allAuthenticatedUsers. + See: https://cloud.google.com/storage/docs/access-control/lists#scopes + :type entity: str + :param role: The access permission for the entity. + Acceptable values are: "OWNER", "READER", "WRITER". + :type role: str + :param user_project: (Optional) The project to be billed for this request. + Required for Requester Pays buckets. + :type user_project: str + """ + self.log.info('Creating a new ACL entry in bucket: %s', bucket) + service = self.get_conn() + try: + response = service.bucketAccessControls().insert( + bucket=bucket, + body={ + "entity": entity, + "role": role + }, + userProject=user_project + ).execute() + if response: + self.log.info('A new ACL entry created in bucket: %s', bucket) + except HttpError as ex: + raise AirflowException( + 'Bucket ACL entry creation failed. Error was: {}'.format(ex.content) + ) + + def insert_object_acl(self, bucket, object_name, entity, role, generation, + user_project): + """ + Creates a new ACL entry on the specified object. + See: https://cloud.google.com/storage/docs/json_api/v1/objectAccessControls/insert + + :param bucket: Name of a bucket. + :type bucket: str + :param object_name: Name of the object. For information about how to URL encode + object names to be path safe, see: + https://cloud.google.com/storage/docs/json_api/#encoding + :type object_name: str + :param entity: The entity holding the permission, in one of the following forms: + user-userId, user-email, group-groupId, group-email, domain-domain, + project-team-projectId, allUsers, allAuthenticatedUsers + See: https://cloud.google.com/storage/docs/access-control/lists#scopes + :type entity: str + :param role: The access permission for the entity. + Acceptable values are: "OWNER", "READER". + :type role: str + :param generation: (Optional) If present, selects a specific revision of this + object (as opposed to the latest version, the default). + :type generation: str + :param user_project: (Optional) The project to be billed for this request. + Required for Requester Pays buckets. + :type user_project: str + """ + self.log.info('Creating a new ACL entry for object: %s in bucket: %s', + object_name, bucket) + service = self.get_conn() + try: + response = service.objectAccessControls().insert( + bucket=bucket, + object=object_name, + body={ + "entity": entity, + "role": role + }, + generation=generation, + userProject=user_project + ).execute() + if response: + self.log.info('A new ACL entry created for object: %s in bucket: %s', + object_name, bucket) + except HttpError as ex: + raise AirflowException( + 'Object ACL entry creation failed. Error was: {}'.format(ex.content) + ) + + def compose(self, bucket, source_objects, destination_object, num_retries=5): + """ + Composes a list of existing object into a new object in the same storage bucket + + Currently it only supports up to 32 objects that can be concatenated + in a single operation + + https://cloud.google.com/storage/docs/json_api/v1/objects/compose + + :param bucket: The name of the bucket containing the source objects. + This is also the same bucket to store the composed destination object. + :type bucket: str + :param source_objects: The list of source objects that will be composed + into a single object. + :type source_objects: list + :param destination_object: The path of the object if given. + :type destination_object: str + """ + + if not source_objects or not len(source_objects): + raise ValueError('source_objects cannot be empty.') + + if not bucket or not destination_object: + raise ValueError('bucket and destination_object cannot be empty.') + + service = self.get_conn() + + dict_source_objects = [{'name': source_object} + for source_object in source_objects] + body = { + 'sourceObjects': dict_source_objects + } + + try: + self.log.info("Composing %s to %s in the bucket %s", + source_objects, destination_object, bucket) + service \ + .objects() \ + .compose(destinationBucket=bucket, + destinationObject=destination_object, + body=body) \ + .execute(num_retries=num_retries) + return True + except HttpError as ex: + if ex.resp['status'] == '404': + return False + raise + def _parse_gcs_url(gsurl): """ diff --git a/airflow/contrib/hooks/imap_hook.py b/airflow/contrib/hooks/imap_hook.py new file mode 100644 index 0000000000000..79f08cd5649cb --- /dev/null +++ b/airflow/contrib/hooks/imap_hook.py @@ -0,0 +1,316 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import email +import imaplib +import os.path +import re + +from airflow import LoggingMixin, AirflowException +from airflow.hooks.base_hook import BaseHook + + +class ImapHook(BaseHook): + """ + This hook connects to a mail server by using the imap protocol. + + :param imap_conn_id: The connection id that contains the information used to authenticate the client. + :type imap_conn_id: str + """ + + def __init__(self, imap_conn_id='imap_default'): + super(ImapHook, self).__init__(imap_conn_id) + self.conn = self.get_connection(imap_conn_id) + self.mail_client = imaplib.IMAP4_SSL(self.conn.host) + + def __enter__(self): + self.mail_client.login(self.conn.login, self.conn.password) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.mail_client.logout() + + def has_mail_attachment(self, name, mail_folder='INBOX', check_regex=False): + """ + Checks the mail folder for mails containing attachments with the given name. + + :param name: The name of the attachment that will be searched for. + :type name: str + :param mail_folder: The mail folder where to look at. + :type mail_folder: str + :param check_regex: Checks the name for a regular expression. + :type check_regex: bool + :returns: True if there is an attachment with the given name and False if not. + :rtype: bool + """ + mail_attachments = self._retrieve_mails_attachments_by_name(name, + mail_folder, + check_regex, + latest_only=True) + return len(mail_attachments) > 0 + + def retrieve_mail_attachments(self, + name, + mail_folder='INBOX', + check_regex=False, + latest_only=False, + not_found_mode='raise'): + """ + Retrieves mail's attachments in the mail folder by its name. + + :param name: The name of the attachment that will be downloaded. + :type name: str + :param mail_folder: The mail folder where to look at. + :type mail_folder: str + :param check_regex: Checks the name for a regular expression. + :type check_regex: bool + :param latest_only: If set to True it will only retrieve + the first matched attachment. + :type latest_only: bool + :param not_found_mode: Specify what should happen if no attachment has been found. + Supported values are 'raise', 'warn' and 'ignore'. + If it is set to 'raise' it will raise an exception, + if set to 'warn' it will only print a warning and + if set to 'ignore' it won't notify you at all. + :type not_found_mode: str + :returns: a list of tuple each containing the attachment filename and its payload. + :rtype: a list of tuple + """ + mail_attachments = self._retrieve_mails_attachments_by_name(name, + mail_folder, + check_regex, + latest_only) + if not mail_attachments: + self._handle_not_found_mode(not_found_mode) + + return mail_attachments + + def download_mail_attachments(self, + name, + local_output_directory, + mail_folder='INBOX', + check_regex=False, + latest_only=False, + not_found_mode='raise'): + """ + Downloads mail's attachments in the mail folder by its name to the local directory. + + :param name: The name of the attachment that will be downloaded. + :type name: str + :param local_output_directory: The output directory on the local machine + where the files will be downloaded to. + :type local_output_directory: str + :param mail_folder: The mail folder where to look at. + :type mail_folder: str + :param check_regex: Checks the name for a regular expression. + :type check_regex: bool + :param latest_only: If set to True it will only download + the first matched attachment. + :type latest_only: bool + :param not_found_mode: Specify what should happen if no attachment has been found. + Supported values are 'raise', 'warn' and 'ignore'. + If it is set to 'raise' it will raise an exception, + if set to 'warn' it will only print a warning and + if set to 'ignore' it won't notify you at all. + :type not_found_mode: str + """ + mail_attachments = self._retrieve_mails_attachments_by_name(name, + mail_folder, + check_regex, + latest_only) + + if not mail_attachments: + self._handle_not_found_mode(not_found_mode) + + self._create_files(mail_attachments, local_output_directory) + + def _handle_not_found_mode(self, not_found_mode): + if not_found_mode == 'raise': + raise AirflowException('No mail attachments found!') + elif not_found_mode == 'warn': + self.log.warning('No mail attachments found!') + elif not_found_mode == 'ignore': + pass # Do not notify if the attachment has not been found. + else: + self.log.error('Invalid "not_found_mode" %s', not_found_mode) + + def _retrieve_mails_attachments_by_name(self, name, mail_folder, check_regex, latest_only): + all_matching_attachments = [] + + self.mail_client.select(mail_folder) + + for mail_id in self._list_mail_ids_desc(): + response_mail_body = self._fetch_mail_body(mail_id) + matching_attachments = self._check_mail_body(response_mail_body, name, check_regex, latest_only) + + if matching_attachments: + all_matching_attachments.extend(matching_attachments) + if latest_only: + break + + self.mail_client.close() + + return all_matching_attachments + + def _list_mail_ids_desc(self): + result, data = self.mail_client.search(None, 'All') + mail_ids = data[0].split() + return reversed(mail_ids) + + def _fetch_mail_body(self, mail_id): + result, data = self.mail_client.fetch(mail_id, '(RFC822)') + mail_body = data[0][1] # The mail body is always in this specific location + mail_body_str = mail_body.decode('utf-8') + return mail_body_str + + def _check_mail_body(self, response_mail_body, name, check_regex, latest_only): + mail = Mail(response_mail_body) + if mail.has_attachments(): + return mail.get_attachments_by_name(name, check_regex, find_first=latest_only) + + def _create_files(self, mail_attachments, local_output_directory): + for name, payload in mail_attachments: + if self._is_symlink(name): + self.log.error('Can not create file because it is a symlink!') + elif self._is_escaping_current_directory(name): + self.log.error('Can not create file because it is escaping the current directory!') + else: + self._create_file(name, payload, local_output_directory) + + def _is_symlink(self, name): + # IMPORTANT NOTE: os.path.islink is not working for windows symlinks + # See: https://stackoverflow.com/a/11068434 + return os.path.islink(name) + + def _is_escaping_current_directory(self, name): + return '../' in name + + def _correct_path(self, name, local_output_directory): + return local_output_directory + name if local_output_directory.endswith('/') \ + else local_output_directory + '/' + name + + def _create_file(self, name, payload, local_output_directory): + file_path = self._correct_path(name, local_output_directory) + + with open(file_path, 'wb') as file: + file.write(payload) + + +class Mail(LoggingMixin): + """ + This class simplifies working with mails returned by the imaplib client. + + :param mail_body: The mail body of a mail received from imaplib client. + :type mail_body: str + """ + + def __init__(self, mail_body): + super(Mail, self).__init__() + self.mail = email.message_from_string(mail_body) + + def has_attachments(self): + """ + Checks the mail for a attachments. + + :returns: True if it has attachments and False if not. + :rtype: bool + """ + return self.mail.get_content_maintype() == 'multipart' + + def get_attachments_by_name(self, name, check_regex, find_first=False): + """ + Gets all attachments by name for the mail. + + :param name: The name of the attachment to look for. + :type name: str + :param check_regex: Checks the name for a regular expression. + :type check_regex: bool + :param find_first: If set to True it will only find the first match and then quit. + :type find_first: bool + :returns: a list of tuples each containing name and payload + where the attachments name matches the given name. + :rtype: list of tuple + """ + attachments = [] + + for part in self.mail.walk(): + mail_part = MailPart(part) + if mail_part.is_attachment(): + found_attachment = mail_part.has_matching_name(name) if check_regex \ + else mail_part.has_equal_name(name) + if found_attachment: + file_name, file_payload = mail_part.get_file() + self.log.info('Found attachment: {}'.format(file_name)) + attachments.append((file_name, file_payload)) + if find_first: + break + + return attachments + + +class MailPart: + """ + This class is a wrapper for a Mail object's part and gives it more features. + + :param part: The mail part in a Mail object. + :type part: any + """ + + def __init__(self, part): + self.part = part + + def is_attachment(self): + """ + Checks if the part is a valid mail attachment. + + :returns: True if it is an attachment and False if not. + :rtype: bool + """ + return self.part.get_content_maintype() != 'multipart' and self.part.get('Content-Disposition') + + def has_matching_name(self, name): + """ + Checks if the given name matches the part's name. + + :param name: The name to look for. + :type name: str + :returns: True if it matches the name (including regular expression). + :rtype: tuple + """ + return re.match(name, self.part.get_filename()) + + def has_equal_name(self, name): + """ + Checks if the given name is equal to the part's name. + + :param name: The name to look for. + :type name: str + :returns: True if it is equal to the given name. + :rtype: bool + """ + return self.part.get_filename() == name + + def get_file(self): + """ + Gets the file including name and payload. + + :returns: the part's name and payload. + :rtype: tuple + """ + return self.part.get_filename(), self.part.get_payload(decode=True) diff --git a/airflow/contrib/hooks/jira_hook.py b/airflow/contrib/hooks/jira_hook.py index c6806d935acf2..e3c4a12ffe69b 100644 --- a/airflow/contrib/hooks/jira_hook.py +++ b/airflow/contrib/hooks/jira_hook.py @@ -21,15 +21,14 @@ from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin -class JiraHook(BaseHook, LoggingMixin): +class JiraHook(BaseHook): """ Jira interaction hook, a Wrapper around JIRA Python SDK. :param jira_conn_id: reference to a pre-defined Jira Connection - :type jira_conn_id: string + :type jira_conn_id: str """ def __init__(self, jira_conn_id='jira_default', diff --git a/airflow/contrib/hooks/mongo_hook.py b/airflow/contrib/hooks/mongo_hook.py index 6ae71a8c8d461..959d4cc86274c 100644 --- a/airflow/contrib/hooks/mongo_hook.py +++ b/airflow/contrib/hooks/mongo_hook.py @@ -1,20 +1,25 @@ # -*- coding: utf-8 -*- # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. from ssl import CERT_NONE from airflow.hooks.base_hook import BaseHook -from pymongo import MongoClient +from pymongo import MongoClient, ReplaceOne class MongoHook(BaseHook): @@ -24,10 +29,9 @@ class MongoHook(BaseHook): https://docs.mongodb.com/manual/reference/connection-string/index.html You can specify connection string options in extra field of your connection https://docs.mongodb.com/manual/reference/connection-string/index.html#connection-string-options - ex. - {replicaSet: test, ssl: True, connectTimeoutMS: 30000} + ex. ``{replicaSet: test, ssl: True, connectTimeoutMS: 30000}`` """ - conn_type = 'MongoDb' + conn_type = 'mongo' def __init__(self, conn_id='mongo_default', *args, **kwargs): super(MongoHook, self).__init__(source='mongo') @@ -37,6 +41,13 @@ def __init__(self, conn_id='mongo_default', *args, **kwargs): self.extras = self.connection.extra_dejson self.client = None + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.client is not None: + self.close_conn() + def get_conn(self): """ Fetches PyMongo Client @@ -49,11 +60,11 @@ def get_conn(self): uri = 'mongodb://{creds}{host}{port}/{database}'.format( creds='{}:{}@'.format( conn.login, conn.password - ) if conn.login is not None else '', + ) if conn.login else '', host=conn.host, port='' if conn.port is None else ':{}'.format(conn.port), - database='' if conn.schema is None else conn.schema + database=conn.schema ) # Mongo Connection Options dict that is unpacked when passed to MongoClient @@ -67,6 +78,12 @@ def get_conn(self): return self.client + def close_conn(self): + client = self.client + if client is not None: + client.close() + self.client = None + def get_collection(self, mongo_collection, mongo_db=None): """ Fetches a mongo collection object for querying. @@ -117,3 +134,158 @@ def insert_many(self, mongo_collection, docs, mongo_db=None, **kwargs): collection = self.get_collection(mongo_collection, mongo_db=mongo_db) return collection.insert_many(docs, **kwargs) + + def update_one(self, mongo_collection, filter_doc, update_doc, + mongo_db=None, **kwargs): + """ + Updates a single document in a mongo collection. + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.update_one + + :param mongo_collection: The name of the collection to update. + :type mongo_collection: str + :param filter_doc: A query that matches the documents to update. + :type filter_doc: dict + :param update_doc: The modifications to apply. + :type update_doc: dict + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + return collection.update_one(filter_doc, update_doc, **kwargs) + + def update_many(self, mongo_collection, filter_doc, update_doc, + mongo_db=None, **kwargs): + """ + Updates one or more documents in a mongo collection. + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.update_many + + :param mongo_collection: The name of the collection to update. + :type mongo_collection: str + :param filter_doc: A query that matches the documents to update. + :type filter_doc: dict + :param update_doc: The modifications to apply. + :type update_doc: dict + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + return collection.update_many(filter_doc, update_doc, **kwargs) + + def replace_one(self, mongo_collection, doc, filter_doc=None, + mongo_db=None, **kwargs): + """ + Replaces a single document in a mongo collection. + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.replace_one + + .. note:: + If no ``filter_doc`` is given, it is assumed that the replacement + document contain the ``_id`` field which is then used as filters. + + :param mongo_collection: The name of the collection to update. + :type mongo_collection: str + :param doc: The new document. + :type doc: dict + :param filter_doc: A query that matches the documents to replace. + Can be omitted; then the _id field from doc will be used. + :type filter_doc: dict + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + if not filter_doc: + filter_doc = {'_id': doc['_id']} + + return collection.replace_one(filter_doc, doc, **kwargs) + + def replace_many(self, mongo_collection, docs, + filter_docs=None, mongo_db=None, upsert=False, collation=None, + **kwargs): + """ + Replaces many documents in a mongo collection. + + Uses bulk_write with multiple ReplaceOne operations + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write + + .. note:: + If no ``filter_docs``are given, it is assumed that all + replacement documents contain the ``_id`` field which are then + used as filters. + + :param mongo_collection: The name of the collection to update. + :type mongo_collection: str + :param docs: The new documents. + :type docs: list[dict] + :param filter_docs: A list of queries that match the documents to replace. + Can be omitted; then the _id fields from docs will be used. + :type filter_docs: list[dict] + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + :param upsert: If ``True``, perform an insert if no documents + match the filters for the replace operation. + :type upsert: bool + :param collation: An instance of + :class:`~pymongo.collation.Collation`. This option is only + supported on MongoDB 3.4 and above. + :type collation: pymongo.collation.Collation + + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + if not filter_docs: + filter_docs = [{'_id': doc['_id']} for doc in docs] + + requests = [ + ReplaceOne( + filter_docs[i], + docs[i], + upsert=upsert, + collation=collation) + for i in range(len(docs)) + ] + + return collection.bulk_write(requests, **kwargs) + + def delete_one(self, mongo_collection, filter_doc, mongo_db=None, **kwargs): + """ + Deletes a single document in a mongo collection. + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.delete_one + + :param mongo_collection: The name of the collection to delete from. + :type mongo_collection: str + :param filter_doc: A query that matches the document to delete. + :type filter_doc: dict + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + return collection.delete_one(filter_doc, **kwargs) + + def delete_many(self, mongo_collection, filter_doc, mongo_db=None, **kwargs): + """ + Deletes one or more documents in a mongo collection. + https://api.mongodb.com/python/current/api/pymongo/collection.html#pymongo.collection.Collection.delete_many + + :param mongo_collection: The name of the collection to delete from. + :type mongo_collection: str + :param filter_doc: A query that matches the documents to delete. + :type filter_doc: dict + :param mongo_db: The name of the database to use. + Can be omitted; then the database from the connection string is used. + :type mongo_db: str + + """ + collection = self.get_collection(mongo_collection, mongo_db=mongo_db) + + return collection.delete_many(filter_doc, **kwargs) diff --git a/airflow/contrib/hooks/openfaas_hook.py b/airflow/contrib/hooks/openfaas_hook.py new file mode 100644 index 0000000000000..8f5062cbc708f --- /dev/null +++ b/airflow/contrib/hooks/openfaas_hook.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.base_hook import BaseHook +import requests +from airflow import AirflowException + +OK_STATUS_CODE = 202 + + +class OpenFaasHook(BaseHook): + """ + Interact with Openfaas to query, deploy, invoke and update function + + :param function_name: Name of the function, Defaults to None + :type query: str + :param conn_id: openfass connection to use, Defaults to open_faas_default + for example host : http://openfaas.faas.com, Conn Type : Http + :type conn_id: str + """ + + GET_FUNCTION = "/system/function/" + INVOKE_ASYNC_FUNCTION = "/async-function/" + DEPLOY_FUNCTION = "/system/functions" + UPDATE_FUNCTION = "/system/functions" + + def __init__(self, + function_name=None, + conn_id='open_faas_default', + *args, **kwargs): + self.function_name = function_name + self.conn_id = conn_id + super(BaseHook, self).__init__(*args, **kwargs) + + def get_conn(self): + conn = self.get_connection(self.conn_id) + return conn + + def deploy_function(self, overwrite_function_if_exist, body): + if overwrite_function_if_exist: + self.log.info("Function already exist " + self.function_name + " going to update") + self.update_function(body) + else: + url = self.get_conn().host + self.DEPLOY_FUNCTION + self.log.info("Deploying function " + url) + response = requests.post(url, body) + if (response.status_code != OK_STATUS_CODE): + self.log.error("Response status " + str(response.status_code)) + self.log.error("Failed to deploy") + raise AirflowException('failed to deploy') + else: + self.log.info("Function deployed " + self.function_name) + + def invoke_async_function(self, body): + url = self.get_conn().host + self.INVOKE_ASYNC_FUNCTION + self.function_name + self.log.info("Invoking function " + url) + response = requests.post(url, body) + if (response.ok): + self.log.info("Invoked " + self.function_name) + else: + self.log.error("Response status " + str(response.status_code)) + raise AirflowException('failed to invoke function') + + def update_function(self, body): + url = self.get_conn().host + self.UPDATE_FUNCTION + self.log.info("Updating function " + url) + response = requests.put(url, body) + if (response.status_code != OK_STATUS_CODE): + self.log.error("Response status " + str(response.status_code)) + self.log.error("Failed to update response " + response.content.decode("utf-8")) + raise AirflowException('failed to update ' + self.function_name) + else: + self.log.info("Function was updated") + + def does_function_exist(self): + url = self.get_conn().host + self.GET_FUNCTION + self.function_name + + response = requests.get(url) + if (response.ok): + return True + else: + self.log.error("Failed to find function " + self.function_name) + return False diff --git a/airflow/contrib/hooks/opsgenie_alert_hook.py b/airflow/contrib/hooks/opsgenie_alert_hook.py new file mode 100644 index 0000000000000..d576c08c23982 --- /dev/null +++ b/airflow/contrib/hooks/opsgenie_alert_hook.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import json + +import requests + +from airflow.hooks.http_hook import HttpHook +from airflow import AirflowException + + +class OpsgenieAlertHook(HttpHook): + """ + This hook allows you to post alerts to Opsgenie. + Accepts a connection that has an Opsgenie API key as the connection's password. + This hook sets the domain to conn_id.host, and if not set will default + to ``https://api.opsgenie.com``. + + Each Opsgenie API key can be pre-configured to a team integration. + You can override these defaults in this hook. + + :param opsgenie_conn_id: The name of the Opsgenie connection to use + :type opsgenie_conn_id: str + + """ + def __init__(self, + opsgenie_conn_id='opsgenie_default', + *args, + **kwargs + ): + super(OpsgenieAlertHook, self).__init__(http_conn_id=opsgenie_conn_id, *args, **kwargs) + + def _get_api_key(self): + """ + Get Opsgenie api_key for creating alert + """ + conn = self.get_connection(self.http_conn_id) + api_key = conn.password + if not api_key: + raise AirflowException('Opsgenie API Key is required for this hook, ' + 'please check your conn_id configuration.') + return api_key + + def get_conn(self, headers=None): + """ + Overwrite HttpHook get_conn because this hook just needs base_url + and headers, and does not need generic params + + :param headers: additional headers to be passed through as a dictionary + :type headers: dict + """ + conn = self.get_connection(self.http_conn_id) + self.base_url = conn.host if conn.host else 'https://api.opsgenie.com' + session = requests.Session() + if headers: + session.headers.update(headers) + return session + + def execute(self, payload={}): + """ + Execute the Opsgenie Alert call + + :param payload: Opsgenie API Create Alert payload values + See https://docs.opsgenie.com/docs/alert-api#section-create-alert + :type payload: dict + """ + api_key = self._get_api_key() + return self.run(endpoint='v2/alerts', + data=json.dumps(payload), + headers={'Content-Type': 'application/json', + 'Authorization': 'GenieKey %s' % api_key}) diff --git a/airflow/contrib/hooks/qubole_hook.py b/airflow/contrib/hooks/qubole_hook.py index fb45862e88e02..43eb5e6f82e15 100755 --- a/airflow/contrib/hooks/qubole_hook.py +++ b/airflow/contrib/hooks/qubole_hook.py @@ -123,7 +123,7 @@ def execute(self, context): def kill(self, ti): """ - Kill (cancel) a Qubole commmand + Kill (cancel) a Qubole command :param ti: Task Instance of the dag, used to determine the Quboles command id :return: response from Qubole """ @@ -176,7 +176,7 @@ def get_jobs_id(self, ti): """ Get jobs associated with a Qubole commands :param ti: Task Instance of the dag, used to determine the Quboles command id - :return: Job informations assoiciated with command + :return: Job information associated with command """ if self.cmd is None: cmd_id = ti.xcom_pull(key="qbol_cmd_id", task_ids=self.task_id) diff --git a/airflow/contrib/hooks/redis_hook.py b/airflow/contrib/hooks/redis_hook.py index 1de75dbca905f..a63c839e34c85 100644 --- a/airflow/contrib/hooks/redis_hook.py +++ b/airflow/contrib/hooks/redis_hook.py @@ -20,16 +20,13 @@ """ RedisHook module """ -from redis import StrictRedis - -from airflow.exceptions import AirflowException +from redis import Redis from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin -class RedisHook(BaseHook, LoggingMixin): +class RedisHook(BaseHook): """ - Hook to interact with Redis database + Wrapper for connection to interact with Redis in-memory data structure store """ def __init__(self, redis_conn_id='redis_default'): """ @@ -39,55 +36,31 @@ def __init__(self, redis_conn_id='redis_default'): we need to connect to Redis. """ self.redis_conn_id = redis_conn_id - self.client = None - conn = self.get_connection(self.redis_conn_id) - self.host = conn.host - self.port = int(conn.port) - self.password = conn.password - self.db = int(conn.extra_dejson.get('db', 0)) - - self.log.debug( - '''Connection "{conn}": - \thost: {host} - \tport: {port} - \textra: {extra} - '''.format( - conn=self.redis_conn_id, - host=self.host, - port=self.port, - extra=conn.extra_dejson - ) - ) + self.redis = None + self.host = None + self.port = None + self.password = None + self.db = None def get_conn(self): """ Returns a Redis connection. """ - if not self.client: + conn = self.get_connection(self.redis_conn_id) + self.host = conn.host + self.port = conn.port + self.password = None if str(conn.password).lower() in ['none', 'false', ''] else conn.password + self.db = conn.extra_dejson.get('db', None) + + if not self.redis: self.log.debug( - 'generating Redis client for conn_id "%s" on %s:%s:%s', + 'Initializing redis object for conn_id "%s" on %s:%s:%s', self.redis_conn_id, self.host, self.port, self.db ) - try: - self.client = StrictRedis( - host=self.host, - port=self.port, - password=self.password, - db=self.db) - except Exception as general_error: - raise AirflowException( - 'Failed to create Redis client, error: {error}'.format( - error=str(general_error) - ) - ) - - return self.client - - def key_exists(self, key): - """ - Checks if a key exists in Redis database + self.redis = Redis( + host=self.host, + port=self.port, + password=self.password, + db=self.db) - :param key: The key to check the existence. - :type key: string - """ - return self.get_conn().exists(key) + return self.redis diff --git a/airflow/contrib/hooks/sagemaker_hook.py b/airflow/contrib/hooks/sagemaker_hook.py new file mode 100644 index 0000000000000..80777821fef39 --- /dev/null +++ b/airflow/contrib/hooks/sagemaker_hook.py @@ -0,0 +1,770 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tarfile +import tempfile +import time +import os +import collections + +import botocore.config +from botocore.exceptions import ClientError + +from airflow.exceptions import AirflowException +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.hooks.S3_hook import S3Hook +from airflow.utils import timezone + + +class LogState(object): + STARTING = 1 + WAIT_IN_PROGRESS = 2 + TAILING = 3 + JOB_COMPLETE = 4 + COMPLETE = 5 + + +# Position is a tuple that includes the last read timestamp and the number of items that were read +# at that time. This is used to figure out which event to start with on the next read. +Position = collections.namedtuple('Position', ['timestamp', 'skip']) + + +def argmin(arr, f): + """Return the index, i, in arr that minimizes f(arr[i])""" + m = None + i = None + for idx, item in enumerate(arr): + if item is not None: + if m is None or f(item) < m: + m = f(item) + i = idx + return i + + +def secondary_training_status_changed(current_job_description, prev_job_description): + """ + Returns true if training job's secondary status message has changed. + + :param current_job_description: Current job description, returned from DescribeTrainingJob call. + :type current_job_description: dict + :param prev_job_description: Previous job description, returned from DescribeTrainingJob call. + :type prev_job_description: dict + + :return: Whether the secondary status message of a training job changed or not. + """ + current_secondary_status_transitions = current_job_description.get('SecondaryStatusTransitions') + if current_secondary_status_transitions is None or len(current_secondary_status_transitions) == 0: + return False + + prev_job_secondary_status_transitions = prev_job_description.get('SecondaryStatusTransitions') \ + if prev_job_description is not None else None + + last_message = prev_job_secondary_status_transitions[-1]['StatusMessage'] \ + if prev_job_secondary_status_transitions is not None \ + and len(prev_job_secondary_status_transitions) > 0 else '' + + message = current_job_description['SecondaryStatusTransitions'][-1]['StatusMessage'] + + return message != last_message + + +def secondary_training_status_message(job_description, prev_description): + """ + Returns a string contains start time and the secondary training job status message. + + :param job_description: Returned response from DescribeTrainingJob call + :type job_description: dict + :param prev_description: Previous job description from DescribeTrainingJob call + :type prev_description: dict + + :return: Job status string to be printed. + """ + + if job_description is None or job_description.get('SecondaryStatusTransitions') is None\ + or len(job_description.get('SecondaryStatusTransitions')) == 0: + return '' + + prev_description_secondary_transitions = prev_description.get('SecondaryStatusTransitions')\ + if prev_description is not None else None + prev_transitions_num = len(prev_description['SecondaryStatusTransitions'])\ + if prev_description_secondary_transitions is not None else 0 + current_transitions = job_description['SecondaryStatusTransitions'] + + transitions_to_print = current_transitions[-1:] if len(current_transitions) == prev_transitions_num else \ + current_transitions[prev_transitions_num - len(current_transitions):] + + status_strs = [] + for transition in transitions_to_print: + message = transition['StatusMessage'] + time_str = timezone.convert_to_utc(job_description['LastModifiedTime']).strftime('%Y-%m-%d %H:%M:%S') + status_strs.append('{} {} - {}'.format(time_str, transition['Status'], message)) + + return '\n'.join(status_strs) + + +class SageMakerHook(AwsHook): + """ + Interact with Amazon SageMaker. + """ + non_terminal_states = {'InProgress', 'Stopping'} + endpoint_non_terminal_states = {'Creating', 'Updating', 'SystemUpdating', + 'RollingBack', 'Deleting'} + failed_states = {'Failed'} + + def __init__(self, + *args, **kwargs): + super(SageMakerHook, self).__init__(*args, **kwargs) + self.s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) + + def tar_and_s3_upload(self, path, key, bucket): + """ + Tar the local file or directory and upload to s3 + + :param path: local file or directory + :type path: str + :param key: s3 key + :type key: str + :param bucket: s3 bucket + :type bucket: str + :return: None + """ + with tempfile.TemporaryFile() as temp_file: + if os.path.isdir(path): + files = [os.path.join(path, name) for name in os.listdir(path)] + else: + files = [path] + with tarfile.open(mode='w:gz', fileobj=temp_file) as tar_file: + for f in files: + tar_file.add(f, arcname=os.path.basename(f)) + temp_file.seek(0) + self.s3_hook.load_file_obj(temp_file, key, bucket, replace=True) + + def configure_s3_resources(self, config): + """ + Extract the S3 operations from the configuration and execute them. + + :param config: config of SageMaker operation + :type config: dict + :rtype: dict + """ + s3_operations = config.pop('S3Operations', None) + + if s3_operations is not None: + create_bucket_ops = s3_operations.get('S3CreateBucket', []) + upload_ops = s3_operations.get('S3Upload', []) + for op in create_bucket_ops: + self.s3_hook.create_bucket(bucket_name=op['Bucket']) + for op in upload_ops: + if op['Tar']: + self.tar_and_s3_upload(op['Path'], op['Key'], + op['Bucket']) + else: + self.s3_hook.load_file(op['Path'], op['Key'], + op['Bucket']) + + def check_s3_url(self, s3url): + """ + Check if an S3 URL exists + + :param s3url: S3 url + :type s3url: str + :rtype: bool + """ + bucket, key = S3Hook.parse_s3_url(s3url) + if not self.s3_hook.check_for_bucket(bucket_name=bucket): + raise AirflowException( + "The input S3 Bucket {} does not exist ".format(bucket)) + if key and not self.s3_hook.check_for_key(key=key, bucket_name=bucket)\ + and not self.s3_hook.check_for_prefix( + prefix=key, bucket_name=bucket, delimiter='/'): + # check if s3 key exists in the case user provides a single file + # or if s3 prefix exists in the case user provides multiple files in + # a prefix + raise AirflowException("The input S3 Key " + "or Prefix {} does not exist in the Bucket {}" + .format(s3url, bucket)) + return True + + def check_training_config(self, training_config): + """ + Check if a training configuration is valid + + :param training_config: training_config + :type training_config: dict + :return: None + """ + for channel in training_config['InputDataConfig']: + self.check_s3_url(channel['DataSource']['S3DataSource']['S3Uri']) + + def check_tuning_config(self, tuning_config): + """ + Check if a tuning configuration is valid + + :param tuning_config: tuning_config + :type tuning_config: dict + :return: None + """ + for channel in tuning_config['TrainingJobDefinition']['InputDataConfig']: + self.check_s3_url(channel['DataSource']['S3DataSource']['S3Uri']) + + def get_conn(self): + """ + Establish an AWS connection for SageMaker + + :rtype: :py:class:`SageMaker.Client` + """ + return self.get_client_type('sagemaker') + + def get_log_conn(self): + """ + Establish an AWS connection for retrieving logs during training + + :rtype: CloudWatchLogs.Client + """ + config = botocore.config.Config(retries={'max_attempts': 15}) + return self.get_client_type('logs', config=config) + + def log_stream(self, log_group, stream_name, start_time=0, skip=0): + """ + A generator for log items in a single stream. This will yield all the + items that are available at the current moment. + + :param log_group: The name of the log group. + :type log_group: str + :param stream_name: The name of the specific stream. + :type stream_name: str + :param start_time: The time stamp value to start reading the logs from (default: 0). + :type start_time: int + :param skip: The number of log entries to skip at the start (default: 0). + This is for when there are multiple entries at the same timestamp. + :type skip: int + :rtype: dict + :return: | A CloudWatch log event with the following key-value pairs: + | 'timestamp' (int): The time in milliseconds of the event. + | 'message' (str): The log event data. + | 'ingestionTime' (int): The time in milliseconds the event was ingested. + """ + + next_token = None + + event_count = 1 + while event_count > 0: + if next_token is not None: + token_arg = {'nextToken': next_token} + else: + token_arg = {} + + response = self.get_log_conn().get_log_events(logGroupName=log_group, + logStreamName=stream_name, + startTime=start_time, + startFromHead=True, + **token_arg) + next_token = response['nextForwardToken'] + events = response['events'] + event_count = len(events) + if event_count > skip: + events = events[skip:] + skip = 0 + else: + skip = skip - event_count + events = [] + for ev in events: + yield ev + + def multi_stream_iter(self, log_group, streams, positions=None): + """ + Iterate over the available events coming from a set of log streams in a single log group + interleaving the events from each stream so they're yielded in timestamp order. + + :param log_group: The name of the log group. + :type log_group: str + :param streams: A list of the log stream names. The position of the stream in this list is + the stream number. + :type streams: list + :param positions: A list of pairs of (timestamp, skip) which represents the last record + read from each stream. + :type positions: list + :return: A tuple of (stream number, cloudwatch log event). + """ + positions = positions or {s: Position(timestamp=0, skip=0) for s in streams} + event_iters = [self.log_stream(log_group, s, positions[s].timestamp, positions[s].skip) + for s in streams] + events = [] + for s in event_iters: + if not s: + events.append(None) + continue + try: + events.append(next(s)) + except StopIteration: + events.append(None) + + while any(events): + i = argmin(events, lambda x: x['timestamp'] if x else 9999999999) + yield (i, events[i]) + try: + events[i] = next(event_iters[i]) + except StopIteration: + events[i] = None + + def create_training_job(self, config, wait_for_completion=True, print_log=True, + check_interval=30, max_ingestion_time=None): + """ + Create a training job + + :param config: the config for training + :type config: dict + :param wait_for_completion: if the program should keep running until job finishes + :type wait_for_completion: bool + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: A response to training job creation + """ + + self.check_training_config(config) + + response = self.get_conn().create_training_job(**config) + if print_log: + self.check_training_status_with_log(config['TrainingJobName'], + self.non_terminal_states, + self.failed_states, + wait_for_completion, + check_interval, max_ingestion_time + ) + elif wait_for_completion: + describe_response = self.check_status(config['TrainingJobName'], + 'TrainingJobStatus', + self.describe_training_job, + check_interval, max_ingestion_time + ) + + billable_time = \ + (describe_response['TrainingEndTime'] - describe_response['TrainingStartTime']) * \ + describe_response['ResourceConfig']['InstanceCount'] + self.log.info('Billable seconds:{}'.format(int(billable_time.total_seconds()) + 1)) + + return response + + def create_tuning_job(self, config, wait_for_completion=True, + check_interval=30, max_ingestion_time=None): + """ + Create a tuning job + + :param config: the config for tuning + :type config: dict + :param wait_for_completion: if the program should keep running until job finishes + :type wait_for_completion: bool + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: A response to tuning job creation + """ + + self.check_tuning_config(config) + + response = self.get_conn().create_hyper_parameter_tuning_job(**config) + if wait_for_completion: + self.check_status(config['HyperParameterTuningJobName'], + 'HyperParameterTuningJobStatus', + self.describe_tuning_job, + check_interval, max_ingestion_time + ) + return response + + def create_transform_job(self, config, wait_for_completion=True, + check_interval=30, max_ingestion_time=None): + """ + Create a transform job + + :param config: the config for transform job + :type config: dict + :param wait_for_completion: if the program should keep running until job finishes + :type wait_for_completion: bool + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: A response to transform job creation + """ + + self.check_s3_url(config['TransformInput']['DataSource']['S3DataSource']['S3Uri']) + + response = self.get_conn().create_transform_job(**config) + if wait_for_completion: + self.check_status(config['TransformJobName'], + 'TransformJobStatus', + self.describe_transform_job, + check_interval, max_ingestion_time + ) + return response + + def create_model(self, config): + """ + Create a model job + + :param config: the config for model + :type config: dict + :return: A response to model creation + """ + + return self.get_conn().create_model(**config) + + def create_endpoint_config(self, config): + """ + Create an endpoint config + + :param config: the config for endpoint-config + :type config: dict + :return: A response to endpoint config creation + """ + + return self.get_conn().create_endpoint_config(**config) + + def create_endpoint(self, config, wait_for_completion=True, + check_interval=30, max_ingestion_time=None): + """ + Create an endpoint + + :param config: the config for endpoint + :type config: dict + :param wait_for_completion: if the program should keep running until job finishes + :type wait_for_completion: bool + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: A response to endpoint creation + """ + + response = self.get_conn().create_endpoint(**config) + if wait_for_completion: + self.check_status(config['EndpointName'], + 'EndpointStatus', + self.describe_endpoint, + check_interval, max_ingestion_time, + non_terminal_states=self.endpoint_non_terminal_states + ) + return response + + def update_endpoint(self, config, wait_for_completion=True, + check_interval=30, max_ingestion_time=None): + """ + Update an endpoint + + :param config: the config for endpoint + :type config: dict + :param wait_for_completion: if the program should keep running until job finishes + :type wait_for_completion: bool + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: A response to endpoint update + """ + + response = self.get_conn().update_endpoint(**config) + if wait_for_completion: + self.check_status(config['EndpointName'], + 'EndpointStatus', + self.describe_endpoint, + check_interval, max_ingestion_time, + non_terminal_states=self.endpoint_non_terminal_states + ) + return response + + def describe_training_job(self, name): + """ + Return the training job info associated with the name + + :param name: the name of the training job + :type name: str + :return: A dict contains all the training job info + """ + + return self.get_conn().describe_training_job(TrainingJobName=name) + + def describe_training_job_with_log(self, job_name, positions, stream_names, + instance_count, state, last_description, + last_describe_job_call): + """ + Return the training job info associated with job_name and print CloudWatch logs + """ + log_group = '/aws/sagemaker/TrainingJobs' + + if len(stream_names) < instance_count: + # Log streams are created whenever a container starts writing to stdout/err, so this list + # may be dynamic until we have a stream for every instance. + logs_conn = self.get_log_conn() + try: + streams = logs_conn.describe_log_streams( + logGroupName=log_group, + logStreamNamePrefix=job_name + '/', + orderBy='LogStreamName', + limit=instance_count + ) + stream_names = [s['logStreamName'] for s in streams['logStreams']] + positions.update([(s, Position(timestamp=0, skip=0)) + for s in stream_names if s not in positions]) + except logs_conn.exceptions.ResourceNotFoundException: + # On the very first training job run on an account, there's no log group until + # the container starts logging, so ignore any errors thrown about that + pass + + if len(stream_names) > 0: + for idx, event in self.multi_stream_iter(log_group, stream_names, positions): + self.log.info(event['message']) + ts, count = positions[stream_names[idx]] + if event['timestamp'] == ts: + positions[stream_names[idx]] = Position(timestamp=ts, skip=count + 1) + else: + positions[stream_names[idx]] = Position(timestamp=event['timestamp'], skip=1) + + if state == LogState.COMPLETE: + return state, last_description, last_describe_job_call + + if state == LogState.JOB_COMPLETE: + state = LogState.COMPLETE + elif time.time() - last_describe_job_call >= 30: + description = self.describe_training_job(job_name) + last_describe_job_call = time.time() + + if secondary_training_status_changed(description, last_description): + self.log.info(secondary_training_status_message(description, last_description)) + last_description = description + + status = description['TrainingJobStatus'] + + if status not in self.non_terminal_states: + state = LogState.JOB_COMPLETE + return state, last_description, last_describe_job_call + + def describe_tuning_job(self, name): + """ + Return the tuning job info associated with the name + + :param name: the name of the tuning job + :type name: string + :return: A dict contains all the tuning job info + """ + + return self.get_conn().describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=name) + + def describe_model(self, name): + """ + Return the SageMaker model info associated with the name + + :param name: the name of the SageMaker model + :type name: string + :return: A dict contains all the model info + """ + + return self.get_conn().describe_model(ModelName=name) + + def describe_transform_job(self, name): + """ + Return the transform job info associated with the name + + :param name: the name of the transform job + :type name: string + :return: A dict contains all the transform job info + """ + + return self.get_conn().describe_transform_job(TransformJobName=name) + + def describe_endpoint_config(self, name): + """ + Return the endpoint config info associated with the name + + :param name: the name of the endpoint config + :type name: string + :return: A dict contains all the endpoint config info + """ + + return self.get_conn().describe_endpoint_config(EndpointConfigName=name) + + def describe_endpoint(self, name): + """ + :param name: the name of the endpoint + :type name: string + :return: A dict contains all the endpoint info + """ + + return self.get_conn().describe_endpoint(EndpointName=name) + + def check_status(self, job_name, key, + describe_function, check_interval, + max_ingestion_time, + non_terminal_states=None): + """ + Check status of a SageMaker job + + :param job_name: name of the job to check status + :type job_name: str + :param key: the key of the response dict + that points to the state + :type key: str + :param describe_function: the function used to retrieve the status + :type describe_function: python callable + :param args: the arguments for the function + :param check_interval: the time interval in seconds which the operator + will check the status of any SageMaker job + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :param non_terminal_states: the set of nonterminal states + :type non_terminal_states: set + :return: response of describe call after job is done + """ + if not non_terminal_states: + non_terminal_states = self.non_terminal_states + + sec = 0 + running = True + + while running: + time.sleep(check_interval) + sec = sec + check_interval + + try: + response = describe_function(job_name) + status = response[key] + self.log.info('Job still running for %s seconds... ' + 'current status is %s' % (sec, status)) + except KeyError: + raise AirflowException('Could not get status of the SageMaker job') + except ClientError: + raise AirflowException('AWS request failed, check logs for more info') + + if status in non_terminal_states: + running = True + elif status in self.failed_states: + raise AirflowException('SageMaker job failed because %s' % response['FailureReason']) + else: + running = False + + if max_ingestion_time and sec > max_ingestion_time: + # ensure that the job gets killed if the max ingestion time is exceeded + raise AirflowException('SageMaker job took more than %s seconds', max_ingestion_time) + + self.log.info('SageMaker Job Compeleted') + response = describe_function(job_name) + return response + + def check_training_status_with_log(self, job_name, non_terminal_states, failed_states, + wait_for_completion, check_interval, max_ingestion_time): + """ + Display the logs for a given training job, optionally tailing them until the + job is complete. + + :param job_name: name of the training job to check status and display logs for + :type job_name: str + :param non_terminal_states: the set of non_terminal states + :type non_terminal_states: set + :param failed_states: the set of failed states + :type failed_states: set + :param wait_for_completion: Whether to keep looking for new log entries + until the job completes + :type wait_for_completion: bool + :param check_interval: The interval in seconds between polling for new log entries and job completion + :type check_interval: int + :param max_ingestion_time: the maximum ingestion time in seconds. Any + SageMaker jobs that run longer than this will fail. Setting this to + None implies no timeout for any SageMaker job. + :type max_ingestion_time: int + :return: None + """ + + sec = 0 + description = self.describe_training_job(job_name) + self.log.info(secondary_training_status_message(description, None)) + instance_count = description['ResourceConfig']['InstanceCount'] + status = description['TrainingJobStatus'] + + stream_names = [] # The list of log streams + positions = {} # The current position in each stream, map of stream name -> position + + job_already_completed = status not in non_terminal_states + + state = LogState.TAILING if wait_for_completion and not job_already_completed else LogState.COMPLETE + + # The loop below implements a state machine that alternates between checking the job status and + # reading whatever is available in the logs at this point. Note, that if we were called with + # wait_for_completion == False, we never check the job status. + # + # If wait_for_completion == TRUE and job is not completed, the initial state is TAILING + # If wait_for_completion == FALSE, the initial state is COMPLETE + # (doesn't matter if the job really is complete). + # + # The state table: + # + # STATE ACTIONS CONDITION NEW STATE + # ---------------- ---------------- ----------------- ---------------- + # TAILING Read logs, Pause, Get status Job complete JOB_COMPLETE + # Else TAILING + # JOB_COMPLETE Read logs, Pause Any COMPLETE + # COMPLETE Read logs, Exit N/A + # + # Notes: + # - The JOB_COMPLETE state forces us to do an extra pause and read any items that + # got to Cloudwatch after the job was marked complete. + last_describe_job_call = time.time() + last_description = description + + while True: + time.sleep(check_interval) + sec = sec + check_interval + + state, last_description, last_describe_job_call = \ + self.describe_training_job_with_log(job_name, positions, stream_names, + instance_count, state, last_description, + last_describe_job_call) + if state == LogState.COMPLETE: + break + + if max_ingestion_time and sec > max_ingestion_time: + # ensure that the job gets killed if the max ingestion time is exceeded + raise AirflowException('SageMaker job took more than %s seconds', max_ingestion_time) + + if wait_for_completion: + status = last_description['TrainingJobStatus'] + if status in failed_states: + reason = last_description.get('FailureReason', '(No reason provided)') + raise AirflowException('Error training {}: {} Reason: {}'.format(job_name, status, reason)) + billable_time = (last_description['TrainingEndTime'] - last_description['TrainingStartTime']) \ + * instance_count + self.log.info('Billable seconds:{}'.format(int(billable_time.total_seconds()) + 1)) diff --git a/airflow/contrib/hooks/salesforce_hook.py b/airflow/contrib/hooks/salesforce_hook.py index ee18b353d2e82..a1756b6530b6a 100644 --- a/airflow/contrib/hooks/salesforce_hook.py +++ b/airflow/contrib/hooks/salesforce_hook.py @@ -37,7 +37,7 @@ from airflow.utils.log.logging_mixin import LoggingMixin -class SalesforceHook(BaseHook, LoggingMixin): +class SalesforceHook(BaseHook): def __init__( self, conn_id, @@ -53,14 +53,14 @@ def __init__( :param conn_id: the name of the connection that has the parameters we need to connect to Salesforce. - The conenction shoud be type `http` and include a + The connection should be type `http` and include a user's security token in the `Extras` field. .. note:: For the HTTP connection type, you can include a JSON structure in the `Extras` field. We need a user's security token to connect to Salesforce. So we define it in the `Extras` field as: - `{"security_token":"YOUR_SECRUITY_TOKEN"}` + `{"security_token":"YOUR_SECURITY_TOKEN"}` """ self.conn_id = conn_id self._args = args @@ -135,7 +135,8 @@ def get_available_fields(self, obj): return [f['name'] for f in desc['fields']] - def _build_field_list(self, fields): + @staticmethod + def _build_field_list(fields): # join all of the fields in a comma separated list return ",".join(fields) @@ -275,7 +276,7 @@ def write_object_to_file( schema = self.describe_object(object_name) - # possible columns that can be convereted to timestamps + # possible columns that can be converted to timestamps # are the ones that are either date or datetime types # strings are too general and we risk unintentional conversion possible_timestamp_cols = [ diff --git a/airflow/contrib/hooks/segment_hook.py b/airflow/contrib/hooks/segment_hook.py index 874d35d0743b9..a072a9f1eaf99 100644 --- a/airflow/contrib/hooks/segment_hook.py +++ b/airflow/contrib/hooks/segment_hook.py @@ -29,10 +29,8 @@ from airflow.hooks.base_hook import BaseHook from airflow.exceptions import AirflowException -from airflow.utils.log.logging_mixin import LoggingMixin - -class SegmentHook(BaseHook, LoggingMixin): +class SegmentHook(BaseHook): def __init__( self, segment_conn_id='segment_default', @@ -54,7 +52,7 @@ def __init__( :type segment_conn_id: str :param segment_debug_mode: Determines whether Segment should run in debug mode. Defaults to False - :type segment_debug_mode: boolean + :type segment_debug_mode: bool .. note:: You must include a JSON structure in the `Extras` field. We need a user's security token to connect to Segment. diff --git a/airflow/contrib/hooks/sftp_hook.py b/airflow/contrib/hooks/sftp_hook.py index 33c8b19e0a732..6fc01babe5ab2 100644 --- a/airflow/contrib/hooks/sftp_hook.py +++ b/airflow/contrib/hooks/sftp_hook.py @@ -19,52 +19,98 @@ import stat import pysftp -import logging import datetime -from airflow.hooks.base_hook import BaseHook +from airflow.contrib.hooks.ssh_hook import SSHHook -class SFTPHook(BaseHook): +class SFTPHook(SSHHook): """ + This hook is inherited from SSH hook. Please refer to SSH hook for the input + arguments. + Interact with SFTP. Aims to be interchangeable with FTPHook. - Pitfalls: - In contrast with FTPHook describe_directory only returns size, type and - modify. It doesn't return unix.owner, unix.mode, perm, unix.group and - unique. - - retrieve_file and store_file only take a local full path and not a - buffer. - - If no mode is passed to create_directory it will be created with 777 - permissions. + :Pitfalls:: + + - In contrast with FTPHook describe_directory only returns size, type and + modify. It doesn't return unix.owner, unix.mode, perm, unix.group and + unique. + - retrieve_file and store_file only take a local full path and not a + buffer. + - If no mode is passed to create_directory it will be created with 777 + permissions. Errors that may occur throughout but should be handled downstream. """ - def __init__(self, ftp_conn_id='sftp_default'): - self.ftp_conn_id = ftp_conn_id + def __init__(self, ftp_conn_id='sftp_default', *args, **kwargs): + kwargs['ssh_conn_id'] = ftp_conn_id + super(SFTPHook, self).__init__(*args, **kwargs) + self.conn = None + self.private_key_pass = None + + # Fail for unverified hosts, unless this is explicitly allowed + self.no_host_key_check = False + + if self.ssh_conn_id is not None: + conn = self.get_connection(self.ssh_conn_id) + if conn.extra is not None: + extra_options = conn.extra_dejson + if 'private_key_pass' in extra_options: + self.private_key_pass = extra_options.get('private_key_pass', None) + + # For backward compatibility + # TODO: remove in Airflow 2.1 + import warnings + if 'ignore_hostkey_verification' in extra_options: + warnings.warn( + 'Extra option `ignore_hostkey_verification` is deprecated.' + 'Please use `no_host_key_check` instead.' + 'This option will be removed in Airflow 2.1', + DeprecationWarning, + stacklevel=2, + ) + self.no_host_key_check = str( + extra_options['ignore_hostkey_verification'] + ).lower() == 'true' + + if 'no_host_key_check' in extra_options: + self.no_host_key_check = str( + extra_options['no_host_key_check']).lower() == 'true' + + if 'private_key' in extra_options: + warnings.warn( + 'Extra option `private_key` is deprecated.' + 'Please use `key_file` instead.' + 'This option will be removed in Airflow 2.1', + DeprecationWarning, + stacklevel=2, + ) + self.key_file = extra_options.get('private_key') def get_conn(self): """ Returns an SFTP connection object """ if self.conn is None: - params = self.get_connection(self.ftp_conn_id) cnopts = pysftp.CnOpts() - if ('ignore_hostkey_verification' in params.extra_dejson and - params.extra_dejson['ignore_hostkey_verification']): + if self.no_host_key_check: cnopts.hostkeys = None + cnopts.compression = self.compress conn_params = { - 'host': params.host, - 'port': params.port, - 'username': params.login, + 'host': self.remote_host, + 'port': self.port, + 'username': self.username, 'cnopts': cnopts } - if params.password is not None: - conn_params['password'] = params.password - if 'private_key' in params.extra_dejson: - conn_params['private_key'] = params.extra_dejson['private_key'] - if 'private_key_pass' in params.extra_dejson: - conn_params['private_key_pass'] = params.extra_dejson['private_key_pass'] + if self.password and self.password.strip(): + conn_params['password'] = self.password + if self.key_file: + conn_params['private_key'] = self.key_file + if self.private_key_pass: + conn_params['private_key_pass'] = self.private_key_pass + self.conn = pysftp.Connection(**conn_params) return self.conn @@ -136,10 +182,9 @@ def retrieve_file(self, remote_full_path, local_full_path): :type local_full_path: str """ conn = self.get_conn() - logging.info('Retrieving file from FTP: {}'.format(remote_full_path)) + self.log.info('Retrieving file from FTP: %s', remote_full_path) conn.get(remote_full_path, local_full_path) - logging.info('Finished retrieving file from FTP: {}'.format( - remote_full_path)) + self.log.info('Finished retrieving file from FTP: %s', remote_full_path) def store_file(self, remote_full_path, local_full_path): """ diff --git a/airflow/contrib/hooks/slack_webhook_hook.py b/airflow/contrib/hooks/slack_webhook_hook.py index 670d40105e669..aca10385ff901 100644 --- a/airflow/contrib/hooks/slack_webhook_hook.py +++ b/airflow/contrib/hooks/slack_webhook_hook.py @@ -38,6 +38,9 @@ class SlackWebhookHook(HttpHook): :type webhook_token: str :param message: The message you want to send on Slack :type message: str + :param attachments: The attachments to send on Slack. Should be a list of + dictionaries representing Slack attachments. + :type attachments: list :param channel: The channel the message should be posted to :type channel: str :param username: The username to post to slack with @@ -54,6 +57,7 @@ def __init__(self, http_conn_id=None, webhook_token=None, message="", + attachments=None, channel=None, username=None, icon_emoji=None, @@ -63,9 +67,9 @@ def __init__(self, **kwargs ): super(SlackWebhookHook, self).__init__(*args, **kwargs) - self.http_conn_id = http_conn_id self.webhook_token = self._get_token(webhook_token, http_conn_id) self.message = message + self.attachments = attachments self.channel = channel self.username = username self.icon_emoji = icon_emoji @@ -76,7 +80,9 @@ def _get_token(self, token, http_conn_id): """ Given either a manually set token or a conn_id, return the webhook_token to use :param token: The manually provided token - :param conn_id: The conn_id provided + :type token: str + :param http_conn_id: The conn_id provided + :type http_conn_id: str :return: webhook_token (str) to use """ if token: @@ -105,17 +111,15 @@ def _build_slack_message(self): cmd['icon_emoji'] = self.icon_emoji if self.link_names: cmd['link_names'] = 1 + if self.attachments: + cmd['attachments'] = self.attachments - # there should always be a message to post ;-) cmd['text'] = self.message return json.dumps(cmd) def execute(self): """ Remote Popen (actually execute the slack webhook call) - - :param cmd: command to remotely execute - :param kwargs: extra arguments to Popen (see subprocess.Popen) """ proxies = {} if self.proxy: diff --git a/airflow/contrib/hooks/spark_jdbc_hook.py b/airflow/contrib/hooks/spark_jdbc_hook.py index b55e4ef060419..c188b1e863de2 100644 --- a/airflow/contrib/hooks/spark_jdbc_hook.py +++ b/airflow/contrib/hooks/spark_jdbc_hook.py @@ -62,7 +62,7 @@ class SparkJDBCHook(SparkSubmitHook): :param jdbc_table: The name of the JDBC table :type jdbc_table: str :param jdbc_conn_id: Connection id used for connection to JDBC database - :type: jdbc_conn_id: str + :type jdbc_conn_id: str :param jdbc_driver: Name of the JDBC driver to use for the JDBC connection. This driver (usually a jar) should be passed in the 'jars' parameter :type jdbc_driver: str diff --git a/airflow/contrib/hooks/spark_sql_hook.py b/airflow/contrib/hooks/spark_sql_hook.py index c1fd2ce21c89f..25c0bc53c5576 100644 --- a/airflow/contrib/hooks/spark_sql_hook.py +++ b/airflow/contrib/hooks/spark_sql_hook.py @@ -27,6 +27,7 @@ class SparkSqlHook(BaseHook): """ This hook is a wrapper around the spark-sql binary. It requires that the "spark-sql" binary is in the PATH. + :param sql: The SQL query to execute :type sql: str :param conf: arbitrary Spark configuration property @@ -91,6 +92,7 @@ def _prepare_command(self, cmd): """ Construct the spark-sql command to execute. Verbose output is enabled as default. + :param cmd: command to append to the spark-sql command :type cmd: str :return: full command to be executed diff --git a/airflow/contrib/hooks/spark_submit_hook.py b/airflow/contrib/hooks/spark_submit_hook.py index 0185cab283345..677ea9000843b 100644 --- a/airflow/contrib/hooks/spark_submit_hook.py +++ b/airflow/contrib/hooks/spark_submit_hook.py @@ -26,7 +26,6 @@ from airflow.exceptions import AirflowException from airflow.utils.log.logging_mixin import LoggingMixin from airflow.contrib.kubernetes import kube_client -from kubernetes.client.rest import ApiException class SparkSubmitHook(BaseHook, LoggingMixin): @@ -34,37 +33,40 @@ class SparkSubmitHook(BaseHook, LoggingMixin): This hook is a wrapper around the spark-submit binary to kick off a spark-submit job. It requires that the "spark-submit" binary is in the PATH or the spark_home to be supplied. + :param conf: Arbitrary Spark configuration properties :type conf: dict :param conn_id: The connection id as configured in Airflow administration. When an - invalid connection_id is supplied, it will default to yarn. + invalid connection_id is supplied, it will default to yarn. :type conn_id: str :param files: Upload additional files to the executor running the job, separated by a - comma. Files will be placed in the working directory of each executor. - For example, serialized objects. + comma. Files will be placed in the working directory of each executor. + For example, serialized objects. :type files: str :param py_files: Additional python files used by the job, can be .zip, .egg or .py. :type py_files: str - :param driver_classpath: Additional, driver-specific, classpath settings. - :type driver_classpath: str + :param: archives: Archives that spark should unzip (and possibly tag with #ALIAS) into + the application working directory. + :param driver_class_path: Additional, driver-specific, classpath settings. + :type driver_class_path: str :param jars: Submit additional jars to upload and place them in executor classpath. :type jars: str :param java_class: the main class of the Java application :type java_class: str :param packages: Comma-separated list of maven coordinates of jars to include on the - driver and executor classpaths + driver and executor classpaths :type packages: str :param exclude_packages: Comma-separated list of maven coordinates of jars to exclude - while resolving the dependencies provided in 'packages' + while resolving the dependencies provided in 'packages' :type exclude_packages: str :param repositories: Comma-separated list of additional remote repositories to search - for the maven coordinates given with 'packages' + for the maven coordinates given with 'packages' :type repositories: str :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors - (Default: all the available cores on the worker) + (Default: all the available cores on the worker) :type total_executor_cores: int :param executor_cores: (Standalone, YARN and Kubernetes only) Number of cores per - executor (Default: 2) + executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str @@ -81,17 +83,21 @@ class SparkSubmitHook(BaseHook, LoggingMixin): :param application_args: Arguments for the application being submitted :type application_args: list :param env_vars: Environment variables for spark-submit. It - supports yarn and k8s mode too. + supports yarn and k8s mode too. :type env_vars: dict :param verbose: Whether to pass the verbose flag to spark-submit process for debugging :type verbose: bool + :param spark_binary: The command to use for spark submit. + Some distros may use spark2-submit. + :type spark_binary: str """ def __init__(self, conf=None, conn_id='spark_default', files=None, py_files=None, - driver_classpath=None, + archives=None, + driver_class_path=None, jars=None, java_class=None, packages=None, @@ -107,12 +113,14 @@ def __init__(self, num_executors=None, application_args=None, env_vars=None, - verbose=False): + verbose=False, + spark_binary="spark-submit"): self._conf = conf self._conn_id = conn_id self._files = files self._py_files = py_files - self._driver_classpath = driver_classpath + self._archives = archives + self._driver_class_path = driver_class_path self._jars = jars self._java_class = java_class self._packages = packages @@ -132,10 +140,15 @@ def __init__(self, self._submit_sp = None self._yarn_application_id = None self._kubernetes_driver_pod = None + self._spark_binary = spark_binary self._connection = self._resolve_connection() self._is_yarn = 'yarn' in self._connection['master'] self._is_kubernetes = 'k8s' in self._connection['master'] + if self._is_kubernetes and kube_client is None: + raise RuntimeError( + "{} specified by kubernetes dependencies are not installed!".format( + self._connection['master'])) self._should_track_driver_status = self._resolve_should_track_driver_status() self._driver_id = None @@ -157,7 +170,7 @@ def _resolve_connection(self): 'queue': None, 'deploy_mode': None, 'spark_home': None, - 'spark_binary': 'spark-submit', + 'spark_binary': self._spark_binary, 'namespace': 'default'} try: @@ -174,7 +187,7 @@ def _resolve_connection(self): conn_data['queue'] = extra.get('queue', None) conn_data['deploy_mode'] = extra.get('deploy-mode', None) conn_data['spark_home'] = extra.get('spark-home', None) - conn_data['spark_binary'] = extra.get('spark-binary', 'spark-submit') + conn_data['spark_binary'] = extra.get('spark-binary', "spark-submit") conn_data['namespace'] = extra.get('namespace', 'default') except AirflowException: self.log.debug( @@ -235,8 +248,10 @@ def _build_spark_submit_command(self, application): connection_cmd += ["--files", self._files] if self._py_files: connection_cmd += ["--py-files", self._py_files] - if self._driver_classpath: - connection_cmd += ["--driver-classpath", self._driver_classpath] + if self._archives: + connection_cmd += ["--archives", self._archives] + if self._driver_class_path: + connection_cmd += ["--driver-class-path", self._driver_class_path] if self._jars: connection_cmd += ["--jars", self._jars] if self._packages: @@ -387,14 +402,14 @@ def _process_spark_submit_log(self, itr): # If we run Kubernetes cluster mode, we want to extract the driver pod id # from the logs so we can kill the application when we stop it unexpectedly elif self._is_kubernetes: - match = re.search('\s*pod name: ((.+?)-([a-z0-9]+)-driver)', line) + match = re.search(r'\s*pod name: ((.+?)-([a-z0-9]+)-driver)', line) if match: self._kubernetes_driver_pod = match.groups()[0] self.log.info("Identified spark driver pod: %s", self._kubernetes_driver_pod) # Store the Spark Exit code - match_exit_code = re.search('\s*Exit code: (\d+)', line) + match_exit_code = re.search(r'\s*Exit code: (\d+)', line) if match_exit_code: self._spark_exit_code = int(match_exit_code.groups()[0]) @@ -402,7 +417,7 @@ def _process_spark_submit_log(self, itr): # we need to extract the driver id from the logs. This allows us to poll for # the status using the driver id. Also, we can kill the driver when needed. elif self._should_track_driver_status and not self._driver_id: - match_driver_id = re.search('(driver-[0-9\-]+)', line) + match_driver_id = re.search(r'(driver-[0-9\-]+)', line) if match_driver_id: self._driver_id = match_driver_id.groups()[0] self.log.info("identified spark driver id: {}" @@ -437,16 +452,25 @@ def _start_driver_status_tracking(self): Finish failed when the status is ERROR/UNKNOWN/KILLED/FAILED. Possible status: - SUBMITTED: Submitted but not yet scheduled on a worker - RUNNING: Has been allocated to a worker to run - FINISHED: Previously ran and exited cleanly - RELAUNCHING: Exited non-zero or due to worker failure, but has not yet + + SUBMITTED + Submitted but not yet scheduled on a worker + RUNNING + Has been allocated to a worker to run + FINISHED + Previously ran and exited cleanly + RELAUNCHING + Exited non-zero or due to worker failure, but has not yet started running again - UNKNOWN: The status of the driver is temporarily not known due to - master failure recovery - KILLED: A user manually killed this driver - FAILED: The driver exited non-zero and was not supervised - ERROR: Unable to run or restart due to an unrecoverable error + UNKNOWN + The status of the driver is temporarily not known due to + master failure recovery + KILLED + A user manually killed this driver + FAILED + The driver exited non-zero and was not supervised + ERROR + Unable to run or restart due to an unrecoverable error (e.g. missing jar file) """ @@ -559,6 +583,6 @@ def on_kill(self): self.log.info("Spark on K8s killed with response: %s", api_response) - except ApiException as e: + except kube_client.ApiException as e: self.log.info("Exception when attempting to kill Spark on K8s:") self.log.exception(e) diff --git a/airflow/contrib/hooks/sqoop_hook.py b/airflow/contrib/hooks/sqoop_hook.py index 74cddc2b21027..aa5ec5222afd0 100644 --- a/airflow/contrib/hooks/sqoop_hook.py +++ b/airflow/contrib/hooks/sqoop_hook.py @@ -25,24 +25,24 @@ from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin from copy import deepcopy -class SqoopHook(BaseHook, LoggingMixin): +class SqoopHook(BaseHook): """ This hook is a wrapper around the sqoop 1 binary. To be able to use the hook it is required that "sqoop" is in the PATH. Additional arguments that can be passed via the 'extra' JSON field of the sqoop connection: - * job_tracker: Job tracker local|jobtracker:port. - * namenode: Namenode. - * lib_jars: Comma separated jar files to include in the classpath. - * files: Comma separated files to be copied to the map reduce cluster. - * archives: Comma separated archives to be unarchived on the compute - machines. - * password_file: Path to file containing the password. + + * ``job_tracker``: Job tracker local|jobtracker:port. + * ``namenode``: Namenode. + * ``lib_jars``: Comma separated jar files to include in the classpath. + * ``files``: Comma separated files to be copied to the map reduce cluster. + * ``archives``: Comma separated archives to be unarchived on the compute + machines. + * ``password_file``: Path to file containing the password. :param conn_id: Reference to the sqoop connection. :type conn_id: str @@ -195,7 +195,7 @@ def _import_cmd(self, target_dir, append, file_type, split_by, direct, for key, value in extra_import_options.items(): cmd += ['--{}'.format(key)] if value: - cmd += [value] + cmd += [str(value)] return cmd @@ -205,6 +205,7 @@ def import_table(self, table, target_dir=None, append=False, file_type="text", """ Imports table from remote location to target dir. Arguments are copies of direct sqoop command line arguments + :param table: Table to read :param target_dir: HDFS destination dir :param append: Append data to an existing dataset in HDFS @@ -235,6 +236,7 @@ def import_query(self, query, target_dir, append=False, file_type="text", split_by=None, direct=None, driver=None, extra_import_options=None): """ Imports a specific query from the rdbms to hdfs + :param query: Free format query to run :param target_dir: HDFS destination dir :param append: Append data to an existing dataset in HDFS @@ -302,7 +304,7 @@ def _export_cmd(self, table, export_dir, input_null_string, for key, value in extra_export_options.items(): cmd += ['--{}'.format(key)] if value: - cmd += [value] + cmd += [str(value)] # The required option cmd += ["--table", table] @@ -319,6 +321,7 @@ def export_table(self, table, export_dir, input_null_string, """ Exports Hive table to remote location. Arguments are copies of direct sqoop command line Arguments + :param table: Table remote destination :param export_dir: Hive table to export :param input_null_string: The string to be interpreted as null for diff --git a/airflow/contrib/hooks/ssh_hook.py b/airflow/contrib/hooks/ssh_hook.py index f51f0fbd11948..956a7fd4419d5 100755 --- a/airflow/contrib/hooks/ssh_hook.py +++ b/airflow/contrib/hooks/ssh_hook.py @@ -1,8 +1,5 @@ # -*- coding: utf-8 -*- # -# Copyright 2012-2015 Spotify AB -# Ported to Airflow by Bolke de Bruin -# # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -22,17 +19,17 @@ import getpass import os +import warnings import paramiko from paramiko.config import SSH_PORT +from sshtunnel import SSHTunnelForwarder -from contextlib import contextmanager from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin -class SSHHook(BaseHook, LoggingMixin): +class SSHHook(BaseHook): """ Hook for ssh remote execution using Paramiko. ref: https://github.com/paramiko/paramiko @@ -65,7 +62,7 @@ def __init__(self, username=None, password=None, key_file=None, - port=SSH_PORT, + port=None, timeout=10, keepalive_interval=30 ): @@ -75,162 +72,177 @@ def __init__(self, self.username = username self.password = password self.key_file = key_file + self.port = port self.timeout = timeout self.keepalive_interval = keepalive_interval + # Default values, overridable from Connection self.compress = True self.no_host_key_check = True + self.allow_host_key_change = False + self.host_proxy = None + + # Placeholder for deprecated __enter__ self.client = None - self.port = port + + # Use connection to override defaults + if self.ssh_conn_id is not None: + conn = self.get_connection(self.ssh_conn_id) + if self.username is None: + self.username = conn.login + if self.password is None: + self.password = conn.password + if self.remote_host is None: + self.remote_host = conn.host + if self.port is None: + self.port = conn.port + if conn.extra is not None: + extra_options = conn.extra_dejson + self.key_file = extra_options.get("key_file") + + if "timeout" in extra_options: + self.timeout = int(extra_options["timeout"], 10) + + if "compress" in extra_options\ + and str(extra_options["compress"]).lower() == 'false': + self.compress = False + if "no_host_key_check" in extra_options\ + and\ + str(extra_options["no_host_key_check"]).lower() == 'false': + self.no_host_key_check = False + if "allow_host_key_change" in extra_options\ + and\ + str(extra_options["allow_host_key_change"]).lower() == 'true': + self.allow_host_key_change = True + + if not self.remote_host: + raise AirflowException("Missing required param: remote_host") + + # Auto detecting username values from system + if not self.username: + self.log.debug( + "username to ssh to host: %s is not specified for connection id" + " %s. Using system's default provided by getpass.getuser()", + self.remote_host, self.ssh_conn_id + ) + self.username = getpass.getuser() + + user_ssh_config_filename = os.path.expanduser('~/.ssh/config') + if os.path.isfile(user_ssh_config_filename): + ssh_conf = paramiko.SSHConfig() + ssh_conf.parse(open(user_ssh_config_filename)) + host_info = ssh_conf.lookup(self.remote_host) + if host_info and host_info.get('proxycommand'): + self.host_proxy = paramiko.ProxyCommand(host_info.get('proxycommand')) + + if not (self.password or self.key_file): + if host_info and host_info.get('identityfile'): + self.key_file = host_info.get('identityfile')[0] + + self.port = self.port or SSH_PORT def get_conn(self): - if not self.client: - self.log.debug('Creating SSH client for conn_id: %s', self.ssh_conn_id) - if self.ssh_conn_id is not None: - conn = self.get_connection(self.ssh_conn_id) - if self.username is None: - self.username = conn.login - if self.password is None: - self.password = conn.password - if self.remote_host is None: - self.remote_host = conn.host - if conn.port is not None: - self.port = conn.port - if conn.extra is not None: - extra_options = conn.extra_dejson - self.key_file = extra_options.get("key_file") - - if "timeout" in extra_options: - self.timeout = int(extra_options["timeout"], 10) - - if "compress" in extra_options \ - and str(extra_options["compress"]).lower() == 'false': - self.compress = False - if "no_host_key_check" in extra_options \ - and \ - str(extra_options["no_host_key_check"]).lower() == 'false': - self.no_host_key_check = False - - if not self.remote_host: - raise AirflowException("Missing required param: remote_host") - - # Auto detecting username values from system - if not self.username: - self.log.debug( - "username to ssh to host: %s is not specified for connection id" - " %s. Using system's default provided by getpass.getuser()", - self.remote_host, self.ssh_conn_id - ) - self.username = getpass.getuser() - - host_proxy = None - user_ssh_config_filename = os.path.expanduser('~/.ssh/config') - if os.path.isfile(user_ssh_config_filename): - ssh_conf = paramiko.SSHConfig() - ssh_conf.parse(open(user_ssh_config_filename)) - host_info = ssh_conf.lookup(self.remote_host) - if host_info and host_info.get('proxycommand'): - host_proxy = paramiko.ProxyCommand(host_info.get('proxycommand')) - - if not (self.password or self.key_file): - if host_info and host_info.get('identityfile'): - self.key_file = host_info.get('identityfile')[0] - - try: - client = paramiko.SSHClient() - client.load_system_host_keys() - if self.no_host_key_check: - # Default is RejectPolicy - client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) - - if self.password and self.password.strip(): - client.connect(hostname=self.remote_host, - username=self.username, - password=self.password, - timeout=self.timeout, - compress=self.compress, - port=self.port, - sock=host_proxy) - else: - client.connect(hostname=self.remote_host, - username=self.username, - key_filename=self.key_file, - timeout=self.timeout, - compress=self.compress, - port=self.port, - sock=host_proxy) - - if self.keepalive_interval: - client.get_transport().set_keepalive(self.keepalive_interval) - - self.client = client - except paramiko.AuthenticationException as auth_error: - self.log.error( - "Auth failed while connecting to host: %s, error: %s", - self.remote_host, auth_error - ) - except paramiko.SSHException as ssh_error: - self.log.error( - "Failed connecting to host: %s, error: %s", - self.remote_host, ssh_error - ) - except Exception as error: - self.log.error( - "Error connecting to host: %s, error: %s", - self.remote_host, error - ) - return self.client - - @contextmanager - def create_tunnel(self, local_port, remote_port=None, remote_host="localhost"): """ - Creates a tunnel between two hosts. Like ssh -L :host:. - Remember to close() the returned "tunnel" object in order to clean up - after yourself when you are done with the tunnel. + Opens a ssh connection to the remote host. - :param local_port: - :type local_port: int - :param remote_port: - :type remote_port: int - :param remote_host: - :type remote_host: str - :return: + :rtype: paramiko.client.SSHClient """ - import subprocess - # this will ensure the connection to the ssh.remote_host from where the tunnel - # is getting created - self.get_conn() - - tunnel_host = "{0}:{1}:{2}".format(local_port, remote_host, remote_port) - - ssh_cmd = ["ssh", "{0}@{1}".format(self.username, self.remote_host), - "-o", "ControlMaster=no", - "-o", "UserKnownHostsFile=/dev/null", - "-o", "StrictHostKeyChecking=no"] - - ssh_tunnel_cmd = ["-L", tunnel_host, - "echo -n ready && cat" - ] - - ssh_cmd += ssh_tunnel_cmd - self.log.debug("Creating tunnel with cmd: %s", ssh_cmd) - - proc = subprocess.Popen(ssh_cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - close_fds=True) - ready = proc.stdout.read(5) - assert ready == b"ready", \ - "Did not get 'ready' from remote, got '{0}' instead".format(ready) - yield - proc.communicate() - assert proc.returncode == 0, \ - "Tunnel process did unclean exit (returncode {}".format(proc.returncode) + self.log.debug('Creating SSH client for conn_id: %s', self.ssh_conn_id) + client = paramiko.SSHClient() + if not self.allow_host_key_change: + self.log.warning('Remote Identification Change is not verified. ' + 'This wont protect against Man-In-The-Middle attacks') + client.load_system_host_keys() + if self.no_host_key_check: + self.log.warning('No Host Key Verification. This wont protect ' + 'against Man-In-The-Middle attacks') + # Default is RejectPolicy + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + + if self.password and self.password.strip(): + client.connect(hostname=self.remote_host, + username=self.username, + password=self.password, + key_filename=self.key_file, + timeout=self.timeout, + compress=self.compress, + port=self.port, + sock=self.host_proxy) + else: + client.connect(hostname=self.remote_host, + username=self.username, + key_filename=self.key_file, + timeout=self.timeout, + compress=self.compress, + port=self.port, + sock=self.host_proxy) + + if self.keepalive_interval: + client.get_transport().set_keepalive(self.keepalive_interval) + + self.client = client + return client def __enter__(self): + warnings.warn('The contextmanager of SSHHook is deprecated.' + 'Please use get_conn() as a contextmanager instead.' + 'This method will be removed in Airflow 2.0', + category=DeprecationWarning) return self def __exit__(self, exc_type, exc_val, exc_tb): if self.client is not None: self.client.close() + self.client = None + + def get_tunnel(self, remote_port, remote_host="localhost", local_port=None): + """ + Creates a tunnel between two hosts. Like ssh -L :host:. + + :param remote_port: The remote port to create a tunnel to + :type remote_port: int + :param remote_host: The remote host to create a tunnel to (default localhost) + :type remote_host: str + :param local_port: The local port to attach the tunnel to + :type local_port: int + + :return: sshtunnel.SSHTunnelForwarder object + """ + + if local_port: + local_bind_address = ('localhost', local_port) + else: + local_bind_address = ('localhost',) + + if self.password and self.password.strip(): + client = SSHTunnelForwarder(self.remote_host, + ssh_port=self.port, + ssh_username=self.username, + ssh_password=self.password, + ssh_pkey=self.key_file, + ssh_proxy=self.host_proxy, + local_bind_address=local_bind_address, + remote_bind_address=(remote_host, remote_port), + logger=self.log) + else: + client = SSHTunnelForwarder(self.remote_host, + ssh_port=self.port, + ssh_username=self.username, + ssh_pkey=self.key_file, + ssh_proxy=self.host_proxy, + local_bind_address=local_bind_address, + remote_bind_address=(remote_host, remote_port), + host_pkey_directories=[], + logger=self.log) + + return client + + def create_tunnel(self, local_port, remote_port=None, remote_host="localhost"): + warnings.warn('SSHHook.create_tunnel is deprecated, Please' + 'use get_tunnel() instead. But please note that the' + 'order of the parameters have changed' + 'This method will be removed in Airflow 2.0', + category=DeprecationWarning) + + return self.get_tunnel(remote_port, remote_host, local_port) diff --git a/airflow/contrib/hooks/vertica_hook.py b/airflow/contrib/hooks/vertica_hook.py index f3411de994d7f..e6b36b51d5937 100644 --- a/airflow/contrib/hooks/vertica_hook.py +++ b/airflow/contrib/hooks/vertica_hook.py @@ -41,9 +41,9 @@ def get_conn(self): "user": conn.login, "password": conn.password or '', "database": conn.schema, + "host": conn.host or 'localhost' } - conn_config["host"] = conn.host or 'localhost' if not conn.port: conn_config["port"] = 5433 else: diff --git a/airflow/contrib/hooks/wasb_hook.py b/airflow/contrib/hooks/wasb_hook.py index 1d73abd78b3dd..d3a766cf69de7 100644 --- a/airflow/contrib/hooks/wasb_hook.py +++ b/airflow/contrib/hooks/wasb_hook.py @@ -18,6 +18,7 @@ # under the License. # +from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook from azure.storage.blob import BlockBlobService @@ -58,7 +59,7 @@ def check_for_blob(self, container_name, blob_name, **kwargs): `BlockBlobService.exists()` takes. :type kwargs: object :return: True if the blob exists, False otherwise. - :rtype bool + :rtype: bool """ return self.connection.exists(container_name, blob_name, **kwargs) @@ -74,7 +75,7 @@ def check_for_prefix(self, container_name, prefix, **kwargs): `BlockBlobService.list_blobs()` takes. :type kwargs: object :return: True if blobs matching the prefix exist, False otherwise. - :rtype bool + :rtype: bool """ matches = self.connection.list_blobs(container_name, prefix, num_results=1, **kwargs) @@ -148,3 +149,43 @@ def read_file(self, container_name, blob_name, **kwargs): return self.connection.get_blob_to_text(container_name, blob_name, **kwargs).content + + def delete_file(self, container_name, blob_name, is_prefix=False, + ignore_if_missing=False, **kwargs): + """ + Delete a file from Azure Blob Storage. + + :param container_name: Name of the container. + :type container_name: str + :param blob_name: Name of the blob. + :type blob_name: str + :param is_prefix: If blob_name is a prefix, delete all matching files + :type is_prefix: bool + :param ignore_if_missing: if True, then return success even if the + blob does not exist. + :type ignore_if_missing: bool + :param kwargs: Optional keyword arguments that + `BlockBlobService.create_blob_from_path()` takes. + :type kwargs: object + """ + + if is_prefix: + blobs_to_delete = [ + blob.name for blob in self.connection.list_blobs( + container_name, prefix=blob_name, **kwargs + ) + ] + elif self.check_for_blob(container_name, blob_name): + blobs_to_delete = [blob_name] + else: + blobs_to_delete = [] + + if not ignore_if_missing and len(blobs_to_delete) == 0: + raise AirflowException('Blob(s) not found: {}'.format(blob_name)) + + for blob_uri in blobs_to_delete: + self.log.info("Deleting blob: " + blob_uri) + self.connection.delete_blob(container_name, + blob_uri, + delete_snapshots='include', + **kwargs) diff --git a/airflow/contrib/hooks/winrm_hook.py b/airflow/contrib/hooks/winrm_hook.py index 0be904b9bcdca..336dc2d818c62 100644 --- a/airflow/contrib/hooks/winrm_hook.py +++ b/airflow/contrib/hooks/winrm_hook.py @@ -17,121 +17,234 @@ # specific language governing permissions and limitations # under the License. # + import getpass + from winrm.protocol import Protocol + from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.log.logging_mixin import LoggingMixin -class WinRMHook(BaseHook, LoggingMixin): - +class WinRMHook(BaseHook): """ Hook for winrm remote execution using pywinrm. + :seealso: https://github.com/diyan/pywinrm/blob/master/winrm/protocol.py + :param ssh_conn_id: connection id from airflow Connections from where all - the required parameters can be fetched like username, password or key_file. + the required parameters can be fetched like username and password. Thought the priority is given to the param passed during init :type ssh_conn_id: str - :param remote_host: remote host to connect + :param endpoint: When set to `None`, endpoint will be constructed like this: + 'http://{remote_host}:{remote_port}/wsman' + :type endpoint: str + :param remote_host: Remote host to connect to. + Ignored if `endpoint` is not `None`. :type remote_host: str + :param remote_port: Remote port to connect to. + Ignored if `endpoint` is not `None`. + :type remote_port: int + :param transport: transport type, one of 'plaintext' (default), 'kerberos', 'ssl', + 'ntlm', 'credssp' + :type transport: str :param username: username to connect to the remote_host :type username: str :param password: password of the username to connect to the remote_host :type password: str - :param key_file: key file to use to connect to the remote_host. - :type key_file: str - :param timeout: timeout for the attempt to connect to the remote_host. - :type timeout: int - :param keepalive_interval: send a keepalive packet to remote host - every keepalive_interval seconds - :type keepalive_interval: int + :param service: the service name, default is HTTP + :type service: str + :param keytab: the path to a keytab file if you are using one + :type keytab: str + :param ca_trust_path: Certification Authority trust path + :type ca_trust_path: str + :param cert_pem: client authentication certificate file path in PEM format + :type cert_pem: str + :param cert_key_pem: client authentication certificate key file path in PEM format + :type cert_key_pem: str + :param server_cert_validation: whether server certificate should be validated on + Python versions that suppport it; one of 'validate' (default), 'ignore' + :type server_cert_validation: str + :param kerberos_delegation: if True, TGT is sent to target server to + allow multiple hops + :type kerberos_delegation: bool + :param read_timeout_sec: maximum seconds to wait before an HTTP connect/read times out + (default 30). This value should be slightly higher than operation_timeout_sec, + as the server can block *at least* that long. + :type read_timeout_sec: int + :param operation_timeout_sec: maximum allowed time in seconds for any single wsman + HTTP operation (default 20). Note that operation timeouts while receiving output + (the only wsman operation that should take any significant time, and where these + timeouts are expected) will be silently retried indefinitely. + :type operation_timeout_sec: int + :param kerberos_hostname_override: the hostname to use for the kerberos exchange + (defaults to the hostname in the endpoint URL) + :type kerberos_hostname_override: str + :param message_encryption_enabled: Will encrypt the WinRM messages if set to True and + the transport auth supports message encryption (Default True). + :type message_encryption_enabled: bool + :param credssp_disable_tlsv1_2: Whether to disable TLSv1.2 support and work with older + protocols like TLSv1.0, default is False + :type credssp_disable_tlsv1_2: bool + :param send_cbt: Will send the channel bindings over a HTTPS channel (Default: True) + :type send_cbt: bool """ - def __init__(self, ssh_conn_id=None, + endpoint=None, remote_host=None, + remote_port=5985, + transport='plaintext', username=None, password=None, - key_file=None, - timeout=10, - keepalive_interval=30 + service='HTTP', + keytab=None, + ca_trust_path=None, + cert_pem=None, + cert_key_pem=None, + server_cert_validation='validate', + kerberos_delegation=False, + read_timeout_sec=30, + operation_timeout_sec=20, + kerberos_hostname_override=None, + message_encryption='auto', + credssp_disable_tlsv1_2=False, + send_cbt=True, ): super(WinRMHook, self).__init__(ssh_conn_id) - # TODO make new win rm connection class self.ssh_conn_id = ssh_conn_id + self.endpoint = endpoint self.remote_host = remote_host + self.remote_port = remote_port + self.transport = transport self.username = username self.password = password - self.key_file = key_file - self.timeout = timeout - self.keepalive_interval = keepalive_interval - # Default values, overridable from Connection - self.compress = True - self.no_host_key_check = True + self.service = service + self.keytab = keytab + self.ca_trust_path = ca_trust_path + self.cert_pem = cert_pem + self.cert_key_pem = cert_key_pem + self.server_cert_validation = server_cert_validation + self.kerberos_delegation = kerberos_delegation + self.read_timeout_sec = read_timeout_sec + self.operation_timeout_sec = operation_timeout_sec + self.kerberos_hostname_override = kerberos_hostname_override + self.message_encryption = message_encryption + self.credssp_disable_tlsv1_2 = credssp_disable_tlsv1_2 + self.send_cbt = send_cbt + self.client = None self.winrm_protocol = None def get_conn(self): - if not self.client: - self.log.debug('Creating WinRM client for conn_id: %s', self.ssh_conn_id) - if self.ssh_conn_id is not None: - conn = self.get_connection(self.ssh_conn_id) - if self.username is None: - self.username = conn.login - if self.password is None: - self.password = conn.password - if self.remote_host is None: - self.remote_host = conn.host - if conn.extra is not None: - extra_options = conn.extra_dejson - self.key_file = extra_options.get("key_file") - - if "timeout" in extra_options: - self.timeout = int(extra_options["timeout"], 10) - - if "compress" in extra_options \ - and extra_options["compress"].lower() == 'false': - self.compress = False - if "no_host_key_check" in extra_options \ - and extra_options["no_host_key_check"].lower() == 'false': - self.no_host_key_check = False - - if not self.remote_host: - raise AirflowException("Missing required param: remote_host") - - # Auto detecting username values from system - if not self.username: - self.log.debug( - "username to ssh to host: %s is not specified for connection id" - " %s. Using system's default provided by getpass.getuser()", - self.remote_host, self.ssh_conn_id - ) - self.username = getpass.getuser() - - try: - - if self.password and self.password.strip(): - self.winrm_protocol = Protocol( - # TODO pass in port from ssh conn - endpoint='http://' + self.remote_host + ':5985/wsman', - # TODO get cert transport working - # transport='certificate', - transport='plaintext', - # cert_pem=r'publickey.pem', - # cert_key_pem=r'dev.pem', - read_timeout_sec=70, - operation_timeout_sec=60, - username=self.username, - password=self.password, - server_cert_validation='ignore') - - self.log.info("Opening WinRM shell") - self.client = self.winrm_protocol.open_shell() - - except Exception as error: - self.log.error( - "Error connecting to host: %s, error: %s", - self.remote_host, error + if self.client: + return self.client + + self.log.debug('Creating WinRM client for conn_id: %s', self.ssh_conn_id) + if self.ssh_conn_id is not None: + conn = self.get_connection(self.ssh_conn_id) + + if self.username is None: + self.username = conn.login + if self.password is None: + self.password = conn.password + if self.remote_host is None: + self.remote_host = conn.host + + if conn.extra is not None: + extra_options = conn.extra_dejson + + if "endpoint" in extra_options: + self.endpoint = str(extra_options["endpoint"]) + if "remote_port" in extra_options: + self.remote_port = int(extra_options["remote_port"]) + if "transport" in extra_options: + self.transport = str(extra_options["transport"]) + if "service" in extra_options: + self.service = str(extra_options["service"]) + if "keytab" in extra_options: + self.keytab = str(extra_options["keytab"]) + if "ca_trust_path" in extra_options: + self.ca_trust_path = str(extra_options["ca_trust_path"]) + if "cert_pem" in extra_options: + self.cert_pem = str(extra_options["cert_pem"]) + if "cert_key_pem" in extra_options: + self.cert_key_pem = str(extra_options["cert_key_pem"]) + if "server_cert_validation" in extra_options: + self.server_cert_validation = \ + str(extra_options["server_cert_validation"]) + if "kerberos_delegation" in extra_options: + self.kerberos_delegation = \ + str(extra_options["kerberos_delegation"]).lower() == 'true' + if "read_timeout_sec" in extra_options: + self.read_timeout_sec = int(extra_options["read_timeout_sec"]) + if "operation_timeout_sec" in extra_options: + self.operation_timeout_sec = \ + int(extra_options["operation_timeout_sec"]) + if "kerberos_hostname_override" in extra_options: + self.kerberos_hostname_override = \ + str(extra_options["kerberos_hostname_override"]) + if "message_encryption" in extra_options: + self.message_encryption = str(extra_options["message_encryption"]) + if "credssp_disable_tlsv1_2" in extra_options: + self.credssp_disable_tlsv1_2 = \ + str(extra_options["credssp_disable_tlsv1_2"]).lower() == 'true' + if "send_cbt" in extra_options: + self.send_cbt = str(extra_options["send_cbt"]).lower() == 'true' + + if not self.remote_host: + raise AirflowException("Missing required param: remote_host") + + # Auto detecting username values from system + if not self.username: + self.log.debug( + "username to WinRM to host: %s is not specified for connection id" + " %s. Using system's default provided by getpass.getuser()", + self.remote_host, self.ssh_conn_id + ) + self.username = getpass.getuser() + + # If endpoint is not set, then build a standard wsman endpoint from host and port. + if not self.endpoint: + self.endpoint = 'http://{0}:{1}/wsman'.format( + self.remote_host, + self.remote_port + ) + + try: + if self.password and self.password.strip(): + self.winrm_protocol = Protocol( + endpoint=self.endpoint, + transport=self.transport, + username=self.username, + password=self.password, + service=self.service, + keytab=self.keytab, + ca_trust_path=self.ca_trust_path, + cert_pem=self.cert_pem, + cert_key_pem=self.cert_key_pem, + server_cert_validation=self.server_cert_validation, + kerberos_delegation=self.kerberos_delegation, + read_timeout_sec=self.read_timeout_sec, + operation_timeout_sec=self.operation_timeout_sec, + kerberos_hostname_override=self.kerberos_hostname_override, + message_encryption=self.message_encryption, + credssp_disable_tlsv1_2=self.credssp_disable_tlsv1_2, + send_cbt=self.send_cbt ) + + self.log.info( + "Establishing WinRM connection to host: %s", + self.remote_host + ) + self.client = self.winrm_protocol.open_shell() + + except Exception as error: + error_msg = "Error connecting to host: {0}, error: {1}".format( + self.remote_host, + error + ) + self.log.error(error_msg) + raise AirflowException(error_msg) + return self.client diff --git a/airflow/contrib/kubernetes/kube_client.py b/airflow/contrib/kubernetes/kube_client.py index 8b71f41242329..cc699e7104475 100644 --- a/airflow/contrib/kubernetes/kube_client.py +++ b/airflow/contrib/kubernetes/kube_client.py @@ -17,9 +17,21 @@ from airflow.configuration import conf from six import PY2 +try: + from kubernetes import config, client + from kubernetes.client.rest import ApiException + has_kubernetes = True +except ImportError as e: + # We need an exception class to be able to use it in ``except`` elsewhere + # in the code base + ApiException = BaseException + has_kubernetes = False + _import_err = e + def _load_kube_config(in_cluster, cluster_context, config_file): - from kubernetes import config, client + if not has_kubernetes: + raise _import_err if in_cluster: config.load_incluster_config() else: @@ -36,4 +48,9 @@ def _load_kube_config(in_cluster, cluster_context, config_file): def get_kube_client(in_cluster=conf.getboolean('kubernetes', 'in_cluster'), cluster_context=None, config_file=None): + if not in_cluster: + if cluster_context is None: + cluster_context = conf.get('kubernetes', 'cluster_context', fallback=None) + if config_file is None: + config_file = conf.get('kubernetes', 'config_file', fallback=None) return _load_kube_config(in_cluster, cluster_context, config_file) diff --git a/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py b/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py index 7133125ab14e3..d1b6d4eaf4dca 100644 --- a/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py +++ b/airflow/contrib/kubernetes/kubernetes_request_factory/kubernetes_request_factory.py @@ -75,6 +75,12 @@ def extract_affinity(pod, req): for k, v in six.iteritems(pod.affinity): req['spec']['affinity'][k] = v + @staticmethod + def extract_node_selector(pod, req): + req['spec']['nodeSelector'] = req['spec'].get('nodeSelector', {}) + for k, v in six.iteritems(pod.node_selectors): + req['spec']['nodeSelector'][k] = v + @staticmethod def extract_cmds(pod, req): req['spec']['containers'][0]['command'] = pod.cmds @@ -83,11 +89,6 @@ def extract_cmds(pod, req): def extract_args(pod, req): req['spec']['containers'][0]['args'] = pod.args - @staticmethod - def extract_node_selector(pod, req): - if len(pod.node_selectors) > 0: - req['spec']['nodeSelector'] = pod.node_selectors - @staticmethod def attach_volumes(pod, req): req['spec']['volumes'] = ( @@ -130,15 +131,21 @@ def extract_volume_secrets(pod, req): @staticmethod def extract_env_and_secrets(pod, req): - env_secrets = [s for s in pod.secrets if s.deploy_type == 'env'] - if len(pod.envs) > 0 or len(env_secrets) > 0: + envs_from_key_secrets = [ + env for env in pod.secrets if env.deploy_type == 'env' and env.key is not None + ] + + if len(pod.envs) > 0 or len(envs_from_key_secrets) > 0: env = [] for k in pod.envs.keys(): env.append({'name': k, 'value': pod.envs[k]}) - for secret in env_secrets: + for secret in envs_from_key_secrets: KubernetesRequestFactory.add_secret_to_env(env, secret) + req['spec']['containers'][0]['env'] = env + KubernetesRequestFactory._apply_env_from(pod, req) + @staticmethod def extract_resources(pod, req): if not pod.resources or pod.resources.is_empty_resource_request(): @@ -174,9 +181,51 @@ def extract_service_account_name(pod, req): if pod.service_account_name: req['spec']['serviceAccountName'] = pod.service_account_name + @staticmethod + def extract_hostnetwork(pod, req): + if pod.hostnetwork: + req['spec']['hostNetwork'] = pod.hostnetwork + @staticmethod def extract_image_pull_secrets(pod, req): if pod.image_pull_secrets: req['spec']['imagePullSecrets'] = [{ 'name': pull_secret } for pull_secret in pod.image_pull_secrets.split(',')] + + @staticmethod + def extract_tolerations(pod, req): + if pod.tolerations: + req['spec']['tolerations'] = pod.tolerations + + @staticmethod + def extract_security_context(pod, req): + if pod.security_context: + req['spec']['securityContext'] = pod.security_context + + @staticmethod + def _apply_env_from(pod, req): + envs_from_secrets = [ + env for env in pod.secrets if env.deploy_type == 'env' and env.key is None + ] + + if pod.configmaps or envs_from_secrets: + req['spec']['containers'][0]['envFrom'] = [] + + for secret in envs_from_secrets: + req['spec']['containers'][0]['envFrom'].append( + { + 'secretRef': { + 'name': secret.secret + } + } + ) + + for configmap in pod.configmaps: + req['spec']['containers'][0]['envFrom'].append( + { + 'configMapRef': { + 'name': configmap + } + } + ) diff --git a/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py b/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py index 95d6c829dec59..3aea85df3ea18 100644 --- a/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py +++ b/airflow/contrib/kubernetes/kubernetes_request_factory/pod_request_factory.py @@ -16,6 +16,7 @@ # under the License. import yaml +from airflow.contrib.kubernetes.pod import Pod from airflow.contrib.kubernetes.kubernetes_request_factory.kubernetes_request_factory \ import KubernetesRequestFactory @@ -59,16 +60,18 @@ def create(self, pod): self.extract_image_pull_secrets(pod, req) self.extract_annotations(pod, req) self.extract_affinity(pod, req) + self.extract_hostnetwork(pod, req) + self.extract_tolerations(pod, req) + self.extract_security_context(pod, req) return req class ExtractXcomPodRequestFactory(KubernetesRequestFactory): - - XCOM_MOUNT_PATH = '/airflow/xcom' - SIDECAR_CONTAINER_NAME = 'airflow-xcom-sidecar' """ Request generator for a pod with sidecar container. """ + XCOM_MOUNT_PATH = '/airflow/xcom' + SIDECAR_CONTAINER_NAME = 'airflow-xcom-sidecar' _yaml = """apiVersion: v1 kind: Pod metadata: @@ -86,7 +89,16 @@ class ExtractXcomPodRequestFactory(KubernetesRequestFactory): mountPath: {xcomMountPath} - name: {sidecarContainerName} image: python:3.5-alpine - command: ["python", "-m", "http.server"] + command: + - python + - -c + - | + import time + while True: + try: + time.sleep(3600) + except KeyboardInterrupt: + exit(0) volumeMounts: - name: xcom mountPath: {xcomMountPath} @@ -116,4 +128,7 @@ def create(self, pod): self.extract_image_pull_secrets(pod, req) self.extract_annotations(pod, req) self.extract_affinity(pod, req) + self.extract_hostnetwork(pod, req) + self.extract_tolerations(pod, req) + self.extract_security_context(pod, req) return req diff --git a/airflow/contrib/kubernetes/pod.py b/airflow/contrib/kubernetes/pod.py index c42221482ff88..f8c0bdffaee08 100644 --- a/airflow/contrib/kubernetes/pod.py +++ b/airflow/contrib/kubernetes/pod.py @@ -46,16 +46,29 @@ class Pod: :param envs: A dict containing the environment variables :type envs: dict :param cmds: The command to be run on the pod - :type cmds: list str + :type cmds: list[str] :param secrets: Secrets to be launched to the pod - :type secrets: list Secret + :type secrets: list[airflow.contrib.kubernetes.secret.Secret] :param result: The result that will be returned to the operator after - successful execution of the pod + successful execution of the pod :type result: any :param image_pull_policy: Specify a policy to cache or always pull an image :type image_pull_policy: str + :param image_pull_secrets: Any image pull secrets to be given to the pod. + If more than one secret is required, provide a comma separated list: + secret_a,secret_b + :type image_pull_secrets: str :param affinity: A dict containing a group of affinity scheduling rules :type affinity: dict + :param hostnetwork: If True enable host networking on the pod + :type hostnetwork: bool + :param tolerations: A list of kubernetes tolerations + :type tolerations: list + :param security_context: A dict containing the security context for the pod + :type security_context: dict + :param configmaps: A list containing names of configmaps object + mounting env variables to the pod + :type configmaps: list[str] """ def __init__( self, @@ -77,7 +90,11 @@ def __init__( service_account_name=None, resources=None, annotations=None, - affinity=None + affinity=None, + hostnetwork=False, + tolerations=None, + security_context=None, + configmaps=None ): self.image = image self.envs = envs or {} @@ -89,7 +106,7 @@ def __init__( self.name = name self.volumes = volumes or [] self.volume_mounts = volume_mounts or [] - self.node_selectors = node_selectors or [] + self.node_selectors = node_selectors or {} self.namespace = namespace self.image_pull_policy = image_pull_policy self.image_pull_secrets = image_pull_secrets @@ -98,3 +115,7 @@ def __init__( self.resources = resources or Resources() self.annotations = annotations or {} self.affinity = affinity or {} + self.hostnetwork = hostnetwork or False + self.tolerations = tolerations or [] + self.security_context = security_context + self.configmaps = configmaps or [] diff --git a/airflow/contrib/kubernetes/pod_generator.py b/airflow/contrib/kubernetes/pod_generator.py index 6d8d83ef054a9..bee7f5b9572a0 100644 --- a/airflow/contrib/kubernetes/pod_generator.py +++ b/airflow/contrib/kubernetes/pod_generator.py @@ -149,7 +149,7 @@ def make_pod(self, namespace, image, pod_id, cmds, arguments, labels): return Pod( namespace=namespace, - name=pod_id + "-" + str(uuid.uuid1())[:8], + name=pod_id + "-" + str(uuid.uuid4())[:8], image=image, cmds=cmds, args=arguments, diff --git a/airflow/contrib/kubernetes/pod_launcher.py b/airflow/contrib/kubernetes/pod_launcher.py index 8ac5108507345..2d46d277b8b8e 100644 --- a/airflow/contrib/kubernetes/pod_launcher.py +++ b/airflow/contrib/kubernetes/pod_launcher.py @@ -17,12 +17,14 @@ import json import time +from typing import Tuple, Optional from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State from datetime import datetime as dt +from airflow.contrib.kubernetes.pod import Pod from airflow.contrib.kubernetes.kubernetes_request_factory import \ pod_request_factory as pod_factory -from kubernetes import watch +from kubernetes import watch, client from kubernetes.client.rest import ApiException from kubernetes.stream import stream as kubernetes_stream from airflow import AirflowException @@ -59,8 +61,17 @@ def run_pod_async(self, pod): raise return resp + def delete_pod(self, pod): + try: + self._client.delete_namespaced_pod( + pod.name, pod.namespace, body=client.V1DeleteOptions()) + except ApiException as e: + # If the pod is already deleted + if e.status != 404: + raise + def run_pod(self, pod, startup_timeout=120, get_logs=True): - # type: (Pod) -> (State, result) + # type: (Pod, int, bool) -> Tuple[State, Optional[str]] """ Launches the pod synchronously and waits for completion. Args: @@ -81,7 +92,7 @@ def run_pod(self, pod, startup_timeout=120, get_logs=True): return self._monitor_pod(pod, get_logs) def _monitor_pod(self, pod, get_logs): - # type: (Pod) -> (State, content) + # type: (Pod, bool) -> Tuple[State, Optional[str]] if get_logs: logs = self._client.read_namespaced_pod_log( @@ -104,7 +115,7 @@ def _monitor_pod(self, pod, get_logs): while self.pod_is_running(pod): self.log.info('Pod %s has state %s', pod.name, State.RUNNING) time.sleep(2) - return (self._task_status(self.read_pod(pod)), result) + return self._task_status(self.read_pod(pod)), result def _task_status(self, event): self.log.info( @@ -154,7 +165,7 @@ def _extract_xcom(self, pod): def _exec_pod_command(self, resp, command): if resp.is_open(): - self.log.info('Running command... %s\n' % command) + self.log.info('Running command... %s\n', command) resp.write_stdin(command + '\n') while resp.is_open(): resp.update(timeout=1) diff --git a/airflow/contrib/kubernetes/secret.py b/airflow/contrib/kubernetes/secret.py index 5c1038cd84e9a..73c51e900acf9 100644 --- a/airflow/contrib/kubernetes/secret.py +++ b/airflow/contrib/kubernetes/secret.py @@ -14,28 +14,55 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from airflow.exceptions import AirflowConfigException -class Secret: +class Secret(object): """Defines Kubernetes Secret Volume""" - def __init__(self, deploy_type, deploy_target, secret, key): + def __init__(self, deploy_type, deploy_target, secret, key=None): """Initialize a Kubernetes Secret Object. Used to track requested secrets from the user. :param deploy_type: The type of secret deploy in Kubernetes, either `env` or `volume` - :type deploy_type: ``str`` - :param deploy_target: The environment variable when `deploy_type` `env` or - file path when `deploy_type` `volume` where expose secret - :type deploy_target: ``str`` + :type deploy_type: str + :param deploy_target: (Optional) The environment variable when + `deploy_type` `env` or file path when `deploy_type` `volume` where + expose secret. If `key` is not provided deploy target should be None. + :type deploy_target: str or None :param secret: Name of the secrets object in Kubernetes - :type secret: ``str`` - :param key: Key of the secret within the Kubernetes Secret - :type key: ``str`` + :type secret: str + :param key: (Optional) Key of the secret within the Kubernetes Secret + if not provided in `deploy_type` `env` it will mount all secrets in object + :type key: str or None """ self.deploy_type = deploy_type - self.deploy_target = deploy_target.upper() - if deploy_type == 'volume': - self.deploy_target = deploy_target + self.deploy_target = deploy_target + + if deploy_target is not None and deploy_type == 'env': + # if deploying to env, capitalize the deploy target + self.deploy_target = deploy_target.upper() + + if key is not None and deploy_target is None: + raise AirflowConfigException( + 'If `key` is set, `deploy_target` should not be None' + ) + self.secret = secret self.key = key + + def __eq__(self, other): + return ( + self.deploy_type == other.deploy_type and + self.deploy_target == other.deploy_target and + self.secret == other.secret and + self.key == other.key + ) + + def __repr__(self): + return 'Secret({}, {}, {}, {})'.format( + self.deploy_type, + self.deploy_target, + self.secret, + self.key + ) diff --git a/airflow/contrib/kubernetes/volume.py b/airflow/contrib/kubernetes/volume.py index d5b4f60cc3850..94003fe48dcb3 100644 --- a/airflow/contrib/kubernetes/volume.py +++ b/airflow/contrib/kubernetes/volume.py @@ -23,11 +23,11 @@ def __init__(self, name, configs): """ Adds Kubernetes Volume to pod. allows pod to access features like ConfigMaps and Persistent Volumes :param name: the name of the volume mount - :type: name: str + :type name: str :param configs: dictionary of any features needed for volume. We purposely keep this vague since there are multiple volume types with changing configs. - :type: configs: dict + :type configs: dict """ self.name = name self.configs = configs diff --git a/airflow/contrib/kubernetes/worker_configuration.py b/airflow/contrib/kubernetes/worker_configuration.py index 26540856f980d..b9896b5b886d9 100644 --- a/airflow/contrib/kubernetes/worker_configuration.py +++ b/airflow/contrib/kubernetes/worker_configuration.py @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -import copy import os import six +from airflow.configuration import conf from airflow.contrib.kubernetes.pod import Pod, Resources from airflow.contrib.kubernetes.secret import Secret from airflow.utils.log.logging_mixin import LoggingMixin @@ -27,17 +27,26 @@ class WorkerConfiguration(LoggingMixin): """Contains Kubernetes Airflow Worker configuration logic""" + dags_volume_name = 'airflow-dags' + logs_volume_name = 'airflow-logs' + git_sync_ssh_secret_volume_name = 'git-sync-ssh-key' + git_ssh_key_secret_key = 'gitSshKey' + git_sync_ssh_known_hosts_volume_name = 'git-sync-known-hosts' + git_ssh_known_hosts_configmap_key = 'known_hosts' + def __init__(self, kube_config): self.kube_config = kube_config self.worker_airflow_home = self.kube_config.airflow_home self.worker_airflow_dags = self.kube_config.dags_folder self.worker_airflow_logs = self.kube_config.base_log_folder + super(WorkerConfiguration, self).__init__() - def _get_init_containers(self, volume_mounts): + def _get_init_containers(self): """When using git to retrieve the DAGs, use the GitSync Init Container""" # If we're using volume claims to mount the dags, no init container is needed - if self.kube_config.dags_volume_claim: + if self.kube_config.dags_volume_claim or \ + self.kube_config.dags_volume_host or self.kube_config.dags_in_image: return [] # Otherwise, define a git-sync init container @@ -49,10 +58,13 @@ def _get_init_containers(self, volume_mounts): 'value': self.kube_config.git_branch }, { 'name': 'GIT_SYNC_ROOT', - 'value': '/tmp' + 'value': self.kube_config.git_sync_root }, { 'name': 'GIT_SYNC_DEST', - 'value': 'dags' + 'value': self.kube_config.git_sync_dest + }, { + 'name': 'GIT_SYNC_DEPTH', + 'value': '1' }, { 'name': 'GIT_SYNC_ONE_TIME', 'value': 'true' @@ -68,32 +80,103 @@ def _get_init_containers(self, volume_mounts): 'value': self.kube_config.git_password }) - volume_mounts[0]['readOnly'] = False + volume_mounts = [{ + 'mountPath': self.kube_config.git_sync_root, + 'name': self.dags_volume_name, + 'readOnly': False + }] + if self.kube_config.git_ssh_key_secret_name: + volume_mounts.append({ + 'name': self.git_sync_ssh_secret_volume_name, + 'mountPath': '/etc/git-secret/ssh', + 'subPath': 'ssh' + }) + init_environment.extend([ + { + 'name': 'GIT_SSH_KEY_FILE', + 'value': '/etc/git-secret/ssh' + }, + { + 'name': 'GIT_SYNC_SSH', + 'value': 'true' + }]) + if self.kube_config.git_ssh_known_hosts_configmap_name: + volume_mounts.append({ + 'name': self.git_sync_ssh_known_hosts_volume_name, + 'mountPath': '/etc/git-secret/known_hosts', + 'subPath': 'known_hosts' + }) + init_environment.extend([ + { + 'name': 'GIT_KNOWN_HOSTS', + 'value': 'true' + }, + { + 'name': 'GIT_SSH_KNOWN_HOSTS_FILE', + 'value': '/etc/git-secret/known_hosts' + } + ]) + else: + init_environment.append({ + 'name': 'GIT_KNOWN_HOSTS', + 'value': 'false' + }) + return [{ 'name': self.kube_config.git_sync_init_container_name, 'image': self.kube_config.git_sync_container, - 'securityContext': {'runAsUser': 0}, + 'securityContext': {'runAsUser': 65533}, # git-sync user 'env': init_environment, 'volumeMounts': volume_mounts }] def _get_environment(self): """Defines any necessary environment variables for the pod executor""" - env = { - 'AIRFLOW__CORE__DAGS_FOLDER': '/tmp/dags', - 'AIRFLOW__CORE__EXECUTOR': 'LocalExecutor' - } + env = {} + + for env_var_name, env_var_val in six.iteritems(self.kube_config.kube_env_vars): + env[env_var_name] = env_var_val + + env["AIRFLOW__CORE__EXECUTOR"] = "LocalExecutor" + if self.kube_config.airflow_configmap: - env['AIRFLOW__CORE__AIRFLOW_HOME'] = self.worker_airflow_home + env['AIRFLOW_HOME'] = self.worker_airflow_home + env['AIRFLOW__CORE__DAGS_FOLDER'] = self.worker_airflow_dags + if (not self.kube_config.airflow_configmap and + 'AIRFLOW__CORE__SQL_ALCHEMY_CONN' not in self.kube_config.kube_secrets): + env['AIRFLOW__CORE__SQL_ALCHEMY_CONN'] = conf.get("core", "SQL_ALCHEMY_CONN") + if self.kube_config.git_dags_folder_mount_point: + # /root/airflow/dags/repo/dags + dag_volume_mount_path = os.path.join( + self.kube_config.git_dags_folder_mount_point, + self.kube_config.git_sync_dest, # repo + self.kube_config.git_subpath # dags + ) + env['AIRFLOW__CORE__DAGS_FOLDER'] = dag_volume_mount_path return env + def _get_configmaps(self): + """Extracts any configmapRefs to envFrom""" + if not self.kube_config.env_from_configmap_ref: + return [] + return self.kube_config.env_from_configmap_ref.split(',') + def _get_secrets(self): """Defines any necessary secrets for the pod executor""" worker_secrets = [] + for env_var_name, obj_key_pair in six.iteritems(self.kube_config.kube_secrets): k8s_secret_obj, k8s_secret_key = obj_key_pair.split('=') worker_secrets.append( - Secret('env', env_var_name, k8s_secret_obj, k8s_secret_key)) + Secret('env', env_var_name, k8s_secret_obj, k8s_secret_key) + ) + + if self.kube_config.env_from_secret_ref: + for secret_ref in self.kube_config.env_from_secret_ref.split(','): + worker_secrets.append( + Secret('env', None, secret_ref) + ) + return worker_secrets def _get_image_pull_secrets(self): @@ -102,79 +185,129 @@ def _get_image_pull_secrets(self): return [] return self.kube_config.image_pull_secrets.split(',') - def init_volumes_and_mounts(self): - dags_volume_name = 'airflow-dags' - logs_volume_name = 'airflow-logs' + def _get_security_context(self): + """Defines the security context""" + security_context = {} + + if self.kube_config.worker_run_as_user: + security_context['runAsUser'] = self.kube_config.worker_run_as_user + + if self.kube_config.worker_fs_group: + security_context['fsGroup'] = self.kube_config.worker_fs_group - def _construct_volume(name, claim, subpath=None): - vo = { + # set fs_group to 65533 if not explicitly specified and using git ssh keypair auth + if self.kube_config.git_ssh_key_secret_name and security_context.get('fsGroup') is None: + security_context['fsGroup'] = 65533 + + return security_context + + def _get_volumes_and_mounts(self): + def _construct_volume(name, claim, host): + volume = { 'name': name } if claim: - vo['persistentVolumeClaim'] = { + volume['persistentVolumeClaim'] = { 'claimName': claim } - if subpath: - vo['subPath'] = subpath + elif host: + volume['hostPath'] = { + 'path': host, + 'type': '' + } else: - vo['emptyDir'] = {} - return vo + volume['emptyDir'] = {} + return volume - volumes = [ - _construct_volume( - dags_volume_name, + volumes = { + self.dags_volume_name: _construct_volume( + self.dags_volume_name, self.kube_config.dags_volume_claim, - self.kube_config.dags_volume_subpath + self.kube_config.dags_volume_host ), - _construct_volume( - logs_volume_name, + self.logs_volume_name: _construct_volume( + self.logs_volume_name, self.kube_config.logs_volume_claim, - self.kube_config.logs_volume_subpath + self.kube_config.logs_volume_host ) - ] + } - dag_volume_mount_path = "" - if self.kube_config.dags_volume_claim: - dag_volume_mount_path = self.worker_airflow_dags - else: - dag_volume_mount_path = os.path.join( - self.worker_airflow_dags, - self.kube_config.git_subpath - ) + volume_mounts = { + self.dags_volume_name: { + 'name': self.dags_volume_name, + 'mountPath': self.generate_dag_volume_mount_path(), + 'readOnly': True, + }, + self.logs_volume_name: { + 'name': self.logs_volume_name, + 'mountPath': self.worker_airflow_logs, + } + } - volume_mounts = [{ - 'name': dags_volume_name, - 'mountPath': dag_volume_mount_path, - 'readOnly': True - }, { - 'name': logs_volume_name, - 'mountPath': self.worker_airflow_logs - }] + if self.kube_config.dags_volume_subpath: + volume_mounts[self.dags_volume_name]['subPath'] = self.kube_config.dags_volume_subpath + + if self.kube_config.logs_volume_subpath: + volume_mounts[self.logs_volume_name]['subPath'] = self.kube_config.logs_volume_subpath + + if self.kube_config.dags_in_image: + del volumes[self.dags_volume_name] + del volume_mounts[self.dags_volume_name] + + # Get the SSH key from secrets as a volume + if self.kube_config.git_ssh_key_secret_name: + volumes[self.git_sync_ssh_secret_volume_name] = { + 'name': self.git_sync_ssh_secret_volume_name, + 'secret': { + 'secretName': self.kube_config.git_ssh_key_secret_name, + 'items': [{ + 'key': self.git_ssh_key_secret_key, + 'path': 'ssh', + 'mode': 0o440 + }] + } + } + + if self.kube_config.git_ssh_known_hosts_configmap_name: + volumes[self.git_sync_ssh_known_hosts_volume_name] = { + 'name': self.git_sync_ssh_known_hosts_volume_name, + 'configMap': { + 'name': self.kube_config.git_ssh_known_hosts_configmap_name + }, + 'mode': 0o440 + } # Mount the airflow.cfg file via a configmap the user has specified if self.kube_config.airflow_configmap: config_volume_name = 'airflow-config' config_path = '{}/airflow.cfg'.format(self.worker_airflow_home) - volumes.append({ + volumes[config_volume_name] = { 'name': config_volume_name, 'configMap': { 'name': self.kube_config.airflow_configmap } - }) - volume_mounts.append({ + } + volume_mounts[config_volume_name] = { 'name': config_volume_name, 'mountPath': config_path, 'subPath': 'airflow.cfg', 'readOnly': True - }) + } return volumes, volume_mounts + def generate_dag_volume_mount_path(self): + if self.kube_config.dags_volume_claim or self.kube_config.dags_volume_host: + dag_volume_mount_path = self.worker_airflow_dags + else: + dag_volume_mount_path = self.kube_config.git_dags_folder_mount_point + + return dag_volume_mount_path + def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_date, - airflow_command, kube_executor_config): - volumes, volume_mounts = self.init_volumes_and_mounts() - worker_init_container_spec = self._get_init_containers( - copy.deepcopy(volume_mounts)) + try_number, airflow_command, kube_executor_config): + volumes_dict, volume_mounts_dict = self._get_volumes_and_mounts() + worker_init_container_spec = self._get_init_containers() resources = Resources( request_memory=kube_executor_config.request_memory, request_cpu=kube_executor_config.request_cpu, @@ -182,9 +315,15 @@ def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_da limit_cpu=kube_executor_config.limit_cpu ) gcp_sa_key = kube_executor_config.gcp_service_account_key - annotations = { - 'iam.cloud.google.com/service-account': gcp_sa_key - } if gcp_sa_key else {} + annotations = dict(kube_executor_config.annotations) or self.kube_config.kube_annotations + if gcp_sa_key: + annotations['iam.cloud.google.com/service-account'] = gcp_sa_key + + volumes = [value for value in volumes_dict.values()] + kube_executor_config.volumes + volume_mounts = [value for value in volume_mounts_dict.values()] + kube_executor_config.volume_mounts + + affinity = kube_executor_config.affinity or self.kube_config.kube_affinity + tolerations = kube_executor_config.tolerations or self.kube_config.kube_tolerations return Pod( namespace=namespace, @@ -192,13 +331,13 @@ def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_da image=kube_executor_config.image or self.kube_config.kube_image, image_pull_policy=(kube_executor_config.image_pull_policy or self.kube_config.kube_image_pull_policy), - cmds=['bash', '-cx', '--'], - args=[airflow_command], + cmds=airflow_command, labels={ 'airflow-worker': worker_uuid, 'dag_id': dag_id, 'task_id': task_id, - 'execution_date': execution_date + 'execution_date': execution_date, + 'try_number': str(try_number), }, envs=self._get_environment(), secrets=self._get_secrets(), @@ -208,5 +347,11 @@ def make_pod(self, namespace, worker_uuid, pod_id, dag_id, task_id, execution_da volumes=volumes, volume_mounts=volume_mounts, resources=resources, - annotations=annotations + annotations=annotations, + node_selectors=(kube_executor_config.node_selectors or + self.kube_config.kube_node_selectors), + affinity=affinity, + tolerations=tolerations, + security_context=self._get_security_context(), + configmaps=self._get_configmaps() ) diff --git a/airflow/contrib/operators/adls_list_operator.py b/airflow/contrib/operators/adls_list_operator.py new file mode 100644 index 0000000000000..33c99064aa27b --- /dev/null +++ b/airflow/contrib/operators/adls_list_operator.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Iterable + +from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class AzureDataLakeStorageListOperator(BaseOperator): + """ + List all files from the specified path + + This operator returns a python list with the names of files which can be used by + `xcom` in the downstream tasks. + + :param path: The Azure Data Lake path to find the objects. Supports glob + strings (templated) + :type path: str + :param azure_data_lake_conn_id: The connection ID to use when + connecting to Azure Data Lake Storage. + :type azure_data_lake_conn_id: str + + **Example**: + The following Operator would list all the Parquet files from ``folder/output/`` + folder in the specified ADLS account :: + + adls_files = AzureDataLakeStorageListOperator( + task_id='adls_files', + path='folder/output/*.parquet', + azure_data_lake_conn_id='azure_data_lake_default' + ) + """ + template_fields = ('path',) # type: Iterable[str] + ui_color = '#901dd2' + + @apply_defaults + def __init__(self, + path, + azure_data_lake_conn_id='azure_data_lake_default', + *args, + **kwargs): + super(AzureDataLakeStorageListOperator, self).__init__(*args, **kwargs) + self.path = path + self.azure_data_lake_conn_id = azure_data_lake_conn_id + + def execute(self, context): + + hook = AzureDataLakeHook( + azure_data_lake_conn_id=self.azure_data_lake_conn_id + ) + + self.log.info('Getting list of ADLS files in path: %s', self.path) + + return hook.list(path=self.path) diff --git a/airflow/contrib/operators/adls_to_gcs.py b/airflow/contrib/operators/adls_to_gcs.py new file mode 100644 index 0000000000000..5d99251d3f6d2 --- /dev/null +++ b/airflow/contrib/operators/adls_to_gcs.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +from tempfile import NamedTemporaryFile + +from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook +from airflow.contrib.operators.adls_list_operator import AzureDataLakeStorageListOperator +from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook, _parse_gcs_url +from airflow.utils.decorators import apply_defaults + + +class AdlsToGoogleCloudStorageOperator(AzureDataLakeStorageListOperator): + """ + Synchronizes an Azure Data Lake Storage path with a GCS bucket + + :param src_adls: The Azure Data Lake path to find the objects (templated) + :type src_adls: str + :param dest_gcs: The Google Cloud Storage bucket and prefix to + store the objects. (templated) + :type dest_gcs: str + :param replace: If true, replaces same-named files in GCS + :type replace: bool + :param azure_data_lake_conn_id: The connection ID to use when + connecting to Azure Data Lake Storage. + :type azure_data_lake_conn_id: str + :param google_cloud_storage_conn_id: The connection ID to use when + connecting to Google Cloud Storage. + :type google_cloud_storage_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + + **Examples**: + The following Operator would copy a single file named + ``hello/world.avro`` from ADLS to the GCS bucket ``mybucket``. Its full + resulting gcs path will be ``gs://mybucket/hello/world.avro`` :: + + copy_single_file = AdlsToGoogleCloudStorageOperator( + task_id='copy_single_file', + src_adls='hello/world.avro', + dest_gcs='gs://mybucket', + replace=False, + azure_data_lake_conn_id='azure_data_lake_default', + google_cloud_storage_conn_id='google_cloud_default' + ) + + The following Operator would copy all parquet files from ADLS + to the GCS bucket ``mybucket``. :: + + copy_all_files = AdlsToGoogleCloudStorageOperator( + task_id='copy_all_files', + src_adls='*.parquet', + dest_gcs='gs://mybucket', + replace=False, + azure_data_lake_conn_id='azure_data_lake_default', + google_cloud_storage_conn_id='google_cloud_default' + ) + + The following Operator would copy all parquet files from ADLS + path ``/hello/world``to the GCS bucket ``mybucket``. :: + + copy_world_files = AdlsToGoogleCloudStorageOperator( + task_id='copy_world_files', + src_adls='hello/world/*.parquet', + dest_gcs='gs://mybucket', + replace=False, + azure_data_lake_conn_id='azure_data_lake_default', + google_cloud_storage_conn_id='google_cloud_default' + ) + """ + template_fields = ('src_adls', 'dest_gcs') + ui_color = '#f0eee4' + + @apply_defaults + def __init__(self, + src_adls, + dest_gcs, + azure_data_lake_conn_id, + google_cloud_storage_conn_id, + delegate_to=None, + replace=False, + *args, + **kwargs): + + super(AdlsToGoogleCloudStorageOperator, self).__init__( + path=src_adls, + azure_data_lake_conn_id=azure_data_lake_conn_id, + *args, + **kwargs + ) + self.src_adls = src_adls + self.dest_gcs = dest_gcs + self.replace = replace + self.google_cloud_storage_conn_id = google_cloud_storage_conn_id + self.delegate_to = delegate_to + + def execute(self, context): + # use the super to list all files in an Azure Data Lake path + files = super(AdlsToGoogleCloudStorageOperator, self).execute(context) + g_hook = GoogleCloudStorageHook( + google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, + delegate_to=self.delegate_to) + + if not self.replace: + # if we are not replacing -> list all files in the ADLS path + # and only keep those files which are present in + # ADLS and not in Google Cloud Storage + bucket_name, prefix = _parse_gcs_url(self.dest_gcs) + existing_files = g_hook.list(bucket=bucket_name, prefix=prefix) + files = set(files) - set(existing_files) + + if files: + hook = AzureDataLakeHook( + azure_data_lake_conn_id=self.azure_data_lake_conn_id + ) + + for obj in files: + with NamedTemporaryFile(mode='wb', delete=True) as f: + hook.download_file(local_path=f.name, remote_path=obj) + f.flush() + dest_gcs_bucket, dest_gcs_prefix = _parse_gcs_url(self.dest_gcs) + dest_path = os.path.join(dest_gcs_prefix, obj) + self.log.info("Saving file to %s", dest_path) + + g_hook.upload(bucket=dest_gcs_bucket, object=dest_path, filename=f.name) + + self.log.info("All done, uploaded %d files to GCS", len(files)) + else: + self.log.info("In sync, no files needed to be uploaded to GCS") + + return files diff --git a/airflow/contrib/operators/aws_athena_operator.py b/airflow/contrib/operators/aws_athena_operator.py new file mode 100644 index 0000000000000..28054d320168a --- /dev/null +++ b/airflow/contrib/operators/aws_athena_operator.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from uuid import uuid4 + +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.contrib.hooks.aws_athena_hook import AWSAthenaHook + + +class AWSAthenaOperator(BaseOperator): + """ + An operator that submit presto query to athena. + + :param query: Presto to be run on athena. (templated) + :type query: str + :param database: Database to select. (templated) + :type database: str + :param output_location: s3 path to write the query results into. (templated) + :type output_location: str + :param aws_conn_id: aws connection to use + :type aws_conn_id: str + :param sleep_time: Time to wait between two consecutive call to check query status on athena + :type sleep_time: int + """ + + ui_color = '#44b5e2' + template_fields = ('query', 'database', 'output_location') + + @apply_defaults + def __init__(self, query, database, output_location, aws_conn_id='aws_default', client_request_token=None, + query_execution_context=None, result_configuration=None, sleep_time=30, *args, **kwargs): + super(AWSAthenaOperator, self).__init__(*args, **kwargs) + self.query = query + self.database = database + self.output_location = output_location + self.aws_conn_id = aws_conn_id + self.client_request_token = client_request_token or str(uuid4()) + self.query_execution_context = query_execution_context or {} + self.result_configuration = result_configuration or {} + self.sleep_time = sleep_time + self.query_execution_id = None + self.hook = None + + def get_hook(self): + return AWSAthenaHook(self.aws_conn_id, self.sleep_time) + + def execute(self, context): + """ + Run Presto Query on Athena + """ + self.hook = self.get_hook() + self.hook.get_conn() + + self.query_execution_context['Database'] = self.database + self.result_configuration['OutputLocation'] = self.output_location + self.query_execution_id = self.hook.run_query(self.query, self.query_execution_context, + self.result_configuration, self.client_request_token) + self.hook.poll_query_status(self.query_execution_id) + + def on_kill(self): + """ + Cancel the submitted athena query + """ + if self.query_execution_id: + self.log.info('⚰️⚰️⚰️ Received a kill Signal. Time to Die') + self.log.info( + 'Stopping Query with executionId - %s', self.query_execution_id + ) + response = self.hook.stop_query(self.query_execution_id) + http_status_code = None + try: + http_status_code = response['ResponseMetadata']['HTTPStatusCode'] + except Exception as ex: + self.log.error('Exception while cancelling query', ex) + finally: + if http_status_code is None or http_status_code != 200: + self.log.error('Unable to request query cancel on athena. Exiting') + else: + self.log.info( + 'Polling Athena for query with id %s to reach final state', self.query_execution_id + ) + self.hook.poll_query_status(self.query_execution_id) diff --git a/airflow/contrib/operators/awsbatch_operator.py b/airflow/contrib/operators/awsbatch_operator.py index d23b44e0d4471..c0b96d819c820 100644 --- a/airflow/contrib/operators/awsbatch_operator.py +++ b/airflow/contrib/operators/awsbatch_operator.py @@ -33,7 +33,7 @@ class AWSBatchOperator(BaseOperator): """ Execute a job on AWS Batch Service - .. warning: the queue parameter was renamed to job_queue to segreggate the + .. warning: the queue parameter was renamed to job_queue to segregate the internal CeleryExecutor queue from the AWS Batch internal queue. :param job_name: the name for the job that will run on AWS Batch @@ -42,18 +42,20 @@ class AWSBatchOperator(BaseOperator): :type job_definition: str :param job_queue: the queue name on AWS Batch :type job_queue: str - :param: overrides: the same parameter that boto3 will receive on - containerOverrides (templated): - http://boto3.readthedocs.io/en/latest/reference/services/batch.html#submit_job - :type: overrides: dict - :param max_retries: exponential backoff retries while waiter is not merged, 4200 = 48 hours + :param overrides: the same parameter that boto3 will receive on + containerOverrides (templated): + http://boto3.readthedocs.io/en/latest/reference/services/batch.html#submit_job + :type overrides: dict + :param max_retries: exponential backoff retries while waiter is not + merged, 4200 = 48 hours :type max_retries: int :param aws_conn_id: connection id of AWS credentials / region name. If None, - credential boto3 strategy will be used - (http://boto3.readthedocs.io/en/latest/guide/configuration.html). + credential boto3 strategy will be used + (http://boto3.readthedocs.io/en/latest/guide/configuration.html). :type aws_conn_id: str :param region_name: region name to use in AWS Hook. Override the region_name in connection (if provided) + :type region_name: str """ ui_color = '#c3dae0' @@ -139,7 +141,7 @@ def _wait_for_task_ended(self): if response['jobs'][-1]['status'] in ['SUCCEEDED', 'FAILED']: retry = False - sleep( 1 + pow(retries * 0.1, 2)) + sleep(1 + pow(retries * 0.1, 2)) retries += 1 def _check_success_task(self): @@ -152,17 +154,19 @@ def _check_success_task(self): raise AirflowException('No job found for {}'.format(response)) for job in response['jobs']: - if 'attempts' in job: - containers = job['attempts'] - for container in containers: - if (job['status'] == 'FAILED' or - container['container']['exitCode'] != 0): - raise AirflowException( - 'This containers encounter an error during ' - 'execution {}'.format(job)) - elif job['status'] is not 'SUCCEEDED': + job_status = job['status'] + if job_status == 'FAILED': + reason = job['statusReason'] + raise AirflowException('Job failed with status {}'.format(reason)) + elif job_status in [ + 'SUBMITTED', + 'PENDING', + 'RUNNABLE', + 'STARTING', + 'RUNNING' + ]: raise AirflowException( - 'This task is still pending {}'.format(job['status'])) + 'This task is still pending {}'.format(job_status)) def get_hook(self): return AwsHook( diff --git a/airflow/contrib/operators/azure_container_instances_operator.py b/airflow/contrib/operators/azure_container_instances_operator.py new file mode 100644 index 0000000000000..2d01e13f65d1b --- /dev/null +++ b/airflow/contrib/operators/azure_container_instances_operator.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from collections import namedtuple +from time import sleep +from typing import Dict, Sequence + +from airflow.contrib.hooks.azure_container_instance_hook import AzureContainerInstanceHook +from airflow.contrib.hooks.azure_container_registry_hook import AzureContainerRegistryHook +from airflow.contrib.hooks.azure_container_volume_hook import AzureContainerVolumeHook + +from airflow.exceptions import AirflowException, AirflowTaskTimeout +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + +from azure.mgmt.containerinstance.models import (EnvironmentVariable, + VolumeMount, + ResourceRequests, + ResourceRequirements, + Container, + ContainerGroup) +from msrestazure.azure_exceptions import CloudError + + +Volume = namedtuple( + 'Volume', + ['conn_id', 'account_name', 'share_name', 'mount_path', 'read_only'], +) + +DEFAULT_ENVIRONMENT_VARIABLES = {} # type: Dict[str, str] +DEFAULT_VOLUMES = [] # type: Sequence[Volume] +DEFAULT_MEMORY_IN_GB = 2.0 +DEFAULT_CPU = 1.0 + + +class AzureContainerInstancesOperator(BaseOperator): + """ + Start a container on Azure Container Instances + + :param ci_conn_id: connection id of a service principal which will be used + to start the container instance + :type ci_conn_id: str + :param registry_conn_id: connection id of a user which can login to a + private docker registry. If None, we assume a public registry + :type registry_conn_id: str + :param resource_group: name of the resource group wherein this container + instance should be started + :type resource_group: str + :param name: name of this container instance. Please note this name has + to be unique in order to run containers in parallel. + :type name: str + :param image: the docker image to be used + :type image: str + :param region: the region wherein this container instance should be started + :type region: str + :param environment_variables: key,value pairs containing environment + variables which will be passed to the running container + :type environment_variables: dict + :param volumes: list of volumes to be mounted to the container. + Currently only Azure Fileshares are supported. + :type volumes: list[] + :param memory_in_gb: the amount of memory to allocate to this container + :type memory_in_gb: double + :param cpu: the number of cpus to allocate to this container + :type cpu: double + :param command: the command to run inside the container + :type command: str + + :Example: + + >>> a = AzureContainerInstancesOperator( + 'azure_service_principal', + 'azure_registry_user', + 'my-resource-group', + 'my-container-name-{{ ds }}', + 'myprivateregistry.azurecr.io/my_container:latest', + 'westeurope', + {'EXECUTION_DATE': '{{ ds }}'}, + [('azure_wasb_conn_id', + 'my_storage_container', + 'my_fileshare', + '/input-data', + True),], + memory_in_gb=14.0, + cpu=4.0, + command='python /app/myfile.py', + task_id='start_container' + ) + """ + + template_fields = ('name', 'environment_variables') + + @apply_defaults + def __init__(self, ci_conn_id, registry_conn_id, resource_group, name, image, region, + environment_variables=None, volumes=None, memory_in_gb=None, cpu=None, + command=None, remove_on_error=True, fail_if_exists=True, *args, **kwargs): + super(AzureContainerInstancesOperator, self).__init__(*args, **kwargs) + + self.ci_conn_id = ci_conn_id + self.resource_group = resource_group + self.name = name + self.image = image + self.region = region + self.registry_conn_id = registry_conn_id + self.environment_variables = environment_variables or DEFAULT_ENVIRONMENT_VARIABLES + self.volumes = volumes or DEFAULT_VOLUMES + self.memory_in_gb = memory_in_gb or DEFAULT_MEMORY_IN_GB + self.cpu = cpu or DEFAULT_CPU + self.command = command + self.remove_on_error = remove_on_error + self.fail_if_exists = fail_if_exists + + def execute(self, context): + ci_hook = AzureContainerInstanceHook(self.ci_conn_id) + + if self.fail_if_exists: + self.log.info("Testing if container group already exists") + if ci_hook.exists(self.resource_group, self.name): + raise AirflowException("Container group exists") + + if self.registry_conn_id: + registry_hook = AzureContainerRegistryHook(self.registry_conn_id) + image_registry_credentials = [registry_hook.connection, ] + else: + image_registry_credentials = None + + environment_variables = [] + for key, value in self.environment_variables.items(): + environment_variables.append(EnvironmentVariable(key, value)) + + volumes = [] + volume_mounts = [] + for conn_id, account_name, share_name, mount_path, read_only in self.volumes: + hook = AzureContainerVolumeHook(conn_id) + + mount_name = "mount-%d" % len(volumes) + volumes.append(hook.get_file_volume(mount_name, + share_name, + account_name, + read_only)) + volume_mounts.append(VolumeMount(mount_name, mount_path, read_only)) + + exit_code = 1 + try: + self.log.info("Starting container group with %.1f cpu %.1f mem", + self.cpu, self.memory_in_gb) + + resources = ResourceRequirements(requests=ResourceRequests( + memory_in_gb=self.memory_in_gb, + cpu=self.cpu)) + + container = Container( + name=self.name, + image=self.image, + resources=resources, + command=self.command, + environment_variables=environment_variables, + volume_mounts=volume_mounts) + + container_group = ContainerGroup( + location=self.region, + containers=[container, ], + image_registry_credentials=image_registry_credentials, + volumes=volumes, + restart_policy='Never', + os_type='Linux') + + ci_hook.create_or_update(self.resource_group, self.name, container_group) + + self.log.info("Container group started %s/%s", self.resource_group, self.name) + + exit_code = self._monitor_logging(ci_hook, self.resource_group, self.name) + + self.log.info("Container had exit code: %s", exit_code) + if exit_code != 0: + raise AirflowException("Container had a non-zero exit code, %s" + % exit_code) + + except CloudError: + self.log.exception("Could not start container group") + raise AirflowException("Could not start container group") + + finally: + if exit_code == 0 or self.remove_on_error: + self.log.info("Deleting container group") + try: + ci_hook.delete(self.resource_group, self.name) + except Exception: + self.log.exception("Could not delete container group") + + def _monitor_logging(self, ci_hook, resource_group, name): + last_state = None + last_message_logged = None + last_line_logged = None + for _ in range(43200): # roughly 12 hours + try: + state, exit_code, detail_status = ci_hook.get_state_exitcode_details(resource_group, name) + if state != last_state: + self.log.info("Container group state changed to %s", state) + last_state = state + + messages = ci_hook.get_messages(resource_group, name) + last_message_logged = self._log_last(messages, last_message_logged) + + if state in ["Running", "Terminated"]: + try: + logs = ci_hook.get_logs(resource_group, name) + last_line_logged = self._log_last(logs, last_line_logged) + except CloudError: + self.log.exception("Exception while getting logs from " + "container instance, retrying...") + + if state == "Terminated": + self.log.info("Container exited with detail_status %s", detail_status) + return exit_code + + except CloudError as err: + if 'ResourceNotFound' in str(err): + self.log.warning("ResourceNotFound, container is probably removed " + "by another process " + "(make sure that the name is unique).") + return 1 + else: + self.log.exception("Exception while getting container groups") + except Exception: + self.log.exception("Exception while getting container groups") + + sleep(1) + + # no return -> hence still running + raise AirflowTaskTimeout("Did not complete on time") + + def _log_last(self, logs, last_line_logged): + if logs: + # determine the last line which was logged before + last_line_index = 0 + for i in range(len(logs) - 1, -1, -1): + if logs[i] == last_line_logged: + # this line is the same, hence print from i+1 + last_line_index = i + 1 + break + + # log all new ones + for line in logs[last_line_index:]: + self.log.info(line.rstrip()) + + return logs[-1] diff --git a/airflow/contrib/operators/azure_cosmos_operator.py b/airflow/contrib/operators/azure_cosmos_operator.py new file mode 100644 index 0000000000000..e0fbaf2dd2f72 --- /dev/null +++ b/airflow/contrib/operators/azure_cosmos_operator.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class AzureCosmosInsertDocumentOperator(BaseOperator): + """ + Inserts a new document into the specified Cosmos database and collection + It will create both the database and collection if they do not already exist + + :param database_name: The name of the database. (templated) + :type database_name: str + :param collection_name: The name of the collection. (templated) + :type collection_name: str + :param document: The document to insert + :type document: dict + :param azure_cosmos_conn_id: reference to a CosmosDB connection. + :type azure_cosmos_conn_id: str + """ + template_fields = ('database_name', 'collection_name') + ui_color = '#e4f0e8' + + @apply_defaults + def __init__(self, + database_name, + collection_name, + document, + azure_cosmos_conn_id='azure_cosmos_default', + *args, + **kwargs): + super(AzureCosmosInsertDocumentOperator, self).__init__(*args, **kwargs) + self.database_name = database_name + self.collection_name = collection_name + self.document = document + self.azure_cosmos_conn_id = azure_cosmos_conn_id + + def execute(self, context): + # Create the hook + hook = AzureCosmosDBHook(azure_cosmos_conn_id=self.azure_cosmos_conn_id) + + # Create the DB if it doesn't already exist + if not hook.does_database_exist(self.database_name): + hook.create_database(self.database_name) + + # Create the collection as well + if not hook.does_collection_exist(self.collection_name, self.database_name): + hook.create_collection(self.collection_name, self.database_name) + + # finally insert the document + hook.upsert_document(self.document, self.database_name, self.collection_name) diff --git a/airflow/contrib/operators/bigquery_check_operator.py b/airflow/contrib/operators/bigquery_check_operator.py index 59ef5d377d182..afb600a3d9120 100644 --- a/airflow/contrib/operators/bigquery_check_operator.py +++ b/airflow/contrib/operators/bigquery_check_operator.py @@ -48,28 +48,32 @@ class BigQueryCheckOperator(CheckOperator): This operator can be used as a data quality check in your pipeline, and depending on where you put it in your DAG, you have the choice to stop the critical path, preventing from - publishing dubious data, or on the side and receive email alterts + publishing dubious data, or on the side and receive email alerts without stopping the progress of the DAG. :param sql: the sql to be executed - :type sql: string + :type sql: str :param bigquery_conn_id: reference to the BigQuery database - :type bigquery_conn_id: string + :type bigquery_conn_id: str + :param use_legacy_sql: Whether to use legacy SQL (true) + or standard SQL (false). + :type use_legacy_sql: bool """ @apply_defaults - def __init__( - self, - sql, - bigquery_conn_id='bigquery_default', - *args, - **kwargs): + def __init__(self, + sql, + bigquery_conn_id='bigquery_default', + use_legacy_sql=True, + *args, **kwargs): super(BigQueryCheckOperator, self).__init__(sql=sql, *args, **kwargs) self.bigquery_conn_id = bigquery_conn_id self.sql = sql + self.use_legacy_sql = use_legacy_sql def get_db_hook(self): - return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) + return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + use_legacy_sql=self.use_legacy_sql) class BigQueryValueCheckOperator(ValueCheckOperator): @@ -77,21 +81,28 @@ class BigQueryValueCheckOperator(ValueCheckOperator): Performs a simple value check using sql code. :param sql: the sql to be executed - :type sql: string + :type sql: str + :param use_legacy_sql: Whether to use legacy SQL (true) + or standard SQL (false). + :type use_legacy_sql: bool """ @apply_defaults - def __init__( - self, sql, pass_value, tolerance=None, - bigquery_conn_id='bigquery_default', - *args, **kwargs): + def __init__(self, sql, + pass_value, + tolerance=None, + bigquery_conn_id='bigquery_default', + use_legacy_sql=True, + *args, **kwargs): super(BigQueryValueCheckOperator, self).__init__( sql=sql, pass_value=pass_value, tolerance=tolerance, *args, **kwargs) self.bigquery_conn_id = bigquery_conn_id + self.use_legacy_sql = use_legacy_sql def get_db_hook(self): - return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) + return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + use_legacy_sql=self.use_legacy_sql) class BigQueryIntervalCheckOperator(IntervalCheckOperator): @@ -102,7 +113,7 @@ class BigQueryIntervalCheckOperator(IntervalCheckOperator): This method constructs a query like so :: SELECT {metrics_threshold_dict_key} FROM {table} - WHERE {date_filter_column}= + WHERE {date_filter_column}= :param table: the table name :type table: str @@ -113,19 +124,22 @@ class BigQueryIntervalCheckOperator(IntervalCheckOperator): example 'COUNT(*)': 1.5 would require a 50 percent or less difference between the current day, and the prior days_back. :type metrics_threshold: dict + :param use_legacy_sql: Whether to use legacy SQL (true) + or standard SQL (false). + :type use_legacy_sql: bool """ @apply_defaults - def __init__( - self, table, metrics_thresholds, - date_filter_column='ds', days_back=-7, - bigquery_conn_id='bigquery_default', - *args, **kwargs): + def __init__(self, table, metrics_thresholds, date_filter_column='ds', + days_back=-7, bigquery_conn_id='bigquery_default', + use_legacy_sql=True, *args, **kwargs): super(BigQueryIntervalCheckOperator, self).__init__( table=table, metrics_thresholds=metrics_thresholds, date_filter_column=date_filter_column, days_back=days_back, *args, **kwargs) self.bigquery_conn_id = bigquery_conn_id + self.use_legacy_sql = use_legacy_sql def get_db_hook(self): - return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id) + return BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + use_legacy_sql=self.use_legacy_sql) diff --git a/airflow/contrib/operators/bigquery_get_data.py b/airflow/contrib/operators/bigquery_get_data.py index ab8f71b717716..f5e6e50f066d5 100644 --- a/airflow/contrib/operators/bigquery_get_data.py +++ b/airflow/contrib/operators/bigquery_get_data.py @@ -51,21 +51,21 @@ class BigQueryGetDataOperator(BaseOperator): ) :param dataset_id: The dataset ID of the requested table. (templated) - :type destination_dataset_table: string + :type dataset_id: str :param table_id: The table ID of the requested table. (templated) - :type table_id: string + :type table_id: str :param max_results: The maximum number of records (rows) to be fetched from the table. (templated) - :type max_results: string + :type max_results: str :param selected_fields: List of fields to return (comma-separated). If unspecified, all fields are returned. - :type selected_fields: string + :type selected_fields: str :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ('dataset_id', 'table_id', 'max_results') ui_color = '#e4f0e8' diff --git a/airflow/contrib/operators/bigquery_operator.py b/airflow/contrib/operators/bigquery_operator.py index b36efbd6bfd43..746abac5360ce 100644 --- a/airflow/contrib/operators/bigquery_operator.py +++ b/airflow/contrib/operators/bigquery_operator.py @@ -39,60 +39,80 @@ class BigQueryOperator(BaseOperator): a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql'. :param destination_dataset_table: A dotted - (.|:).
that, if set, will store the results + ``(.|:).
`` that, if set, will store the results of the query. (templated) - :type destination_dataset_table: string + :type destination_dataset_table: str :param write_disposition: Specifies the action that occurs if the destination table already exists. (default: 'WRITE_EMPTY') - :type write_disposition: string + :type write_disposition: str :param create_disposition: Specifies whether the job is allowed to create new tables. (default: 'CREATE_IF_NEEDED') - :type create_disposition: string + :type create_disposition: str :param allow_large_results: Whether to allow large results. - :type allow_large_results: boolean + :type allow_large_results: bool :param flatten_results: If true and query uses legacy SQL dialect, flattens all nested and repeated fields in the query results. ``allow_large_results`` must be ``true`` if this is set to ``false``. For standard SQL queries, this flag is ignored and results are never flattened. - :type flatten_results: boolean + :type flatten_results: bool :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param udf_config: The User Defined Function configuration for the query. See https://cloud.google.com/bigquery/user-defined-functions for details. :type udf_config: list :param use_legacy_sql: Whether to use legacy SQL (true) or standard SQL (false). - :type use_legacy_sql: boolean + :type use_legacy_sql: bool :param maximum_billing_tier: Positive integer that serves as a multiplier of the basic price. Defaults to None, in which case it uses the value set in the project. - :type maximum_billing_tier: integer + :type maximum_billing_tier: int :param maximum_bytes_billed: Limits the bytes billed for this job. Queries that will have bytes billed beyond this limit will fail (without incurring a charge). If unspecified, this will be set to your project default. :type maximum_bytes_billed: float + :param api_resource_configs: a dictionary that contain params + 'configuration' applied for Google BigQuery Jobs API: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs + for example, {'query': {'useQueryCache': False}}. You could use it + if you need to provide some params that are not supported by BigQueryOperator + like args. + :type api_resource_configs: dict :param schema_update_options: Allows the schema of the destination table to be updated as a side effect of the load job. :type schema_update_options: tuple - :param query_params: a dictionary containing query parameter types and - values, passed to BigQuery. - :type query_params: dict + :param query_params: a list of dictionary containing query parameter types and + values, passed to BigQuery. The structure of dictionary should look like + 'queryParameters' in Google BigQuery Jobs API: + https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs. + For example, [{ 'name': 'corpus', 'parameterType': { 'type': 'STRING' }, + 'parameterValue': { 'value': 'romeoandjuliet' } }]. + :type query_params: list + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict :param priority: Specifies a priority for the query. Possible values include INTERACTIVE and BATCH. The default value is INTERACTIVE. - :type priority: string + :type priority: str :param time_partitioning: configure optional time partitioning fields i.e. - partition by field, type and - expiration as per API specifications. Note that 'field' is not available in - conjunction with dataset.table$partition. + partition by field, type and expiration as per API specifications. :type time_partitioning: dict + :param cluster_fields: Request that the result of this query be stored sorted + by one or more columns. This is only available in conjunction with + time_partitioning. The order of columns given determines the sort order. + :type cluster_fields: list[str] + :param location: The geographic location of the job. Required except for + US and EU. See details at + https://cloud.google.com/bigquery/docs/locations#specifying_your_location + :type location: str """ - template_fields = ('bql', 'sql', 'destination_dataset_table') + template_fields = ('bql', 'sql', 'destination_dataset_table', 'labels') template_ext = ('.sql', ) ui_color = '#e4f0e8' @@ -100,21 +120,25 @@ class BigQueryOperator(BaseOperator): def __init__(self, bql=None, sql=None, - destination_dataset_table=False, + destination_dataset_table=None, write_disposition='WRITE_EMPTY', allow_large_results=False, - flatten_results=False, + flatten_results=None, bigquery_conn_id='bigquery_default', delegate_to=None, - udf_config=False, + udf_config=None, use_legacy_sql=True, maximum_billing_tier=None, maximum_bytes_billed=None, create_disposition='CREATE_IF_NEEDED', schema_update_options=(), query_params=None, + labels=None, priority='INTERACTIVE', - time_partitioning={}, + time_partitioning=None, + api_resource_configs=None, + cluster_fields=None, + location=None, *args, **kwargs): super(BigQueryOperator, self).__init__(*args, **kwargs) @@ -133,9 +157,13 @@ def __init__(self, self.maximum_bytes_billed = maximum_bytes_billed self.schema_update_options = schema_update_options self.query_params = query_params + self.labels = labels self.bq_cursor = None self.priority = priority self.time_partitioning = time_partitioning + self.api_resource_configs = api_resource_configs + self.cluster_fields = cluster_fields + self.location = location # TODO remove `bql` in Airflow 2.0 if self.bql: @@ -157,11 +185,13 @@ def execute(self, context): hook = BigQueryHook( bigquery_conn_id=self.bigquery_conn_id, use_legacy_sql=self.use_legacy_sql, - delegate_to=self.delegate_to) + delegate_to=self.delegate_to, + location=self.location, + ) conn = hook.get_conn() self.bq_cursor = conn.cursor() self.bq_cursor.run_query( - self.sql, + sql=self.sql, destination_dataset_table=self.destination_dataset_table, write_disposition=self.write_disposition, allow_large_results=self.allow_large_results, @@ -171,15 +201,18 @@ def execute(self, context): maximum_bytes_billed=self.maximum_bytes_billed, create_disposition=self.create_disposition, query_params=self.query_params, + labels=self.labels, schema_update_options=self.schema_update_options, priority=self.priority, - time_partitioning=self.time_partitioning + time_partitioning=self.time_partitioning, + api_resource_configs=self.api_resource_configs, + cluster_fields=self.cluster_fields, ) def on_kill(self): super(BigQueryOperator, self).on_kill() if self.bq_cursor is not None: - self.log.info('Canceling running query due to execution timeout') + self.log.info('Cancelling running query') self.bq_cursor.cancel_query() @@ -195,11 +228,11 @@ class BigQueryCreateEmptyTableOperator(BaseOperator): You can also create a table without schema. :param project_id: The project to create the table into. (templated) - :type project_id: string + :type project_id: str :param dataset_id: The dataset to create the table into. (templated) - :type dataset_id: string + :type dataset_id: str :param table_id: The Name of the table to be created. (templated) - :type table_id: string + :type table_id: str :param schema_fields: If set, the schema field list as defined here: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema @@ -212,7 +245,7 @@ class BigQueryCreateEmptyTableOperator(BaseOperator): :param gcs_schema_object: Full path to the JSON file containing schema (templated). For example: ``gs://test-bucket/dir1/dir2/employee_schema.json`` - :type gcs_schema_object: string + :type gcs_schema_object: str :param time_partitioning: configure optional time partitioning fields i.e. partition by field, type and expiration as per API specifications. @@ -220,57 +253,60 @@ class BigQueryCreateEmptyTableOperator(BaseOperator): https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#timePartitioning :type time_partitioning: dict :param bigquery_conn_id: Reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string - - **Example (with schema JSON in GCS)**: :: - - CreateTable = BigQueryCreateEmptyTableOperator( - task_id='BigQueryCreateEmptyTableOperator_task', - dataset_id='ODS', - table_id='Employees', - project_id='internal-gcp-project', - gcs_schema_object='gs://schema-bucket/employee_schema.json', - bigquery_conn_id='airflow-service-account', - google_cloud_storage_conn_id='airflow-service-account' - ) - - **Corresponding Schema file** (``employee_schema.json``): :: - - [ - { - "mode": "NULLABLE", - "name": "emp_name", - "type": "STRING" - }, - { - "mode": "REQUIRED", - "name": "salary", - "type": "INTEGER" - } - ] - - **Example (with schema in the DAG)**: :: - - CreateTable = BigQueryCreateEmptyTableOperator( - task_id='BigQueryCreateEmptyTableOperator_task', - dataset_id='ODS', - table_id='Employees', - project_id='internal-gcp-project', - schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, - {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}], - bigquery_conn_id='airflow-service-account', - google_cloud_storage_conn_id='airflow-service-account' - ) + :type delegate_to: str + :param labels: a dictionary containing labels for the table, passed to BigQuery + + **Example (with schema JSON in GCS)**: :: + + CreateTable = BigQueryCreateEmptyTableOperator( + task_id='BigQueryCreateEmptyTableOperator_task', + dataset_id='ODS', + table_id='Employees', + project_id='internal-gcp-project', + gcs_schema_object='gs://schema-bucket/employee_schema.json', + bigquery_conn_id='airflow-service-account', + google_cloud_storage_conn_id='airflow-service-account' + ) + + **Corresponding Schema file** (``employee_schema.json``): :: + + [ + { + "mode": "NULLABLE", + "name": "emp_name", + "type": "STRING" + }, + { + "mode": "REQUIRED", + "name": "salary", + "type": "INTEGER" + } + ] + + **Example (with schema in the DAG)**: :: + + CreateTable = BigQueryCreateEmptyTableOperator( + task_id='BigQueryCreateEmptyTableOperator_task', + dataset_id='ODS', + table_id='Employees', + project_id='internal-gcp-project', + schema_fields=[{"name": "emp_name", "type": "STRING", "mode": "REQUIRED"}, + {"name": "salary", "type": "INTEGER", "mode": "NULLABLE"}], + bigquery_conn_id='airflow-service-account', + google_cloud_storage_conn_id='airflow-service-account' + ) + :type labels: dict """ - template_fields = ('dataset_id', 'table_id', 'project_id', 'gcs_schema_object') + template_fields = ('dataset_id', 'table_id', 'project_id', + 'gcs_schema_object', 'labels') ui_color = '#f0eee4' @apply_defaults @@ -280,10 +316,11 @@ def __init__(self, project_id=None, schema_fields=None, gcs_schema_object=None, - time_partitioning={}, + time_partitioning=None, bigquery_conn_id='bigquery_default', google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, + labels=None, *args, **kwargs): super(BigQueryCreateEmptyTableOperator, self).__init__(*args, **kwargs) @@ -296,7 +333,8 @@ def __init__(self, self.bigquery_conn_id = bigquery_conn_id self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.delegate_to = delegate_to - self.time_partitioning = time_partitioning + self.time_partitioning = {} if time_partitioning is None else time_partitioning + self.labels = labels def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, @@ -323,7 +361,8 @@ def execute(self, context): dataset_id=self.dataset_id, table_id=self.table_id, schema_fields=schema_fields, - time_partitioning=self.time_partitioning + time_partitioning=self.time_partitioning, + labels=self.labels ) @@ -338,15 +377,15 @@ class BigQueryCreateExternalTableOperator(BaseOperator): Google cloud storage must be a JSON file with the schema fields in it. :param bucket: The bucket to point the external table to. (templated) - :type bucket: string + :type bucket: str :param source_objects: List of Google cloud storage URIs to point table to. (templated) If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI. - :type object: list - :param destination_project_dataset_table: The dotted (.).
- BigQuery table to load data into (templated). If is not included, + :type source_objects: list + :param destination_project_dataset_table: The dotted ``(.).
`` + BigQuery table to load data into (templated). If ```` is not included, project will be the project defined in the connection json. - :type destination_project_dataset_table: string + :type destination_project_dataset_table: str :param schema_fields: If set, the schema field list as defined here: https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs#configuration.load.schema @@ -359,26 +398,26 @@ class BigQueryCreateExternalTableOperator(BaseOperator): :type schema_fields: list :param schema_object: If set, a GCS object path pointing to a .json file that contains the schema for the table. (templated) - :param schema_object: string + :type schema_object: str :param source_format: File format of the data. - :type source_format: string + :type source_format: str :param compression: [Optional] The compression type of the data source. Possible values include GZIP and NONE. The default value is NONE. This setting is ignored for Google Cloud Bigtable, Google Cloud Datastore backups and Avro formats. - :type compression: string + :type compression: str :param skip_leading_rows: Number of rows to skip when loading from a CSV. :type skip_leading_rows: int :param field_delimiter: The delimiter to use for the CSV. - :type field_delimiter: string + :type field_delimiter: str :param max_bad_records: The maximum number of bad records that BigQuery can ignore when running the job. :type max_bad_records: int :param quote_character: The value that is used to quote data sections in a CSV file. - :type quote_character: string + :type quote_character: str :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false). - :type allow_quoted_newlines: boolean + :type allow_quoted_newlines: bool :param allow_jagged_rows: Accept rows that are missing trailing optional columns. The missing values are treated as nulls. If false, records with missing trailing columns are treated as bad records, and if there are too many bad records, an @@ -386,19 +425,21 @@ class BigQueryCreateExternalTableOperator(BaseOperator): for other formats. :type allow_jagged_rows: bool :param bigquery_conn_id: Reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param src_fmt_configs: configure optional fields specific to the source format :type src_fmt_configs: dict + :param labels: a dictionary containing labels for the table, passed to BigQuery + :type labels: dict """ template_fields = ('bucket', 'source_objects', - 'schema_object', 'destination_project_dataset_table') + 'schema_object', 'destination_project_dataset_table', 'labels') ui_color = '#f0eee4' @apply_defaults @@ -420,6 +461,7 @@ def __init__(self, google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, src_fmt_configs={}, + labels=None, *args, **kwargs): super(BigQueryCreateExternalTableOperator, self).__init__(*args, **kwargs) @@ -446,6 +488,7 @@ def __init__(self, self.delegate_to = delegate_to self.src_fmt_configs = src_fmt_configs + self.labels = labels def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, @@ -479,5 +522,121 @@ def execute(self, context): quote_character=self.quote_character, allow_quoted_newlines=self.allow_quoted_newlines, allow_jagged_rows=self.allow_jagged_rows, - src_fmt_configs=self.src_fmt_configs + src_fmt_configs=self.src_fmt_configs, + labels=self.labels + ) + + +class BigQueryDeleteDatasetOperator(BaseOperator): + """ + This operator deletes an existing dataset from your Project in Big query. + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets/delete + + :param project_id: The project id of the dataset. + :type project_id: str + :param dataset_id: The dataset to be deleted. + :type dataset_id: str + + **Example**: :: + + delete_temp_data = BigQueryDeleteDatasetOperator(dataset_id = 'temp-dataset', + project_id = 'temp-project', + bigquery_conn_id='_my_gcp_conn_', + task_id='Deletetemp', + dag=dag) + """ + + template_fields = ('dataset_id', 'project_id') + ui_color = '#f00004' + + @apply_defaults + def __init__(self, + dataset_id, + project_id=None, + bigquery_conn_id='bigquery_default', + delegate_to=None, + *args, **kwargs): + self.dataset_id = dataset_id + self.project_id = project_id + self.bigquery_conn_id = bigquery_conn_id + self.delegate_to = delegate_to + + self.log.info('Dataset id: %s', self.dataset_id) + self.log.info('Project id: %s', self.project_id) + + super(BigQueryDeleteDatasetOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + delegate_to=self.delegate_to) + + conn = bq_hook.get_conn() + cursor = conn.cursor() + + cursor.delete_dataset( + project_id=self.project_id, + dataset_id=self.dataset_id ) + + +class BigQueryCreateEmptyDatasetOperator(BaseOperator): + """ + This operator is used to create new dataset for your Project in Big query. + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource + + :param project_id: The name of the project where we want to create the dataset. + Don't need to provide, if projectId in dataset_reference. + :type project_id: str + :param dataset_id: The id of dataset. Don't need to provide, + if datasetId in dataset_reference. + :type dataset_id: str + :param dataset_reference: Dataset reference that could be provided with request body. + More info: + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets#resource + :type dataset_reference: dict + + **Example**: :: + + create_new_dataset = BigQueryCreateEmptyDatasetOperator( + dataset_id = 'new-dataset', + project_id = 'my-project', + dataset_reference = {"friendlyName": "New Dataset"} + bigquery_conn_id='_my_gcp_conn_', + task_id='newDatasetCreator', + dag=dag) + + """ + + template_fields = ('dataset_id', 'project_id') + ui_color = '#f0eee4' + + @apply_defaults + def __init__(self, + dataset_id, + project_id=None, + dataset_reference=None, + bigquery_conn_id='bigquery_default', + delegate_to=None, + *args, **kwargs): + self.dataset_id = dataset_id + self.project_id = project_id + self.bigquery_conn_id = bigquery_conn_id + self.dataset_reference = dataset_reference if dataset_reference else {} + self.delegate_to = delegate_to + + self.log.info('Dataset id: %s', self.dataset_id) + self.log.info('Project id: %s', self.project_id) + + super(BigQueryCreateEmptyDatasetOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, + delegate_to=self.delegate_to) + + conn = bq_hook.get_conn() + cursor = conn.cursor() + + cursor.create_empty_dataset( + project_id=self.project_id, + dataset_id=self.dataset_id, + dataset_reference=self.dataset_reference) diff --git a/airflow/contrib/operators/bigquery_table_delete_operator.py b/airflow/contrib/operators/bigquery_table_delete_operator.py index a16107d8c41ec..106afd1e94f0e 100644 --- a/airflow/contrib/operators/bigquery_table_delete_operator.py +++ b/airflow/contrib/operators/bigquery_table_delete_operator.py @@ -27,18 +27,18 @@ class BigQueryTableDeleteOperator(BaseOperator): Deletes BigQuery tables :param deletion_dataset_table: A dotted - (.|:).
that indicates which table + ``(.|:).
`` that indicates which table will be deleted. (templated) - :type deletion_dataset_table: string + :type deletion_dataset_table: str :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param ignore_if_missing: if True, then return success even if the requested table does not exist. - :type ignore_if_missing: boolean + :type ignore_if_missing: bool """ template_fields = ('deletion_dataset_table',) ui_color = '#ffd1dc' diff --git a/airflow/contrib/operators/bigquery_to_bigquery.py b/airflow/contrib/operators/bigquery_to_bigquery.py index 93a52b31020ce..288731e157de7 100644 --- a/airflow/contrib/operators/bigquery_to_bigquery.py +++ b/airflow/contrib/operators/bigquery_to_bigquery.py @@ -31,27 +31,30 @@ class BigQueryToBigQueryOperator(BaseOperator): https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.copy :param source_project_dataset_tables: One or more - dotted (project:|project.).
BigQuery tables to use as the - source data. If is not included, project will be the + dotted ``(project:|project.).
`` BigQuery tables to use as the + source data. If ```` is not included, project will be the project defined in the connection json. Use a list if there are multiple source tables. (templated) :type source_project_dataset_tables: list|string :param destination_project_dataset_table: The destination BigQuery - table. Format is: (project:|project.).
(templated) - :type destination_project_dataset_table: string + table. Format is: ``(project:|project.).
`` (templated) + :type destination_project_dataset_table: str :param write_disposition: The write disposition if the table already exists. - :type write_disposition: string + :type write_disposition: str :param create_disposition: The create disposition if the table doesn't exist. - :type create_disposition: string + :type create_disposition: str :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict """ template_fields = ('source_project_dataset_tables', - 'destination_project_dataset_table') + 'destination_project_dataset_table', 'labels') template_ext = ('.sql',) ui_color = '#e6f0e4' @@ -63,6 +66,7 @@ def __init__(self, create_disposition='CREATE_IF_NEEDED', bigquery_conn_id='bigquery_default', delegate_to=None, + labels=None, *args, **kwargs): super(BigQueryToBigQueryOperator, self).__init__(*args, **kwargs) @@ -72,6 +76,7 @@ def __init__(self, self.create_disposition = create_disposition self.bigquery_conn_id = bigquery_conn_id self.delegate_to = delegate_to + self.labels = labels def execute(self, context): self.log.info( @@ -86,4 +91,5 @@ def execute(self, context): self.source_project_dataset_tables, self.destination_project_dataset_table, self.write_disposition, - self.create_disposition) + self.create_disposition, + self.labels) diff --git a/airflow/contrib/operators/bigquery_to_gcs.py b/airflow/contrib/operators/bigquery_to_gcs.py index e2ce93068f433..19b004fdfe2fb 100644 --- a/airflow/contrib/operators/bigquery_to_gcs.py +++ b/airflow/contrib/operators/bigquery_to_gcs.py @@ -31,31 +31,35 @@ class BigQueryToCloudStorageOperator(BaseOperator): https://cloud.google.com/bigquery/docs/reference/v2/jobs :param source_project_dataset_table: The dotted - (.|:).
BigQuery table to use as the source - data. If is not included, project will be the project + ``(.|:).
`` BigQuery table to use as the + source data. If ```` is not included, project will be the project defined in the connection json. (templated) - :type source_project_dataset_table: string + :type source_project_dataset_table: str :param destination_cloud_storage_uris: The destination Google Cloud Storage URI (e.g. gs://some-bucket/some-file.txt). (templated) Follows convention defined here: https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple :type destination_cloud_storage_uris: list :param compression: Type of compression to use. - :type compression: string + :type compression: str :param export_format: File format to export. - :type field_delimiter: string + :type export_format: str :param field_delimiter: The delimiter to use when extracting to a CSV. - :type field_delimiter: string + :type field_delimiter: str :param print_header: Whether to print a header for a CSV file extract. - :type print_header: boolean + :type print_header: bool :param bigquery_conn_id: reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str + :param labels: a dictionary containing labels for the job/query, + passed to BigQuery + :type labels: dict """ - template_fields = ('source_project_dataset_table', 'destination_cloud_storage_uris') + template_fields = ('source_project_dataset_table', + 'destination_cloud_storage_uris', 'labels') template_ext = ('.sql',) ui_color = '#e4e6f0' @@ -69,6 +73,7 @@ def __init__(self, print_header=True, bigquery_conn_id='bigquery_default', delegate_to=None, + labels=None, *args, **kwargs): super(BigQueryToCloudStorageOperator, self).__init__(*args, **kwargs) @@ -80,6 +85,7 @@ def __init__(self, self.print_header = print_header self.bigquery_conn_id = bigquery_conn_id self.delegate_to = delegate_to + self.labels = labels def execute(self, context): self.log.info('Executing extract of %s into: %s', @@ -95,4 +101,5 @@ def execute(self, context): self.compression, self.export_format, self.field_delimiter, - self.print_header) + self.print_header, + self.labels) diff --git a/airflow/contrib/operators/cassandra_to_gcs.py b/airflow/contrib/operators/cassandra_to_gcs.py index 211444b96d1b9..6819eca404ebb 100644 --- a/airflow/contrib/operators/cassandra_to_gcs.py +++ b/airflow/contrib/operators/cassandra_to_gcs.py @@ -60,18 +60,18 @@ def __init__(self, **kwargs): """ :param cql: The CQL to execute on the Cassandra table. - :type cql: string + :type cql: str :param bucket: The bucket to upload to. - :type bucket: string + :type bucket: str :param filename: The filename to use as the object name when uploading to Google cloud storage. A {} should be specified in the filename to allow the operator to inject file numbers in cases where the file is split due to size. - :type filename: string + :type filename: str :param schema_filename: If set, the filename to use as the object name when uploading a .json file containing the BigQuery schema fields for the table that was dumped from MySQL. - :type schema_filename: string + :type schema_filename: str :param approx_max_file_size_bytes: This operator supports the ability to split large table dumps into multiple files (see notes in the filenamed param docs above). Google cloud storage allows for files @@ -79,14 +79,14 @@ def __init__(self, file size of the splits. :type approx_max_file_size_bytes: long :param cassandra_conn_id: Reference to a specific Cassandra hook. - :type cassandra_conn_id: string + :type cassandra_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(CassandraToGoogleCloudStorageOperator, self).__init__(*args, **kwargs) self.cql = cql @@ -266,7 +266,7 @@ def convert_tuple_type(cls, name, value): """ Converts a tuple to RECORD that contains n fields, each will be converted to its corresponding data type in bq and will be named 'field_', where - index is determined by the order of the tuple elments defined in cassandra. + index is determined by the order of the tuple elements defined in cassandra. """ names = ['field_' + str(i) for i in range(len(value))] values = [cls.convert_value(name, value) for name, value in zip(names, value)] @@ -276,7 +276,7 @@ def convert_tuple_type(cls, name, value): def convert_map_type(cls, name, value): """ Converts a map to a repeated RECORD that contains two fields: 'key' and 'value', - each will be converted to its corresopnding data type in BQ. + each will be converted to its corresponding data type in BQ. """ converted_map = [] for k, v in zip(value.keys(), value.values()): diff --git a/airflow/contrib/operators/databricks_operator.py b/airflow/contrib/operators/databricks_operator.py index 7b8d522dba85b..df3f903725ef6 100644 --- a/airflow/contrib/operators/databricks_operator.py +++ b/airflow/contrib/operators/databricks_operator.py @@ -24,15 +24,76 @@ from airflow.exceptions import AirflowException from airflow.contrib.hooks.databricks_hook import DatabricksHook from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults XCOM_RUN_ID_KEY = 'run_id' XCOM_RUN_PAGE_URL_KEY = 'run_page_url' +def _deep_string_coerce(content, json_path='json'): + """ + Coerces content or all values of content if it is a dict to a string. The + function will throw if content contains non-string or non-numeric types. + + The reason why we have this function is because the ``self.json`` field must be a + dict with only string values. This is because ``render_template`` will fail + for numerical values. + """ + c = _deep_string_coerce + if isinstance(content, six.string_types): + return content + elif isinstance(content, six.integer_types + (float,)): + # Databricks can tolerate either numeric or string types in the API backend. + return str(content) + elif isinstance(content, (list, tuple)): + return [c(e, '{0}[{1}]'.format(json_path, i)) for i, e in enumerate(content)] + elif isinstance(content, dict): + return {k: c(v, '{0}[{1}]'.format(json_path, k)) + for k, v in list(content.items())} + else: + param_type = type(content) + msg = 'Type {0} used for parameter {1} is not a number or a string' \ + .format(param_type, json_path) + raise AirflowException(msg) + + +def _handle_databricks_operator_execution(operator, hook, log, context): + """ + Handles the Airflow + Databricks lifecycle logic for a Databricks operator + :param operator: Databricks operator being handled + :param context: Airflow context + """ + if operator.do_xcom_push: + context['ti'].xcom_push(key=XCOM_RUN_ID_KEY, value=operator.run_id) + log.info('Run submitted with run_id: %s', operator.run_id) + run_page_url = hook.get_run_page_url(operator.run_id) + if operator.do_xcom_push: + context['ti'].xcom_push(key=XCOM_RUN_PAGE_URL_KEY, value=run_page_url) + + log.info('View run status, Spark UI, and logs at %s', run_page_url) + while True: + run_state = hook.get_run_state(operator.run_id) + if run_state.is_terminal: + if run_state.is_successful: + log.info('%s completed successfully.', operator.task_id) + log.info('View run status, Spark UI, and logs at %s', run_page_url) + return + else: + error_message = '{t} failed with terminal state: {s}'.format( + t=operator.task_id, + s=run_state) + raise AirflowException(error_message) + else: + log.info('%s in run state: %s', operator.task_id, run_state) + log.info('View run status, Spark UI, and logs at %s', run_page_url) + log.info('Sleeping for %s seconds.', operator.polling_period_seconds) + time.sleep(operator.polling_period_seconds) + + class DatabricksSubmitRunOperator(BaseOperator): """ - Submits an Spark job run to Databricks using the + Submits a Spark job run to Databricks using the `api/2.0/jobs/runs/submit `_ API endpoint. @@ -43,6 +104,7 @@ class DatabricksSubmitRunOperator(BaseOperator): to call the ``api/2.0/jobs/runs/submit`` endpoint and pass it directly to our ``DatabricksSubmitRunOperator`` through the ``json`` parameter. For example :: + json = { 'new_cluster': { 'spark_version': '2.1.0-db3-scala2.11', @@ -58,6 +120,7 @@ class DatabricksSubmitRunOperator(BaseOperator): of the ``DatabricksSubmitRunOperator`` directly. Note that there is exactly one named parameter for each top level parameter in the ``runs/submit`` endpoint. In this method, your code would look like this: :: + new_cluster = { 'spark_version': '2.1.0-db3-scala2.11', 'num_workers': 2 @@ -119,7 +182,7 @@ class DatabricksSubmitRunOperator(BaseOperator): :param existing_cluster_id: ID for existing cluster on which to run this task. *EITHER* ``new_cluster`` *OR* ``existing_cluster_id`` should be specified. This field will be templated. - :type existing_cluster_id: string + :type existing_cluster_id: str :param libraries: Libraries which this run will use. This field will be templated. @@ -130,7 +193,7 @@ class DatabricksSubmitRunOperator(BaseOperator): By default this will be set to the Airflow ``task_id``. This ``task_id`` is a required parameter of the superclass ``BaseOperator``. This field will be templated. - :type run_name: string + :type run_name: str :param timeout_seconds: The timeout for this run. By default a value of 0 is used which means to have no timeout. This field will be templated. @@ -139,15 +202,18 @@ class DatabricksSubmitRunOperator(BaseOperator): By default and in the common case this will be ``databricks_default``. To use token based authentication, provide the key ``token`` in the extra field for the connection. - :type databricks_conn_id: string + :type databricks_conn_id: str :param polling_period_seconds: Controls the rate which we poll for the result of this run. By default the operator will poll every 30 seconds. :type polling_period_seconds: int :param databricks_retry_limit: Amount of times retry if the Databricks backend is unreachable. Its value must be greater than or equal to 1. :type databricks_retry_limit: int + :param databricks_retry_delay: Number of seconds to wait between retries (it + might be a floating point number). + :type databricks_retry_delay: float :param do_xcom_push: Whether we should push run_id and run_page_url to xcom. - :type do_xcom_push: boolean + :type do_xcom_push: bool """ # Used in airflow.models.BaseOperator template_fields = ('json',) @@ -155,6 +221,7 @@ class DatabricksSubmitRunOperator(BaseOperator): ui_color = '#1CB1C2' ui_fgcolor = '#fff' + @apply_defaults def __init__( self, json=None, @@ -168,6 +235,7 @@ def __init__( databricks_conn_id='databricks_default', polling_period_seconds=30, databricks_retry_limit=3, + databricks_retry_delay=1, do_xcom_push=False, **kwargs): """ @@ -178,6 +246,7 @@ def __init__( self.databricks_conn_id = databricks_conn_id self.polling_period_seconds = polling_period_seconds self.databricks_retry_limit = databricks_retry_limit + self.databricks_retry_delay = databricks_retry_delay if spark_jar_task is not None: self.json['spark_jar_task'] = spark_jar_task if notebook_task is not None: @@ -195,72 +264,217 @@ def __init__( if 'run_name' not in self.json: self.json['run_name'] = run_name or kwargs['task_id'] - self.json = self._deep_string_coerce(self.json) + self.json = _deep_string_coerce(self.json) # This variable will be used in case our task gets killed. self.run_id = None self.do_xcom_push = do_xcom_push - def _deep_string_coerce(self, content, json_path='json'): - """ - Coerces content or all values of content if it is a dict to a string. The - function will throw if content contains non-string or non-numeric types. + def get_hook(self): + return DatabricksHook( + self.databricks_conn_id, + retry_limit=self.databricks_retry_limit, + retry_delay=self.databricks_retry_delay) + + def execute(self, context): + hook = self.get_hook() + self.run_id = hook.submit_run(self.json) + _handle_databricks_operator_execution(self, hook, self.log, context) + + def on_kill(self): + hook = self.get_hook() + hook.cancel_run(self.run_id) + self.log.info( + 'Task: %s with run_id: %s was requested to be cancelled.', + self.task_id, self.run_id + ) + + +class DatabricksRunNowOperator(BaseOperator): + """ + Runs an existing Spark job run to Databricks using the + `api/2.0/jobs/run-now + `_ + API endpoint. + + There are two ways to instantiate this operator. + + In the first way, you can take the JSON payload that you typically use + to call the ``api/2.0/jobs/run-now`` endpoint and pass it directly + to our ``DatabricksRunNowOperator`` through the ``json`` parameter. + For example :: + + json = { + "job_id": 42, + "notebook_params": { + "dry-run": "true", + "oldest-time-to-consider": "1457570074236" + } + } + + notebook_run = DatabricksRunNowOperator(task_id='notebook_run', json=json) + + Another way to accomplish the same thing is to use the named parameters + of the ``DatabricksRunNowOperator`` directly. Note that there is exactly + one named parameter for each top level parameter in the ``run-now`` + endpoint. In this method, your code would look like this: :: + + job_id=42 + + notebook_params = { + "dry-run": "true", + "oldest-time-to-consider": "1457570074236" + } + + python_params = ["douglas adams", "42"] + + spark_submit_params = ["--class", "org.apache.spark.examples.SparkPi"] + + notebook_run = DatabricksRunNowOperator( + job_id=job_id, + notebook_params=notebook_params, + python_params=python_params, + spark_submit_params=spark_submit_params + ) + + In the case where both the json parameter **AND** the named parameters + are provided, they will be merged together. If there are conflicts during the merge, + the named parameters will take precedence and override the top level ``json`` keys. + + Currently the named parameters that ``DatabricksRunNowOperator`` supports are + - ``job_id`` + - ``json`` + - ``notebook_params`` + - ``python_params`` + - ``spark_submit_params`` + + + :param job_id: the job_id of the existing Databricks job. + This field will be templated. + + .. seealso:: + https://docs.databricks.com/api/latest/jobs.html#run-now + :type job_id: str + :param json: A JSON object containing API parameters which will be passed + directly to the ``api/2.0/jobs/run-now`` endpoint. The other named parameters + (i.e. ``notebook_params``, ``spark_submit_params``..) to this operator will + be merged with this json dictionary if they are provided. + If there are conflicts during the merge, the named parameters will + take precedence and override the top level json keys. (templated) + + .. seealso:: + For more information about templating see :ref:`jinja-templating`. + https://docs.databricks.com/api/latest/jobs.html#run-now + :type json: dict + :param notebook_params: A dict from keys to values for jobs with notebook task, + e.g. "notebook_params": {"name": "john doe", "age": "35"}. + The map is passed to the notebook and will be accessible through the + dbutils.widgets.get function. See Widgets for more information. + If not specified upon run-now, the triggered run will use the + job’s base parameters. notebook_params cannot be + specified in conjunction with jar_params. The json representation + of this field (i.e. {"notebook_params":{"name":"john doe","age":"35"}}) + cannot exceed 10,000 bytes. + This field will be templated. + + .. seealso:: + https://docs.databricks.com/user-guide/notebooks/widgets.html + :type notebook_params: dict + :param python_params: A list of parameters for jobs with python tasks, + e.g. "python_params": ["john doe", "35"]. + The parameters will be passed to python file as command line parameters. + If specified upon run-now, it would overwrite the parameters specified in + job setting. + The json representation of this field (i.e. {"python_params":["john doe","35"]}) + cannot exceed 10,000 bytes. + This field will be templated. + + .. seealso:: + https://docs.databricks.com/api/latest/jobs.html#run-now + :type python_params: list[str] + :param spark_submit_params: A list of parameters for jobs with spark submit task, + e.g. "spark_submit_params": ["--class", "org.apache.spark.examples.SparkPi"]. + The parameters will be passed to spark-submit script as command line parameters. + If specified upon run-now, it would overwrite the parameters specified + in job setting. + The json representation of this field cannot exceed 10,000 bytes. + This field will be templated. + + .. seealso:: + https://docs.databricks.com/api/latest/jobs.html#run-now + :type spark_submit_params: list[str] + :param timeout_seconds: The timeout for this run. By default a value of 0 is used + which means to have no timeout. + This field will be templated. + :type timeout_seconds: int32 + :param databricks_conn_id: The name of the Airflow connection to use. + By default and in the common case this will be ``databricks_default``. To use + token based authentication, provide the key ``token`` in the extra field for the + connection. + :type databricks_conn_id: str + :param polling_period_seconds: Controls the rate which we poll for the result of + this run. By default the operator will poll every 30 seconds. + :type polling_period_seconds: int + :param databricks_retry_limit: Amount of times retry if the Databricks backend is + unreachable. Its value must be greater than or equal to 1. + :type databricks_retry_limit: int + :param do_xcom_push: Whether we should push run_id and run_page_url to xcom. + :type do_xcom_push: bool + """ + # Used in airflow.models.BaseOperator + template_fields = ('json',) + # Databricks brand color (blue) under white text + ui_color = '#1CB1C2' + ui_fgcolor = '#fff' + + @apply_defaults + def __init__( + self, + job_id, + json=None, + notebook_params=None, + python_params=None, + spark_submit_params=None, + databricks_conn_id='databricks_default', + polling_period_seconds=30, + databricks_retry_limit=3, + databricks_retry_delay=1, + do_xcom_push=False, + **kwargs): - The reason why we have this function is because the ``self.json`` field must be a - dict with only string values. This is because ``render_template`` will fail - for numerical values. """ - c = self._deep_string_coerce - if isinstance(content, six.string_types): - return content - elif isinstance(content, six.integer_types + (float,)): - # Databricks can tolerate either numeric or string types in the API backend. - return str(content) - elif isinstance(content, (list, tuple)): - return [c(e, '{0}[{1}]'.format(json_path, i)) for i, e in enumerate(content)] - elif isinstance(content, dict): - return {k: c(v, '{0}[{1}]'.format(json_path, k)) - for k, v in list(content.items())} - else: - param_type = type(content) - msg = 'Type {0} used for parameter {1} is not a number or a string'\ - .format(param_type, json_path) - raise AirflowException(msg) + Creates a new ``DatabricksRunNowOperator``. + """ + super(DatabricksRunNowOperator, self).__init__(**kwargs) + self.json = json or {} + self.databricks_conn_id = databricks_conn_id + self.polling_period_seconds = polling_period_seconds + self.databricks_retry_limit = databricks_retry_limit + self.databricks_retry_delay = databricks_retry_delay - def _log_run_page_url(self, url): - self.log.info('View run status, Spark UI, and logs at %s', url) + if job_id is not None: + self.json['job_id'] = job_id + if notebook_params is not None: + self.json['notebook_params'] = notebook_params + if python_params is not None: + self.json['python_params'] = python_params + if spark_submit_params is not None: + self.json['spark_submit_params'] = spark_submit_params + + self.json = _deep_string_coerce(self.json) + # This variable will be used in case our task gets killed. + self.run_id = None + self.do_xcom_push = do_xcom_push def get_hook(self): return DatabricksHook( self.databricks_conn_id, - retry_limit=self.databricks_retry_limit) + retry_limit=self.databricks_retry_limit, + retry_delay=self.databricks_retry_delay) def execute(self, context): hook = self.get_hook() - self.run_id = hook.submit_run(self.json) - if self.do_xcom_push: - context['ti'].xcom_push(key=XCOM_RUN_ID_KEY, value=self.run_id) - self.log.info('Run submitted with run_id: %s', self.run_id) - run_page_url = hook.get_run_page_url(self.run_id) - if self.do_xcom_push: - context['ti'].xcom_push(key=XCOM_RUN_PAGE_URL_KEY, value=run_page_url) - self._log_run_page_url(run_page_url) - while True: - run_state = hook.get_run_state(self.run_id) - if run_state.is_terminal: - if run_state.is_successful: - self.log.info('%s completed successfully.', self.task_id) - self._log_run_page_url(run_page_url) - return - else: - error_message = '{t} failed with terminal state: {s}'.format( - t=self.task_id, - s=run_state) - raise AirflowException(error_message) - else: - self.log.info('%s in run state: %s', self.task_id, run_state) - self._log_run_page_url(run_page_url) - self.log.info('Sleeping for %s seconds.', self.polling_period_seconds) - time.sleep(self.polling_period_seconds) + self.run_id = hook.run_now(self.json) + _handle_databricks_operator_execution(self, hook, self.log, context) def on_kill(self): hook = self.get_hook() diff --git a/airflow/contrib/operators/dataflow_operator.py b/airflow/contrib/operators/dataflow_operator.py index e3c8c1fff1572..e880642f6067c 100644 --- a/airflow/contrib/operators/dataflow_operator.py +++ b/airflow/contrib/operators/dataflow_operator.py @@ -16,7 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - +import os import re import uuid import copy @@ -33,6 +33,77 @@ class DataFlowJavaOperator(BaseOperator): Start a Java Cloud DataFlow batch job. The parameters of the operation will be passed to the job. + **Example**: :: + + default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': + (2016, 8, 1), + 'email': ['alex@vanboxel.be'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 1, + 'retry_delay': timedelta(minutes=30), + 'dataflow_default_options': { + 'project': 'my-gcp-project', + 'zone': 'us-central1-f', + 'stagingLocation': 'gs://bucket/tmp/dataflow/staging/', + } + } + + dag = DAG('test-dag', default_args=default_args) + + task = DataFlowJavaOperator( + gcp_conn_id='gcp_default', + task_id='normalize-cal', + jar='{{var.value.gcp_dataflow_base}}pipeline-ingress-cal-normalize-1.0.jar', + options={ + 'autoscalingAlgorithm': 'BASIC', + 'maxNumWorkers': '50', + 'start': '{{ds}}', + 'partitionType': 'DAY' + + }, + dag=dag) + + .. seealso:: + For more detail on job submission have a look at the reference: + https://cloud.google.com/dataflow/pipelines/specifying-exec-params + + :param jar: The reference to a self executing DataFlow jar (templated). + :type jar: str + :param job_name: The 'jobName' to use when executing the DataFlow job + (templated). This ends up being set in the pipeline options, so any entry + with key ``'jobName'`` in ``options`` will be overwritten. + :type job_name: str + :param dataflow_default_options: Map of default job options. + :type dataflow_default_options: dict + :param options: Map of job specific options. + :type options: dict + :param gcp_conn_id: The connection ID to use connecting to Google Cloud + Platform. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param poll_sleep: The time in seconds to sleep between polling Google + Cloud Platform for the dataflow job status while the job is in the + JOB_STATE_RUNNING state. + :type poll_sleep: int + :param job_class: The name of the dataflow job class to be executed, it + is often not the main class configured in the dataflow jar file. + :type job_class: str + + ``jar``, ``options``, and ``job_name`` are templated so you can use variables in them. + + Note that both + ``dataflow_default_options`` and ``options`` will be merged to specify pipeline + execution parameter, and ``dataflow_default_options`` is expected to save + high-level options, for instances, project and zone information, which + apply to all dataflow operators in the DAG. + It's a good practice to define dataflow_* parameters in the default_args of the dag like the project, zone and staging location. @@ -53,7 +124,7 @@ class DataFlowJavaOperator(BaseOperator): .. code-block:: python - t1 = DataFlowOperation( + t1 = DataFlowJavaOperator( task_id='datapflow_example', jar='{{var.value.gcp_dataflow_base}}pipeline/build/libs/pipeline-example-1.0.jar', options={ @@ -66,15 +137,15 @@ class DataFlowJavaOperator(BaseOperator): gcp_conn_id='gcp-airflow-service-account', dag=my-dag) - Both ``jar`` and ``options`` are templated so you can use variables in them. """ - template_fields = ['options', 'jar'] + template_fields = ['options', 'jar', 'job_name'] ui_color = '#0273d4' @apply_defaults def __init__( self, jar, + job_name='{{task.task_id}}', dataflow_default_options=None, options=None, gcp_conn_id='google_cloud_default', @@ -83,39 +154,6 @@ def __init__( job_class=None, *args, **kwargs): - """ - Create a new DataFlowJavaOperator. Note that both - dataflow_default_options and options will be merged to specify pipeline - execution parameter, and dataflow_default_options is expected to save - high-level options, for instances, project and zone information, which - apply to all dataflow operators in the DAG. - - - .. seealso:: - For more detail on job submission have a look at the reference: - https://cloud.google.com/dataflow/pipelines/specifying-exec-params - - :param jar: The reference to a self executing DataFlow jar. - :type jar: string - :param dataflow_default_options: Map of default job options. - :type dataflow_default_options: dict - :param options: Map of job specific options. - :type options: dict - :param gcp_conn_id: The connection ID to use connecting to Google Cloud - Platform. - :type gcp_conn_id: string - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must have - domain-wide delegation enabled. - :type delegate_to: string - :param poll_sleep: The time in seconds to sleep between polling Google - Cloud Platform for the dataflow job status while the job is in the - JOB_STATE_RUNNING state. - :type poll_sleep: int - :param job_class: The name of the dataflow job class to be executued, it - is often not the main class configured in the dataflow jar file. - :type job_class: string - """ super(DataFlowJavaOperator, self).__init__(*args, **kwargs) dataflow_default_options = dataflow_default_options or {} @@ -125,6 +163,7 @@ def __init__( self.gcp_conn_id = gcp_conn_id self.delegate_to = delegate_to self.jar = jar + self.job_name = job_name self.dataflow_default_options = dataflow_default_options self.options = options self.poll_sleep = poll_sleep @@ -141,7 +180,7 @@ def execute(self, context): dataflow_options = copy.copy(self.dataflow_default_options) dataflow_options.update(self.options) - hook.start_java_dataflow(self.task_id, dataflow_options, + hook.start_java_dataflow(self.job_name, dataflow_options, self.jar, self.job_class) @@ -149,6 +188,27 @@ class DataflowTemplateOperator(BaseOperator): """ Start a Templated Cloud DataFlow batch job. The parameters of the operation will be passed to the job. + + :param template: The reference to the DataFlow template. + :type template: str + :param job_name: The 'jobName' to use when executing the DataFlow template + (templated). + :param dataflow_default_options: Map of default job environment options. + :type dataflow_default_options: dict + :param parameters: Map of job specific parameters for the template. + :type parameters: dict + :param gcp_conn_id: The connection ID to use connecting to Google Cloud + Platform. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param poll_sleep: The time in seconds to sleep between polling Google + Cloud Platform for the dataflow job status while the job is in the + JOB_STATE_RUNNING state. + :type poll_sleep: int + It's a good practice to define dataflow_* parameters in the default_args of the dag like the project, zone and staging location. @@ -160,9 +220,10 @@ class DataflowTemplateOperator(BaseOperator): default_args = { 'dataflow_default_options': { - 'project': 'my-gcp-project' + 'project': 'my-gcp-project', + 'region': 'europe-west1', 'zone': 'europe-west1-d', - 'tempLocation': 'gs://my-staging-bucket/staging/' + 'tempLocation': 'gs://my-staging-bucket/staging/', } } } @@ -183,16 +244,27 @@ class DataflowTemplateOperator(BaseOperator): gcp_conn_id='gcp-airflow-service-account', dag=my-dag) - ``template``, ``dataflow_default_options`` and ``parameters`` are templated so you can - use variables in them. + ``template``, ``dataflow_default_options``, ``parameters``, and ``job_name`` are + templated so you can use variables in them. + + Note that ``dataflow_default_options`` is expected to save high-level options + for project information, which apply to all dataflow operators in the DAG. + + .. seealso:: + https://cloud.google.com/dataflow/docs/reference/rest/v1b3 + /LaunchTemplateParameters + https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment + For more detail on job template execution have a look at the reference: + https://cloud.google.com/dataflow/docs/templates/executing-templates """ - template_fields = ['parameters', 'dataflow_default_options', 'template'] + template_fields = ['parameters', 'dataflow_default_options', 'template', 'job_name'] ui_color = '#0273d4' @apply_defaults def __init__( self, template, + job_name='{{task.task_id}}', dataflow_default_options=None, parameters=None, gcp_conn_id='google_cloud_default', @@ -200,36 +272,6 @@ def __init__( poll_sleep=10, *args, **kwargs): - """ - Create a new DataflowTemplateOperator. Note that - dataflow_default_options is expected to save high-level options - for project information, which apply to all dataflow operators in the DAG. - - .. seealso:: - https://cloud.google.com/dataflow/docs/reference/rest/v1b3 - /LaunchTemplateParameters - https://cloud.google.com/dataflow/docs/reference/rest/v1b3/RuntimeEnvironment - For more detail on job template execution have a look at the reference: - https://cloud.google.com/dataflow/docs/templates/executing-templates - - :param template: The reference to the DataFlow template. - :type template: string - :param dataflow_default_options: Map of default job environment options. - :type dataflow_default_options: dict - :param parameters: Map of job specific parameters for the template. - :type parameters: dict - :param gcp_conn_id: The connection ID to use connecting to Google Cloud - Platform. - :type gcp_conn_id: string - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must have - domain-wide delegation enabled. - :type delegate_to: string - :param poll_sleep: The time in seconds to sleep between polling Google - Cloud Platform for the dataflow job status while the job is in the - JOB_STATE_RUNNING state. - :type poll_sleep: int - """ super(DataflowTemplateOperator, self).__init__(*args, **kwargs) dataflow_default_options = dataflow_default_options or {} @@ -240,6 +282,7 @@ def __init__( self.dataflow_default_options = dataflow_default_options self.poll_sleep = poll_sleep self.template = template + self.job_name = job_name self.parameters = parameters def execute(self, context): @@ -247,18 +290,54 @@ def execute(self, context): delegate_to=self.delegate_to, poll_sleep=self.poll_sleep) - hook.start_template_dataflow(self.task_id, self.dataflow_default_options, + hook.start_template_dataflow(self.job_name, self.dataflow_default_options, self.parameters, self.template) class DataFlowPythonOperator(BaseOperator): + """ + Launching Cloud Dataflow jobs written in python. Note that both + dataflow_default_options and options will be merged to specify pipeline + execution parameter, and dataflow_default_options is expected to save + high-level options, for instances, project and zone information, which + apply to all dataflow operators in the DAG. - template_fields = ['options', 'dataflow_default_options'] + .. seealso:: + For more detail on job submission have a look at the reference: + https://cloud.google.com/dataflow/pipelines/specifying-exec-params + + :param py_file: Reference to the python dataflow pipeline file.py, e.g., + /some/local/file/path/to/your/python/pipeline/file. + :type py_file: str + :param job_name: The 'job_name' to use when executing the DataFlow job + (templated). This ends up being set in the pipeline options, so any entry + with key ``'jobName'`` or ``'job_name'`` in ``options`` will be overwritten. + :type job_name: str + :param py_options: Additional python options, e.g., ["-m", "-v"]. + :type pyt_options: list[str] + :param dataflow_default_options: Map of default job options. + :type dataflow_default_options: dict + :param options: Map of job specific options. + :type options: dict + :param gcp_conn_id: The connection ID to use connecting to Google Cloud + Platform. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param poll_sleep: The time in seconds to sleep between polling Google + Cloud Platform for the dataflow job status while the job is in the + JOB_STATE_RUNNING state. + :type poll_sleep: int + """ + template_fields = ['options', 'dataflow_default_options', 'job_name'] @apply_defaults def __init__( self, py_file, + job_name='{{task.task_id}}', py_options=None, dataflow_default_options=None, options=None, @@ -267,41 +346,11 @@ def __init__( poll_sleep=10, *args, **kwargs): - """ - Create a new DataFlowPythonOperator. Note that both - dataflow_default_options and options will be merged to specify pipeline - execution parameter, and dataflow_default_options is expected to save - high-level options, for instances, project and zone information, which - apply to all dataflow operators in the DAG. - .. seealso:: - For more detail on job submission have a look at the reference: - https://cloud.google.com/dataflow/pipelines/specifying-exec-params - - :param py_file: Reference to the python dataflow pipleline file.py, e.g., - /some/local/file/path/to/your/python/pipeline/file. - :type py_file: string - :param py_options: Additional python options. - :type pyt_options: list of strings, e.g., ["-m", "-v"]. - :param dataflow_default_options: Map of default job options. - :type dataflow_default_options: dict - :param options: Map of job specific options. - :type options: dict - :param gcp_conn_id: The connection ID to use connecting to Google Cloud - Platform. - :type gcp_conn_id: string - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must have - domain-wide delegation enabled. - :type delegate_to: string - :param poll_sleep: The time in seconds to sleep between polling Google - Cloud Platform for the dataflow job status while the job is in the - JOB_STATE_RUNNING state. - :type poll_sleep: int - """ super(DataFlowPythonOperator, self).__init__(*args, **kwargs) self.py_file = py_file + self.job_name = job_name self.py_options = py_options or [] self.dataflow_default_options = dataflow_default_options or {} self.options = options or {} @@ -327,11 +376,11 @@ def execute(self, context): formatted_options = {camel_to_snake(key): dataflow_options[key] for key in dataflow_options} hook.start_python_dataflow( - self.task_id, formatted_options, + self.job_name, formatted_options, self.py_file, self.py_options) -class GoogleCloudBucketHelper(): +class GoogleCloudBucketHelper(object): """GoogleCloudStorageHook helper class to download GCS object.""" GCS_PREFIX_LENGTH = 5 @@ -348,9 +397,9 @@ def google_cloud_to_local(self, file_name): will be returned immediately. :param file_name: The full path of input file. - :type file_name: string + :type file_name: str :return: The full path of local file. - :type: string + :rtype: str """ if not file_name.startswith('gs://'): return file_name @@ -358,19 +407,19 @@ def google_cloud_to_local(self, file_name): # Extracts bucket_id and object_id by first removing 'gs://' prefix and # then split the remaining by path delimiter '/'. path_components = file_name[self.GCS_PREFIX_LENGTH:].split('/') - if path_components < 2: + if len(path_components) < 2: raise Exception( - 'Invalid Google Cloud Storage (GCS) object path: {}.' + 'Invalid Google Cloud Storage (GCS) object path: {}' .format(file_name)) bucket_id = path_components[0] object_id = '/'.join(path_components[1:]) - local_file = '/tmp/dataflow{}-{}'.format(str(uuid.uuid1())[:8], + local_file = '/tmp/dataflow{}-{}'.format(str(uuid.uuid4())[:8], path_components[-1]) - file_size = self._gcs_hook.download(bucket_id, object_id, local_file) + self._gcs_hook.download(bucket_id, object_id, local_file) - if file_size > 0: + if os.stat(local_file).st_size > 0: return local_file raise Exception( - 'Failed to download Google Cloud Storage GCS object: {}' + 'Failed to download Google Cloud Storage (GCS) object: {}' .format(file_name)) diff --git a/airflow/contrib/operators/dataproc_operator.py b/airflow/contrib/operators/dataproc_operator.py index 5d59f7fb6eb58..f525db96836e6 100644 --- a/airflow/contrib/operators/dataproc_operator.py +++ b/airflow/contrib/operators/dataproc_operator.py @@ -48,37 +48,50 @@ class DataprocClusterCreateOperator(BaseOperator): parameters detailed in the link are available as a parameter to this operator. :param cluster_name: The name of the DataProc cluster to create. (templated) - :type cluster_name: string + :type cluster_name: str :param project_id: The ID of the google cloud project in which to create the cluster. (templated) - :type project_id: string - :param num_workers: The # of workers to spin up + :type project_id: str + :param num_workers: The # of workers to spin up. If set to zero will + spin up cluster in a single node mode :type num_workers: int :param storage_bucket: The storage bucket to use, setting to None lets dataproc generate a custom one for you - :type storage_bucket: string + :type storage_bucket: str :param init_actions_uris: List of GCS uri's containing dataproc initialization scripts - :type init_actions_uris: list[string] + :type init_actions_uris: list[str] :param init_action_timeout: Amount of time executable scripts in init_actions_uris has to complete - :type init_action_timeout: string + :type init_action_timeout: str :param metadata: dict of key-value google compute engine metadata entries to add to all instances :type metadata: dict :param image_version: the version of software inside the Dataproc cluster - :type image_version: string + :type image_version: str + :param custom_image: custom Dataproc image for more info see + https://cloud.google.com/dataproc/docs/guides/dataproc-images + :type: custom_image: str :param properties: dict of properties to set on config files (e.g. spark-defaults.conf), see - https://cloud.google.com/dataproc/docs/reference/rest/v1/ \ - projects.regions.clusters#SoftwareConfig + https://cloud.google.com/dataproc/docs/reference/rest/v1/projects.regions.clusters#SoftwareConfig :type properties: dict :param master_machine_type: Compute engine machine type to use for the master node - :type master_machine_type: string + :type master_machine_type: str + :param master_disk_type: Type of the boot disk for the master node + (default is ``pd-standard``). + Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or + ``pd-standard`` (Persistent Disk Hard Disk Drive). + :type master_disk_type: str :param master_disk_size: Disk size for the master node :type master_disk_size: int :param worker_machine_type: Compute engine machine type to use for the worker nodes - :type worker_machine_type: string + :type worker_machine_type: str + :param worker_disk_type: Type of the boot disk for the worker node + (default is ``pd-standard``). + Valid values: ``pd-ssd`` (Persistent Disk Solid State Drive) or + ``pd-standard`` (Persistent Disk Hard Disk Drive). + :type worker_disk_type: str :param worker_disk_size: Disk size for the worker nodes :type worker_disk_size: int :param num_preemptible_workers: The # of preemptible worker nodes to spin up @@ -86,30 +99,31 @@ class DataprocClusterCreateOperator(BaseOperator): :param labels: dict of labels to add to the cluster :type labels: dict :param zone: The zone where the cluster will be located. (templated) - :type zone: string + :type zone: str :param network_uri: The network uri to be used for machine communication, cannot be specified with subnetwork_uri - :type network_uri: string + :type network_uri: str :param subnetwork_uri: The subnetwork uri to be used for machine communication, cannot be specified with network_uri - :type subnetwork_uri: string + :type subnetwork_uri: str :param internal_ip_only: If true, all instances in the cluster will only have internal IP addresses. This can only be enabled for subnetwork enabled networks :type internal_ip_only: bool :param tags: The GCE tags to add to all instances - :type tags: list[string] + :type tags: list[str] :param region: leave as 'global', might become relevant in the future. (templated) + :type region: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param service_account: The service account of the dataproc instances. - :type service_account: string + :type service_account: str :param service_account_scopes: The URIs of service account scopes to be included. - :type service_account_scopes: list[string] + :type service_account_scopes: list[str] :param idle_delete_ttl: The longest duration that cluster would keep alive while staying idle. Passing this threshold will cause cluster to be auto-deleted. A duration in seconds. @@ -138,11 +152,14 @@ def __init__(self, init_actions_uris=None, init_action_timeout="10m", metadata=None, + custom_image=None, image_version=None, properties=None, master_machine_type='n1-standard-4', + master_disk_type='pd-standard', master_disk_size=500, worker_machine_type='n1-standard-4', + worker_disk_type='pd-standard', worker_disk_size=500, num_preemptible_workers=0, labels=None, @@ -168,11 +185,14 @@ def __init__(self, self.init_actions_uris = init_actions_uris self.init_action_timeout = init_action_timeout self.metadata = metadata + self.custom_image = custom_image self.image_version = image_version - self.properties = properties + self.properties = properties or dict() self.master_machine_type = master_machine_type + self.master_disk_type = master_disk_type self.master_disk_size = master_disk_size self.worker_machine_type = worker_machine_type + self.worker_disk_type = worker_disk_type self.worker_disk_size = worker_disk_size self.labels = labels self.zone = zone @@ -186,6 +206,19 @@ def __init__(self, self.idle_delete_ttl = idle_delete_ttl self.auto_delete_time = auto_delete_time self.auto_delete_ttl = auto_delete_ttl + self.single_node = num_workers == 0 + + assert not (self.custom_image and self.image_version), \ + "custom_image and image_version can't be both set" + + assert ( + not self.single_node or ( + self.single_node and self.num_preemptible_workers == 0 + ) + ), "num_workers == 0 means single node mode - no preemptibles allowed" + + assert not (self.custom_image and self.image_version), \ + "custom_image and image_version can't be both set" def _get_cluster_list_for_project(self, service): result = service.projects().regions().clusters().list( @@ -272,6 +305,7 @@ def _build_cluster_data(self): 'numInstances': 1, 'machineTypeUri': master_type_uri, 'diskConfig': { + 'bootDiskType': self.master_disk_type, 'bootDiskSizeGb': self.master_disk_size } }, @@ -279,6 +313,7 @@ def _build_cluster_data(self): 'numInstances': self.num_workers, 'machineTypeUri': worker_type_uri, 'diskConfig': { + 'bootDiskType': self.worker_disk_type, 'bootDiskSizeGb': self.worker_disk_size } }, @@ -292,6 +327,7 @@ def _build_cluster_data(self): 'numInstances': self.num_preemptible_workers, 'machineTypeUri': worker_type_uri, 'diskConfig': { + 'bootDiskType': self.worker_disk_type, 'bootDiskSizeGb': self.worker_disk_size }, 'isPreemptible': True @@ -321,6 +357,17 @@ def _build_cluster_data(self): cluster_data['config']['gceClusterConfig']['tags'] = self.tags if self.image_version: cluster_data['config']['softwareConfig']['imageVersion'] = self.image_version + elif self.custom_image: + custom_image_url = 'https://www.googleapis.com/compute/beta/projects/' \ + '{}/global/images/{}'.format(self.project_id, + self.custom_image) + cluster_data['config']['masterConfig']['imageUri'] = custom_image_url + if not self.single_node: + cluster_data['config']['workerConfig']['imageUri'] = custom_image_url + + if self.single_node: + self.properties["dataproc:dataproc.allow.zero.workers"] = "true" + if self.properties: cluster_data['config']['softwareConfig']['properties'] = self.properties if self.idle_delete_ttl: @@ -395,39 +442,39 @@ class DataprocClusterScaleOperator(BaseOperator): **Example**: :: - t1 = DataprocClusterScaleOperator( - task_id='dataproc_scale', - project_id='my-project', - cluster_name='cluster-1', - num_workers=10, - num_preemptible_workers=10, - graceful_decommission_timeout='1h' - dag=dag) + t1 = DataprocClusterScaleOperator( + task_id='dataproc_scale', + project_id='my-project', + cluster_name='cluster-1', + num_workers=10, + num_preemptible_workers=10, + graceful_decommission_timeout='1h', + dag=dag) .. seealso:: For more detail on about scaling clusters have a look at the reference: https://cloud.google.com/dataproc/docs/concepts/configuring-clusters/scaling-clusters :param cluster_name: The name of the cluster to scale. (templated) - :type cluster_name: string + :type cluster_name: str :param project_id: The ID of the google cloud project in which the cluster runs. (templated) - :type project_id: string + :type project_id: str :param region: The region for the dataproc cluster. (templated) - :type region: string + :type region: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param num_workers: The new number of workers :type num_workers: int :param num_preemptible_workers: The new number of preemptible workers :type num_preemptible_workers: int :param graceful_decommission_timeout: Timeout for graceful YARN decomissioning. Maximum value is 1d - :type graceful_decommission_timeout: string + :type graceful_decommission_timeout: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ['cluster_name', 'project_id', 'region'] @@ -491,7 +538,8 @@ def _build_scale_cluster_data(self): } return scale_data - def _get_graceful_decommission_timeout(self, timeout): + @staticmethod + def _get_graceful_decommission_timeout(timeout): match = re.match(r"^(\d+)(s|m|h|d)$", timeout) if match: if match.group(2) == "s": @@ -542,18 +590,18 @@ class DataprocClusterDeleteOperator(BaseOperator): cluster is destroyed. :param cluster_name: The name of the cluster to create. (templated) - :type cluster_name: string + :type cluster_name: str :param project_id: The ID of the google cloud project in which the cluster runs. (templated) - :type project_id: string + :type project_id: str :param region: leave as 'global', might become relevant in the future. (templated) - :type region: string + :type region: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ['cluster_name', 'project_id', 'region'] @@ -575,7 +623,8 @@ def __init__(self, self.project_id = project_id self.region = region - def _wait_for_done(self, service, operation_name): + @staticmethod + def _wait_for_done(service, operation_name): time.sleep(15) while True: response = service.projects().regions().operations().get( @@ -643,18 +692,18 @@ class DataProcPigOperator(BaseOperator): :param query: The query or reference to the query file (pg or pig extension). (templated) - :type query: string + :type query: str :param query_uri: The uri of a pig script on Cloud Storage. - :type query_uri: string + :type query_uri: str :param variables: Map of named parameters for the query. (templated) :type variables: dict :param job_name: The job name used in the DataProc cluster. This name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. (templated) - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. (templated) - :type cluster_name: string + :type cluster_name: str :param dataproc_pig_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_pig_properties: dict @@ -662,15 +711,27 @@ class DataProcPigOperator(BaseOperator): UDFs and libs) and are ideal to put in default arguments. :type dataproc_pig_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] template_ext = ('.pg', '.pig',) ui_color = '#0273d4' @@ -687,6 +748,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -701,6 +763,7 @@ def __init__( self.dataproc_properties = dataproc_pig_properties self.dataproc_jars = dataproc_pig_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, @@ -716,7 +779,10 @@ def execute(self, context): job.add_jar_file_uris(self.dataproc_jars) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataProcHiveOperator(BaseOperator): @@ -724,17 +790,17 @@ class DataProcHiveOperator(BaseOperator): Start a Hive query Job on a Cloud DataProc cluster. :param query: The query or reference to the query file (q extension). - :type query: string + :type query: str :param query_uri: The uri of a hive script on Cloud Storage. - :type query_uri: string + :type query_uri: str :param variables: Map of named parameters for the query. :type variables: dict :param job_name: The job name used in the DataProc cluster. This name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. - :type cluster_name: string + :type cluster_name: str :param dataproc_hive_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_hive_properties: dict @@ -742,15 +808,27 @@ class DataProcHiveOperator(BaseOperator): UDFs and libs) and are ideal to put in default arguments. :type dataproc_hive_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] template_ext = ('.q',) ui_color = '#0273d4' @@ -767,6 +845,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -781,6 +860,7 @@ def __init__( self.dataproc_properties = dataproc_hive_properties self.dataproc_jars = dataproc_hive_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, @@ -797,7 +877,10 @@ def execute(self, context): job.add_jar_file_uris(self.dataproc_jars) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataProcSparkSqlOperator(BaseOperator): @@ -805,18 +888,18 @@ class DataProcSparkSqlOperator(BaseOperator): Start a Spark SQL query Job on a Cloud DataProc cluster. :param query: The query or reference to the query file (q extension). (templated) - :type query: string + :type query: str :param query_uri: The uri of a spark sql script on Cloud Storage. - :type query_uri: string + :type query_uri: str :param variables: Map of named parameters for the query. (templated) :type variables: dict :param job_name: The job name used in the DataProc cluster. This name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. (templated) - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. (templated) - :type cluster_name: string + :type cluster_name: str :param dataproc_spark_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_spark_properties: dict @@ -824,15 +907,27 @@ class DataProcSparkSqlOperator(BaseOperator): for UDFs and libs) and are ideal to put in default arguments. :type dataproc_spark_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['query', 'variables', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] template_ext = ('.q',) ui_color = '#0273d4' @@ -849,6 +944,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -863,6 +959,7 @@ def __init__( self.dataproc_properties = dataproc_spark_properties self.dataproc_jars = dataproc_spark_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, @@ -879,7 +976,10 @@ def execute(self, context): job.add_jar_file_uris(self.dataproc_jars) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataProcSparkOperator(BaseOperator): @@ -888,10 +988,10 @@ class DataProcSparkOperator(BaseOperator): :param main_jar: URI of the job jar provisioned on Cloud Storage. (use this or the main_class, not both together). - :type main_jar: string + :type main_jar: str :param main_class: Name of the job class. (use this or the main_jar, not both together). - :type main_class: string + :type main_class: str :param arguments: Arguments for the job. (templated) :type arguments: list :param archives: List of archived files that will be unpacked in the work @@ -903,9 +1003,9 @@ class DataProcSparkOperator(BaseOperator): name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. (templated) - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. (templated) - :type cluster_name: string + :type cluster_name: str :param dataproc_spark_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_spark_properties: dict @@ -913,16 +1013,28 @@ class DataProcSparkOperator(BaseOperator): for UDFs and libs) and are ideal to put in default arguments. :type dataproc_spark_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['arguments', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['arguments', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] ui_color = '#0273d4' @apply_defaults @@ -940,6 +1052,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -956,6 +1069,7 @@ def __init__( self.dataproc_properties = dataproc_spark_properties self.dataproc_jars = dataproc_spark_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, @@ -970,7 +1084,10 @@ def execute(self, context): job.add_file_uris(self.files) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataProcHadoopOperator(BaseOperator): @@ -979,10 +1096,10 @@ class DataProcHadoopOperator(BaseOperator): :param main_jar: URI of the job jar provisioned on Cloud Storage. (use this or the main_class, not both together). - :type main_jar: string + :type main_jar: str :param main_class: Name of the job class. (use this or the main_jar, not both together). - :type main_class: string + :type main_class: str :param arguments: Arguments for the job. (templated) :type arguments: list :param archives: List of archived files that will be unpacked in the work @@ -994,9 +1111,9 @@ class DataProcHadoopOperator(BaseOperator): name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. (templated) - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. (templated) - :type cluster_name: string + :type cluster_name: str :param dataproc_hadoop_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_hadoop_properties: dict @@ -1004,16 +1121,28 @@ class DataProcHadoopOperator(BaseOperator): for UDFs and libs) and are ideal to put in default arguments. :type dataproc_hadoop_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['arguments', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['arguments', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] ui_color = '#0273d4' @apply_defaults @@ -1031,6 +1160,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -1047,6 +1177,7 @@ def __init__( self.dataproc_properties = dataproc_hadoop_properties self.dataproc_jars = dataproc_hadoop_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook(gcp_conn_id=self.gcp_conn_id, @@ -1061,7 +1192,10 @@ def execute(self, context): job.add_file_uris(self.files) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataProcPySparkOperator(BaseOperator): @@ -1070,7 +1204,7 @@ class DataProcPySparkOperator(BaseOperator): :param main: [Required] The Hadoop Compatible Filesystem (HCFS) URI of the main Python file to use as the driver. Must be a .py file. - :type main: string + :type main: str :param arguments: Arguments for the job. (templated) :type arguments: list :param archives: List of archived files that will be unpacked in the work @@ -1085,9 +1219,9 @@ class DataProcPySparkOperator(BaseOperator): name by default is the task_id appended with the execution data, but can be templated. The name will always be appended with a random number to avoid name clashes. (templated) - :type job_name: string + :type job_name: str :param cluster_name: The name of the DataProc cluster. - :type cluster_name: string + :type cluster_name: str :param dataproc_pyspark_properties: Map for the Pig properties. Ideal to put in default arguments :type dataproc_pyspark_properties: dict @@ -1095,22 +1229,34 @@ class DataProcPySparkOperator(BaseOperator): for UDFs and libs) and are ideal to put in default arguments. :type dataproc_pyspark_jars: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param region: The specified region where the dataproc cluster is created. - :type region: string + :type region: str + :param job_error_states: Job states that should be considered error states. + Any states in this list will result in an error being raised and failure of the + task. Eg, if the ``CANCELLED`` state should also be considered a task failure, + pass in ``['ERROR', 'CANCELLED']``. Possible values are currently only + ``'ERROR'`` and ``'CANCELLED'``, but could change in the future. Defaults to + ``['ERROR']``. + :type job_error_states: list + :var dataproc_job_id: The actual "jobId" as submitted to the Dataproc API. + This is useful for identifying or linking to the job in the Google Cloud Console + Dataproc UI, as the actual "jobId" submitted to the Dataproc API is appended with + an 8 character random string. + :vartype dataproc_job_id: str """ - template_fields = ['arguments', 'job_name', 'cluster_name', 'dataproc_jars'] + template_fields = ['arguments', 'job_name', 'cluster_name', 'region', 'dataproc_jars'] ui_color = '#0273d4' @staticmethod def _generate_temp_filename(filename): dt = time.strftime('%Y%m%d%H%M%S') - return "{}_{}_{}".format(dt, str(uuid.uuid1())[:8], ntpath.basename(filename)) + return "{}_{}_{}".format(dt, str(uuid.uuid4())[:8], ntpath.basename(filename)) """ Upload a local file to a Google Cloud Storage bucket @@ -1149,6 +1295,7 @@ def __init__( gcp_conn_id='google_cloud_default', delegate_to=None, region='global', + job_error_states=['ERROR'], *args, **kwargs): @@ -1165,6 +1312,7 @@ def __init__( self.dataproc_properties = dataproc_pyspark_properties self.dataproc_jars = dataproc_pyspark_jars self.region = region + self.job_error_states = job_error_states def execute(self, context): hook = DataProcHook( @@ -1192,7 +1340,10 @@ def execute(self, context): job.add_python_file_uris(self.pyfiles) job.set_job_name(self.job_name) - hook.submit(hook.project_id, job.build(), self.region) + job_to_submit = job.build() + self.dataproc_job_id = job_to_submit["job"]["reference"]["jobId"] + + hook.submit(hook.project_id, job_to_submit, self.region, self.job_error_states) class DataprocWorkflowTemplateBaseOperator(BaseOperator): @@ -1216,10 +1367,10 @@ def __init__(self, ) def execute(self, context): - self.hook.await(self.start()) + self.hook.wait(self.start()) def start(self, context): - raise AirflowException('plese start a workflow operation') + raise AirflowException('Please start a workflow operation') class DataprocWorkflowTemplateInstantiateOperator(DataprocWorkflowTemplateBaseOperator): @@ -1232,18 +1383,18 @@ class DataprocWorkflowTemplateInstantiateOperator(DataprocWorkflowTemplateBaseOp https://cloud.google.com/dataproc/docs/reference/rest/v1beta2/projects.regions.workflowTemplates/instantiate :param template_id: The id of the template. (templated) - :type template_id: string + :type template_id: str :param project_id: The ID of the google cloud project in which the template runs - :type project_id: string + :type project_id: str :param region: leave as 'global', might become relevant in the future - :type region: string + :type region: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ['template_id'] @@ -1261,7 +1412,7 @@ def start(self): .instantiate( name=('projects/%s/regions/%s/workflowTemplates/%s' % (self.project_id, self.region, self.template_id)), - body={'instanceId': str(uuid.uuid1())}) + body={'instanceId': str(uuid.uuid4())}) .execute()) @@ -1279,15 +1430,15 @@ class DataprocWorkflowTemplateInstantiateInlineOperator( :type template: map :param project_id: The ID of the google cloud project in which the template runs - :type project_id: string + :type project_id: str :param region: leave as 'global', might become relevant in the future - :type region: string + :type region: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ['template'] @@ -1304,6 +1455,6 @@ def start(self): self.hook.get_conn().projects().regions().workflowTemplates() .instantiateInline( parent='projects/%s/regions/%s' % (self.project_id, self.region), - instanceId=str(uuid.uuid1()), + instanceId=str(uuid.uuid4()), body=self.template) .execute()) diff --git a/airflow/contrib/operators/datastore_export_operator.py b/airflow/contrib/operators/datastore_export_operator.py index f6dc7cc571b41..9d95eadc74822 100644 --- a/airflow/contrib/operators/datastore_export_operator.py +++ b/airflow/contrib/operators/datastore_export_operator.py @@ -29,19 +29,19 @@ class DatastoreExportOperator(BaseOperator): Export entities from Google Cloud Datastore to Cloud Storage :param bucket: name of the cloud storage bucket to backup data - :type bucket: string + :type bucket: str :param namespace: optional namespace path in the specified Cloud Storage bucket to backup data. If this namespace does not exist in GCS, it will be created. :type namespace: str :param datastore_conn_id: the name of the Datastore connection id to use - :type datastore_conn_id: string + :type datastore_conn_id: str :param cloud_storage_conn_id: the name of the cloud storage connection id to force-write backup - :type cloud_storage_conn_id: string + :type cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param entity_filter: description of what data from the project is included in the export, refer to https://cloud.google.com/datastore/docs/reference/rest/Shared.Types/EntityFilter diff --git a/airflow/contrib/operators/datastore_import_operator.py b/airflow/contrib/operators/datastore_import_operator.py index 401d36e05b7a8..c79767f35e9cd 100644 --- a/airflow/contrib/operators/datastore_import_operator.py +++ b/airflow/contrib/operators/datastore_import_operator.py @@ -28,10 +28,10 @@ class DatastoreImportOperator(BaseOperator): Import entities from Cloud Storage to Google Cloud Datastore :param bucket: container in Cloud Storage to store data - :type bucket: string + :type bucket: str :param file: path of the backup metadata file in the specified Cloud Storage bucket. It should have the extension .overall_export_metadata - :type file: string + :type file: str :param namespace: optional namespace of the backup metadata file in the specified Cloud Storage bucket. :type namespace: str @@ -42,11 +42,11 @@ class DatastoreImportOperator(BaseOperator): :param labels: client-assigned labels for cloud storage :type labels: dict :param datastore_conn_id: the name of the connection id to use - :type datastore_conn_id: string + :type datastore_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param polling_interval_in_seconds: number of seconds to wait before polling for execution status again :type polling_interval_in_seconds: int diff --git a/airflow/contrib/operators/dingding_operator.py b/airflow/contrib/operators/dingding_operator.py new file mode 100644 index 0000000000000..2d7ea2225e90e --- /dev/null +++ b/airflow/contrib/operators/dingding_operator.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.dingding_hook import DingdingHook +from airflow.operators.bash_operator import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class DingdingOperator(BaseOperator): + """ + This operator allows you send Dingding message using Dingding custom bot. + Get Dingding token from conn_id.password. And prefer set domain to + conn_id.host, if not will use default ``https://oapi.dingtalk.com``. + + For more detail message in + `Dingding custom bot `_ + + :param dingding_conn_id: The name of the Dingding connection to use + :type dingding_conn_id: str + :param message_type: Message type you want to send to Dingding, support five type so far + including text, link, markdown, actionCard, feedCard + :type message_type: str + :param message: The message send to Dingding chat group + :type message: str or dict + :param at_mobiles: Remind specific users with this message + :type at_mobiles: list[str] + :param at_all: Remind all people in group or not. If True, will overwrite ``at_mobiles`` + :type at_all: bool + """ + template_fields = ('message',) + ui_color = '#4ea4d4' # Dingding icon color + + @apply_defaults + def __init__(self, + dingding_conn_id='dingding_default', + message_type='text', + message=None, + at_mobiles=None, + at_all=False, + *args, + **kwargs): + super(DingdingOperator, self).__init__(*args, **kwargs) + self.dingding_conn_id = dingding_conn_id + self.message_type = message_type + self.message = message + self.at_mobiles = at_mobiles + self.at_all = at_all + + def execute(self, context): + self.log.info('Sending Dingding message.') + hook = DingdingHook( + self.dingding_conn_id, + self.message_type, + self.message, + self.at_mobiles, + self.at_all + ) + hook.send() diff --git a/airflow/contrib/operators/druid_operator.py b/airflow/contrib/operators/druid_operator.py index 426393deeb315..75d552fec5a5b 100644 --- a/airflow/contrib/operators/druid_operator.py +++ b/airflow/contrib/operators/druid_operator.py @@ -21,6 +21,7 @@ from airflow.hooks.druid_hook import DruidHook from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults class DruidOperator(BaseOperator): @@ -36,6 +37,7 @@ class DruidOperator(BaseOperator): template_fields = ('index_spec_str',) template_ext = ('.json',) + @apply_defaults def __init__(self, json_index_file, druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None, @@ -58,5 +60,5 @@ def execute(self, context): druid_ingest_conn_id=self.conn_id, max_ingestion_time=self.max_ingestion_time ) - self.log.info("Sumitting %s", self.index_spec_str) + self.log.info("Submitting %s", self.index_spec_str) hook.submit_indexing_job(self.index_spec_str) diff --git a/airflow/contrib/operators/ecs_operator.py b/airflow/contrib/operators/ecs_operator.py index 60540f5e289d6..2f7d8c20013e4 100644 --- a/airflow/contrib/operators/ecs_operator.py +++ b/airflow/contrib/operators/ecs_operator.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. import sys +import re from airflow.exceptions import AirflowException from airflow.models import BaseOperator @@ -33,17 +34,27 @@ class ECSOperator(BaseOperator): :type task_definition: str :param cluster: the cluster name on EC2 Container Service :type cluster: str - :param: overrides: the same parameter that boto3 will receive (templated): - http://boto3.readthedocs.org/en/latest/reference/services/ecs.html#ECS.Client.run_task - :type: overrides: dict + :param overrides: the same parameter that boto3 will receive (templated): + http://boto3.readthedocs.org/en/latest/reference/services/ecs.html#ECS.Client.run_task + :type overrides: dict :param aws_conn_id: connection id of AWS credentials / region name. If None, - credential boto3 strategy will be used - (http://boto3.readthedocs.io/en/latest/guide/configuration.html). + credential boto3 strategy will be used + (http://boto3.readthedocs.io/en/latest/guide/configuration.html). :type aws_conn_id: str :param region_name: region name to use in AWS Hook. Override the region_name in connection (if provided) + :type region_name: str :param launch_type: the launch type on which to run your task ('EC2' or 'FARGATE') - :type: launch_type: str + :type launch_type: str + :param group: the name of the task group associated with the task + :type group: str + :param placement_constraints: an array of placement constraint objects to use for + the task + :type placement_constraints: list + :param platform_version: the platform version on which your task is running + :type platform_version: str + :param network_configuration: the network configuration for the task + :type network_configuration: dict """ ui_color = '#f0ede4' @@ -53,7 +64,9 @@ class ECSOperator(BaseOperator): @apply_defaults def __init__(self, task_definition, cluster, overrides, - aws_conn_id=None, region_name=None, launch_type='EC2', **kwargs): + aws_conn_id=None, region_name=None, launch_type='EC2', + group=None, placement_constraints=None, platform_version='LATEST', + network_configuration=None, **kwargs): super(ECSOperator, self).__init__(**kwargs) self.aws_conn_id = aws_conn_id @@ -62,6 +75,10 @@ def __init__(self, task_definition, cluster, overrides, self.cluster = cluster self.overrides = overrides self.launch_type = launch_type + self.group = group + self.placement_constraints = placement_constraints + self.platform_version = platform_version + self.network_configuration = network_configuration self.hook = self.get_hook() @@ -77,13 +94,23 @@ def execute(self, context): region_name=self.region_name ) - response = self.client.run_task( - cluster=self.cluster, - taskDefinition=self.task_definition, - overrides=self.overrides, - startedBy=self.owner, - launchType=self.launch_type - ) + run_opts = { + 'cluster': self.cluster, + 'taskDefinition': self.task_definition, + 'overrides': self.overrides, + 'startedBy': self.owner, + 'launchType': self.launch_type, + } + + if self.launch_type == 'FARGATE': + run_opts['platformVersion'] = self.platform_version + if self.group is not None: + run_opts['group'] = self.group + if self.placement_constraints is not None: + run_opts['placementConstraints'] = self.placement_constraints + if self.network_configuration is not None: + run_opts['networkConfiguration'] = self.network_configuration + response = self.client.run_task(**run_opts) failures = response['failures'] if len(failures) > 0: @@ -115,6 +142,15 @@ def _check_success_task(self): raise AirflowException(response) for task in response['tasks']: + # This is a `stoppedReason` that indicates a task has not + # successfully finished, but there is no other indication of failure + # in the response. + # See, https://docs.aws.amazon.com/AmazonECS/latest/developerguide/stopped-task-errors.html # noqa E501 + if re.match(r'Host EC2 \(instance .+?\) (stopped|terminated)\.', + task.get('stoppedReason', '')): + raise AirflowException( + 'The task was stopped because the host instance terminated: {}'. + format(task.get('stoppedReason', ''))) containers = task['containers'] for container in containers: if container.get('lastStatus') == 'STOPPED' and \ diff --git a/airflow/contrib/operators/emr_add_steps_operator.py b/airflow/contrib/operators/emr_add_steps_operator.py index 643ffe9c1b94c..44e2e674524f0 100644 --- a/airflow/contrib/operators/emr_add_steps_operator.py +++ b/airflow/contrib/operators/emr_add_steps_operator.py @@ -27,7 +27,7 @@ class EmrAddStepsOperator(BaseOperator): An operator that adds steps to an existing EMR job_flow. :param job_flow_id: id of the JobFlow to add steps to. (templated) - :type job_flow_name: str + :type job_flow_id: str :param aws_conn_id: aws connection to uses :type aws_conn_id: str :param steps: boto3 style steps to be added to the jobflow. (templated) diff --git a/airflow/contrib/operators/emr_create_job_flow_operator.py b/airflow/contrib/operators/emr_create_job_flow_operator.py index 89be12f0657e4..62c21a7533fe1 100644 --- a/airflow/contrib/operators/emr_create_job_flow_operator.py +++ b/airflow/contrib/operators/emr_create_job_flow_operator.py @@ -33,8 +33,8 @@ class EmrCreateJobFlowOperator(BaseOperator): :param emr_conn_id: emr connection to use :type emr_conn_id: str :param job_flow_overrides: boto3 style arguments to override - emr_connection extra. (templated) - :type steps: dict + emr_connection extra. (templated) + :type job_flow_overrides: dict """ template_fields = ['job_flow_overrides'] template_ext = () @@ -46,6 +46,7 @@ def __init__( aws_conn_id='s3_default', emr_conn_id='emr_default', job_flow_overrides=None, + region_name=None, *args, **kwargs): super(EmrCreateJobFlowOperator, self).__init__(*args, **kwargs) self.aws_conn_id = aws_conn_id @@ -53,9 +54,12 @@ def __init__( if job_flow_overrides is None: job_flow_overrides = {} self.job_flow_overrides = job_flow_overrides + self.region_name = region_name def execute(self, context): - emr = EmrHook(aws_conn_id=self.aws_conn_id, emr_conn_id=self.emr_conn_id) + emr = EmrHook(aws_conn_id=self.aws_conn_id, + emr_conn_id=self.emr_conn_id, + region_name=self.region_name) self.log.info( 'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s', diff --git a/airflow/contrib/operators/emr_terminate_job_flow_operator.py b/airflow/contrib/operators/emr_terminate_job_flow_operator.py index 50407a14acad9..0a72f4ac73e81 100644 --- a/airflow/contrib/operators/emr_terminate_job_flow_operator.py +++ b/airflow/contrib/operators/emr_terminate_job_flow_operator.py @@ -27,7 +27,7 @@ class EmrTerminateJobFlowOperator(BaseOperator): Operator to terminate EMR JobFlows. :param job_flow_id: id of the JobFlow to terminate. (templated) - :type job_flow_name: str + :type job_flow_id: str :param aws_conn_id: aws connection to uses :type aws_conn_id: str """ diff --git a/airflow/contrib/operators/file_to_gcs.py b/airflow/contrib/operators/file_to_gcs.py index 807385b43c3a7..de40abd473e86 100644 --- a/airflow/contrib/operators/file_to_gcs.py +++ b/airflow/contrib/operators/file_to_gcs.py @@ -25,20 +25,23 @@ class FileToGoogleCloudStorageOperator(BaseOperator): """ - Uploads a file to Google Cloud Storage + Uploads a file to Google Cloud Storage. + Optionally can compress the file for upload. :param src: Path to the local file. (templated) - :type src: string + :type src: str :param dst: Destination path within the specified bucket. (templated) - :type dst: string + :type dst: str :param bucket: The bucket to upload to. (templated) - :type bucket: string + :type bucket: str :param google_cloud_storage_conn_id: The Airflow connection ID to upload with - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param mime_type: The mime-type string - :type mime_type: string + :type mime_type: str :param delegate_to: The account to impersonate, if any - :type delegate_to: string + :type delegate_to: str + :param gzip: Allows for file to be compressed and uploaded as gzip + :type gzip: bool """ template_fields = ('src', 'dst', 'bucket') @@ -50,6 +53,7 @@ def __init__(self, google_cloud_storage_conn_id='google_cloud_default', mime_type='application/octet-stream', delegate_to=None, + gzip=False, *args, **kwargs): super(FileToGoogleCloudStorageOperator, self).__init__(*args, **kwargs) @@ -59,6 +63,7 @@ def __init__(self, self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.mime_type = mime_type self.delegate_to = delegate_to + self.gzip = gzip def execute(self, context): """ @@ -72,4 +77,6 @@ def execute(self, context): bucket=self.bucket, object=self.dst, mime_type=self.mime_type, - filename=self.src) + filename=self.src, + gzip=self.gzip, + ) diff --git a/airflow/contrib/operators/gcp_bigtable_operator.py b/airflow/contrib/operators/gcp_bigtable_operator.py new file mode 100644 index 0000000000000..d99746798dc28 --- /dev/null +++ b/airflow/contrib/operators/gcp_bigtable_operator.py @@ -0,0 +1,485 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Iterable +import google.api_core.exceptions + +from airflow import AirflowException +from airflow.models import BaseOperator +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.contrib.hooks.gcp_bigtable_hook import BigtableHook +from airflow.utils.decorators import apply_defaults +from google.cloud.bigtable_admin_v2 import enums +from google.cloud.bigtable.table import ClusterState + + +class BigtableValidationMixin(object): + """ + Common class for Cloud Bigtable operators for validating required fields. + """ + + REQUIRED_ATTRIBUTES = [] # type: Iterable[str] + + def _validate_inputs(self): + for attr_name in self.REQUIRED_ATTRIBUTES: + if not getattr(self, attr_name): + raise AirflowException('Empty parameter: {}'.format(attr_name)) + + +class BigtableInstanceCreateOperator(BaseOperator, BigtableValidationMixin): + """ + Creates a new Cloud Bigtable instance. + If the Cloud Bigtable instance with the given ID exists, the operator does not + compare its configuration + and immediately succeeds. No changes are made to the existing instance. + + For more details about instance creation have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/instance.html#google.cloud.bigtable.instance.Instance.create + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableInstanceCreateOperator` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance to create. + :type main_cluster_id: str + :param main_cluster_id: The ID for main cluster for the new instance. + :type main_cluster_zone: str + :param main_cluster_zone: The zone for main cluster + See https://cloud.google.com/bigtable/docs/locations for more details. + :type project_id: str + :param project_id: Optional, the ID of the GCP project. If set to None or missing, + the default project_id from the GCP connection is used. + :type replica_cluster_id: str + :param replica_cluster_id: (optional) The ID for replica cluster for the new instance. + :type replica_cluster_zone: str + :param replica_cluster_zone: (optional) The zone for replica cluster. + :type instance_type: enums.IntEnum + :param instance_type: (optional) The type of the instance. + :type instance_display_name: str + :param instance_display_name: (optional) Human-readable name of the instance. Defaults + to ``instance_id``. + :type instance_labels: dict + :param instance_labels: (optional) Dictionary of labels to associate + with the instance. + :type cluster_nodes: int + :param cluster_nodes: (optional) Number of nodes for cluster. + :type cluster_storage_type: enums.IntEnum + :param cluster_storage_type: (optional) The type of storage. + :type timeout: int + :param timeout: (optional) timeout (in seconds) for instance creation. + If None is not specified, Operator will wait indefinitely. + """ + + REQUIRED_ATTRIBUTES = ('instance_id', 'main_cluster_id', + 'main_cluster_zone') + template_fields = ['project_id', 'instance_id', 'main_cluster_id', + 'main_cluster_zone'] + + @apply_defaults + def __init__(self, + instance_id, + main_cluster_id, + main_cluster_zone, + project_id=None, + replica_cluster_id=None, + replica_cluster_zone=None, + instance_display_name=None, + instance_type=None, + instance_labels=None, + cluster_nodes=None, + cluster_storage_type=None, + timeout=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self.main_cluster_id = main_cluster_id + self.main_cluster_zone = main_cluster_zone + self.replica_cluster_id = replica_cluster_id + self.replica_cluster_zone = replica_cluster_zone + self.instance_display_name = instance_display_name + self.instance_type = instance_type + self.instance_labels = instance_labels + self.cluster_nodes = cluster_nodes + self.cluster_storage_type = cluster_storage_type + self.timeout = timeout + self._validate_inputs() + self.hook = BigtableHook() + super(BigtableInstanceCreateOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + instance = self.hook.get_instance(project_id=self.project_id, + instance_id=self.instance_id) + if instance: + # Based on Instance.__eq__ instance with the same ID and client is + # considered as equal. + self.log.info( + "The instance '%s' already exists in this project. " + "Consider it as created", + self.instance_id + ) + return + try: + self.hook.create_instance( + project_id=self.project_id, + instance_id=self.instance_id, + main_cluster_id=self.main_cluster_id, + main_cluster_zone=self.main_cluster_zone, + replica_cluster_id=self.replica_cluster_id, + replica_cluster_zone=self.replica_cluster_zone, + instance_display_name=self.instance_display_name, + instance_type=self.instance_type, + instance_labels=self.instance_labels, + cluster_nodes=self.cluster_nodes, + cluster_storage_type=self.cluster_storage_type, + timeout=self.timeout, + ) + except google.api_core.exceptions.GoogleAPICallError as e: + self.log.error('An error occurred. Exiting.') + raise e + + +class BigtableInstanceDeleteOperator(BaseOperator, BigtableValidationMixin): + """ + Deletes the Cloud Bigtable instance, including its clusters and all related tables. + + For more details about deleting instance have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/instance.html#google.cloud.bigtable.instance.Instance.delete + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableInstanceDeleteOperator` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance to delete. + :param project_id: Optional, the ID of the GCP project. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + """ + REQUIRED_ATTRIBUTES = ('instance_id',) + template_fields = ['project_id', 'instance_id'] + + @apply_defaults + def __init__(self, + instance_id, + project_id=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self._validate_inputs() + self.hook = BigtableHook() + super(BigtableInstanceDeleteOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + try: + self.hook.delete_instance(project_id=self.project_id, + instance_id=self.instance_id) + except google.api_core.exceptions.NotFound: + self.log.info( + "The instance '%s' does not exist in project '%s'. " + "Consider it as deleted", + self.instance_id, self.project_id + ) + except google.api_core.exceptions.GoogleAPICallError as e: + self.log.error('An error occurred. Exiting.') + raise e + + +class BigtableTableCreateOperator(BaseOperator, BigtableValidationMixin): + """ + Creates the table in the Cloud Bigtable instance. + + For more details about creating table have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/table.html#google.cloud.bigtable.table.Table.create + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableTableCreateOperator` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance that will + hold the new table. + :type table_id: str + :param table_id: The ID of the table to be created. + :type project_id: str + :param project_id: Optional, the ID of the GCP project. If set to None or missing, + the default project_id from the GCP connection is used. + :type initial_split_keys: list + :param initial_split_keys: (Optional) list of row keys in bytes that will be used to + initially split the table into several tablets. + :type column_families: dict + :param column_families: (Optional) A map columns to create. + The key is the column_id str and the value is a + :class:`google.cloud.bigtable.column_family.GarbageCollectionRule` + """ + REQUIRED_ATTRIBUTES = ('instance_id', 'table_id') + template_fields = ['project_id', 'instance_id', 'table_id'] + + @apply_defaults + def __init__(self, + instance_id, + table_id, + project_id=None, + initial_split_keys=None, + column_families=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self.table_id = table_id + self.initial_split_keys = initial_split_keys or list() + self.column_families = column_families or dict() + self._validate_inputs() + self.hook = BigtableHook() + self.instance = None + super(BigtableTableCreateOperator, self).__init__(*args, **kwargs) + + def _compare_column_families(self): + table_column_families = self.hook.get_column_families_for_table(self.instance, + self.table_id) + if set(table_column_families.keys()) != set(self.column_families.keys()): + self.log.error("Table '%s' has different set of Column Families", + self.table_id) + self.log.error("Expected: %s", self.column_families.keys()) + self.log.error("Actual: %s", table_column_families.keys()) + return False + + for key in table_column_families.keys(): + # There is difference in structure between local Column Families + # and remote ones + # Local `self.column_families` is dict with column_id as key + # and GarbageCollectionRule as value. + # Remote `table_column_families` is list of ColumnFamily objects. + # For more information about ColumnFamily please refer to the documentation: + # https://googleapis.github.io/google-cloud-python/latest/bigtable/column-family.html#google.cloud.bigtable.column_family.ColumnFamily + if table_column_families[key].gc_rule != self.column_families[key]: + self.log.error("Column Family '%s' differs for table '%s'.", key, + self.table_id) + return False + return True + + def execute(self, context): + self.instance = self.hook.get_instance(project_id=self.project_id, + instance_id=self.instance_id) + if not self.instance: + raise AirflowException( + "Dependency: instance '{}' does not exist in project '{}'.". + format(self.instance_id, self.project_id)) + try: + self.hook.create_table( + instance=self.instance, + table_id=self.table_id, + initial_split_keys=self.initial_split_keys, + column_families=self.column_families + ) + except google.api_core.exceptions.AlreadyExists: + if not self._compare_column_families(): + raise AirflowException( + "Table '{}' already exists with different Column Families.". + format(self.table_id)) + self.log.info("The table '%s' already exists. Consider it as created", + self.table_id) + + +class BigtableTableDeleteOperator(BaseOperator, BigtableValidationMixin): + """ + Deletes the Cloud Bigtable table. + + For more details about deleting table have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/table.html#google.cloud.bigtable.table.Table.delete + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableTableDeleteOperator` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance. + :type table_id: str + :param table_id: The ID of the table to be deleted. + :type project_id: str + :param project_id: Optional, the ID of the GCP project. If set to None or missing, + the default project_id from the GCP connection is used. + :type app_profile_id: str + :parm app_profile_id: Application profile. + """ + REQUIRED_ATTRIBUTES = ('instance_id', 'table_id') + template_fields = ['project_id', 'instance_id', 'table_id'] + + @apply_defaults + def __init__(self, + instance_id, + table_id, + project_id=None, + app_profile_id=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self.table_id = table_id + self.app_profile_id = app_profile_id + self._validate_inputs() + self.hook = BigtableHook() + super(BigtableTableDeleteOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + instance = self.hook.get_instance(project_id=self.project_id, + instance_id=self.instance_id) + if not instance: + raise AirflowException("Dependency: instance '{}' does not exist.".format( + self.instance_id)) + + try: + self.hook.delete_table( + project_id=self.project_id, + instance_id=self.instance_id, + table_id=self.table_id, + ) + except google.api_core.exceptions.NotFound: + # It's OK if table doesn't exists. + self.log.info("The table '%s' no longer exists. Consider it as deleted", + self.table_id) + except google.api_core.exceptions.GoogleAPICallError as e: + self.log.error('An error occurred. Exiting.') + raise e + + +class BigtableClusterUpdateOperator(BaseOperator, BigtableValidationMixin): + """ + Updates a Cloud Bigtable cluster. + + For more details about updating a Cloud Bigtable cluster, + have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/cluster.html#google.cloud.bigtable.cluster.Cluster.update + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableClusterUpdateOperator` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance. + :type cluster_id: str + :param cluster_id: The ID of the Cloud Bigtable cluster to update. + :type nodes: int + :param nodes: The desired number of nodes for the Cloud Bigtable cluster. + :type project_id: str + :param project_id: Optional, the ID of the GCP project. + """ + REQUIRED_ATTRIBUTES = ('instance_id', 'cluster_id', 'nodes') + template_fields = ['project_id', 'instance_id', 'cluster_id', 'nodes'] + + @apply_defaults + def __init__(self, + instance_id, + cluster_id, + nodes, + project_id=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self.cluster_id = cluster_id + self.nodes = nodes + self._validate_inputs() + self.hook = BigtableHook() + super(BigtableClusterUpdateOperator, self).__init__(*args, **kwargs) + + def execute(self, context): + instance = self.hook.get_instance(project_id=self.project_id, + instance_id=self.instance_id) + if not instance: + raise AirflowException("Dependency: instance '{}' does not exist.".format( + self.instance_id)) + + try: + self.hook.update_cluster( + instance=instance, + cluster_id=self.cluster_id, + nodes=self.nodes + ) + except google.api_core.exceptions.NotFound: + raise AirflowException( + "Dependency: cluster '{}' does not exist for instance '{}'.". + format(self.cluster_id, self.instance_id)) + except google.api_core.exceptions.GoogleAPICallError as e: + self.log.error('An error occurred. Exiting.') + raise e + + +class BigtableTableWaitForReplicationSensor(BaseSensorOperator, BigtableValidationMixin): + """ + Sensor that waits for Cloud Bigtable table to be fully replicated to its clusters. + No exception will be raised if the instance or the table does not exist. + + For more details about cluster states for a table, have a look at the reference: + https://googleapis.github.io/google-cloud-python/latest/bigtable/table.html#google.cloud.bigtable.table.Table.get_cluster_states + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BigtableTableWaitForReplicationSensor` + + :type instance_id: str + :param instance_id: The ID of the Cloud Bigtable instance. + :type table_id: str + :param table_id: The ID of the table to check replication status. + :type project_id: str + :param project_id: Optional, the ID of the GCP project. + """ + REQUIRED_ATTRIBUTES = ('instance_id', 'table_id') + template_fields = ['project_id', 'instance_id', 'table_id'] + + @apply_defaults + def __init__(self, + instance_id, + table_id, + project_id=None, + *args, **kwargs): + self.project_id = project_id + self.instance_id = instance_id + self.table_id = table_id + self._validate_inputs() + self.hook = BigtableHook() + super(BigtableTableWaitForReplicationSensor, self).__init__(*args, **kwargs) + + def poke(self, context): + instance = self.hook.get_instance(project_id=self.project_id, + instance_id=self.instance_id) + if not instance: + self.log.info("Dependency: instance '%s' does not exist.", self.instance_id) + return False + + try: + cluster_states = self.hook.get_cluster_states_for_table(instance=instance, + table_id=self.table_id) + except google.api_core.exceptions.NotFound: + self.log.info( + "Dependency: table '%s' does not exist in instance '%s'.", + self.table_id, self.instance_id) + return False + + ready_state = ClusterState(enums.Table.ClusterState.ReplicationState.READY) + + is_table_replicated = True + for cluster_id in cluster_states.keys(): + if cluster_states[cluster_id] != ready_state: + self.log.info("Table '%s' is not yet replicated on cluster '%s'.", + self.table_id, cluster_id) + is_table_replicated = False + + if not is_table_replicated: + return False + + self.log.info("Table '%s' is replicated.", self.table_id) + return True diff --git a/airflow/contrib/operators/gcp_compute_operator.py b/airflow/contrib/operators/gcp_compute_operator.py new file mode 100644 index 0000000000000..23c339bf32842 --- /dev/null +++ b/airflow/contrib/operators/gcp_compute_operator.py @@ -0,0 +1,484 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from copy import deepcopy + +from googleapiclient.errors import HttpError + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_compute_hook import GceHook +from airflow.contrib.utils.gcp_field_sanitizer import GcpBodyFieldSanitizer +from airflow.contrib.utils.gcp_field_validator import GcpBodyFieldValidator +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from json_merge_patch import merge + + +class GceBaseOperator(BaseOperator): + """ + Abstract base operator for Google Compute Engine operators to inherit from. + """ + + @apply_defaults + def __init__(self, + zone, + resource_id, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, **kwargs): + self.project_id = project_id + self.zone = zone + self.resource_id = resource_id + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + self._hook = GceHook(gcp_conn_id=self.gcp_conn_id, api_version=self.api_version) + super(GceBaseOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is missing") + if not self.zone: + raise AirflowException("The required parameter 'zone' is missing") + if not self.resource_id: + raise AirflowException("The required parameter 'resource_id' is missing") + + def execute(self, context): + pass + + +class GceInstanceStartOperator(GceBaseOperator): + """ + Starts an instance in Google Compute Engine. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GceInstanceStartOperator` + + :param zone: Google Cloud Platform zone where the instance exists. + :type zone: str + :param resource_id: Name of the Compute Engine instance resource. + :type resource_id: str + :param project_id: Optional, Google Cloud Platform Project ID where the Compute + Engine Instance exists. If set to None or missing, the default project_id from the GCP connection is + used. + :type project_id: str + :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud + Platform. Defaults to 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: Optional, API version used (for example v1 - or beta). Defaults + to v1. + :type api_version: str + :param validate_body: Optional, If set to False, body validation is not performed. + Defaults to False. + """ + # [START gce_instance_start_template_fields] + template_fields = ('project_id', 'zone', 'resource_id', 'gcp_conn_id', 'api_version') + # [END gce_instance_start_template_fields] + + @apply_defaults + def __init__(self, + zone, + resource_id, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, **kwargs): + super(GceInstanceStartOperator, self).__init__( + project_id=project_id, zone=zone, resource_id=resource_id, + gcp_conn_id=gcp_conn_id, api_version=api_version, *args, **kwargs) + + def execute(self, context): + return self._hook.start_instance(zone=self.zone, + resource_id=self.resource_id, + project_id=self.project_id) + + +class GceInstanceStopOperator(GceBaseOperator): + """ + Stops an instance in Google Compute Engine. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GceInstanceStopOperator` + + :param zone: Google Cloud Platform zone where the instance exists. + :type zone: str + :param resource_id: Name of the Compute Engine instance resource. + :type resource_id: str + :param project_id: Optional, Google Cloud Platform Project ID where the Compute + Engine Instance exists. If set to None or missing, the default project_id from the GCP connection is + used. + :type project_id: str + :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud + Platform. Defaults to 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: Optional, API version used (for example v1 - or beta). Defaults + to v1. + :type api_version: str + :param validate_body: Optional, If set to False, body validation is not performed. + Defaults to False. + """ + # [START gce_instance_stop_template_fields] + template_fields = ('project_id', 'zone', 'resource_id', 'gcp_conn_id', 'api_version') + # [END gce_instance_stop_template_fields] + + @apply_defaults + def __init__(self, + zone, + resource_id, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, **kwargs): + super(GceInstanceStopOperator, self).__init__( + project_id=project_id, zone=zone, resource_id=resource_id, + gcp_conn_id=gcp_conn_id, api_version=api_version, *args, **kwargs) + + def execute(self, context): + self._hook.stop_instance(zone=self.zone, + resource_id=self.resource_id, + project_id=self.project_id) + + +SET_MACHINE_TYPE_VALIDATION_SPECIFICATION = [ + dict(name="machineType", regexp="^.+$"), +] + + +class GceSetMachineTypeOperator(GceBaseOperator): + """ + Changes the machine type for a stopped instance to the machine type specified in + the request. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GceSetMachineTypeOperator` + + :param zone: Google Cloud Platform zone where the instance exists. + :type zone: str + :param resource_id: Name of the Compute Engine instance resource. + :type resource_id: str + :param body: Body required by the Compute Engine setMachineType API, as described in + https://cloud.google.com/compute/docs/reference/rest/v1/instances/setMachineType#request-body + :type body: dict + :param project_id: Optional, Google Cloud Platform Project ID where the Compute + Engine Instance exists. If set to None or missing, the default project_id from the GCP connection + is used. + :type project_id: str + :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud + Platform. Defaults to 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: Optional, API version used (for example v1 - or beta). Defaults + to v1. + :type api_version: str + :param validate_body: Optional, If set to False, body validation is not performed. + Defaults to False. + :type validate_body: bool + """ + # [START gce_instance_set_machine_type_template_fields] + template_fields = ('project_id', 'zone', 'resource_id', 'gcp_conn_id', 'api_version') + # [END gce_instance_set_machine_type_template_fields] + + @apply_defaults + def __init__(self, + zone, + resource_id, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + validate_body=True, + *args, **kwargs): + self.body = body + self._field_validator = None + if validate_body: + self._field_validator = GcpBodyFieldValidator( + SET_MACHINE_TYPE_VALIDATION_SPECIFICATION, api_version=api_version) + super(GceSetMachineTypeOperator, self).__init__( + project_id=project_id, zone=zone, resource_id=resource_id, + gcp_conn_id=gcp_conn_id, api_version=api_version, *args, **kwargs) + + def _validate_all_body_fields(self): + if self._field_validator: + self._field_validator.validate(self.body) + + def execute(self, context): + self._validate_all_body_fields() + return self._hook.set_machine_type(zone=self.zone, + resource_id=self.resource_id, + body=self.body, + project_id=self.project_id) + + +GCE_INSTANCE_TEMPLATE_VALIDATION_PATCH_SPECIFICATION = [ + dict(name="name", regexp="^.+$"), + dict(name="description", optional=True), + dict(name="properties", type='dict', optional=True, fields=[ + dict(name="description", optional=True), + dict(name="tags", optional=True, fields=[ + dict(name="items", optional=True) + ]), + dict(name="machineType", optional=True), + dict(name="canIpForward", optional=True), + dict(name="networkInterfaces", optional=True), # not validating deeper + dict(name="disks", optional=True), # not validating the array deeper + dict(name="metadata", optional=True, fields=[ + dict(name="fingerprint", optional=True), + dict(name="items", optional=True), + dict(name="kind", optional=True), + ]), + dict(name="serviceAccounts", optional=True), # not validating deeper + dict(name="scheduling", optional=True, fields=[ + dict(name="onHostMaintenance", optional=True), + dict(name="automaticRestart", optional=True), + dict(name="preemptible", optional=True), + dict(name="nodeAffinitites", optional=True), # not validating deeper + ]), + dict(name="labels", optional=True), + dict(name="guestAccelerators", optional=True), # not validating deeper + dict(name="minCpuPlatform", optional=True), + ]), +] + +GCE_INSTANCE_TEMPLATE_FIELDS_TO_SANITIZE = [ + "kind", + "id", + "name", + "creationTimestamp", + "properties.disks.sha256", + "properties.disks.kind", + "properties.disks.sourceImageEncryptionKey.sha256", + "properties.disks.index", + "properties.disks.licenses", + "properties.networkInterfaces.kind", + "properties.networkInterfaces.accessConfigs.kind", + "properties.networkInterfaces.name", + "properties.metadata.kind", + "selfLink" +] + + +class GceInstanceTemplateCopyOperator(GceBaseOperator): + """ + Copies the instance template, applying specified changes. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GceInstanceTemplateCopyOperator` + + :param resource_id: Name of the Instance Template + :type resource_id: str + :param body_patch: Patch to the body of instanceTemplates object following rfc7386 + PATCH semantics. The body_patch content follows + https://cloud.google.com/compute/docs/reference/rest/v1/instanceTemplates + Name field is required as we need to rename the template, + all the other fields are optional. It is important to follow PATCH semantics + - arrays are replaced fully, so if you need to update an array you should + provide the whole target array as patch element. + :type body_patch: dict + :param project_id: Optional, Google Cloud Platform Project ID where the Compute + Engine Instance exists. If set to None or missing, the default project_id from the GCP connection + is used. + :type project_id: str + :param request_id: Optional, unique request_id that you might add to achieve + full idempotence (for example when client call times out repeating the request + with the same request id will not create a new instance template again). + It should be in UUID format as defined in RFC 4122. + :type request_id: str + :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud + Platform. Defaults to 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: Optional, API version used (for example v1 - or beta). Defaults + to v1. + :type api_version: str + :param validate_body: Optional, If set to False, body validation is not performed. + Defaults to False. + :type validate_body: bool + """ + # [START gce_instance_template_copy_operator_template_fields] + template_fields = ('project_id', 'resource_id', 'request_id', + 'gcp_conn_id', 'api_version') + # [END gce_instance_template_copy_operator_template_fields] + + @apply_defaults + def __init__(self, + resource_id, + body_patch, + project_id=None, + request_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + validate_body=True, + *args, **kwargs): + self.body_patch = body_patch + self.request_id = request_id + self._field_validator = None + if 'name' not in self.body_patch: + raise AirflowException("The body '{}' should contain at least " + "name for the new operator in the 'name' field". + format(body_patch)) + if validate_body: + self._field_validator = GcpBodyFieldValidator( + GCE_INSTANCE_TEMPLATE_VALIDATION_PATCH_SPECIFICATION, api_version=api_version) + self._field_sanitizer = GcpBodyFieldSanitizer( + GCE_INSTANCE_TEMPLATE_FIELDS_TO_SANITIZE) + super(GceInstanceTemplateCopyOperator, self).__init__( + project_id=project_id, zone='global', resource_id=resource_id, + gcp_conn_id=gcp_conn_id, api_version=api_version, *args, **kwargs) + + def _validate_all_body_fields(self): + if self._field_validator: + self._field_validator.validate(self.body_patch) + + def execute(self, context): + self._validate_all_body_fields() + try: + # Idempotence check (sort of) - we want to check if the new template + # is already created and if is, then we assume it was created by previous run + # of CopyTemplate operator - we do not check if content of the template + # is as expected. Templates are immutable so we cannot update it anyway + # and deleting/recreating is not worth the hassle especially + # that we cannot delete template if it is already used in some Instance + # Group Manager. We assume success if the template is simply present + existing_template = self._hook.get_instance_template( + resource_id=self.body_patch['name'], project_id=self.project_id) + self.log.info( + "The %s template already existed. It was likely created by previous run of the operator. " + "Assuming success.", + existing_template + ) + return existing_template + except HttpError as e: + # We actually expect to get 404 / Not Found here as the template should + # not yet exist + if not e.resp.status == 404: + raise e + old_body = self._hook.get_instance_template(resource_id=self.resource_id, + project_id=self.project_id) + new_body = deepcopy(old_body) + self._field_sanitizer.sanitize(new_body) + new_body = merge(new_body, self.body_patch) + self.log.info("Calling insert instance template with updated body: %s", new_body) + self._hook.insert_instance_template(body=new_body, + request_id=self.request_id, + project_id=self.project_id) + return self._hook.get_instance_template(resource_id=self.body_patch['name'], + project_id=self.project_id) + + +class GceInstanceGroupManagerUpdateTemplateOperator(GceBaseOperator): + """ + Patches the Instance Group Manager, replacing source template URL with the + destination one. API V1 does not have update/patch operations for Instance + Group Manager, so you must use beta or newer API version. Beta is the default. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GceInstanceGroupManagerUpdateTemplateOperator` + + :param resource_id: Name of the Instance Group Manager + :type resource_id: str + :param zone: Google Cloud Platform zone where the Instance Group Manager exists. + :type zone: str + :param source_template: URL of the template to replace. + :type source_template: str + :param destination_template: URL of the target template. + :type destination_template: str + :param project_id: Optional, Google Cloud Platform Project ID where the Compute + Engine Instance exists. If set to None or missing, the default project_id from the GCP connection is + used. + :type project_id: str + :param request_id: Optional, unique request_id that you might add to achieve + full idempotence (for example when client call times out repeating the request + with the same request id will not create a new instance template again). + It should be in UUID format as defined in RFC 4122. + :type request_id: str + :param gcp_conn_id: Optional, The connection ID used to connect to Google Cloud + Platform. Defaults to 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: Optional, API version used (for example v1 - or beta). Defaults + to v1. + :type api_version: str + :param validate_body: Optional, If set to False, body validation is not performed. + Defaults to False. + :type validate_body: bool + """ + # [START gce_igm_update_template_operator_template_fields] + template_fields = ('project_id', 'resource_id', 'zone', 'request_id', + 'source_template', 'destination_template', + 'gcp_conn_id', 'api_version') + # [END gce_igm_update_template_operator_template_fields] + + @apply_defaults + def __init__(self, + resource_id, + zone, + source_template, + destination_template, + project_id=None, + update_policy=None, + request_id=None, + gcp_conn_id='google_cloud_default', + api_version='beta', + *args, **kwargs): + self.zone = zone + self.source_template = source_template + self.destination_template = destination_template + self.request_id = request_id + self.update_policy = update_policy + self._change_performed = False + if api_version == 'v1': + raise AirflowException("Api version v1 does not have update/patch " + "operations for Instance Group Managers. Use beta" + " api version or above") + super(GceInstanceGroupManagerUpdateTemplateOperator, self).__init__( + project_id=project_id, zone=self.zone, resource_id=resource_id, + gcp_conn_id=gcp_conn_id, api_version=api_version, *args, **kwargs) + + def _possibly_replace_template(self, dictionary): + # type: (dict) -> None + if dictionary.get('instanceTemplate') == self.source_template: + dictionary['instanceTemplate'] = self.destination_template + self._change_performed = True + + def execute(self, context): + old_instance_group_manager = self._hook.get_instance_group_manager( + zone=self.zone, resource_id=self.resource_id, project_id=self.project_id) + patch_body = {} + if 'versions' in old_instance_group_manager: + patch_body['versions'] = old_instance_group_manager['versions'] + if 'instanceTemplate' in old_instance_group_manager: + patch_body['instanceTemplate'] = old_instance_group_manager['instanceTemplate'] + if self.update_policy: + patch_body['updatePolicy'] = self.update_policy + self._possibly_replace_template(patch_body) + if 'versions' in patch_body: + for version in patch_body['versions']: + self._possibly_replace_template(version) + if self._change_performed or self.update_policy: + self.log.info("Calling patch instance template with updated body: {}". + format(patch_body)) + return self._hook.patch_instance_group_manager( + zone=self.zone, resource_id=self.resource_id, + body=patch_body, request_id=self.request_id, + project_id=self.project_id) + else: + # Idempotence achieved + return True diff --git a/airflow/contrib/operators/gcp_container_operator.py b/airflow/contrib/operators/gcp_container_operator.py index 5648b4d8a01b3..c5325193f6ff0 100644 --- a/airflow/contrib/operators/gcp_container_operator.py +++ b/airflow/contrib/operators/gcp_container_operator.py @@ -17,13 +17,48 @@ # specific language governing permissions and limitations # under the License. # +import os +import subprocess +import tempfile + from airflow import AirflowException from airflow.contrib.hooks.gcp_container_hook import GKEClusterHook +from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults class GKEClusterDeleteOperator(BaseOperator): + """ + Deletes the cluster, including the Kubernetes endpoint and all worker nodes. + + To delete a certain cluster, you must specify the ``project_id``, the ``name`` + of the cluster, the ``location`` that the cluster is in, and the ``task_id``. + + **Operator Creation**: :: + + operator = GKEClusterDeleteOperator( + task_id='cluster_delete', + project_id='my-project', + location='cluster-location' + name='cluster-name') + + .. seealso:: + For more detail about deleting clusters have a look at the reference: + https://google-cloud-python.readthedocs.io/en/latest/container/gapic/v1/api.html#google.cloud.container_v1.ClusterManagerClient.delete_cluster + + :param project_id: The Google Developers Console [project ID or project number] + :type project_id: str + :param name: The name of the resource to delete, in this case cluster name + :type name: str + :param location: The name of the Google Compute Engine zone in which the cluster + resides. + :type location: str + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: The api version to use + :type api_version: str + """ template_fields = ['project_id', 'gcp_conn_id', 'name', 'location', 'api_version'] @apply_defaults @@ -35,37 +70,6 @@ def __init__(self, api_version='v2', *args, **kwargs): - """ - Deletes the cluster, including the Kubernetes endpoint and all worker nodes. - - - To delete a certain cluster, you must specify the ``project_id``, the ``name`` - of the cluster, the ``location`` that the cluster is in, and the ``task_id``. - - **Operator Creation**: :: - - operator = GKEClusterDeleteOperator( - task_id='cluster_delete', - project_id='my-project', - location='cluster-location' - name='cluster-name') - - .. seealso:: - For more detail about deleting clusters have a look at the reference: - https://google-cloud-python.readthedocs.io/en/latest/container/gapic/v1/api.html#google.cloud.container_v1.ClusterManagerClient.delete_cluster - - :param project_id: The Google Developers Console [project ID or project number] - :type project_id: str - :param name: The name of the resource to delete, in this case cluster name - :type name: str - :param location: The name of the Google Compute Engine zone in which the cluster - resides. - :type location: str - :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: str - :param api_version: The api version to use - :type api_version: str - """ super(GKEClusterDeleteOperator, self).__init__(*args, **kwargs) self.project_id = project_id @@ -88,61 +92,63 @@ def execute(self, context): class GKEClusterCreateOperator(BaseOperator): + """ + Create a Google Kubernetes Engine Cluster of specified dimensions + The operator will wait until the cluster is created. + + The **minimum** required to define a cluster to create is: + + ``dict()`` :: + cluster_def = {'name': 'my-cluster-name', + 'initial_node_count': 1} + + or + + ``Cluster`` proto :: + from google.cloud.container_v1.types import Cluster + + cluster_def = Cluster(name='my-cluster-name', initial_node_count=1) + + **Operator Creation**: :: + + operator = GKEClusterCreateOperator( + task_id='cluster_create', + project_id='my-project', + location='my-location' + body=cluster_def) + + .. seealso:: + For more detail on about creating clusters have a look at the reference: + :class:`google.cloud.container_v1.types.Cluster` + + :param project_id: The Google Developers Console [project ID or project number] + :type project_id: str + :param location: The name of the Google Compute Engine zone in which the cluster + resides. + :type location: str + :param body: The Cluster definition to create, can be protobuf or python dict, if + dict it must match protobuf message Cluster + :type body: dict or google.cloud.container_v1.types.Cluster + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: The api version to use + :type api_version: str + """ template_fields = ['project_id', 'gcp_conn_id', 'location', 'api_version', 'body'] @apply_defaults def __init__(self, project_id, location, - body={}, + body=None, gcp_conn_id='google_cloud_default', api_version='v2', *args, **kwargs): - """ - Create a Google Kubernetes Engine Cluster of specified dimensions - The operator will wait until the cluster is created. - - The **minimum** required to define a cluster to create is: - - ``dict()`` :: - cluster_def = {'name': 'my-cluster-name', - 'initial_node_count': 1} - - or - - ``Cluster`` proto :: - from google.cloud.container_v1.types import Cluster - - cluster_def = Cluster(name='my-cluster-name', initial_node_count=1) - - **Operator Creation**: :: - - operator = GKEClusterCreateOperator( - task_id='cluster_create', - project_id='my-project', - location='my-location' - body=cluster_def) - - .. seealso:: - For more detail on about creating clusters have a look at the reference: - https://google-cloud-python.readthedocs.io/en/latest/container/gapic/v1/types.html#google.cloud.container_v1.types.Cluster - - :param project_id: The Google Developers Console [project ID or project number] - :type project_id: str - :param location: The name of the Google Compute Engine zone in which the cluster - resides. - :type location: str - :param body: The Cluster definition to create, can be protobuf or python dict, if - dict it must match protobuf message Cluster - :type body: dict or google.cloud.container_v1.types.Cluster - :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: str - :param api_version: The api version to use - :type api_version: str - """ super(GKEClusterCreateOperator, self).__init__(*args, **kwargs) + if body is None: + body = {} self.project_id = project_id self.gcp_conn_id = gcp_conn_id self.location = location @@ -170,3 +176,147 @@ def execute(self, context): hook = GKEClusterHook(self.project_id, self.location) create_op = hook.create_cluster(cluster=self.body) return create_op + + +KUBE_CONFIG_ENV_VAR = "KUBECONFIG" +G_APP_CRED = "GOOGLE_APPLICATION_CREDENTIALS" + + +class GKEPodOperator(KubernetesPodOperator): + """ + Executes a task in a Kubernetes pod in the specified Google Kubernetes + Engine cluster + + This Operator assumes that the system has gcloud installed and either + has working default application credentials or has configured a + connection id with a service account. + + The **minimum** required to define a cluster to create are the variables + ``task_id``, ``project_id``, ``location``, ``cluster_name``, ``name``, + ``namespace``, and ``image`` + + **Operator Creation**: :: + + operator = GKEPodOperator(task_id='pod_op', + project_id='my-project', + location='us-central1-a', + cluster_name='my-cluster-name', + name='task-name', + namespace='default', + image='perl') + + .. seealso:: + For more detail about application authentication have a look at the reference: + https://cloud.google.com/docs/authentication/production#providing_credentials_to_your_application + + :param project_id: The Google Developers Console project id + :type project_id: str + :param location: The name of the Google Kubernetes Engine zone in which the + cluster resides, e.g. 'us-central1-a' + :type location: str + :param cluster_name: The name of the Google Kubernetes Engine cluster the pod + should be spawned in + :type cluster_name: str + :param gcp_conn_id: The google cloud connection id to use. This allows for + users to specify a service account. + :type gcp_conn_id: str + """ + template_fields = ('project_id', 'location', + 'cluster_name') + KubernetesPodOperator.template_fields + + @apply_defaults + def __init__(self, + project_id, + location, + cluster_name, + gcp_conn_id='google_cloud_default', + *args, + **kwargs): + super(GKEPodOperator, self).__init__(*args, **kwargs) + self.project_id = project_id + self.location = location + self.cluster_name = cluster_name + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + # Specifying a service account file allows the user to using non default + # authentication for creating a Kubernetes Pod. This is done by setting the + # environment variable `GOOGLE_APPLICATION_CREDENTIALS` that gcloud looks at. + key_file = None + + # If gcp_conn_id is not specified gcloud will use the default + # service account credentials. + if self.gcp_conn_id: + from airflow.hooks.base_hook import BaseHook + # extras is a deserialized json object + extras = BaseHook.get_connection(self.gcp_conn_id).extra_dejson + # key_file only gets set if a json file is created from a JSON string in + # the web ui, else none + key_file = self._set_env_from_extras(extras=extras) + + # Write config to a temp file and set the environment variable to point to it. + # This is to avoid race conditions of reading/writing a single file + with tempfile.NamedTemporaryFile() as conf_file: + os.environ[KUBE_CONFIG_ENV_VAR] = conf_file.name + # Attempt to get/update credentials + # We call gcloud directly instead of using google-cloud-python api + # because there is no way to write kubernetes config to a file, which is + # required by KubernetesPodOperator. + # The gcloud command looks at the env variable `KUBECONFIG` for where to save + # the kubernetes config file. + subprocess.check_call( + ["gcloud", "container", "clusters", "get-credentials", + self.cluster_name, + "--zone", self.location, + "--project", self.project_id]) + + # Since the key file is of type mkstemp() closing the file will delete it from + # the file system so it cannot be accessed after we don't need it anymore + if key_file: + key_file.close() + + # Tell `KubernetesPodOperator` where the config file is located + self.config_file = os.environ[KUBE_CONFIG_ENV_VAR] + super(GKEPodOperator, self).execute(context) + + def _set_env_from_extras(self, extras): + """ + Sets the environment variable `GOOGLE_APPLICATION_CREDENTIALS` with either: + + - The path to the keyfile from the specified connection id + - A generated file's path if the user specified JSON in the connection id. The + file is assumed to be deleted after the process dies due to how mkstemp() + works. + + The environment variable is used inside the gcloud command to determine correct + service account to use. + """ + key_path = self._get_field(extras, 'key_path', False) + keyfile_json_str = self._get_field(extras, 'keyfile_dict', False) + + if not key_path and not keyfile_json_str: + self.log.info('Using gcloud with application default credentials.') + elif key_path: + os.environ[G_APP_CRED] = key_path + else: + # Write service account JSON to secure file for gcloud to reference + service_key = tempfile.NamedTemporaryFile(delete=False) + service_key.write(keyfile_json_str) + os.environ[G_APP_CRED] = service_key.name + # Return file object to have a pointer to close after use, + # thus deleting from file system. + return service_key + + def _get_field(self, extras, field, default=None): + """ + Fetches a field from extras, and returns it. This is some Airflow + magic. The google_cloud_platform hook type adds custom UI elements + to the hook page, which allow admins to specify service_account, + key_path, etc. They get formatted as shown below. + """ + long_f = 'extra__google_cloud_platform__{}'.format(field) + if long_f in extras: + return extras[long_f] + else: + self.log.info('Field %s not found in extras.', field) + return default diff --git a/airflow/contrib/operators/gcp_function_operator.py b/airflow/contrib/operators/gcp_function_operator.py new file mode 100644 index 0000000000000..5fc3b82794b80 --- /dev/null +++ b/airflow/contrib/operators/gcp_function_operator.py @@ -0,0 +1,328 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import re + +from googleapiclient.errors import HttpError + +from airflow import AirflowException +from airflow.contrib.utils.gcp_field_validator import GcpBodyFieldValidator, \ + GcpFieldValidationException +from airflow.version import version +from airflow.models import BaseOperator +from airflow.contrib.hooks.gcp_function_hook import GcfHook +from airflow.utils.decorators import apply_defaults + + +def _validate_available_memory_in_mb(value): + if int(value) <= 0: + raise GcpFieldValidationException("The available memory has to be greater than 0") + + +def _validate_max_instances(value): + if int(value) <= 0: + raise GcpFieldValidationException( + "The max instances parameter has to be greater than 0") + + +CLOUD_FUNCTION_VALIDATION = [ + dict(name="name", regexp="^.+$"), + dict(name="description", regexp="^.+$", optional=True), + dict(name="entryPoint", regexp=r'^.+$', optional=True), + dict(name="runtime", regexp=r'^.+$', optional=True), + dict(name="timeout", regexp=r'^.+$', optional=True), + dict(name="availableMemoryMb", custom_validation=_validate_available_memory_in_mb, + optional=True), + dict(name="labels", optional=True), + dict(name="environmentVariables", optional=True), + dict(name="network", regexp=r'^.+$', optional=True), + dict(name="maxInstances", optional=True, custom_validation=_validate_max_instances), + + dict(name="source_code", type="union", fields=[ + dict(name="sourceArchiveUrl", regexp=r'^.+$'), + dict(name="sourceRepositoryUrl", regexp=r'^.+$', api_version='v1beta2'), + dict(name="sourceRepository", type="dict", fields=[ + dict(name="url", regexp=r'^.+$') + ]), + dict(name="sourceUploadUrl") + ]), + + dict(name="trigger", type="union", fields=[ + dict(name="httpsTrigger", type="dict", fields=[ + # This dict should be empty at input (url is added at output) + ]), + dict(name="eventTrigger", type="dict", fields=[ + dict(name="eventType", regexp=r'^.+$'), + dict(name="resource", regexp=r'^.+$'), + dict(name="service", regexp=r'^.+$', optional=True), + dict(name="failurePolicy", type="dict", optional=True, fields=[ + dict(name="retry", type="dict", optional=True) + ]) + ]) + ]), +] + + +class GcfFunctionDeployOperator(BaseOperator): + """ + Creates a function in Google Cloud Functions. + If a function with this name already exists, it will be updated. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcfFunctionDeployOperator` + + :param location: Google Cloud Platform region where the function should be created. + :type location: str + :param body: Body of the Cloud Functions definition. The body must be a + Cloud Functions dictionary as described in: + https://cloud.google.com/functions/docs/reference/rest/v1/projects.locations.functions + . Different API versions require different variants of the Cloud Functions + dictionary. + :type body: dict or google.cloud.functions.v1.CloudFunction + :param project_id: (Optional) Google Cloud Platform project ID where the function + should be created. + :type project_id: str + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud + Platform - default 'google_cloud_default'. + :type gcp_conn_id: str + :param api_version: (Optional) API version used (for example v1 - default - or + v1beta1). + :type api_version: str + :param zip_path: Path to zip file containing source code of the function. If the path + is set, the sourceUploadUrl should not be specified in the body or it should + be empty. Then the zip file will be uploaded using the upload URL generated + via generateUploadUrl from the Cloud Functions API. + :type zip_path: str + :param validate_body: If set to False, body validation is not performed. + :type validate_body: bool + """ + # [START gcf_function_deploy_template_fields] + template_fields = ('project_id', 'location', 'gcp_conn_id', 'api_version') + # [END gcf_function_deploy_template_fields] + + @apply_defaults + def __init__(self, + location, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1', + zip_path=None, + validate_body=True, + *args, **kwargs): + self.project_id = project_id + self.location = location + self.body = body + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self.zip_path = zip_path + self.zip_path_preprocessor = ZipPathPreprocessor(body, zip_path) + self._field_validator = None + if validate_body: + self._field_validator = GcpBodyFieldValidator(CLOUD_FUNCTION_VALIDATION, + api_version=api_version) + self._hook = GcfHook(gcp_conn_id=self.gcp_conn_id, api_version=self.api_version) + self._validate_inputs() + super(GcfFunctionDeployOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if not self.location: + raise AirflowException("The required parameter 'location' is missing") + if not self.body: + raise AirflowException("The required parameter 'body' is missing") + self.zip_path_preprocessor.preprocess_body() + + def _validate_all_body_fields(self): + if self._field_validator: + self._field_validator.validate(self.body) + + def _create_new_function(self): + self._hook.create_new_function( + project_id=self.project_id, + location=self.location, + body=self.body) + + def _update_function(self): + self._hook.update_function(self.body['name'], self.body, self.body.keys()) + + def _check_if_function_exists(self): + name = self.body.get('name') + if not name: + raise GcpFieldValidationException("The 'name' field should be present in " + "body: '{}'.".format(self.body)) + try: + self._hook.get_function(name) + except HttpError as e: + status = e.resp.status + if status == 404: + return False + raise e + return True + + def _upload_source_code(self): + return self._hook.upload_function_zip(project_id=self.project_id, + location=self.location, + zip_path=self.zip_path) + + def _set_airflow_version_label(self): + if 'labels' not in self.body.keys(): + self.body['labels'] = {} + self.body['labels'].update( + {'airflow-version': 'v' + version.replace('.', '-').replace('+', '-')}) + + def execute(self, context): + if self.zip_path_preprocessor.should_upload_function(): + self.body[GCF_SOURCE_UPLOAD_URL] = self._upload_source_code() + self._validate_all_body_fields() + self._set_airflow_version_label() + if not self._check_if_function_exists(): + self._create_new_function() + else: + self._update_function() + + +GCF_SOURCE_ARCHIVE_URL = 'sourceArchiveUrl' +GCF_SOURCE_UPLOAD_URL = 'sourceUploadUrl' +SOURCE_REPOSITORY = 'sourceRepository' +GCF_ZIP_PATH = 'zip_path' + + +class ZipPathPreprocessor: + """ + Pre-processes zip path parameter. + + Responsible for checking if the zip path parameter is correctly specified in + relation with source_code body fields. Non empty zip path parameter is special because + it is mutually exclusive with sourceArchiveUrl and sourceRepository body fields. + It is also mutually exclusive with non-empty sourceUploadUrl. + The pre-process modifies sourceUploadUrl body field in special way when zip_path + is not empty. An extra step is run when execute method is called and sourceUploadUrl + field value is set in the body with the value returned by generateUploadUrl Cloud + Function API method. + + :param body: Body passed to the create/update method calls. + :type body: dict + :param zip_path: path to the zip file containing source code. + :type body: dict + + """ + upload_function = None + + def __init__(self, body, zip_path): + self.body = body + self.zip_path = zip_path + + @staticmethod + def _is_present_and_empty(dictionary, field): + return field in dictionary and not dictionary[field] + + def _verify_upload_url_and_no_zip_path(self): + if self._is_present_and_empty(self.body, GCF_SOURCE_UPLOAD_URL): + if not self.zip_path: + raise AirflowException( + "Parameter '{}' is empty in the body and argument '{}' " + "is missing or empty. You need to have non empty '{}' " + "when '{}' is present and empty.". + format(GCF_SOURCE_UPLOAD_URL, GCF_ZIP_PATH, GCF_ZIP_PATH, GCF_SOURCE_UPLOAD_URL)) + + def _verify_upload_url_and_zip_path(self): + if GCF_SOURCE_UPLOAD_URL in self.body and self.zip_path: + if not self.body[GCF_SOURCE_UPLOAD_URL]: + self.upload_function = True + else: + raise AirflowException("Only one of '{}' in body or '{}' argument " + "allowed. Found both." + .format(GCF_SOURCE_UPLOAD_URL, GCF_ZIP_PATH)) + + def _verify_archive_url_and_zip_path(self): + if GCF_SOURCE_ARCHIVE_URL in self.body and self.zip_path: + raise AirflowException("Only one of '{}' in body or '{}' argument " + "allowed. Found both." + .format(GCF_SOURCE_ARCHIVE_URL, GCF_ZIP_PATH)) + + def should_upload_function(self): + if self.upload_function is None: + raise AirflowException('validate() method has to be invoked before ' + 'should_upload_function') + return self.upload_function + + def preprocess_body(self): + self._verify_archive_url_and_zip_path() + self._verify_upload_url_and_zip_path() + self._verify_upload_url_and_no_zip_path() + if self.upload_function is None: + self.upload_function = False + + +FUNCTION_NAME_PATTERN = '^projects/[^/]+/locations/[^/]+/functions/[^/]+$' +FUNCTION_NAME_COMPILED_PATTERN = re.compile(FUNCTION_NAME_PATTERN) + + +class GcfFunctionDeleteOperator(BaseOperator): + """ + Deletes the specified function from Google Cloud Functions. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcfFunctionDeleteOperator` + + :param name: A fully-qualified function name, matching + the pattern: `^projects/[^/]+/locations/[^/]+/functions/[^/]+$` + :type name: str + :param gcp_conn_id: The connection ID to use to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (for example v1 or v1beta1). + :type api_version: str + """ + # [START gcf_function_delete_template_fields] + template_fields = ('name', 'gcp_conn_id', 'api_version') + # [END gcf_function_delete_template_fields] + + @apply_defaults + def __init__(self, + name, + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, **kwargs): + self.name = name + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + self.hook = GcfHook(gcp_conn_id=self.gcp_conn_id, api_version=self.api_version) + super(GcfFunctionDeleteOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if not self.name: + raise AttributeError('Empty parameter: name') + else: + pattern = FUNCTION_NAME_COMPILED_PATTERN + if not pattern.match(self.name): + raise AttributeError( + 'Parameter name must match pattern: {}'.format(FUNCTION_NAME_PATTERN)) + + def execute(self, context): + try: + return self.hook.delete_function(self.name) + except HttpError as e: + status = e.resp.status + if status == 404: + self.log.info('The function does not exist in this project') + else: + self.log.error('An error occurred. Exiting.') + raise e diff --git a/airflow/contrib/operators/gcp_natural_language_operator.py b/airflow/contrib/operators/gcp_natural_language_operator.py new file mode 100644 index 0000000000000..0d47ea461ab10 --- /dev/null +++ b/airflow/contrib/operators/gcp_natural_language_operator.py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from google.protobuf.json_format import MessageToDict + +from airflow.contrib.hooks.gcp_natural_language_hook import CloudNaturalLanguageHook +from airflow.models import BaseOperator + + +class CloudLanguageAnalyzeEntitiesOperator(BaseOperator): + """ + Finds named entities in the text along with entity types, + salience, mentions for each entity, and other properties. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudLanguageAnalyzeEntitiesOperator` + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: seq[tuple[str, str]]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START natural_langauge_analyze_entities_template_fields] + template_fields = ("document", "gcp_conn_id") + # [END natural_langauge_analyze_entities_template_fields] + + def __init__( + self, + document, + encoding_type=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudLanguageAnalyzeEntitiesOperator, self).__init__(*args, **kwargs) + self.document = document + self.encoding_type = encoding_type + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudNaturalLanguageHook(gcp_conn_id=self.gcp_conn_id) + + self.log.info("Start analyzing entities") + response = hook.analyze_entities( + document=self.document, retry=self.retry, timeout=self.timeout, metadata=self.metadata + ) + self.log.info("Finished analyzing entities") + + return MessageToDict(response) + + +class CloudLanguageAnalyzeEntitySentimentOperator(BaseOperator): + """ + Finds entities, similar to AnalyzeEntities in the text and analyzes sentiment associated with each + entity and its mentions. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudLanguageAnalyzeEntitySentimentOperator` + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: seq[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START natural_langauge_analyze_entity_sentiment_template_fields] + template_fields = ("document", "gcp_conn_id") + # [END natural_langauge_analyze_entity_sentiment_template_fields] + + def __init__( + self, + document, + encoding_type=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudLanguageAnalyzeEntitySentimentOperator, self).__init__(*args, **kwargs) + self.document = document + self.encoding_type = encoding_type + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudNaturalLanguageHook(gcp_conn_id=self.gcp_conn_id) + + self.log.info("Start entity sentiment analyze") + response = hook.analyze_entity_sentiment( + document=self.document, + encoding_type=self.encoding_type, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + self.log.info("Finished entity sentiment analyze") + + return MessageToDict(response) + + +class CloudLanguageAnalyzeSentimentOperator(BaseOperator): + """ + Analyzes the sentiment of the provided text. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudLanguageAnalyzeSentimentOperator` + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or google.cloud.language_v1.types.Document + :param encoding_type: The encoding type used by the API to calculate offsets. + :type encoding_type: google.cloud.language_v1.types.EncodingType + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :rtype: google.cloud.language_v1.types.AnalyzeEntitiesResponse + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START natural_langauge_analyze_sentiment_template_fields] + template_fields = ("document", "gcp_conn_id") + # [END natural_langauge_analyze_sentiment_template_fields] + + def __init__( + self, + document, + encoding_type=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudLanguageAnalyzeSentimentOperator, self).__init__(*args, **kwargs) + self.document = document + self.encoding_type = encoding_type + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudNaturalLanguageHook(gcp_conn_id=self.gcp_conn_id) + + self.log.info("Start sentiment analyze") + response = hook.analyze_sentiment( + document=self.document, retry=self.retry, timeout=self.timeout, metadata=self.metadata + ) + self.log.info("Finished sentiment analyze") + + return MessageToDict(response) + + +class CloudLanguageClassifyTextOperator(BaseOperator): + """ + Classifies a document into categories. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudLanguageClassifyTextOperator` + + :param document: Input document. + If a dict is provided, it must be of the same form as the protobuf message Document + :type document: dict or google.cloud.language_v1.types.Document + :param retry: A retry object used to retry requests. If None is specified, requests will not be + retried. + :param timeout: The amount of time, in seconds, to wait for the request to complete. Note that if + retry is specified, the timeout applies to each individual attempt. + :type timeout: float + :param metadata: Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]]] + :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START natural_langauge_classify_text_template_fields] + template_fields = ("document", "gcp_conn_id") + # [END natural_langauge_classify_text_template_fields] + + def __init__( + self, + document, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudLanguageClassifyTextOperator, self).__init__(*args, **kwargs) + self.document = document + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudNaturalLanguageHook(gcp_conn_id=self.gcp_conn_id) + + self.log.info("Start text classify") + response = hook.classify_text( + document=self.document, retry=self.retry, timeout=self.timeout, metadata=self.metadata + ) + self.log.info("Finished text classify") + + return MessageToDict(response) diff --git a/airflow/contrib/operators/gcp_spanner_operator.py b/airflow/contrib/operators/gcp_spanner_operator.py new file mode 100644 index 0000000000000..bea4f676e3ccc --- /dev/null +++ b/airflow/contrib/operators/gcp_spanner_operator.py @@ -0,0 +1,432 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import six + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_spanner_hook import CloudSpannerHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class CloudSpannerInstanceDeployOperator(BaseOperator): + """ + Creates a new Cloud Spanner instance, or if an instance with the same instance_id + exists in the specified project, updates the Cloud Spanner instance. + + :param instance_id: Cloud Spanner instance ID. + :type instance_id: str + :param configuration_name: The name of the Cloud Spanner instance configuration + defining how the instance will be created. Required for + instances that do not yet exist. + :type configuration_name: str + :param node_count: (Optional) The number of nodes allocated to the Cloud Spanner + instance. + :type node_count: int + :param display_name: (Optional) The display name for the Cloud Spanner instance in + the GCP Console. (Must be between 4 and 30 characters.) If this value is not set + in the constructor, the name is the same as the instance ID. + :type display_name: str + :param project_id: Optional, the ID of the project which owns the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_deploy_template_fields] + template_fields = ('project_id', 'instance_id', 'configuration_name', 'display_name', + 'gcp_conn_id') + # [END gcp_spanner_deploy_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + configuration_name, + node_count, + display_name, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.configuration_name = configuration_name + self.node_count = node_count + self.display_name = display_name + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDeployOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' " + "is empty or None") + + def execute(self, context): + if not self._hook.get_instance(project_id=self.project_id, instance_id=self.instance_id): + self.log.info("Creating Cloud Spanner instance '%s'", self.instance_id) + func = self._hook.create_instance + else: + self.log.info("Updating Cloud Spanner instance '%s'", self.instance_id) + func = self._hook.update_instance + func(project_id=self.project_id, + instance_id=self.instance_id, + configuration_name=self.configuration_name, + node_count=self.node_count, + display_name=self.display_name) + + +class CloudSpannerInstanceDeleteOperator(BaseOperator): + """ + Deletes a Cloud Spanner instance. If an instance does not exist, + no action is taken and the operator succeeds. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSpannerInstanceDeleteOperator` + + :param instance_id: The Cloud Spanner instance ID. + :type instance_id: str + :param project_id: Optional, the ID of the project that owns the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_delete_template_fields] + template_fields = ('project_id', 'instance_id', 'gcp_conn_id') + # [END gcp_spanner_delete_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDeleteOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' " + "is empty or None") + + def execute(self, context): + if self._hook.get_instance(project_id=self.project_id, instance_id=self.instance_id): + return self._hook.delete_instance(project_id=self.project_id, + instance_id=self.instance_id) + else: + self.log.info("Instance '%s' does not exist in project '%s'. " + "Aborting delete.", self.instance_id, self.project_id) + return True + + +class CloudSpannerInstanceDatabaseQueryOperator(BaseOperator): + """ + Executes an arbitrary DML query (INSERT, UPDATE, DELETE). + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSpannerInstanceDatabaseQueryOperator` + + :param instance_id: The Cloud Spanner instance ID. + :type instance_id: str + :param database_id: The Cloud Spanner database ID. + :type database_id: str + :param query: The query or list of queries to be executed. Can be a path to a SQL + file. + :type query: str or list + :param project_id: Optional, the ID of the project that owns the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_query_template_fields] + template_fields = ('project_id', 'instance_id', 'database_id', 'query', 'gcp_conn_id') + template_ext = ('.sql',) + # [END gcp_spanner_query_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + database_id, + query, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.database_id = database_id + self.query = query + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDatabaseQueryOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' " + "is empty or None") + if not self.database_id: + raise AirflowException("The required parameter 'database_id' " + "is empty or None") + if not self.query: + raise AirflowException("The required parameter 'query' is empty") + + def execute(self, context): + queries = self.query + if isinstance(self.query, six.string_types): + queries = [x.strip() for x in self.query.split(';')] + self.sanitize_queries(queries) + self.log.info("Executing DML query(-ies) on " + "projects/%s/instances/%s/databases/%s", + self.project_id, self.instance_id, self.database_id) + self.log.info(queries) + self._hook.execute_dml(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id, + queries=queries) + + @staticmethod + def sanitize_queries(queries): + if len(queries) and queries[-1] == '': + del queries[-1] + + +class CloudSpannerInstanceDatabaseDeployOperator(BaseOperator): + """ + Creates a new Cloud Spanner database, or if database exists, + the operator does nothing. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSpannerInstanceDatabaseDeployOperator` + + :param instance_id: The Cloud Spanner instance ID. + :type instance_id: str + :param database_id: The Cloud Spanner database ID. + :type database_id: str + :param ddl_statements: The string list containing DDL for the new database. + :type ddl_statements: list[str] + :param project_id: Optional, the ID of the project that owns the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_database_deploy_template_fields] + template_fields = ('project_id', 'instance_id', 'database_id', 'ddl_statements', + 'gcp_conn_id') + template_ext = ('.sql', ) + # [END gcp_spanner_database_deploy_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + database_id, + ddl_statements, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.database_id = database_id + self.ddl_statements = ddl_statements + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDatabaseDeployOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' is empty " + "or None") + if not self.database_id: + raise AirflowException("The required parameter 'database_id' is empty" + " or None") + + def execute(self, context): + if not self._hook.get_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id): + self.log.info("Creating Cloud Spanner database " + "'%s' in project '%s' and instance '%s'", + self.database_id, self.project_id, self.instance_id) + return self._hook.create_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id, + ddl_statements=self.ddl_statements) + else: + self.log.info("The database '%s' in project '%s' and instance '%s'" + " already exists. Nothing to do. Exiting.", + self.database_id, self.project_id, self.instance_id) + return True + + +class CloudSpannerInstanceDatabaseUpdateOperator(BaseOperator): + """ + Updates a Cloud Spanner database with the specified DDL statement. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSpannerInstanceDatabaseUpdateOperator` + + :param instance_id: The Cloud Spanner instance ID. + :type instance_id: str + :param database_id: The Cloud Spanner database ID. + :type database_id: str + :param ddl_statements: The string list containing DDL to apply to the database. + :type ddl_statements: list[str] + :param project_id: Optional, the ID of the project that owns the the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param operation_id: (Optional) Unique per database operation id that can + be specified to implement idempotency check. + :type operation_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_database_update_template_fields] + template_fields = ('project_id', 'instance_id', 'database_id', 'ddl_statements', + 'gcp_conn_id') + template_ext = ('.sql', ) + # [END gcp_spanner_database_update_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + database_id, + ddl_statements, + project_id=None, + operation_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.database_id = database_id + self.ddl_statements = ddl_statements + self.operation_id = operation_id + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDatabaseUpdateOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' is empty" + " or None") + if not self.database_id: + raise AirflowException("The required parameter 'database_id' is empty" + " or None") + if not self.ddl_statements: + raise AirflowException("The required parameter 'ddl_statements' is empty" + " or None") + + def execute(self, context): + if not self._hook.get_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id): + raise AirflowException("The Cloud Spanner database '{}' in project '{}' and " + "instance '{}' is missing. Create the database first " + "before you can update it.".format(self.database_id, + self.project_id, + self.instance_id)) + else: + return self._hook.update_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id, + ddl_statements=self.ddl_statements, + operation_id=self.operation_id) + + +class CloudSpannerInstanceDatabaseDeleteOperator(BaseOperator): + """ + Deletes a Cloud Spanner database. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSpannerInstanceDatabaseDeleteOperator` + + :param instance_id: Cloud Spanner instance ID. + :type instance_id: str + :param database_id: Cloud Spanner database ID. + :type database_id: str + :param project_id: Optional, the ID of the project that owns the Cloud Spanner + Database. If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + # [START gcp_spanner_database_delete_template_fields] + template_fields = ('project_id', 'instance_id', 'database_id', + 'gcp_conn_id') + # [END gcp_spanner_database_delete_template_fields] + + @apply_defaults + def __init__(self, + instance_id, + database_id, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, **kwargs): + self.instance_id = instance_id + self.project_id = project_id + self.database_id = database_id + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + self._hook = CloudSpannerHook(gcp_conn_id=gcp_conn_id) + super(CloudSpannerInstanceDatabaseDeleteOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance_id: + raise AirflowException("The required parameter 'instance_id' is empty" + " or None") + if not self.database_id: + raise AirflowException("The required parameter 'database_id' is empty" + " or None") + + def execute(self, context): + db = self._hook.get_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id) + if not db: + self.log.info("The Cloud Spanner database was missing: " + "'%s' in project '%s' and instance '%s'. Assuming success.", + self.database_id, self.project_id, self.instance_id) + return True + else: + return self._hook.delete_database(project_id=self.project_id, + instance_id=self.instance_id, + database_id=self.database_id) diff --git a/airflow/contrib/operators/gcp_sql_operator.py b/airflow/contrib/operators/gcp_sql_operator.py new file mode 100644 index 0000000000000..cdce4baa7591b --- /dev/null +++ b/airflow/contrib/operators/gcp_sql_operator.py @@ -0,0 +1,805 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from googleapiclient.errors import HttpError + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_sql_hook import CloudSqlHook, CloudSqlDatabaseHook +from airflow.contrib.utils.gcp_field_validator import GcpBodyFieldValidator +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.hooks.base_hook import BaseHook + +SETTINGS = 'settings' +SETTINGS_VERSION = 'settingsVersion' + +CLOUD_SQL_CREATE_VALIDATION = [ + dict(name="name", allow_empty=False), + dict(name="settings", type="dict", fields=[ + dict(name="tier", allow_empty=False), + dict(name="backupConfiguration", type="dict", fields=[ + dict(name="binaryLogEnabled", optional=True), + dict(name="enabled", optional=True), + dict(name="replicationLogArchivingEnabled", optional=True), + dict(name="startTime", allow_empty=False, optional=True) + ], optional=True), + dict(name="activationPolicy", allow_empty=False, optional=True), + dict(name="authorizedGaeApplications", type="list", optional=True), + dict(name="crashSafeReplicationEnabled", optional=True), + dict(name="dataDiskSizeGb", optional=True), + dict(name="dataDiskType", allow_empty=False, optional=True), + dict(name="databaseFlags", type="list", optional=True), + dict(name="ipConfiguration", type="dict", fields=[ + dict(name="authorizedNetworks", type="list", fields=[ + dict(name="expirationTime", optional=True), + dict(name="name", allow_empty=False, optional=True), + dict(name="value", allow_empty=False, optional=True) + ], optional=True), + dict(name="ipv4Enabled", optional=True), + dict(name="privateNetwork", allow_empty=False, optional=True), + dict(name="requireSsl", optional=True), + ], optional=True), + dict(name="locationPreference", type="dict", fields=[ + dict(name="followGaeApplication", allow_empty=False, optional=True), + dict(name="zone", allow_empty=False, optional=True), + ], optional=True), + dict(name="maintenanceWindow", type="dict", fields=[ + dict(name="hour", optional=True), + dict(name="day", optional=True), + dict(name="updateTrack", allow_empty=False, optional=True), + ], optional=True), + dict(name="pricingPlan", allow_empty=False, optional=True), + dict(name="replicationType", allow_empty=False, optional=True), + dict(name="storageAutoResize", optional=True), + dict(name="storageAutoResizeLimit", optional=True), + dict(name="userLabels", type="dict", optional=True), + ]), + dict(name="databaseVersion", allow_empty=False, optional=True), + dict(name="failoverReplica", type="dict", fields=[ + dict(name="name", allow_empty=False) + ], optional=True), + dict(name="masterInstanceName", allow_empty=False, optional=True), + dict(name="onPremisesConfiguration", type="dict", optional=True), + dict(name="region", allow_empty=False, optional=True), + dict(name="replicaConfiguration", type="dict", fields=[ + dict(name="failoverTarget", optional=True), + dict(name="mysqlReplicaConfiguration", type="dict", fields=[ + dict(name="caCertificate", allow_empty=False, optional=True), + dict(name="clientCertificate", allow_empty=False, optional=True), + dict(name="clientKey", allow_empty=False, optional=True), + dict(name="connectRetryInterval", optional=True), + dict(name="dumpFilePath", allow_empty=False, optional=True), + dict(name="masterHeartbeatPeriod", optional=True), + dict(name="password", allow_empty=False, optional=True), + dict(name="sslCipher", allow_empty=False, optional=True), + dict(name="username", allow_empty=False, optional=True), + dict(name="verifyServerCertificate", optional=True) + ], optional=True), + ], optional=True) +] +CLOUD_SQL_EXPORT_VALIDATION = [ + dict(name="exportContext", type="dict", fields=[ + dict(name="fileType", allow_empty=False), + dict(name="uri", allow_empty=False), + dict(name="databases", optional=True, type="list"), + dict(name="sqlExportOptions", type="dict", optional=True, fields=[ + dict(name="tables", optional=True, type="list"), + dict(name="schemaOnly", optional=True) + ]), + dict(name="csvExportOptions", type="dict", optional=True, fields=[ + dict(name="selectQuery") + ]) + ]) +] +CLOUD_SQL_IMPORT_VALIDATION = [ + dict(name="importContext", type="dict", fields=[ + dict(name="fileType", allow_empty=False), + dict(name="uri", allow_empty=False), + dict(name="database", optional=True, allow_empty=False), + dict(name="importUser", optional=True), + dict(name="csvImportOptions", type="dict", optional=True, fields=[ + dict(name="table"), + dict(name="columns", type="list", optional=True) + ]) + ]) +] +CLOUD_SQL_DATABASE_CREATE_VALIDATION = [ + dict(name="instance", allow_empty=False), + dict(name="name", allow_empty=False), + dict(name="project", allow_empty=False), +] +CLOUD_SQL_DATABASE_PATCH_VALIDATION = [ + dict(name="instance", optional=True), + dict(name="name", optional=True), + dict(name="project", optional=True), + dict(name="etag", optional=True), + dict(name="charset", optional=True), + dict(name="collation", optional=True), +] + + +class CloudSqlBaseOperator(BaseOperator): + """ + Abstract base operator for Google Cloud SQL operators to inherit from. + + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param project_id: Optional, Google Cloud Platform Project ID. f set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + """ + @apply_defaults + def __init__(self, + instance, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + *args, **kwargs): + self.project_id = project_id + self.instance = instance + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + self._hook = CloudSqlHook(gcp_conn_id=self.gcp_conn_id, + api_version=self.api_version) + super(CloudSqlBaseOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if self.project_id == '': + raise AirflowException("The required parameter 'project_id' is empty") + if not self.instance: + raise AirflowException("The required parameter 'instance' is empty or None") + + def _check_if_instance_exists(self, instance): + try: + return self._hook.get_instance(project_id=self.project_id, + instance=instance) + except HttpError as e: + status = e.resp.status + if status == 404: + return False + raise e + + def _check_if_db_exists(self, db_name): + try: + return self._hook.get_database( + project_id=self.project_id, + instance=self.instance, + database=db_name) + except HttpError as e: + status = e.resp.status + if status == 404: + return False + raise e + + def execute(self, context): + pass + + @staticmethod + def _get_settings_version(instance): + return instance.get(SETTINGS).get(SETTINGS_VERSION) + + +class CloudSqlInstanceCreateOperator(CloudSqlBaseOperator): + """ + Creates a new Cloud SQL instance. + If an instance with the same name exists, no action will be taken and + the operator will succeed. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceCreateOperator` + + :param body: Body required by the Cloud SQL insert API, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/insert + #request-body + :type body: dict + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + :param validate_body: True if body should be validated, False otherwise. + :type validate_body: bool + """ + # [START gcp_sql_create_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_create_template_fields] + + @apply_defaults + def __init__(self, + body, + instance, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + validate_body=True, + *args, **kwargs): + self.body = body + self.validate_body = validate_body + super(CloudSqlInstanceCreateOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceCreateOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + + def _validate_body_fields(self): + if self.validate_body: + GcpBodyFieldValidator(CLOUD_SQL_CREATE_VALIDATION, + api_version=self.api_version).validate(self.body) + + def execute(self, context): + self._validate_body_fields() + if not self._check_if_instance_exists(self.instance): + self._hook.create_instance( + project_id=self.project_id, + body=self.body) + else: + self.log.info("Cloud SQL instance with ID {} already exists. " + "Aborting create.".format(self.instance)) + + instance_resource = self._hook.get_instance(project_id=self.project_id, + instance=self.instance) + service_account_email = instance_resource["serviceAccountEmailAddress"] + task_instance = context['task_instance'] + task_instance.xcom_push(key="service_account_email", value=service_account_email) + + +class CloudSqlInstancePatchOperator(CloudSqlBaseOperator): + """ + Updates settings of a Cloud SQL instance. + + Caution: This is a partial update, so only included values for the settings will be + updated. + + In the request body, supply the relevant portions of an instance resource, according + to the rules of patch semantics. + https://cloud.google.com/sql/docs/mysql/admin-api/how-tos/performance#patch + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstancePatchOperator` + + :param body: Body required by the Cloud SQL patch API, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/patch#request-body + :type body: dict + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + """ + # [START gcp_sql_patch_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_patch_template_fields] + + @apply_defaults + def __init__(self, + body, + instance, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + *args, **kwargs): + self.body = body + super(CloudSqlInstancePatchOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstancePatchOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + + def execute(self, context): + if not self._check_if_instance_exists(self.instance): + raise AirflowException('Cloud SQL instance with ID {} does not exist. ' + 'Please specify another instance to patch.' + .format(self.instance)) + else: + return self._hook.patch_instance( + project_id=self.project_id, + body=self.body, + instance=self.instance) + + +class CloudSqlInstanceDeleteOperator(CloudSqlBaseOperator): + """ + Deletes a Cloud SQL instance. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceDeleteOperator` + + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + """ + # [START gcp_sql_delete_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_delete_template_fields] + + @apply_defaults + def __init__(self, + instance, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + *args, **kwargs): + super(CloudSqlInstanceDeleteOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def execute(self, context): + if not self._check_if_instance_exists(self.instance): + print("Cloud SQL instance with ID {} does not exist. Aborting delete." + .format(self.instance)) + return True + else: + return self._hook.delete_instance( + project_id=self.project_id, + instance=self.instance) + + +class CloudSqlInstanceDatabaseCreateOperator(CloudSqlBaseOperator): + """ + Creates a new database inside a Cloud SQL instance. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceDatabaseCreateOperator` + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/databases/insert#request-body + :type body: dict + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + :param validate_body: Whether the body should be validated. Defaults to True. + :type validate_body: bool + """ + # [START gcp_sql_db_create_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_db_create_template_fields] + + @apply_defaults + def __init__(self, + instance, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + validate_body=True, + *args, **kwargs): + self.body = body + self.validate_body = validate_body + super(CloudSqlInstanceDatabaseCreateOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceDatabaseCreateOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + + def _validate_body_fields(self): + if self.validate_body: + GcpBodyFieldValidator(CLOUD_SQL_DATABASE_CREATE_VALIDATION, + api_version=self.api_version).validate(self.body) + + def execute(self, context): + self._validate_body_fields() + database = self.body.get("name") + if not database: + self.log.error("Body doesn't contain 'name'. Cannot check if the" + " database already exists in the instance {}." + .format(self.instance)) + return False + if self._check_if_db_exists(database): + self.log.info("Cloud SQL instance with ID {} already contains database" + " '{}'. Aborting database insert." + .format(self.instance, database)) + return True + else: + return self._hook.create_database(project_id=self.project_id, + instance=self.instance, + body=self.body) + + +class CloudSqlInstanceDatabasePatchOperator(CloudSqlBaseOperator): + """ + Updates a resource containing information about a database inside a Cloud SQL + instance using patch semantics. + See: https://cloud.google.com/sql/docs/mysql/admin-api/how-tos/performance#patch + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceDatabasePatchOperator` + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param database: Name of the database to be updated in the instance. + :type database: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/databases/patch#request-body + :type body: dict + :param project_id: Optional, Google Cloud Platform Project ID. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + :param validate_body: Whether the body should be validated. Defaults to True. + :type validate_body: bool + """ + # [START gcp_sql_db_patch_template_fields] + template_fields = ('project_id', 'instance', 'database', 'gcp_conn_id', + 'api_version') + # [END gcp_sql_db_patch_template_fields] + + @apply_defaults + def __init__(self, + instance, + database, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + validate_body=True, + *args, **kwargs): + self.database = database + self.body = body + self.validate_body = validate_body + super(CloudSqlInstanceDatabasePatchOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceDatabasePatchOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + if not self.database: + raise AirflowException("The required parameter 'database' is empty") + + def _validate_body_fields(self): + if self.validate_body: + GcpBodyFieldValidator(CLOUD_SQL_DATABASE_PATCH_VALIDATION, + api_version=self.api_version).validate(self.body) + + def execute(self, context): + self._validate_body_fields() + if not self._check_if_db_exists(self.database): + raise AirflowException("Cloud SQL instance with ID {} does not contain " + "database '{}'. " + "Please specify another database to patch." + .format(self.instance, self.database)) + else: + return self._hook.patch_database( + project_id=self.project_id, + instance=self.instance, + database=self.database, + body=self.body) + + +class CloudSqlInstanceDatabaseDeleteOperator(CloudSqlBaseOperator): + """ + Deletes a database from a Cloud SQL instance. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceDatabaseDeleteOperator` + + :param instance: Database instance ID. This does not include the project ID. + :type instance: str + :param database: Name of the database to be deleted in the instance. + :type database: str + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + """ + # [START gcp_sql_db_delete_template_fields] + template_fields = ('project_id', 'instance', 'database', 'gcp_conn_id', + 'api_version') + # [END gcp_sql_db_delete_template_fields] + + @apply_defaults + def __init__(self, + instance, + database, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + *args, **kwargs): + self.database = database + super(CloudSqlInstanceDatabaseDeleteOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceDatabaseDeleteOperator, self)._validate_inputs() + if not self.database: + raise AirflowException("The required parameter 'database' is empty") + + def execute(self, context): + if not self._check_if_db_exists(self.database): + print("Cloud SQL instance with ID {} does not contain database '{}'. " + "Aborting database delete." + .format(self.instance, self.database)) + return True + else: + return self._hook.delete_database( + project_id=self.project_id, + instance=self.instance, + database=self.database) + + +class CloudSqlInstanceExportOperator(CloudSqlBaseOperator): + """ + Exports data from a Cloud SQL instance to a Cloud Storage bucket as a SQL dump + or CSV file. + + Note: This operator is idempotent. If executed multiple times with the same + export file URI, the export file in GCS will simply be overridden. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceImportOperator` + + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/export#request-body + :type body: dict + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + :param validate_body: Whether the body should be validated. Defaults to True. + :type validate_body: bool + """ + # [START gcp_sql_export_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_export_template_fields] + + @apply_defaults + def __init__(self, + instance, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + validate_body=True, + *args, **kwargs): + self.body = body + self.validate_body = validate_body + super(CloudSqlInstanceExportOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceExportOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + + def _validate_body_fields(self): + if self.validate_body: + GcpBodyFieldValidator(CLOUD_SQL_EXPORT_VALIDATION, + api_version=self.api_version).validate(self.body) + + def execute(self, context): + self._validate_body_fields() + return self._hook.export_instance( + project_id=self.project_id, + instance=self.instance, + body=self.body) + + +class CloudSqlInstanceImportOperator(CloudSqlBaseOperator): + """ + Imports data into a Cloud SQL instance from a SQL dump or CSV file in Cloud Storage. + + CSV IMPORT: + + This operator is NOT idempotent for a CSV import. If the same file is imported + multiple times, the imported data will be duplicated in the database. + Moreover, if there are any unique constraints the duplicate import may result in an + error. + + SQL IMPORT: + + This operator is idempotent for a SQL import if it was also exported by Cloud SQL. + The exported SQL contains 'DROP TABLE IF EXISTS' statements for all tables + to be imported. + + If the import file was generated in a different way, idempotence is not guaranteed. + It has to be ensured on the SQL file level. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlInstanceImportOperator` + + :param instance: Cloud SQL instance ID. This does not include the project ID. + :type instance: str + :param body: The request body, as described in + https://cloud.google.com/sql/docs/mysql/admin-api/v1beta4/instances/export#request-body + :type body: dict + :param project_id: Optional, Google Cloud Platform Project ID. If set to None or missing, + the default project_id from the GCP connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1beta4). + :type api_version: str + :param validate_body: Whether the body should be validated. Defaults to True. + :type validate_body: bool + """ + # [START gcp_sql_import_template_fields] + template_fields = ('project_id', 'instance', 'gcp_conn_id', 'api_version') + # [END gcp_sql_import_template_fields] + + @apply_defaults + def __init__(self, + instance, + body, + project_id=None, + gcp_conn_id='google_cloud_default', + api_version='v1beta4', + validate_body=True, + *args, **kwargs): + self.body = body + self.validate_body = validate_body + super(CloudSqlInstanceImportOperator, self).__init__( + project_id=project_id, instance=instance, gcp_conn_id=gcp_conn_id, + api_version=api_version, *args, **kwargs) + + def _validate_inputs(self): + super(CloudSqlInstanceImportOperator, self)._validate_inputs() + if not self.body: + raise AirflowException("The required parameter 'body' is empty") + + def _validate_body_fields(self): + if self.validate_body: + GcpBodyFieldValidator(CLOUD_SQL_IMPORT_VALIDATION, + api_version=self.api_version).validate(self.body) + + def execute(self, context): + self._validate_body_fields() + return self._hook.import_instance( + project_id=self.project_id, + instance=self.instance, + body=self.body) + + +class CloudSqlQueryOperator(BaseOperator): + """ + Performs DML or DDL query on an existing Cloud Sql instance. It optionally uses + cloud-sql-proxy to establish secure connection with the database. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudSqlQueryOperator` + + :param sql: SQL query or list of queries to run (should be DML or DDL query - + this operator does not return any data from the database, + so it is useless to pass it DQL queries. Note that it is responsibility of the + author of the queries to make sure that the queries are idempotent. For example + you can use CREATE TABLE IF NOT EXISTS to create a table. + :type sql: str or list[str] + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform for + cloud-sql-proxy authentication. + :type gcp_conn_id: str + :param gcp_cloudsql_conn_id: The connection ID used to connect to Google Cloud SQL + its schema should be gcpcloudsql://. + See :class:`~airflow.contrib.hooks.gcp_sql_hook.CloudSqlDatabaseHook` for + details on how to define gcpcloudsql:// connection. + :type gcp_cloudsql_conn_id: str + """ + # [START gcp_sql_query_template_fields] + template_fields = ('sql', 'gcp_cloudsql_conn_id', 'gcp_conn_id') + template_ext = ('.sql',) + # [END gcp_sql_query_template_fields] + + @apply_defaults + def __init__(self, + sql, + autocommit=False, + parameters=None, + gcp_conn_id='google_cloud_default', + gcp_cloudsql_conn_id='google_cloud_sql_default', + *args, **kwargs): + super(CloudSqlQueryOperator, self).__init__(*args, **kwargs) + self.sql = sql + self.gcp_conn_id = gcp_conn_id + self.gcp_cloudsql_conn_id = gcp_cloudsql_conn_id + self.autocommit = autocommit + self.parameters = parameters + self.gcp_connection = BaseHook.get_connection(self.gcp_conn_id) + self.cloudsql_db_hook = CloudSqlDatabaseHook( + gcp_cloudsql_conn_id=gcp_cloudsql_conn_id, + default_gcp_project_id=self.gcp_connection.extra_dejson.get( + 'extra__google_cloud_platform__project')) + self.cloud_sql_proxy_runner = None + self.database_hook = None + + def execute(self, context): + self.cloudsql_db_hook.validate_ssl_certs() + self.cloudsql_db_hook.create_connection() + try: + self.cloudsql_db_hook.validate_socket_path_length() + self.database_hook = self.cloudsql_db_hook.get_database_hook() + try: + try: + if self.cloudsql_db_hook.use_proxy: + self.cloud_sql_proxy_runner = self.cloudsql_db_hook.\ + get_sqlproxy_runner() + self.cloudsql_db_hook.free_reserved_port() + # There is very, very slim chance that the socket will + # be taken over here by another bind(0). + # It's quite unlikely to happen though! + self.cloud_sql_proxy_runner.start_proxy() + self.log.info('Executing: "%s"', self.sql) + self.database_hook.run(self.sql, self.autocommit, + parameters=self.parameters) + finally: + if self.cloud_sql_proxy_runner: + self.cloud_sql_proxy_runner.stop_proxy() + self.cloud_sql_proxy_runner = None + finally: + self.cloudsql_db_hook.cleanup_database_hook() + finally: + self.cloudsql_db_hook.delete_connection() + self.cloudsql_db_hook = None diff --git a/airflow/contrib/operators/gcp_transfer_operator.py b/airflow/contrib/operators/gcp_transfer_operator.py new file mode 100644 index 0000000000000..c961bdd97ece2 --- /dev/null +++ b/airflow/contrib/operators/gcp_transfer_operator.py @@ -0,0 +1,798 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from copy import deepcopy +from datetime import date, time + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_transfer_hook import ( + GCPTransferServiceHook, + GcpTransferJobsStatus, + TRANSFER_OPTIONS, + OBJECT_CONDITIONS, + PROJECT_ID, + BUCKET_NAME, + GCS_DATA_SINK, + STATUS, + DESCRIPTION, + GCS_DATA_SOURCE, + HTTP_DATA_SOURCE, + SECONDS, + MINUTES, + HOURS, + YEAR, + MONTH, + DAY, + START_TIME_OF_DAY, + SCHEDULE_END_DATE, + SCHEDULE_START_DATE, + SCHEDULE, + SECRET_ACCESS_KEY, + ACCESS_KEY_ID, + AWS_ACCESS_KEY, + AWS_S3_DATA_SOURCE, + TRANSFER_SPEC, +) +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + +try: + from airflow.contrib.hooks.aws_hook import AwsHook +except ImportError: # pragma: no cover + AwsHook = None # type: ignore + + +class TransferJobPreprocessor: + def __init__(self, body, aws_conn_id='aws_default'): + self.body = body + self.aws_conn_id = aws_conn_id + + def _inject_aws_credentials(self): + if TRANSFER_SPEC not in self.body or AWS_S3_DATA_SOURCE not in self.body[TRANSFER_SPEC]: + return + + aws_hook = AwsHook(self.aws_conn_id) + aws_credentials = aws_hook.get_credentials() + aws_access_key_id = aws_credentials.access_key + aws_secret_access_key = aws_credentials.secret_key + self.body[TRANSFER_SPEC][AWS_S3_DATA_SOURCE][AWS_ACCESS_KEY] = { + ACCESS_KEY_ID: aws_access_key_id, + SECRET_ACCESS_KEY: aws_secret_access_key, + } + + def _reformat_date(self, field_key): + schedule = self.body[SCHEDULE] + if field_key not in schedule: + return + if isinstance(schedule[field_key], date): + schedule[field_key] = self._convert_date_to_dict(schedule[field_key]) + + def _reformat_time(self, field_key): + schedule = self.body[SCHEDULE] + if field_key not in schedule: + return + if isinstance(schedule[field_key], time): + schedule[field_key] = self._convert_time_to_dict(schedule[field_key]) + + def _reformat_schedule(self): + if SCHEDULE not in self.body: + return + self._reformat_date(SCHEDULE_START_DATE) + self._reformat_date(SCHEDULE_END_DATE) + self._reformat_time(START_TIME_OF_DAY) + + def process_body(self): + self._inject_aws_credentials() + self._reformat_schedule() + return self.body + + @staticmethod + def _convert_date_to_dict(field_date): + """ + Convert native python ``datetime.date`` object to a format supported by the API + """ + return {DAY: field_date.day, MONTH: field_date.month, YEAR: field_date.year} + + @staticmethod + def _convert_time_to_dict(time): + """ + Convert native python ``datetime.time`` object to a format supported by the API + """ + return {HOURS: time.hour, MINUTES: time.minute, SECONDS: time.second} + + +class TransferJobValidator: + def __init__(self, body): + self.body = body + + def _verify_data_source(self): + is_gcs = GCS_DATA_SOURCE in self.body[TRANSFER_SPEC] + is_aws_s3 = AWS_S3_DATA_SOURCE in self.body[TRANSFER_SPEC] + is_http = HTTP_DATA_SOURCE in self.body[TRANSFER_SPEC] + + sources_count = sum([is_gcs, is_aws_s3, is_http]) + if sources_count != 0 and sources_count != 1: + raise AirflowException( + "More than one data source detected. Please choose exactly one data source from: " + "gcsDataSource, awsS3DataSource and httpDataSource." + ) + + def _restrict_aws_credentials(self): + if AWS_S3_DATA_SOURCE not in self.body[TRANSFER_SPEC]: + return + + if AWS_ACCESS_KEY in self.body[TRANSFER_SPEC][AWS_S3_DATA_SOURCE]: + raise AirflowException( + "AWS credentials detected inside the body parameter (awsAccessKey). This is not allowed, " + "please use Airflow connections to store credentials." + ) + + def _restrict_empty_body(self): + if not self.body: + raise AirflowException("The required parameter 'body' is empty or None") + + def validate_body(self): + self._restrict_empty_body() + + if TRANSFER_SPEC not in self.body: + return + + self._restrict_aws_credentials() + self._verify_data_source() + + +class GcpTransferServiceJobCreateOperator(BaseOperator): + """ + Creates a transfer job that runs periodically. + + .. warning:: + + This operator is NOT idempotent. If you run it many times, many transfer + jobs will be created in the Google Cloud Platform. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceJobCreateOperator` + + :param body: (Required) The request body, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/create#request-body + With three additional improvements: + + * dates can be given in the form :class:`datetime.date` + * times can be given in the form :class:`datetime.time` + * credentials to Amazon Web Service should be stored in the connection and indicated by the + aws_conn_id parameter + + :type body: dict + :param aws_conn_id: The connection ID used to retrieve credentials to + Amazon Web Service. + :type aws_conn_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud + Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_job_create_template_fields] + template_fields = ('body', 'gcp_conn_id', 'aws_conn_id') + # [END gcp_transfer_job_create_template_fields] + + @apply_defaults + def __init__( + self, + body, + aws_conn_id='aws_default', + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, + **kwargs + ): + super(GcpTransferServiceJobCreateOperator, self).__init__(*args, **kwargs) + self.body = deepcopy(body) + self.aws_conn_id = aws_conn_id + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + + def _validate_inputs(self): + TransferJobValidator(body=self.body).validate_body() + + def execute(self, context): + TransferJobPreprocessor(body=self.body, aws_conn_id=self.aws_conn_id).process_body() + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + return hook.create_transfer_job(body=self.body) + + +class GcpTransferServiceJobUpdateOperator(BaseOperator): + """ + Updates a transfer job that runs periodically. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceJobUpdateOperator` + + :param job_name: (Required) Name of the job to be updated + :type job_name: str + :param body: (Required) The request body, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/patch#request-body + With three additional improvements: + + * dates can be given in the form :class:`datetime.date` + * times can be given in the form :class:`datetime.time` + * credentials to Amazon Web Service should be stored in the connection and indicated by the + aws_conn_id parameter + + :type body: dict + :param aws_conn_id: The connection ID used to retrieve credentials to + Amazon Web Service. + :type aws_conn_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud + Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_job_update_template_fields] + template_fields = ('job_name', 'body', 'gcp_conn_id', 'aws_conn_id') + # [END gcp_transfer_job_update_template_fields] + + @apply_defaults + def __init__( + self, + job_name, + body, + aws_conn_id='aws_default', + gcp_conn_id='google_cloud_default', + api_version='v1', + *args, + **kwargs + ): + super(GcpTransferServiceJobUpdateOperator, self).__init__(*args, **kwargs) + self.job_name = job_name + self.body = body + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self.aws_conn_id = aws_conn_id + self._validate_inputs() + + def _validate_inputs(self): + TransferJobValidator(body=self.body).validate_body() + if not self.job_name: + raise AirflowException("The required parameter 'job_name' is empty or None") + + def execute(self, context): + TransferJobPreprocessor(body=self.body, aws_conn_id=self.aws_conn_id).process_body() + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + return hook.update_transfer_job(job_name=self.job_name, body=self.body) + + +class GcpTransferServiceJobDeleteOperator(BaseOperator): + """ + Delete a transfer job. This is a soft delete. After a transfer job is + deleted, the job and all the transfer executions are subject to garbage + collection. Transfer jobs become eligible for garbage collection + 30 days after soft delete. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceJobDeleteOperator` + + :param job_name: (Required) Name of the TRANSFER operation + :type job_name: str + :param project_id: (Optional) the ID of the project that owns the Transfer + Job. If set to None or missing, the default project_id from the GCP + connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud + Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_job_delete_template_fields] + template_fields = ('job_name', 'project_id', 'gcp_conn_id', 'api_version') + # [END gcp_transfer_job_delete_template_fields] + + @apply_defaults + def __init__( + self, job_name, gcp_conn_id='google_cloud_default', api_version='v1', project_id=None, *args, **kwargs + ): + super(GcpTransferServiceJobDeleteOperator, self).__init__(*args, **kwargs) + self.job_name = job_name + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + + def _validate_inputs(self): + if not self.job_name: + raise AirflowException("The required parameter 'job_name' is empty or None") + + def execute(self, context): + self._validate_inputs() + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + hook.delete_transfer_job(job_name=self.job_name, project_id=self.project_id) + + +class GcpTransferServiceOperationGetOperator(BaseOperator): + """ + Gets the latest state of a long-running operation in Google Storage Transfer + Service. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceOperationGetOperator` + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :param gcp_conn_id: The connection ID used to connect to Google + Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_operation_get_template_fields] + template_fields = ('operation_name', 'gcp_conn_id') + # [END gcp_transfer_operation_get_template_fields] + + @apply_defaults + def __init__(self, operation_name, gcp_conn_id='google_cloud_default', api_version='v1', *args, **kwargs): + super(GcpTransferServiceOperationGetOperator, self).__init__(*args, **kwargs) + self.operation_name = operation_name + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + + def _validate_inputs(self): + if not self.operation_name: + raise AirflowException("The required parameter 'operation_name' is empty or None") + + def execute(self, context): + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + operation = hook.get_transfer_operation(operation_name=self.operation_name) + return operation + + +class GcpTransferServiceOperationsListOperator(BaseOperator): + """ + Lists long-running operations in Google Storage Transfer + Service that match the specified filter. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceOperationsListOperator` + + :param filter: (Required) A request filter, as described in + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs/list#body.QUERY_PARAMETERS.filter + :type filter: dict + :param gcp_conn_id: The connection ID used to connect to Google + Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_operations_list_template_fields] + template_fields = ('filter', 'gcp_conn_id') + # [END gcp_transfer_operations_list_template_fields] + + def __init__(self, filter, gcp_conn_id='google_cloud_default', api_version='v1', *args, **kwargs): + super(GcpTransferServiceOperationsListOperator, self).__init__(*args, **kwargs) + self.filter = filter + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + + def _validate_inputs(self): + if not self.filter: + raise AirflowException("The required parameter 'filter' is empty or None") + + def execute(self, context): + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + operations_list = hook.list_transfer_operations(filter=self.filter) + self.log.info(operations_list) + return operations_list + + +class GcpTransferServiceOperationPauseOperator(BaseOperator): + """ + Pauses a transfer operation in Google Storage Transfer Service. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceOperationPauseOperator` + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + :param api_version: API version used (e.g. v1). + :type api_version: str + """ + + # [START gcp_transfer_operation_pause_template_fields] + template_fields = ('operation_name', 'gcp_conn_id', 'api_version') + # [END gcp_transfer_operation_pause_template_fields] + + @apply_defaults + def __init__(self, operation_name, gcp_conn_id='google_cloud_default', api_version='v1', *args, **kwargs): + super(GcpTransferServiceOperationPauseOperator, self).__init__(*args, **kwargs) + self.operation_name = operation_name + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + + def _validate_inputs(self): + if not self.operation_name: + raise AirflowException("The required parameter 'operation_name' is empty or None") + + def execute(self, context): + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + hook.pause_transfer_operation(operation_name=self.operation_name) + + +class GcpTransferServiceOperationResumeOperator(BaseOperator): + """ + Resumes a transfer operation in Google Storage Transfer Service. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceOperationResumeOperator` + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud Platform. + :param api_version: API version used (e.g. v1). + :type api_version: str + :type gcp_conn_id: str + """ + + # [START gcp_transfer_operation_resume_template_fields] + template_fields = ('operation_name', 'gcp_conn_id', 'api_version') + # [END gcp_transfer_operation_resume_template_fields] + + @apply_defaults + def __init__(self, operation_name, gcp_conn_id='google_cloud_default', api_version='v1', *args, **kwargs): + self.operation_name = operation_name + self.gcp_conn_id = gcp_conn_id + self.api_version = api_version + self._validate_inputs() + super(GcpTransferServiceOperationResumeOperator, self).__init__(*args, **kwargs) + + def _validate_inputs(self): + if not self.operation_name: + raise AirflowException("The required parameter 'operation_name' is empty or None") + + def execute(self, context): + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + hook.resume_transfer_operation(operation_name=self.operation_name) + + +class GcpTransferServiceOperationCancelOperator(BaseOperator): + """ + Cancels a transfer operation in Google Storage Transfer Service. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GcpTransferServiceOperationCancelOperator` + + :param operation_name: (Required) Name of the transfer operation. + :type operation_name: str + :param api_version: API version used (e.g. v1). + :type api_version: str + :param gcp_conn_id: The connection ID used to connect to Google + Cloud Platform. + :type gcp_conn_id: str + """ + + # [START gcp_transfer_operation_cancel_template_fields] + template_fields = ('operation_name', 'gcp_conn_id', 'api_version') + # [END gcp_transfer_operation_cancel_template_fields] + + @apply_defaults + def __init__(self, operation_name, api_version='v1', gcp_conn_id='google_cloud_default', *args, **kwargs): + super(GcpTransferServiceOperationCancelOperator, self).__init__(*args, **kwargs) + self.operation_name = operation_name + self.api_version = api_version + self.gcp_conn_id = gcp_conn_id + self._validate_inputs() + + def _validate_inputs(self): + if not self.operation_name: + raise AirflowException("The required parameter 'operation_name' is empty or None") + + def execute(self, context): + hook = GCPTransferServiceHook(api_version=self.api_version, gcp_conn_id=self.gcp_conn_id) + hook.cancel_transfer_operation(operation_name=self.operation_name) + + +class S3ToGoogleCloudStorageTransferOperator(BaseOperator): + """ + Synchronizes an S3 bucket with a Google Cloud Storage bucket using the + GCP Storage Transfer Service. + + .. warning:: + + This operator is NOT idempotent. If you run it many times, many transfer + jobs will be created in the Google Cloud Platform. + + **Example**: + + .. code-block:: python + + s3_to_gcs_transfer_op = S3ToGoogleCloudStorageTransferOperator( + task_id='s3_to_gcs_transfer_example', + s3_bucket='my-s3-bucket', + project_id='my-gcp-project', + gcs_bucket='my-gcs-bucket', + dag=my_dag) + + :param s3_bucket: The S3 bucket where to find the objects. (templated) + :type s3_bucket: str + :param gcs_bucket: The destination Google Cloud Storage bucket + where you want to store the files. (templated) + :type gcs_bucket: str + :param project_id: Optional ID of the Google Cloud Platform Console project that + owns the job + :type project_id: str + :param aws_conn_id: The source S3 connection + :type aws_conn_id: str + :param gcp_conn_id: The destination connection ID to use + when connecting to Google Cloud Storage. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param description: Optional transfer service job description + :type description: str + :param schedule: Optional transfer service schedule; + If not set, run transfer job once as soon as the operator runs + The format is described + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs. + With two additional improvements: + + * dates they can be passed as :class:`datetime.date` + * times they can be passed as :class:`datetime.time` + + :type schedule: dict + :param object_conditions: Optional transfer service object conditions; see + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/TransferSpec + :type object_conditions: dict + :param transfer_options: Optional transfer service transfer options; see + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/TransferSpec + :type transfer_options: dict + :param wait: Wait for transfer to finish + :type wait: bool + :param timeout: Time to wait for the operation to end in seconds + :type timeout: int + """ + + template_fields = ('gcp_conn_id', 's3_bucket', 'gcs_bucket', 'description', 'object_conditions') + ui_color = '#e09411' + + @apply_defaults + def __init__( + self, + s3_bucket, + gcs_bucket, + project_id=None, + aws_conn_id='aws_default', + gcp_conn_id='google_cloud_default', + delegate_to=None, + description=None, + schedule=None, + object_conditions=None, + transfer_options=None, + wait=True, + timeout=None, + *args, + **kwargs + ): + + super(S3ToGoogleCloudStorageTransferOperator, self).__init__(*args, **kwargs) + self.s3_bucket = s3_bucket + self.gcs_bucket = gcs_bucket + self.project_id = project_id + self.aws_conn_id = aws_conn_id + self.gcp_conn_id = gcp_conn_id + self.delegate_to = delegate_to + self.description = description + self.schedule = schedule + self.object_conditions = object_conditions + self.transfer_options = transfer_options + self.wait = wait + self.timeout = timeout + + def execute(self, context): + hook = GCPTransferServiceHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) + body = self._create_body() + + TransferJobPreprocessor(body=body, aws_conn_id=self.aws_conn_id).process_body() + + job = hook.create_transfer_job(body=body) + + if self.wait: + hook.wait_for_transfer_job(job, timeout=self.timeout) + + def _create_body(self): + body = { + DESCRIPTION: self.description, + STATUS: GcpTransferJobsStatus.ENABLED, + TRANSFER_SPEC: { + AWS_S3_DATA_SOURCE: {BUCKET_NAME: self.s3_bucket}, + GCS_DATA_SINK: {BUCKET_NAME: self.gcs_bucket}, + }, + } + + if self.project_id is not None: + body[PROJECT_ID] = self.project_id + + if self.schedule is not None: + body[SCHEDULE] = self.schedule + + if self.object_conditions is not None: + body[TRANSFER_SPEC][OBJECT_CONDITIONS] = self.object_conditions + + if self.transfer_options is not None: + body[TRANSFER_SPEC][TRANSFER_OPTIONS] = self.transfer_options + + return body + + +class GoogleCloudStorageToGoogleCloudStorageTransferOperator(BaseOperator): + """ + Copies objects from a bucket to another using the GCP Storage Transfer + Service. + + .. warning:: + + This operator is NOT idempotent. If you run it many times, many transfer + jobs will be created in the Google Cloud Platform. + + **Example**: + + .. code-block:: python + + gcs_to_gcs_transfer_op = GoogleCloudStorageToGoogleCloudStorageTransferOperator( + task_id='gcs_to_gcs_transfer_example', + source_bucket='my-source-bucket', + destination_bucket='my-destination-bucket', + project_id='my-gcp-project', + dag=my_dag) + + :param source_bucket: The source Google cloud storage bucket where the + object is. (templated) + :type source_bucket: str + :param destination_bucket: The destination Google cloud storage bucket + where the object should be. (templated) + :type destination_bucket: str + :param project_id: The ID of the Google Cloud Platform Console project that + owns the job + :type project_id: str + :param gcp_conn_id: Optional connection ID to use when connecting to Google Cloud + Storage. + :type gcp_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must have + domain-wide delegation enabled. + :type delegate_to: str + :param description: Optional transfer service job description + :type description: str + :param schedule: Optional transfer service schedule; + If not set, run transfer job once as soon as the operator runs + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferJobs. + With two additional improvements: + + * dates they can be passed as :class:`datetime.date` + * times they can be passed as :class:`datetime.time` + + :type schedule: dict + :param object_conditions: Optional transfer service object conditions; see + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/TransferSpec#ObjectConditions + :type object_conditions: dict + :param transfer_options: Optional transfer service transfer options; see + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/TransferSpec#TransferOptions + :type transfer_options: dict + :param wait: Wait for transfer to finish; defaults to `True` + :type wait: bool + :param timeout: Time to wait for the operation to end in seconds + :type timeout: int + """ + + template_fields = ( + 'gcp_conn_id', + 'source_bucket', + 'destination_bucket', + 'description', + 'object_conditions', + ) + ui_color = '#e09411' + + @apply_defaults + def __init__( + self, + source_bucket, + destination_bucket, + project_id=None, + gcp_conn_id='google_cloud_default', + delegate_to=None, + description=None, + schedule=None, + object_conditions=None, + transfer_options=None, + wait=True, + timeout=None, + *args, + **kwargs + ): + + super(GoogleCloudStorageToGoogleCloudStorageTransferOperator, self).__init__(*args, **kwargs) + self.source_bucket = source_bucket + self.destination_bucket = destination_bucket + self.project_id = project_id + self.gcp_conn_id = gcp_conn_id + self.delegate_to = delegate_to + self.description = description + self.schedule = schedule + self.object_conditions = object_conditions + self.transfer_options = transfer_options + self.wait = wait + self.timeout = timeout + + def execute(self, context): + hook = GCPTransferServiceHook(gcp_conn_id=self.gcp_conn_id, delegate_to=self.delegate_to) + + body = self._create_body() + + TransferJobPreprocessor(body=body).process_body() + + job = hook.create_transfer_job(body=body) + + if self.wait: + hook.wait_for_transfer_job(job, timeout=self.timeout) + + def _create_body(self): + body = { + DESCRIPTION: self.description, + STATUS: GcpTransferJobsStatus.ENABLED, + TRANSFER_SPEC: { + GCS_DATA_SOURCE: {BUCKET_NAME: self.source_bucket}, + GCS_DATA_SINK: {BUCKET_NAME: self.destination_bucket}, + }, + } + + if self.project_id is not None: + body[PROJECT_ID] = self.project_id + + if self.schedule is not None: + body[SCHEDULE] = self.schedule + + if self.object_conditions is not None: + body[TRANSFER_SPEC][OBJECT_CONDITIONS] = self.object_conditions + + if self.transfer_options is not None: + body[TRANSFER_SPEC][TRANSFER_OPTIONS] = self.transfer_options + + return body diff --git a/airflow/contrib/operators/gcp_translate_operator.py b/airflow/contrib/operators/gcp_translate_operator.py new file mode 100644 index 0000000000000..87d4aa4cfdb5c --- /dev/null +++ b/airflow/contrib/operators/gcp_translate_operator.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow import AirflowException +from airflow.contrib.hooks.gcp_translate_hook import CloudTranslateHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class CloudTranslateTextOperator(BaseOperator): + """ + Translate a string or list of strings. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudTranslateTextOperator` + + See https://cloud.google.com/translate/docs/translating-text + + Execute method returns str or list. + + This is a list of dictionaries for each queried value. Each + dictionary typically contains three keys (though not + all will be present in all cases). + + * ``detectedSourceLanguage``: The detected language (as an + ISO 639-1 language code) of the text. + * ``translatedText``: The translation of the text into the + target language. + * ``input``: The corresponding input value. + * ``model``: The model used to translate the text. + + If only a single value is passed, then only a single + dictionary is set as XCom return value. + + :type values: str or list + :param values: String or list of strings to translate. + + :type target_language: str + :param target_language: The language to translate results into. This + is required by the API and defaults to + the target language of the current instance. + + :type format_: str or None + :param format_: (Optional) One of ``text`` or ``html``, to specify + if the input text is plain text or HTML. + + :type source_language: str or None + :param source_language: (Optional) The language of the text to + be translated. + + :type model: str or None + :param model: (Optional) The model used to translate the text, such + as ``'base'`` or ``'nmt'``. + + """ + + # [START translate_template_fields] + template_fields = ('values', 'target_language', 'format_', 'source_language', 'model', 'gcp_conn_id') + # [END translate_template_fields] + + @apply_defaults + def __init__( + self, + values, + target_language, + format_, + source_language, + model, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudTranslateTextOperator, self).__init__(*args, **kwargs) + self.values = values + self.target_language = target_language + self.format_ = format_ + self.source_language = source_language + self.model = model + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + _hook = CloudTranslateHook(gcp_conn_id=self.gcp_conn_id) + try: + translation = _hook.translate( + values=self.values, + target_language=self.target_language, + format_=self.format_, + source_language=self.source_language, + model=self.model, + ) + self.log.debug("Translation %s", translation) + return translation + except ValueError as e: + self.log.error('An error has been thrown from translate method:') + self.log.error(e) + raise AirflowException(e) diff --git a/airflow/contrib/operators/gcp_vision_operator.py b/airflow/contrib/operators/gcp_vision_operator.py new file mode 100644 index 0000000000000..ba1c1b5058aab --- /dev/null +++ b/airflow/contrib/operators/gcp_vision_operator.py @@ -0,0 +1,963 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from google.api_core.exceptions import AlreadyExists + +from airflow.contrib.hooks.gcp_vision_hook import CloudVisionHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class CloudVisionProductSetCreateOperator(BaseOperator): + """ + Creates a new ProductSet resource. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductSetCreateOperator` + + :param product_set: (Required) The ProductSet to create. If a dict is provided, it must be of the same + form as the protobuf message `ProductSet`. + :type product_set: dict or google.cloud.vision_v1.types.ProductSet + :param location: (Required) The region where the ProductSet should be created. Valid regions + (as of 2019-02-05) are: us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param project_id: (Optional) The project in which the ProductSet should be created. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param product_set_id: (Optional) A user-supplied resource id for this ProductSet. + If set, the server will attempt to use this value as the resource id. If it is + already in use, an error is returned with code ALREADY_EXISTS. Must be at most + 128 characters long. It cannot contain the character /. + :type product_set_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_productset_create_template_fields] + template_fields = ("location", "project_id", "product_set_id", "gcp_conn_id") + # [END vision_productset_create_template_fields] + + @apply_defaults + def __init__( + self, + product_set, + location, + project_id=None, + product_set_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudVisionProductSetCreateOperator, self).__init__(*args, **kwargs) + self.location = location + self.project_id = project_id + self.product_set = product_set + self.product_set_id = product_set_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + try: + return self._hook.create_product_set( + location=self.location, + project_id=self.project_id, + product_set=self.product_set, + product_set_id=self.product_set_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + except AlreadyExists: + self.log.info( + "Product set with id %s already exists. Exiting from the create operation.", + self.product_set_id, + ) + return self.product_set_id + + +class CloudVisionProductSetGetOperator(BaseOperator): + """ + Gets information associated with a ProductSet. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductSetGetOperator` + + :param location: (Required) The region where the ProductSet is located. Valid regions (as of 2019-02-05) + are: us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_set_id: (Required) The resource id of this ProductSet. + :type product_set_id: str + :param project_id: (Optional) The project in which the ProductSet is located. If set + to None or missing, the default `project_id` from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_productset_get_template_fields] + template_fields = ('location', 'project_id', 'product_set_id', 'gcp_conn_id') + # [END vision_productset_get_template_fields] + + @apply_defaults + def __init__( + self, + location, + product_set_id, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductSetGetOperator, self).__init__(*args, **kwargs) + self.location = location + self.project_id = project_id + self.product_set_id = product_set_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + return self._hook.get_product_set( + location=self.location, + product_set_id=self.product_set_id, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionProductSetUpdateOperator(BaseOperator): + """ + Makes changes to a `ProductSet` resource. Only display_name can be updated currently. + + .. note:: To locate the `ProductSet` resource, its `name` in the form + `projects/PROJECT_ID/locations/LOC_ID/productSets/PRODUCT_SET_ID` is necessary. + + You can provide the `name` directly as an attribute of the `product_set` object. + However, you can leave it blank and provide `location` and `product_set_id` instead + (and optionally `project_id` - if not present, the connection default will be used) + and the `name` will be created by the operator itself. + + This mechanism exists for your convenience, to allow leaving the `project_id` empty + and having Airflow use the connection default `project_id`. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductSetUpdateOperator` + + :param product_set: (Required) The ProductSet resource which replaces the one on the + server. If a dict is provided, it must be of the same form as the protobuf + message `ProductSet`. + :type product_set: dict or google.cloud.vision_v1.types.ProductSet + :param location: (Optional) The region where the ProductSet is located. Valid regions (as of 2019-02-05) + are: us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_set_id: (Optional) The resource id of this ProductSet. + :type product_set_id: str + :param project_id: (Optional) The project in which the ProductSet should be created. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param update_mask: (Optional) The `FieldMask` that specifies which fields to update. If update_mask + isn’t specified, all mutable fields are to be updated. Valid mask path is display_name. If a dict is + provided, it must be of the same form as the protobuf message `FieldMask`. + :type update_mask: dict or google.cloud.vision_v1.types.FieldMask + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + + """ + + # [START vision_productset_update_template_fields] + template_fields = ('location', 'project_id', 'product_set_id', 'gcp_conn_id') + # [END vision_productset_update_template_fields] + + @apply_defaults + def __init__( + self, + product_set, + location=None, + product_set_id=None, + project_id=None, + update_mask=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductSetUpdateOperator, self).__init__(*args, **kwargs) + self.product_set = product_set + self.update_mask = update_mask + self.location = location + self.project_id = project_id + self.product_set_id = product_set_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + return self._hook.update_product_set( + location=self.location, + product_set_id=self.product_set_id, + project_id=self.project_id, + product_set=self.product_set, + update_mask=self.update_mask, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionProductSetDeleteOperator(BaseOperator): + """ + Permanently deletes a `ProductSet`. `Products` and `ReferenceImages` in the + `ProductSet` are not deleted. The actual image files are not deleted from Google + Cloud Storage. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductSetDeleteOperator` + + :param location: (Required) The region where the ProductSet is located. Valid regions (as of 2019-02-05) + are: us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_set_id: (Required) The resource id of this ProductSet. + :type product_set_id: str + :param project_id: (Optional) The project in which the ProductSet should be created. + If set to None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + + """ + + # [START vision_productset_delete_template_fields] + template_fields = ('location', 'project_id', 'product_set_id', 'gcp_conn_id') + # [END vision_productset_delete_template_fields] + + @apply_defaults + def __init__( + self, + location, + product_set_id, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductSetDeleteOperator, self).__init__(*args, **kwargs) + self.location = location + self.project_id = project_id + self.product_set_id = product_set_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + self._hook.delete_product_set( + location=self.location, + product_set_id=self.product_set_id, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionProductCreateOperator(BaseOperator): + """ + Creates and returns a new product resource. + + Possible errors regarding the `Product` object provided: + + - Returns `INVALID_ARGUMENT` if `display_name` is missing or longer than 4096 characters. + - Returns `INVALID_ARGUMENT` if `description` is longer than 4096 characters. + - Returns `INVALID_ARGUMENT` if `product_category` is missing or invalid. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductCreateOperator` + + :param location: (Required) The region where the Product should be created. Valid regions + (as of 2019-02-05) are: us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product: (Required) The product to create. If a dict is provided, it must be of the same form as + the protobuf message `Product`. + :type product: dict or google.cloud.vision_v1.types.Product + :param project_id: (Optional) The project in which the Product should be created. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param product_id: (Optional) A user-supplied resource id for this Product. + If set, the server will attempt to use this value as the resource id. If it is + already in use, an error is returned with code ALREADY_EXISTS. Must be at most + 128 characters long. It cannot contain the character /. + :type product_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + + """ + + # [START vision_product_create_template_fields] + template_fields = ('location', 'project_id', 'product_id', 'gcp_conn_id') + # [END vision_product_create_template_fields] + + @apply_defaults + def __init__( + self, + location, + product, + project_id=None, + product_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductCreateOperator, self).__init__(*args, **kwargs) + self.location = location + self.product = product + self.project_id = project_id + self.product_id = product_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + try: + return self._hook.create_product( + location=self.location, + product=self.product, + project_id=self.project_id, + product_id=self.product_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + except AlreadyExists: + self.log.info( + 'Product with id %s already exists. Exiting from the create operation.', self.product_id + ) + return self.product_id + + +class CloudVisionProductGetOperator(BaseOperator): + """ + Gets information associated with a `Product`. + + Possible errors: + + - Returns `NOT_FOUND` if the `Product` does not exist. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductGetOperator` + + :param location: (Required) The region where the Product is located. Valid regions (as of 2019-02-05) are: + us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_id: (Required) The resource id of this Product. + :type product_id: str + :param project_id: (Optional) The project in which the Product is located. If set to + None or missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + + """ + + # [START vision_product_get_template_fields] + template_fields = ('location', 'project_id', 'product_id', 'gcp_conn_id') + # [END vision_product_get_template_fields] + + @apply_defaults + def __init__( + self, + location, + product_id, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudVisionProductGetOperator, self).__init__(*args, **kwargs) + self.location = location + self.product_id = product_id + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + return self._hook.get_product( + location=self.location, + product_id=self.product_id, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionProductUpdateOperator(BaseOperator): + """ + Makes changes to a Product resource. Only the display_name, description, and labels fields can be + updated right now. + + If labels are updated, the change will not be reflected in queries until the next index time. + + .. note:: To locate the `Product` resource, its `name` in the form + `projects/PROJECT_ID/locations/LOC_ID/products/PRODUCT_ID` is necessary. + + You can provide the `name` directly as an attribute of the `product` object. However, you can leave it + blank and provide `location` and `product_id` instead (and optionally `project_id` - if not present, + the connection default will be used) and the `name` will be created by the operator itself. + + This mechanism exists for your convenience, to allow leaving the `project_id` empty and having Airflow + use the connection default `project_id`. + + Possible errors related to the provided `Product`: + + - Returns `NOT_FOUND` if the Product does not exist. + - Returns `INVALID_ARGUMENT` if `display_name` is present in update_mask but is missing from the request + or longer than 4096 characters. + - Returns `INVALID_ARGUMENT` if `description` is present in update_mask but is longer than 4096 + characters. + - Returns `INVALID_ARGUMENT` if `product_category` is present in update_mask. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductUpdateOperator` + + :param product: (Required) The Product resource which replaces the one on the server. product.name is + immutable. If a dict is provided, it must be of the same form as the protobuf message `Product`. + :type product: dict or google.cloud.vision_v1.types.ProductSet + :param location: (Optional) The region where the Product is located. Valid regions (as of 2019-02-05) are: + us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_id: (Optional) The resource id of this Product. + :type product_id: str + :param project_id: (Optional) The project in which the Product is located. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param update_mask: (Optional) The `FieldMask` that specifies which fields to update. If update_mask + isn’t specified, all mutable fields are to be updated. Valid mask paths include product_labels, + display_name, and description. If a dict is provided, it must be of the same form as the protobuf + message `FieldMask`. + :type update_mask: dict or google.cloud.vision_v1.types.FieldMask + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_product_update_template_fields] + template_fields = ('location', 'project_id', 'product_id', 'gcp_conn_id') + # [END vision_product_update_template_fields] + + @apply_defaults + def __init__( + self, + product, + location=None, + product_id=None, + project_id=None, + update_mask=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductUpdateOperator, self).__init__(*args, **kwargs) + self.product = product + self.location = location + self.product_id = product_id + self.project_id = project_id + self.update_mask = update_mask + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + return self._hook.update_product( + product=self.product, + location=self.location, + product_id=self.product_id, + project_id=self.project_id, + update_mask=self.update_mask, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionProductDeleteOperator(BaseOperator): + """ + Permanently deletes a product and its reference images. + + Metadata of the product and all its images will be deleted right away, but search queries against + ProductSets containing the product may still work until all related caches are refreshed. + + Possible errors: + + - Returns `NOT_FOUND` if the product does not exist. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionProductDeleteOperator` + + :param location: (Required) The region where the Product is located. Valid regions (as of 2019-02-05) are: + us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param product_id: (Required) The resource id of this Product. + :type product_id: str + :param project_id: (Optional) The project in which the Product is located. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_product_delete_template_fields] + template_fields = ('location', 'project_id', 'product_id', 'gcp_conn_id') + # [END vision_product_delete_template_fields] + + @apply_defaults + def __init__( + self, + location, + product_id, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionProductDeleteOperator, self).__init__(*args, **kwargs) + self.location = location + self.product_id = product_id + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + self._hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + + def execute(self, context): + self._hook.delete_product( + location=self.location, + product_id=self.product_id, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionAnnotateImageOperator(BaseOperator): + """ + Run image detection and annotation for an image. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionAnnotateImageOperator` + + :param request: (Required) Individual file annotation requests. + If a dict is provided, it must be of the same form as the protobuf + message class:`google.cloud.vision_v1.types.AnnotateImageRequest` + :type request: dict or google.cloud.vision_v1.types.AnnotateImageRequest + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_annotate_image_template_fields] + template_fields = ('request', 'gcp_conn_id') + # [END vision_annotate_image_template_fields] + + @apply_defaults + def __init__( + self, request, retry=None, timeout=None, gcp_conn_id='google_cloud_default', *args, **kwargs + ): + super(CloudVisionAnnotateImageOperator, self).__init__(*args, **kwargs) + self.request = request + self.retry = retry + self.timeout = timeout + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + return hook.annotate_image(request=self.request, retry=self.retry, timeout=self.timeout) + + +class CloudVisionReferenceImageCreateOperator(BaseOperator): + """ + Creates and returns a new ReferenceImage ID resource. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionReferenceImageCreateOperator` + + :param location: (Required) The region where the Product is located. Valid regions (as of 2019-02-05) are: + us-east1, us-west1, europe-west1, asia-east1 + :type location: str + :param reference_image: (Required) The reference image to create. If an image ID is specified, it is + ignored. + If a dict is provided, it must be of the same form as the protobuf message + :class:`google.cloud.vision_v1.types.ReferenceImage` + :type reference_image: dict or google.cloud.vision_v1.types.ReferenceImage + :param reference_image_id: (Optional) A user-supplied resource id for the ReferenceImage to be added. + If set, the server will attempt to use this value as the resource id. If it is already in use, an + error is returned with code ALREADY_EXISTS. Must be at most 128 characters long. It cannot contain + the character `/`. + :type reference_image_id: str + :param product_id: (Optional) The resource id of this Product. + :type product_id: str + :param project_id: (Optional) The project in which the Product is located. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_reference_image_create_template_fields] + template_fields = ( + "location", + "reference_image", + "product_id", + "reference_image_id", + "project_id", + "gcp_conn_id", + ) + # [END vision_reference_image_create_template_fields] + + @apply_defaults + def __init__( + self, + location, + reference_image, + product_id, + reference_image_id=None, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(CloudVisionReferenceImageCreateOperator, self).__init__(*args, **kwargs) + self.location = location + self.product_id = product_id + self.reference_image = reference_image + self.reference_image_id = reference_image_id + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + try: + hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + return hook.create_reference_image( + location=self.location, + product_id=self.product_id, + reference_image=self.reference_image, + reference_image_id=self.reference_image_id, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + except AlreadyExists: + self.log.info( + "ReferenceImage with id %s already exists. Exiting from the create operation.", + self.product_id, + ) + return self.reference_image_id + + +class CloudVisionAddProductToProductSetOperator(BaseOperator): + """ + Adds a Product to the specified ProductSet. If the Product is already present, no change is made. + + One Product can be added to at most 100 ProductSets. + + Possible errors: + + - Returns `NOT_FOUND` if the Product or the ProductSet doesn’t exist. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionAddProductToProductSetOperator` + + :param product_set_id: (Required) The resource id for the ProductSet to modify. + :type product_set_id: str + :param product_id: (Required) The resource id of this Product. + :type product_id: str + :param location: (Required) The region where the ProductSet is located. Valid regions (as of 2019-02-05) + are: us-east1, us-west1, europe-west1, asia-east1 + :type: str + :param project_id: (Optional) The project in which the Product is located. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_add_product_to_product_set_template_fields] + template_fields = ("location", "product_set_id", "product_id", "project_id", "gcp_conn_id") + # [END vision_add_product_to_product_set_template_fields] + + @apply_defaults + def __init__( + self, + product_set_id, + product_id, + location, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudVisionAddProductToProductSetOperator, self).__init__(*args, **kwargs) + self.product_set_id = product_set_id + self.product_id = product_id + self.location = location + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + return hook.add_product_to_product_set( + product_set_id=self.product_set_id, + product_id=self.product_id, + location=self.location, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) + + +class CloudVisionRemoveProductFromProductSetOperator(BaseOperator): + """ + Removes a Product from the specified ProductSet. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:CloudVisionRemoveProductFromProductSetOperator` + + :param product_set_id: (Required) The resource id for the ProductSet to modify. + :type product_set_id: str + :param product_id: (Required) The resource id of this Product. + :type product_id: str + :param location: (Required) The region where the ProductSet is located. Valid regions (as of 2019-02-05) + are: us-east1, us-west1, europe-west1, asia-east1 + :type: str + :param project_id: (Optional) The project in which the Product is located. If set to None or + missing, the default project_id from the GCP connection is used. + :type project_id: str + :param retry: (Optional) A retry object used to retry requests. If `None` is + specified, requests will not be retried. + :type retry: google.api_core.retry.Retry + :param timeout: (Optional) The amount of time, in seconds, to wait for the request to + complete. Note that if retry is specified, the timeout applies to each individual + attempt. + :type timeout: float + :param metadata: (Optional) Additional metadata that is provided to the method. + :type metadata: sequence[tuple[str, str]] + :param gcp_conn_id: (Optional) The connection ID used to connect to Google Cloud Platform. + :type gcp_conn_id: str + """ + + # [START vision_remove_product_from_product_set_template_fields] + template_fields = ("location", "product_set_id", "product_id", "project_id", "gcp_conn_id") + # [END vision_remove_product_from_product_set_template_fields] + + @apply_defaults + def __init__( + self, + product_set_id, + product_id, + location, + project_id=None, + retry=None, + timeout=None, + metadata=None, + gcp_conn_id="google_cloud_default", + *args, + **kwargs + ): + super(CloudVisionRemoveProductFromProductSetOperator, self).__init__(*args, **kwargs) + self.product_set_id = product_set_id + self.product_id = product_id + self.location = location + self.project_id = project_id + self.retry = retry + self.timeout = timeout + self.metadata = metadata + self.gcp_conn_id = gcp_conn_id + + def execute(self, context): + hook = CloudVisionHook(gcp_conn_id=self.gcp_conn_id) + return hook.remove_product_from_product_set( + product_set_id=self.product_set_id, + product_id=self.product_id, + location=self.location, + project_id=self.project_id, + retry=self.retry, + timeout=self.timeout, + metadata=self.metadata, + ) diff --git a/airflow/contrib/operators/gcs_acl_operator.py b/airflow/contrib/operators/gcs_acl_operator.py new file mode 100644 index 0000000000000..03c72ce328c93 --- /dev/null +++ b/airflow/contrib/operators/gcs_acl_operator.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class GoogleCloudStorageBucketCreateAclEntryOperator(BaseOperator): + """ + Creates a new ACL entry on the specified bucket. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GoogleCloudStorageBucketCreateAclEntryOperator` + + :param bucket: Name of a bucket. + :type bucket: str + :param entity: The entity holding the permission, in one of the following forms: + user-userId, user-email, group-groupId, group-email, domain-domain, + project-team-projectId, allUsers, allAuthenticatedUsers + :type entity: str + :param role: The access permission for the entity. + Acceptable values are: "OWNER", "READER", "WRITER". + :type role: str + :param user_project: (Optional) The project to be billed for this request. + Required for Requester Pays buckets. + :type user_project: str + :param google_cloud_storage_conn_id: The connection ID to use when + connecting to Google Cloud Storage. + :type google_cloud_storage_conn_id: str + """ + # [START gcs_bucket_create_acl_template_fields] + template_fields = ('bucket', 'entity', 'role', 'user_project') + # [END gcs_bucket_create_acl_template_fields] + + @apply_defaults + def __init__(self, bucket, entity, role, user_project=None, + google_cloud_storage_conn_id='google_cloud_default', *args, **kwargs): + super(GoogleCloudStorageBucketCreateAclEntryOperator, self).__init__(*args, + **kwargs) + self.bucket = bucket + self.entity = entity + self.role = role + self.user_project = user_project + self.google_cloud_storage_conn_id = google_cloud_storage_conn_id + + def execute(self, context): + hook = GoogleCloudStorageHook( + google_cloud_storage_conn_id=self.google_cloud_storage_conn_id + ) + hook.insert_bucket_acl(bucket=self.bucket, entity=self.entity, role=self.role, + user_project=self.user_project) + + +class GoogleCloudStorageObjectCreateAclEntryOperator(BaseOperator): + """ + Creates a new ACL entry on the specified object. + + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GoogleCloudStorageObjectCreateAclEntryOperator` + + :param bucket: Name of a bucket. + :type bucket: str + :param object_name: Name of the object. For information about how to URL encode object + names to be path safe, see: + https://cloud.google.com/storage/docs/json_api/#encoding + :type object_name: str + :param entity: The entity holding the permission, in one of the following forms: + user-userId, user-email, group-groupId, group-email, domain-domain, + project-team-projectId, allUsers, allAuthenticatedUsers + :type entity: str + :param role: The access permission for the entity. + Acceptable values are: "OWNER", "READER". + :type role: str + :param generation: (Optional) If present, selects a specific revision of this object + (as opposed to the latest version, the default). + :type generation: str + :param user_project: (Optional) The project to be billed for this request. + Required for Requester Pays buckets. + :type user_project: str + :param google_cloud_storage_conn_id: The connection ID to use when + connecting to Google Cloud Storage. + :type google_cloud_storage_conn_id: str + """ + # [START gcs_object_create_acl_template_fields] + template_fields = ('bucket', 'object_name', 'entity', 'role', 'generation', + 'user_project') + # [END gcs_object_create_acl_template_fields] + + @apply_defaults + def __init__(self, + bucket, + object_name, + entity, + role, + generation=None, + user_project=None, + google_cloud_storage_conn_id='google_cloud_default', + *args, **kwargs): + super(GoogleCloudStorageObjectCreateAclEntryOperator, self).__init__(*args, + **kwargs) + self.bucket = bucket + self.object_name = object_name + self.entity = entity + self.role = role + self.generation = generation + self.user_project = user_project + self.google_cloud_storage_conn_id = google_cloud_storage_conn_id + + def execute(self, context): + hook = GoogleCloudStorageHook( + google_cloud_storage_conn_id=self.google_cloud_storage_conn_id + ) + hook.insert_object_acl(bucket=self.bucket, object_name=self.object_name, + entity=self.entity, role=self.role, + generation=self.generation, user_project=self.user_project) diff --git a/airflow/contrib/operators/gcs_download_operator.py b/airflow/contrib/operators/gcs_download_operator.py index ce272aedd501c..1d168d4660722 100644 --- a/airflow/contrib/operators/gcs_download_operator.py +++ b/airflow/contrib/operators/gcs_download_operator.py @@ -29,26 +29,26 @@ class GoogleCloudStorageDownloadOperator(BaseOperator): Downloads a file from Google Cloud Storage. :param bucket: The Google cloud storage bucket where the object is. (templated) - :type bucket: string + :type bucket: str :param object: The name of the object to download in the Google cloud storage bucket. (templated) - :type object: string + :type object: str :param filename: The file path on the local file system (where the operator is being executed) that the file should be downloaded to. (templated) If no filename passed, the downloaded data will not be stored on the local file system. - :type filename: string + :type filename: str :param store_to_xcom_key: If this param is set, the operator will push the contents of the downloaded file to XCom with the key set in this parameter. If not set, the downloaded data will not be pushed to XCom. (templated) - :type store_to_xcom_key: string + :type store_to_xcom_key: str :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ('bucket', 'object', 'filename', 'store_to_xcom_key',) ui_color = '#f0eee4' diff --git a/airflow/contrib/operators/gcs_list_operator.py b/airflow/contrib/operators/gcs_list_operator.py index 6474453afabec..056b349394016 100644 --- a/airflow/contrib/operators/gcs_list_operator.py +++ b/airflow/contrib/operators/gcs_list_operator.py @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +from typing import Iterable + from airflow.contrib.hooks.gcs_hook import GoogleCloudStorageHook from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults @@ -30,21 +32,21 @@ class GoogleCloudStorageListOperator(BaseOperator): `xcom` in the downstream task. :param bucket: The Google cloud storage bucket to find the objects. (templated) - :type bucket: string + :type bucket: str :param prefix: Prefix string which filters objects whose name begin with this prefix. (templated) - :type prefix: string + :type prefix: str :param delimiter: The delimiter by which you want to filter the objects. (templated) For e.g to lists the CSV files from in a directory in GCS you would use delimiter='.csv'. - :type delimiter: string + :type delimiter: str :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str **Example**: The following Operator would list all the Avro files from ``sales/sales-2017`` @@ -58,7 +60,7 @@ class GoogleCloudStorageListOperator(BaseOperator): google_cloud_storage_conn_id=google_cloud_conn_id ) """ - template_fields = ('bucket', 'prefix', 'delimiter') + template_fields = ('bucket', 'prefix', 'delimiter') # type: Iterable[str] ui_color = '#f0eee4' @apply_defaults diff --git a/airflow/contrib/operators/gcs_operator.py b/airflow/contrib/operators/gcs_operator.py index ef5e8de5043f3..31c8f8fa0c91c 100644 --- a/airflow/contrib/operators/gcs_operator.py +++ b/airflow/contrib/operators/gcs_operator.py @@ -33,7 +33,11 @@ class GoogleCloudStorageCreateBucketOperator(BaseOperator): https://cloud.google.com/storage/docs/bucketnaming.html#requirements :param bucket_name: The name of the bucket. (templated) - :type bucket_name: string + :type bucket_name: str + :param resource: An optional dict with parameters for creating the bucket. + For information on available parameters, see Cloud Storage API doc: + https://cloud.google.com/storage/docs/json_api/v1/buckets/insert + :type resource: dict :param storage_class: This defines how objects in the bucket are stored and determines the SLA and the cost of storage (templated). Values include @@ -42,41 +46,41 @@ class GoogleCloudStorageCreateBucketOperator(BaseOperator): - ``STANDARD`` - ``NEARLINE`` - ``COLDLINE``. + If this value is not specified when the bucket is created, it will default to STANDARD. - :type storage_class: string + :type storage_class: str :param location: The location of the bucket. (templated) Object data for objects in the bucket resides in physical storage within this region. Defaults to US. - .. seealso:: - https://developers.google.com/storage/docs/bucket-locations + .. seealso:: https://developers.google.com/storage/docs/bucket-locations - :type location: string + :type location: str :param project_id: The ID of the GCP Project. (templated) - :type project_id: string + :type project_id: str :param labels: User-provided labels, in key/value pairs. :type labels: dict :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string - - **Example**: - The following Operator would create a new bucket ``test-bucket`` - with ``MULTI_REGIONAL`` storage class in ``EU`` region :: - - CreateBucket = GoogleCloudStorageCreateBucketOperator( - task_id='CreateNewBucket', - bucket_name='test-bucket', - storage_class='MULTI_REGIONAL', - location='EU', - labels={'env': 'dev', 'team': 'airflow'}, - google_cloud_storage_conn_id='airflow-service-account' - ) + :type delegate_to: str + + :Example:: + The following Operator would create a new bucket ``test-bucket`` + with ``MULTI_REGIONAL`` storage class in ``EU`` region :: + + CreateBucket = GoogleCloudStorageCreateBucketOperator( + task_id='CreateNewBucket', + bucket_name='test-bucket', + storage_class='MULTI_REGIONAL', + location='EU', + labels={'env': 'dev', 'team': 'airflow'}, + google_cloud_storage_conn_id='airflow-service-account' + ) """ template_fields = ('bucket_name', 'storage_class', @@ -86,6 +90,7 @@ class GoogleCloudStorageCreateBucketOperator(BaseOperator): @apply_defaults def __init__(self, bucket_name, + resource=None, storage_class='MULTI_REGIONAL', location='US', project_id=None, @@ -96,6 +101,7 @@ def __init__(self, **kwargs): super(GoogleCloudStorageCreateBucketOperator, self).__init__(*args, **kwargs) self.bucket_name = bucket_name + self.resource = resource self.storage_class = storage_class self.location = location self.project_id = project_id @@ -116,6 +122,7 @@ def execute(self, context): ) hook.create_bucket(bucket_name=self.bucket_name, + resource=self.resource, storage_class=self.storage_class, location=self.location, project_id=self.project_id, diff --git a/airflow/contrib/operators/gcs_to_bq.py b/airflow/contrib/operators/gcs_to_bq.py index 3a7798030cf49..4fd1f6a4ef88c 100644 --- a/airflow/contrib/operators/gcs_to_bq.py +++ b/airflow/contrib/operators/gcs_to_bq.py @@ -34,43 +34,48 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator): point the operator to a Google cloud storage object name. The object in Google cloud storage must be a JSON file with the schema fields in it. + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:GoogleCloudStorageToBigQueryOperator` + :param bucket: The bucket to load from. (templated) - :type bucket: string + :type bucket: str :param source_objects: List of Google cloud storage URIs to load from. (templated) If source_format is 'DATASTORE_BACKUP', the list must only contain a single URI. - :type object: list - :param destination_project_dataset_table: The dotted (.).
- BigQuery table to load data into. If is not included, - project will be the project defined in the connection json. (templated) - :type destination_project_dataset_table: string + :type source_objects: list[str] + :param destination_project_dataset_table: The dotted + ``(.|:).
`` BigQuery table to load data into. + If ```` is not included, project will be the project defined in + the connection json. (templated) + :type destination_project_dataset_table: str :param schema_fields: If set, the schema field list as defined here: https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.load Should not be set when source_format is 'DATASTORE_BACKUP'. :type schema_fields: list :param schema_object: If set, a GCS object path pointing to a .json file that contains the schema for the table. (templated) - :param schema_object: string + :type schema_object: str :param source_format: File format to export. - :type source_format: string + :type source_format: str :param compression: [Optional] The compression type of the data source. Possible values include GZIP and NONE. The default value is NONE. This setting is ignored for Google Cloud Bigtable, Google Cloud Datastore backups and Avro formats. - :type compression: string + :type compression: str :param create_disposition: The create disposition if the table doesn't exist. - :type create_disposition: string + :type create_disposition: str :param skip_leading_rows: Number of rows to skip when loading from a CSV. :type skip_leading_rows: int :param write_disposition: The write disposition if the table already exists. - :type write_disposition: string + :type write_disposition: str :param field_delimiter: The delimiter to use when loading from a CSV. - :type field_delimiter: string + :type field_delimiter: str :param max_bad_records: The maximum number of bad records that BigQuery can ignore when running the job. :type max_bad_records: int :param quote_character: The value that is used to quote data sections in a CSV file. - :type quote_character: string + :type quote_character: str :param ignore_unknown_values: [Optional] Indicates if BigQuery should allow extra values that are not represented in the table schema. If true, the extra values are ignored. If false, records with extra columns @@ -78,7 +83,7 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator): invalid error is returned in the job result. :type ignore_unknown_values: bool :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false). - :type allow_quoted_newlines: boolean + :type allow_quoted_newlines: bool :param allow_jagged_rows: Accept rows that are missing trailing optional columns. The missing values are treated as nulls. If false, records with missing trailing columns are treated as bad records, and if there are too many bad records, an @@ -86,21 +91,21 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator): for other formats. :type allow_jagged_rows: bool :param max_id_key: If set, the name of a column in the BigQuery table - that's to be loaded. Thsi will be used to select the MAX value from + that's to be loaded. This will be used to select the MAX value from BigQuery after the load occurs. The results will be returned by the execute() command, which in turn gets stored in XCom for future operators to use. This can be helpful with incremental loads--during future executions, you can pick up from the max ID. - :type max_id_key: string + :type max_id_key: str :param bigquery_conn_id: Reference to a specific BigQuery hook. - :type bigquery_conn_id: string + :type bigquery_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param schema_update_options: Allows the schema of the destination table to be updated as a side effect of the load job. :type schema_update_options: list @@ -114,6 +119,14 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator): Note that 'field' is not available in concurrency with dataset.table$partition. :type time_partitioning: dict + :param cluster_fields: Request that the result of this load be stored sorted + by one or more columns. This is only available in conjunction with + time_partitioning. The order of columns given determines the sort order. + Not applicable for external tables. + :type cluster_fields: list[str] + :param autodetect: [Optional] Indicates if we should automatically infer the + options and schema for CSV and JSON sources. (Default: ``False``) + :type autodetect: bool """ template_fields = ('bucket', 'source_objects', 'schema_object', 'destination_project_dataset_table') @@ -143,14 +156,20 @@ def __init__(self, google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, schema_update_options=(), - src_fmt_configs={}, + src_fmt_configs=None, external_table=False, - time_partitioning={}, + time_partitioning=None, + cluster_fields=None, + autodetect=False, *args, **kwargs): super(GoogleCloudStorageToBigQueryOperator, self).__init__(*args, **kwargs) # GCS config + if src_fmt_configs is None: + src_fmt_configs = {} + if time_partitioning is None: + time_partitioning = {} self.bucket = bucket self.source_objects = source_objects self.schema_object = schema_object @@ -179,20 +198,27 @@ def __init__(self, self.schema_update_options = schema_update_options self.src_fmt_configs = src_fmt_configs self.time_partitioning = time_partitioning + self.cluster_fields = cluster_fields + self.autodetect = autodetect def execute(self, context): bq_hook = BigQueryHook(bigquery_conn_id=self.bigquery_conn_id, delegate_to=self.delegate_to) - if not self.schema_fields and \ - self.schema_object and \ - self.source_format != 'DATASTORE_BACKUP': - gcs_hook = GoogleCloudStorageHook( - google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, - delegate_to=self.delegate_to) - schema_fields = json.loads(gcs_hook.download( - self.bucket, - self.schema_object).decode("utf-8")) + if not self.schema_fields: + if self.schema_object and self.source_format != 'DATASTORE_BACKUP': + gcs_hook = GoogleCloudStorageHook( + google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, + delegate_to=self.delegate_to) + schema_fields = json.loads(gcs_hook.download( + self.bucket, + self.schema_object).decode("utf-8")) + elif self.schema_object is None and self.autodetect is False: + raise ValueError('At least one of `schema_fields`, `schema_object`, ' + 'or `autodetect` must be passed.') + else: + schema_fields = None + else: schema_fields = self.schema_fields @@ -223,6 +249,7 @@ def execute(self, context): schema_fields=schema_fields, source_uris=source_uris, source_format=self.source_format, + autodetect=self.autodetect, create_disposition=self.create_disposition, skip_leading_rows=self.skip_leading_rows, write_disposition=self.write_disposition, @@ -234,7 +261,8 @@ def execute(self, context): allow_jagged_rows=self.allow_jagged_rows, schema_update_options=self.schema_update_options, src_fmt_configs=self.src_fmt_configs, - time_partitioning=self.time_partitioning) + time_partitioning=self.time_partitioning, + cluster_fields=self.cluster_fields) if self.max_id_key: cursor.execute('SELECT MAX({}) FROM {}'.format( diff --git a/airflow/contrib/operators/gcs_to_gcs.py b/airflow/contrib/operators/gcs_to_gcs.py index 256685f90b2e1..b694b802fc878 100644 --- a/airflow/contrib/operators/gcs_to_gcs.py +++ b/airflow/contrib/operators/gcs_to_gcs.py @@ -28,79 +28,86 @@ class GoogleCloudStorageToGoogleCloudStorageOperator(BaseOperator): :param source_bucket: The source Google cloud storage bucket where the object is. (templated) - :type source_bucket: string + :type source_bucket: str :param source_object: The source name of the object to copy in the Google cloud storage bucket. (templated) - If wildcards are used in this argument: - You can use only one wildcard for objects (filenames) within your - bucket. The wildcard can appear inside the object name or at the - end of the object name. Appending a wildcard to the bucket name is - unsupported. - :type source_object: string + You can use only one wildcard for objects (filenames) within your + bucket. The wildcard can appear inside the object name or at the + end of the object name. Appending a wildcard to the bucket name is + unsupported. + :type source_object: str :param destination_bucket: The destination Google cloud storage bucket - where the object should be. (templated) - :type destination_bucket: string + where the object should be. (templated) + :type destination_bucket: str :param destination_object: The destination name of the object in the destination Google cloud storage bucket. (templated) If a wildcard is supplied in the source_object argument, this is the prefix that will be prepended to the final destination objects' paths. Note that the source path's part before the wildcard will be removed; if it needs to be retained it should be appended to destination_object. - For example, with prefix ``foo/*`` and destination_object `'blah/``, the + For example, with prefix ``foo/*`` and destination_object ``blah/``, the file ``foo/baz`` will be copied to ``blah/baz``; to retain the prefix write the destination_object as e.g. ``blah/foo``, in which case the copied file will be named ``blah/foo/baz``. - :type destination_object: string + :type destination_object: str :param move_object: When move object is True, the object is moved instead - of copied to the new location. - This is the equivalent of a mv command as opposed to a - cp command. + of copied to the new location. This is the equivalent of a mv command + as opposed to a cp command. :type move_object: bool :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string - - **Examples**: - The following Operator would copy a single file named - ``sales/sales-2017/january.avro`` in the ``data`` bucket to the file named - ``copied_sales/2017/january-backup.avro` in the ``data_backup`` bucket :: - copy_single_file = GoogleCloudStorageToGoogleCloudStorageOperator( - task_id='copy_single_file', - source_bucket='data', - source_object='sales/sales-2017/january.avro', - destination_bucket='data_backup', - destination_object='copied_sales/2017/january-backup.avro', - google_cloud_storage_conn_id=google_cloud_conn_id - ) + :type delegate_to: str + :param last_modified_time: When specified, if the object(s) were + modified after last_modified_time, they will be copied/moved. + If tzinfo has not been set, UTC will be assumed. + :type last_modified_time: datetime.datetime - The following Operator would copy all the Avro files from ``sales/sales-2017`` - folder (i.e. with names starting with that prefix) in ``data`` bucket to the - ``copied_sales/2017`` folder in the ``data_backup`` bucket. :: - copy_files = GoogleCloudStorageToGoogleCloudStorageOperator( - task_id='copy_files', - source_bucket='data', - source_object='sales/sales-2017/*.avro', - destination_bucket='data_backup', - destination_object='copied_sales/2017/', - google_cloud_storage_conn_id=google_cloud_conn_id - ) + :Example: + + The following Operator would copy a single file named + ``sales/sales-2017/january.avro`` in the ``data`` bucket to the file named + ``copied_sales/2017/january-backup.avro`` in the ``data_backup`` bucket :: + + copy_single_file = GoogleCloudStorageToGoogleCloudStorageOperator( + task_id='copy_single_file', + source_bucket='data', + source_object='sales/sales-2017/january.avro', + destination_bucket='data_backup', + destination_object='copied_sales/2017/january-backup.avro', + google_cloud_storage_conn_id=google_cloud_conn_id + ) + + The following Operator would copy all the Avro files from ``sales/sales-2017`` + folder (i.e. with names starting with that prefix) in ``data`` bucket to the + ``copied_sales/2017`` folder in the ``data_backup`` bucket. :: + + copy_files = GoogleCloudStorageToGoogleCloudStorageOperator( + task_id='copy_files', + source_bucket='data', + source_object='sales/sales-2017/*.avro', + destination_bucket='data_backup', + destination_object='copied_sales/2017/', + google_cloud_storage_conn_id=google_cloud_conn_id + ) + + The following Operator would move all the Avro files from ``sales/sales-2017`` + folder (i.e. with names starting with that prefix) in ``data`` bucket to the + same folder in the ``data_backup`` bucket, deleting the original files in the + process. :: + + move_files = GoogleCloudStorageToGoogleCloudStorageOperator( + task_id='move_files', + source_bucket='data', + source_object='sales/sales-2017/*.avro', + destination_bucket='data_backup', + move_object=True, + google_cloud_storage_conn_id=google_cloud_conn_id + ) - The following Operator would move all the Avro files from ``sales/sales-2017`` - folder (i.e. with names starting with that prefix) in ``data`` bucket to the - same folder in the ``data_backup`` bucket, deleting the original files in the - process. :: - move_files = GoogleCloudStorageToGoogleCloudStorageOperator( - task_id='move_files', - source_bucket='data', - source_object='sales/sales-2017/*.avro', - destination_bucket='data_backup', - move_object=True, - google_cloud_storage_conn_id=google_cloud_conn_id - ) """ template_fields = ('source_bucket', 'source_object', 'destination_bucket', 'destination_object',) @@ -115,6 +122,7 @@ def __init__(self, move_object=False, google_cloud_storage_conn_id='google_cloud_default', delegate_to=None, + last_modified_time=None, *args, **kwargs): super(GoogleCloudStorageToGoogleCloudStorageOperator, @@ -126,6 +134,7 @@ def __init__(self, self.move_object = move_object self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.delegate_to = delegate_to + self.last_modified_time = last_modified_time self.wildcard = '*' def execute(self, context): @@ -141,6 +150,13 @@ def execute(self, context): objects = hook.list(self.source_bucket, prefix=prefix, delimiter=delimiter) for source_object in objects: + if self.last_modified_time is not None: + # Check to see if object was modified after last_modified_time + if hook.is_updated_after(self.source_bucket, source_object, + self.last_modified_time): + pass + else: + continue if self.destination_object is None: destination_object = source_object else: @@ -157,6 +173,14 @@ def execute(self, context): hook.delete(self.source_bucket, source_object) else: + if self.last_modified_time is not None: + if hook.is_updated_after(self.source_bucket, + self.source_object, + self.last_modified_time): + pass + else: + return + self.log.info( log_message.format(self.source_bucket, self.source_object, self.destination_bucket or self.source_bucket, diff --git a/airflow/contrib/operators/gcs_to_gcs_transfer_operator.py b/airflow/contrib/operators/gcs_to_gcs_transfer_operator.py new file mode 100644 index 0000000000000..41be57a1bbfed --- /dev/null +++ b/airflow/contrib/operators/gcs_to_gcs_transfer_operator.py @@ -0,0 +1,28 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import warnings + +from airflow.contrib.operators.gcp_transfer_operator import ( # noqa + GoogleCloudStorageToGoogleCloudStorageTransferOperator, +) + +warnings.warn( + "This module is deprecated. Please use `airflow.contrib.operators.gcp_transfer_operator`", + DeprecationWarning, +) diff --git a/airflow/contrib/operators/gcs_to_s3.py b/airflow/contrib/operators/gcs_to_s3.py index a87aa3af5c531..6029661f370c6 100644 --- a/airflow/contrib/operators/gcs_to_s3.py +++ b/airflow/contrib/operators/gcs_to_s3.py @@ -28,25 +28,36 @@ class GoogleCloudStorageToS3Operator(GoogleCloudStorageListOperator): Synchronizes a Google Cloud Storage bucket with an S3 bucket. :param bucket: The Google Cloud Storage bucket to find the objects. (templated) - :type bucket: string + :type bucket: str :param prefix: Prefix string which filters objects whose name begin with this prefix. (templated) - :type prefix: string + :type prefix: str :param delimiter: The delimiter by which you want to filter the objects. (templated) For e.g to lists the CSV files from in a directory in GCS you would use delimiter='.csv'. - :type delimiter: string + :type delimiter: str :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google Cloud Storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param dest_aws_conn_id: The destination S3 connection :type dest_aws_conn_id: str :param dest_s3_key: The base S3 key to be used to store the files. (templated) :type dest_s3_key: str + :param dest_verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type dest_verify: bool or str """ template_fields = ('bucket', 'prefix', 'delimiter', 'dest_s3_key') ui_color = '#f0eee4' @@ -60,6 +71,7 @@ def __init__(self, delegate_to=None, dest_aws_conn_id=None, dest_s3_key=None, + dest_verify=None, replace=False, *args, **kwargs): @@ -75,12 +87,13 @@ def __init__(self, ) self.dest_aws_conn_id = dest_aws_conn_id self.dest_s3_key = dest_s3_key + self.dest_verify = dest_verify self.replace = replace def execute(self, context): # use the super to list all files in an Google Cloud Storage bucket files = super(GoogleCloudStorageToS3Operator, self).execute(context) - s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id) + s3_hook = S3Hook(aws_conn_id=self.dest_aws_conn_id, verify=self.dest_verify) if not self.replace: # if we are not replacing -> list all files in the S3 bucket @@ -88,7 +101,7 @@ def execute(self, context): # Google Cloud Storage and not in S3 bucket_name, _ = S3Hook.parse_s3_url(self.dest_s3_key) existing_files = s3_hook.list_keys(bucket_name) - files = set(files) - set(existing_files) + files = list(set(files) - set(existing_files)) if files: hook = GoogleCloudStorageHook( diff --git a/airflow/contrib/operators/imap_attachment_to_s3_operator.py b/airflow/contrib/operators/imap_attachment_to_s3_operator.py new file mode 100644 index 0000000000000..3b9f143591efb --- /dev/null +++ b/airflow/contrib/operators/imap_attachment_to_s3_operator.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.imap_hook import ImapHook +from airflow.hooks.S3_hook import S3Hook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class ImapAttachmentToS3Operator(BaseOperator): + """ + Transfers a mail attachment from a mail server into s3 bucket. + + :param imap_attachment_name: The file name of the mail attachment that you want to transfer. + :type imap_attachment_name: str + :param s3_key: The destination file name in the s3 bucket for the attachment. + :type s3_key: str + :param imap_mail_folder: The folder on the mail server to look for the attachment. + :type imap_mail_folder: str + :param imap_check_regex: If set checks the `imap_attachment_name` for a regular expression. + :type imap_check_regex: bool + :param s3_overwrite: If set overwrites the s3 key if already exists. + :type s3_overwrite: bool + :param imap_conn_id: The reference to the connection details of the mail server. + :type imap_conn_id: str + :param s3_conn_id: The reference to the s3 connection details. + :type s3_conn_id: str + """ + template_fields = ('imap_attachment_name', 's3_key') + + @apply_defaults + def __init__(self, + imap_attachment_name, + s3_key, + imap_mail_folder='INBOX', + imap_check_regex=False, + s3_overwrite=False, + imap_conn_id='imap_default', + s3_conn_id='aws_default', + *args, + **kwargs): + super(ImapAttachmentToS3Operator, self).__init__(*args, **kwargs) + self.imap_attachment_name = imap_attachment_name + self.s3_key = s3_key + self.imap_mail_folder = imap_mail_folder + self.imap_check_regex = imap_check_regex + self.s3_overwrite = s3_overwrite + self.imap_conn_id = imap_conn_id + self.s3_conn_id = s3_conn_id + + def execute(self, context): + """ + This function executes the transfer from the email server (via imap) into s3. + + :param context: The context while executing. + :type context: dict + """ + self.log.info( + 'Transferring mail attachment %s from mail server via imap to s3 key %s...', + self.imap_attachment_name, self.s3_key + ) + + with ImapHook(imap_conn_id=self.imap_conn_id) as imap_hook: + imap_mail_attachments = imap_hook.retrieve_mail_attachments( + name=self.imap_attachment_name, + mail_folder=self.imap_mail_folder, + check_regex=self.imap_check_regex, + latest_only=True + ) + + s3_hook = S3Hook(aws_conn_id=self.s3_conn_id) + s3_hook.load_bytes(bytes_data=imap_mail_attachments[0][1], key=self.s3_key) diff --git a/airflow/contrib/operators/jenkins_job_trigger_operator.py b/airflow/contrib/operators/jenkins_job_trigger_operator.py index 37185f3e13f42..3af87f3f3bed7 100644 --- a/airflow/contrib/operators/jenkins_job_trigger_operator.py +++ b/airflow/contrib/operators/jenkins_job_trigger_operator.py @@ -26,36 +26,27 @@ from airflow.contrib.hooks.jenkins_hook import JenkinsHook import jenkins from jenkins import JenkinsException -from six.moves.urllib.request import Request, urlopen +from requests import Request +import six from six.moves.urllib.error import HTTPError, URLError -try: - basestring -except NameError: - basestring = str # For python3 compatibility - -# TODO Use jenkins_urlopen instead when it will be available -# in the stable python-jenkins version (> 0.4.15) -def jenkins_request_with_headers(jenkins_server, req, add_crumb=True): +def jenkins_request_with_headers(jenkins_server, req): """ We need to get the headers in addition to the body answer to get the location from them - This function is just a copy of the one present in python-jenkins library + This function uses jenkins_request method from python-jenkins library with just the return call changed + :param jenkins_server: The server to query :param req: The request to execute - :param add_crumb: Boolean to indicate if it should add crumb to the request - :return: + :return: Dict containing the response body (key body) + and the headers coming along (headers) """ try: - if jenkins_server.auth: - req.add_header('Authorization', jenkins_server.auth) - if add_crumb: - jenkins_server.maybe_add_crumb(req) - response = urlopen(req, timeout=jenkins_server.timeout) - response_body = response.read() - response_headers = response.info() + response = jenkins_server.jenkins_request(req) + response_body = response.content + response_headers = response.headers if response_body is None: raise jenkins.EmptyResponseException( "Error communicating with server[%s]: " @@ -94,14 +85,15 @@ class JenkinsJobTriggerOperator(BaseOperator): This operator depend on python-jenkins library, version >= 0.4.15 to communicate with jenkins server. You'll also need to configure a Jenkins connection in the connections screen. + :param jenkins_connection_id: The jenkins connection to use for this job - :type jenkins_connection_id: string + :type jenkins_connection_id: str :param job_name: The name of the job to trigger - :type job_name: string + :type job_name: str :param parameters: The parameters block to provide to jenkins. (templated) - :type parameters: string + :type parameters: str :param sleep_time: How long will the operator sleep between each status - request for the job (min 1, default 10) + request for the job (min 1, default 10) :type sleep_time: int :param max_try_before_job_appears: The maximum number of requests to make while waiting for the job to appears on jenkins server (default 10) @@ -135,13 +127,14 @@ def build_job(self, jenkins_server): It returned a dict with 2 keys : body and headers. headers contains also a dict-like object which can be queried to get the location to poll in the queue. + :param jenkins_server: The jenkins server where the job should be triggered :return: Dict containing the response body (key body) - and the headers coming along (headers) + and the headers coming along (headers) """ # Warning if the parameter is too long, the URL can be longer than # the maximum allowed size - if self.parameters and isinstance(self.parameters, basestring): + if self.parameters and isinstance(self.parameters, six.string_types): import ast self.parameters = ast.literal_eval(self.parameters) @@ -150,7 +143,7 @@ def build_job(self, jenkins_server): self.parameters = None request = Request(jenkins_server.build_job_url(self.job_name, - self.parameters, None), b'') + self.parameters, None)) return jenkins_request_with_headers(jenkins_server, request) def poll_job_in_queue(self, location, jenkins_server): @@ -163,6 +156,7 @@ def poll_job_in_queue(self, location, jenkins_server): returned by the build_job call and poll this file. When a 'executable' block appears in the json, it means the job execution started and the field 'number' then contains the build number. + :param location: Location to poll, returned in the header of the build_job call :param jenkins_server: The jenkins server to poll :return: The build_number corresponding to the triggered job diff --git a/airflow/contrib/operators/kubernetes_pod_operator.py b/airflow/contrib/operators/kubernetes_pod_operator.py index bf656f12fbb88..fcbdcfae55328 100644 --- a/airflow/contrib/operators/kubernetes_pod_operator.py +++ b/airflow/contrib/operators/kubernetes_pod_operator.py @@ -21,13 +21,6 @@ from airflow.contrib.kubernetes import kube_client, pod_generator, pod_launcher from airflow.contrib.kubernetes.pod import Resources from airflow.utils.state import State -from airflow.contrib.kubernetes.volume_mount import VolumeMount # noqa -from airflow.contrib.kubernetes.volume import Volume # noqa -from airflow.contrib.kubernetes.secret import Secret # noqa - -template_fields = ('templates_dict',) -template_ext = tuple() -ui_color = '#ffefeb' class KubernetesPodOperator(BaseOperator): @@ -37,18 +30,24 @@ class KubernetesPodOperator(BaseOperator): :param image: Docker image you wish to launch. Defaults to dockerhub.io, but fully qualified URLS will point to custom repositories :type image: str - :param: namespace: the namespace to run within kubernetes - :type: namespace: str + :param namespace: the namespace to run within kubernetes + :type namespace: str :param cmds: entrypoint of the container. (templated) The docker images's entrypoint is used if this is not provide. - :type cmds: list of str - :param arguments: arguments of to the entrypoint. (templated) + :type cmds: list[str] + :param arguments: arguments of the entrypoint. (templated) The docker image's CMD is used if this is not provided. - :type arguments: list of str + :type arguments: list[str] + :param image_pull_policy: Specify a policy to cache or always pull an image + :type image_pull_policy: str + :param image_pull_secrets: Any image pull secrets to be given to the pod. + If more than one secret is required, provide a + comma separated list: secret_a,secret_b + :type image_pull_secrets: str :param volume_mounts: volumeMounts for launched pod - :type volume_mounts: list of VolumeMount + :type volume_mounts: list[airflow.contrib.kubernetes.volume_mount.VolumeMount] :param volumes: volumes for launched pod. Includes ConfigMaps and PersistentVolumes - :type volumes: list of Volume + :type volumes: list[airflow.contrib.kubernetes.volume.Volume] :param labels: labels to apply to the Pod :type labels: dict :param startup_timeout_seconds: timeout in seconds to startup the pod @@ -60,22 +59,35 @@ class KubernetesPodOperator(BaseOperator): :type env_vars: dict :param secrets: Kubernetes secrets to inject in the container, They can be exposed as environment vars or files in a volume. - :type secrets: list of Secret + :type secrets: list[airflow.contrib.kubernetes.secret.Secret] :param in_cluster: run kubernetes client with in_cluster configuration :type in_cluster: bool :param cluster_context: context that points to kubernetes cluster. Ignored when in_cluster is True. If None, current-context is used. - :type cluster_context: string + :type cluster_context: str :param get_logs: get the stdout of the container as logs of the tasks :type get_logs: bool :param affinity: A dict containing a group of affinity scheduling rules :type affinity: dict + :param node_selectors: A dict containing a group of scheduling rules + :type node_selectors: dict :param config_file: The path to the Kubernetes config file :type config_file: str :param xcom_push: If xcom_push is True, the content of the file /airflow/xcom/return.json in the container will also be pushed to an XCom when the container completes. :type xcom_push: bool + :param is_delete_operator_pod: What to do when the pod reaches its final + state, or the execution is interrupted. + If False (default): do nothing, If True: delete the pod + :type is_delete_operator_pod: bool + :param hostnetwork: If True enable host networking on the pod + :type hostnetwork: bool + :param tolerations: A list of kubernetes tolerations + :type tolerations: list tolerations + :param configmaps: A list of configmap names objects that we + want mount as env variables + :type configmaps: list[str] """ template_fields = ('cmds', 'arguments', 'env_vars', 'config_file') @@ -100,19 +112,31 @@ def execute(self, context): labels=self.labels, ) + pod.service_account_name = self.service_account_name pod.secrets = self.secrets pod.envs = self.env_vars pod.image_pull_policy = self.image_pull_policy + pod.image_pull_secrets = self.image_pull_secrets pod.annotations = self.annotations pod.resources = self.resources pod.affinity = self.affinity + pod.node_selectors = self.node_selectors + pod.hostnetwork = self.hostnetwork + pod.tolerations = self.tolerations + pod.configmaps = self.configmaps + pod.security_context = self.security_context launcher = pod_launcher.PodLauncher(kube_client=client, extract_xcom=self.xcom_push) - (final_state, result) = launcher.run_pod( - pod, - startup_timeout=self.startup_timeout_seconds, - get_logs=self.get_logs) + try: + (final_state, result) = launcher.run_pod( + pod, + startup_timeout=self.startup_timeout_seconds, + get_logs=self.get_logs) + finally: + if self.is_delete_operator_pod: + launcher.delete_pod(pod) + if final_state != State.SUCCESS: raise AirflowException( 'Pod returned a failure: {state}'.format(state=final_state) @@ -144,6 +168,14 @@ def __init__(self, affinity=None, config_file=None, xcom_push=False, + node_selectors=None, + image_pull_secrets=None, + service_account_name="default", + is_delete_operator_pod=False, + hostnetwork=False, + tolerations=None, + configmaps=None, + security_context=None, *args, **kwargs): super(KubernetesPodOperator, self).__init__(*args, **kwargs) @@ -162,8 +194,16 @@ def __init__(self, self.cluster_context = cluster_context self.get_logs = get_logs self.image_pull_policy = image_pull_policy + self.node_selectors = node_selectors or {} self.annotations = annotations or {} self.affinity = affinity or {} self.xcom_push = xcom_push self.resources = resources or Resources() self.config_file = config_file + self.image_pull_secrets = image_pull_secrets + self.service_account_name = service_account_name + self.is_delete_operator_pod = is_delete_operator_pod + self.hostnetwork = hostnetwork + self.tolerations = tolerations or [] + self.configmaps = configmaps or [] + self.security_context = security_context or {} diff --git a/airflow/contrib/operators/mlengine_operator.py b/airflow/contrib/operators/mlengine_operator.py index 9fe966d387ab1..a5ac41911d93d 100644 --- a/airflow/contrib/operators/mlengine_operator.py +++ b/airflow/contrib/operators/mlengine_operator.py @@ -15,7 +15,7 @@ # limitations under the License. import re -from apiclient import errors +from googleapiclient.errors import HttpError from airflow.contrib.hooks.gcp_mlengine_hook import MLEngineHook from airflow.exceptions import AirflowException @@ -42,7 +42,7 @@ def _normalize_mlengine_job_id(job_id): # Add a prefix when a job_id starts with a digit or a template match = re.search(r'\d|\{{2}', job_id) - if match and match.start() is 0: + if match and match.start() == 0: job = 'z_{}'.format(job_id) else: job = job_id @@ -68,17 +68,16 @@ class MLEngineBatchPredictionOperator(BaseOperator): NOTE: For model origin, users should consider exactly one from the three options below: - 1. Populate 'uri' field only, which should be a GCS location that - points to a tensorflow savedModel directory. - 2. Populate 'model_name' field only, which refers to an existing - model, and the default version of the model will be used. - 3. Populate both 'model_name' and 'version_name' fields, which - refers to a specific version of a specific model. - In options 2 and 3, both model and version name should contain the - minimal identifier. For instance, call + 1. Populate ``uri`` field only, which should be a GCS location that + points to a tensorflow savedModel directory. + 2. Populate ``model_name`` field only, which refers to an existing + model, and the default version of the model will be used. + 3. Populate both ``model_name`` and ``version_name`` fields, which + refers to a specific version of a specific model. - :: + In options 2 and 3, both model and version name should contain the + minimal identifier. For instance, call:: MLEngineBatchPredictionOperator( ..., @@ -87,52 +86,52 @@ class MLEngineBatchPredictionOperator(BaseOperator): ...) if the desired model version is - "projects/my_project/models/my_model/versions/my_version". + ``projects/my_project/models/my_model/versions/my_version``. See https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs for further documentation on the parameters. :param project_id: The Google Cloud project name where the prediction job is submitted. (templated) - :type project_id: string + :type project_id: str :param job_id: A unique id for the prediction job on Google Cloud ML Engine. (templated) - :type job_id: string + :type job_id: str :param data_format: The format of the input data. It will default to 'DATA_FORMAT_UNSPECIFIED' if is not provided or is not one of ["TEXT", "TF_RECORD", "TF_RECORD_GZIP"]. - :type data_format: string + :type data_format: str :param input_paths: A list of GCS paths of input data for batch - prediction. Accepting wildcard operator *, but only at the end. (templated) - :type input_paths: list of string + prediction. Accepting wildcard operator ``*``, but only at the end. (templated) + :type input_paths: list[str] :param output_path: The GCS path where the prediction results are written to. (templated) - :type output_path: string + :type output_path: str :param region: The Google Compute Engine region to run the prediction job in. (templated) - :type region: string + :type region: str :param model_name: The Google Cloud ML Engine model to use for prediction. If version_name is not provided, the default version of this model will be used. Should not be None if version_name is provided. Should be None if uri is provided. (templated) - :type model_name: string + :type model_name: str :param version_name: The Google Cloud ML Engine model version to use for prediction. Should be None if uri is provided. (templated) - :type version_name: string + :type version_name: str :param uri: The GCS path of the saved model to use for prediction. Should be None if model_name is provided. It should be a GCS path pointing to a tensorflow SavedModel. (templated) - :type uri: string + :type uri: str :param max_worker_count: The maximum number of workers to be used for parallel processing. Defaults to 10 if not specified. @@ -140,19 +139,19 @@ class MLEngineBatchPredictionOperator(BaseOperator): :param runtime_version: The Google Cloud ML Engine runtime version to use for batch prediction. - :type runtime_version: string + :type runtime_version: str :param gcp_conn_id: The connection ID used for connection to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must - have doamin-wide delegation enabled. - :type delegate_to: string + have domain-wide delegation enabled. + :type delegate_to: str - Raises: - ``ValueError``: if a unique model/version origin cannot be determined. + :raises: ``ValueError``: if a unique model/version origin cannot be + determined. """ template_fields = [ @@ -264,12 +263,13 @@ def check_existing_job(existing_job): try: finished_prediction_job = hook.create_job( self._project_id, prediction_request, check_existing_job) - except errors.HttpError: + except HttpError: raise if finished_prediction_job['state'] != 'SUCCEEDED': - self.log.error('MLEngine batch prediction job failed: {}'.format( - str(finished_prediction_job))) + self.log.error( + 'MLEngine batch prediction job failed: %s', str(finished_prediction_job) + ) raise RuntimeError(finished_prediction_job['errorMessage']) return finished_prediction_job['predictionOutput'] @@ -281,8 +281,7 @@ class MLEngineModelOperator(BaseOperator): :param project_id: The Google Cloud project name to which MLEngine model belongs. (templated) - :type project_id: string - + :type project_id: str :param model: A dictionary containing the information about the model. If the `operation` is `create`, then the `model` parameter should contain all the information about this model such as `name`. @@ -290,19 +289,17 @@ class MLEngineModelOperator(BaseOperator): If the `operation` is `get`, the `model` parameter should contain the `name` of the model. :type model: dict - :param operation: The operation to perform. Available operations are: * ``create``: Creates a new model as provided by the `model` parameter. * ``get``: Gets a particular model where the name is specified in `model`. - + :type operation: str :param gcp_conn_id: The connection ID to use when fetching connection info. - :type gcp_conn_id: string - + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = [ @@ -342,17 +339,17 @@ class MLEngineVersionOperator(BaseOperator): :param project_id: The Google Cloud project name to which MLEngine model belongs. - :type project_id: string + :type project_id: str :param model_name: The name of the Google Cloud ML Engine model that the version belongs to. (templated) - :type model_name: string + :type model_name: str :param version_name: A name to use for the version being operated upon. If not None and the `version` argument is None or does not have a value for the `name` key, then this will be populated in the payload for the `name` key. (templated) - :type version_name: string + :type version_name: str :param version: A dictionary containing the information about the version. If the `operation` is `create`, `version` should contain all the @@ -381,15 +378,15 @@ class MLEngineVersionOperator(BaseOperator): model specified by `model_name`). The name of the version should be specified in the `version` parameter. - :type operation: string + :type operation: str :param gcp_conn_id: The connection ID to use when fetching connection info. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = [ @@ -427,7 +424,9 @@ def execute(self, context): gcp_conn_id=self._gcp_conn_id, delegate_to=self._delegate_to) if self._operation == 'create': - assert self._version is not None + if not self._version: + raise ValueError("version attribute of {} could not " + "be empty".format(self.__class__.__name__)) return hook.create_version(self._project_id, self._model_name, self._version) elif self._operation == 'set_default': @@ -448,56 +447,56 @@ class MLEngineTrainingOperator(BaseOperator): :param project_id: The Google Cloud project name within which MLEngine training job should run (templated). - :type project_id: string + :type project_id: str :param job_id: A unique templated id for the submitted Google MLEngine training job. (templated) - :type job_id: string + :type job_id: str :param package_uris: A list of package locations for MLEngine training job, which should include the main training program + any additional dependencies. (templated) - :type package_uris: string + :type package_uris: str :param training_python_module: The Python module name to run within MLEngine training job after installing 'package_uris' packages. (templated) - :type training_python_module: string + :type training_python_module: str :param training_args: A list of templated command line arguments to pass to the MLEngine training program. (templated) - :type training_args: string + :type training_args: str :param region: The Google Compute Engine region to run the MLEngine training job in (templated). - :type region: string + :type region: str :param scale_tier: Resource tier for MLEngine training job. (templated) - :type scale_tier: string + :type scale_tier: str :param runtime_version: The Google Cloud ML runtime version to use for training. (templated) - :type runtime_version: string + :type runtime_version: str :param python_version: The version of Python used in training. (templated) - :type python_version: string + :type python_version: str :param job_dir: A Google Cloud Storage path in which to store training outputs and other data needed for training. (templated) - :type job_dir: string + :type job_dir: str :param gcp_conn_id: The connection ID to use when fetching connection info. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param mode: Can be one of 'DRY_RUN'/'CLOUD'. In 'DRY_RUN' mode, no real training job will be launched, but the MLEngine training job request will be printed out. In 'CLOUD' mode, a real MLEngine training job creation request will be issued. - :type mode: string + :type mode: str """ template_fields = [ @@ -586,8 +585,7 @@ def execute(self, context): if self._mode == 'DRY_RUN': self.log.info('In dry_run mode.') - self.log.info('MLEngine Training job request is: {}'.format( - training_request)) + self.log.info('MLEngine Training job request is: %s', training_request) return hook = MLEngineHook( @@ -602,10 +600,9 @@ def check_existing_job(existing_job): try: finished_training_job = hook.create_job( self._project_id, training_request, check_existing_job) - except errors.HttpError: + except HttpError: raise if finished_training_job['state'] != 'SUCCEEDED': - self.log.error('MLEngine training job failed: {}'.format( - str(finished_training_job))) + self.log.error('MLEngine training job failed: %s', str(finished_training_job)) raise RuntimeError(finished_training_job['errorMessage']) diff --git a/airflow/contrib/operators/mongo_to_s3.py b/airflow/contrib/operators/mongo_to_s3.py index 43b5d8b6c357a..4a052d4b4af24 100644 --- a/airflow/contrib/operators/mongo_to_s3.py +++ b/airflow/contrib/operators/mongo_to_s3.py @@ -1,21 +1,27 @@ # -*- coding: utf-8 -*- # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import json from airflow.contrib.hooks.mongo_hook import MongoHook from airflow.hooks.S3_hook import S3Hook from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults from bson import json_util @@ -34,6 +40,7 @@ class MongoToS3Operator(BaseOperator): template_fields = ['s3_key', 'mongo_query'] # pylint: disable=too-many-instance-attributes + @apply_defaults def __init__(self, mongo_conn_id, s3_conn_id, @@ -96,7 +103,8 @@ def execute(self, context): return True - def _stringify(self, iterable, joinable='\n'): + @staticmethod + def _stringify(iterable, joinable='\n'): """ Takes an iterable (pymongo Cursor or Array) containing dictionaries and returns a stringified version using python join @@ -105,7 +113,8 @@ def _stringify(self, iterable, joinable='\n'): [json.dumps(doc, default=json_util.default) for doc in iterable] ) - def transform(self, docs): + @staticmethod + def transform(docs): """ Processes pyMongo cursor and returns an iterable with each element being a JSON serializable dictionary diff --git a/airflow/contrib/operators/mysql_to_gcs.py b/airflow/contrib/operators/mysql_to_gcs.py index 4d1bb7b329e6c..c168b75e75e55 100644 --- a/airflow/contrib/operators/mysql_to_gcs.py +++ b/airflow/contrib/operators/mysql_to_gcs.py @@ -31,13 +31,56 @@ from MySQLdb.constants import FIELD_TYPE from tempfile import NamedTemporaryFile from six import string_types +import unicodecsv as csv PY3 = sys.version_info[0] == 3 class MySqlToGoogleCloudStorageOperator(BaseOperator): - """ - Copy data from MySQL to Google cloud storage in JSON format. + """Copy data from MySQL to Google cloud storage in JSON or CSV format. + + The JSON data files generated are newline-delimited to enable them to be + loaded into BigQuery. + Reference: https://cloud.google.com/bigquery/docs/ + loading-data-cloud-storage-json#limitations + + :param sql: The SQL to execute on the MySQL table. + :type sql: str + :param bucket: The bucket to upload to. + :type bucket: str + :param filename: The filename to use as the object name when uploading + to Google cloud storage. A {} should be specified in the filename + to allow the operator to inject file numbers in cases where the + file is split due to size. + :type filename: str + :param schema_filename: If set, the filename to use as the object name + when uploading a .json file containing the BigQuery schema fields + for the table that was dumped from MySQL. + :type schema_filename: str + :param approx_max_file_size_bytes: This operator supports the ability + to split large table dumps into multiple files (see notes in the + filenamed param docs above). Google cloud storage allows for files + to be a maximum of 4GB. This param allows developers to specify the + file size of the splits. + :type approx_max_file_size_bytes: long + :param mysql_conn_id: Reference to a specific MySQL hook. + :type mysql_conn_id: str + :param google_cloud_storage_conn_id: Reference to a specific Google + cloud storage hook. + :type google_cloud_storage_conn_id: str + :param schema: The schema to use, if any. Should be a list of dict or + a str. Pass a string if using Jinja template, otherwise, pass a list of + dict. Examples could be seen: https://cloud.google.com/bigquery/docs + /schemas#specifying_a_json_schema_file + :type schema: str or list + :param delegate_to: The account to impersonate, if any. For this to + work, the service account making the request must have domain-wide + delegation enabled. + :type delegate_to: str + :param export_format: Desired format of files to be exported. + :type export_format: str + :param field_delimiter: The delimiter to be used for CSV files. + :type field_delimiter: str """ template_fields = ('sql', 'bucket', 'filename', 'schema_filename', 'schema') template_ext = ('.sql',) @@ -54,42 +97,10 @@ def __init__(self, google_cloud_storage_conn_id='google_cloud_default', schema=None, delegate_to=None, + export_format='json', + field_delimiter=',', *args, **kwargs): - """ - :param sql: The SQL to execute on the MySQL table. - :type sql: string - :param bucket: The bucket to upload to. - :type bucket: string - :param filename: The filename to use as the object name when uploading - to Google cloud storage. A {} should be specified in the filename - to allow the operator to inject file numbers in cases where the - file is split due to size. - :type filename: string - :param schema_filename: If set, the filename to use as the object name - when uploading a .json file containing the BigQuery schema fields - for the table that was dumped from MySQL. - :type schema_filename: string - :param approx_max_file_size_bytes: This operator supports the ability - to split large table dumps into multiple files (see notes in the - filenamed param docs above). Google cloud storage allows for files - to be a maximum of 4GB. This param allows developers to specify the - file size of the splits. - :type approx_max_file_size_bytes: long - :param mysql_conn_id: Reference to a specific MySQL hook. - :type mysql_conn_id: string - :param google_cloud_storage_conn_id: Reference to a specific Google - cloud storage hook. - :type google_cloud_storage_conn_id: string - :param schema: The schema to use, if any. Should be a list of dict or - a str. Pass a string if using Jinja template, otherwise, pass a list of - dict. Examples could be seen: https://cloud.google.com/bigquery/docs - /schemas#specifying_a_json_schema_file - :type schema: str or list - :param delegate_to: The account to impersonate, if any. For this to - work, the service account making the request must have domain-wide - delegation enabled. - """ super(MySqlToGoogleCloudStorageOperator, self).__init__(*args, **kwargs) self.sql = sql self.bucket = bucket @@ -100,6 +111,8 @@ def __init__(self, self.google_cloud_storage_conn_id = google_cloud_storage_conn_id self.schema = schema self.delegate_to = delegate_to + self.export_format = export_format.lower() + self.field_delimiter = field_delimiter def execute(self, context): cursor = self._query_mysql() @@ -107,17 +120,19 @@ def execute(self, context): # If a schema is set, create a BQ schema JSON file. if self.schema_filename: - files_to_upload.update(self._write_local_schema_file(cursor)) + files_to_upload.append(self._write_local_schema_file(cursor)) # Flush all files before uploading. - for file_handle in files_to_upload.values(): - file_handle.flush() + for tmp_file in files_to_upload: + tmp_file_handle = tmp_file.get('file_handle') + tmp_file_handle.flush() self._upload_to_gcs(files_to_upload) # Close all temp file handles. - for file_handle in files_to_upload.values(): - file_handle.close() + for tmp_file in files_to_upload: + tmp_file_handle = tmp_file.get('file_handle') + tmp_file_handle.close() def _query_mysql(self): """ @@ -141,41 +156,73 @@ def _write_local_data_files(self, cursor): col_type_dict = self._get_col_type_dict() file_no = 0 tmp_file_handle = NamedTemporaryFile(delete=True) - tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} + if self.export_format == 'csv': + file_mime_type = 'text/csv' + else: + file_mime_type = 'application/json' + files_to_upload = [{ + 'file_name': self.filename.format(file_no), + 'file_handle': tmp_file_handle, + 'file_mime_type': file_mime_type + }] + + if self.export_format == 'csv': + csv_writer = self._configure_csv_file(tmp_file_handle, schema) for row in cursor: # Convert datetime objects to utc seconds, and decimals to floats. # Convert binary type object to string encoded with base64. row = self._convert_types(schema, col_type_dict, row) - row_dict = dict(zip(schema, row)) - # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. - s = json.dumps(row_dict) - if PY3: - s = s.encode('utf-8') - tmp_file_handle.write(s) + if self.export_format == 'csv': + csv_writer.writerow(row) + else: + row_dict = dict(zip(schema, row)) - # Append newline to make dumps BigQuery compatible. - tmp_file_handle.write(b'\n') + # TODO validate that row isn't > 2MB. BQ enforces a hard row size of 2MB. + s = json.dumps(row_dict, sort_keys=True) + if PY3: + s = s.encode('utf-8') + tmp_file_handle.write(s) + + # Append newline to make dumps BigQuery compatible. + tmp_file_handle.write(b'\n') # Stop if the file exceeds the file size limit. if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: file_no += 1 tmp_file_handle = NamedTemporaryFile(delete=True) - tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle + files_to_upload.append({ + 'file_name': self.filename.format(file_no), + 'file_handle': tmp_file_handle, + 'file_mime_type': file_mime_type + }) + + if self.export_format == 'csv': + csv_writer = self._configure_csv_file(tmp_file_handle, schema) - return tmp_file_handles + return files_to_upload + + def _configure_csv_file(self, file_handle, schema): + """Configure a csv writer with the file_handle and write schema + as headers for the new file. + """ + csv_writer = csv.writer(file_handle, encoding='utf-8', + delimiter=self.field_delimiter) + csv_writer.writerow(schema) + return csv_writer def _write_local_schema_file(self, cursor): """ - Takes a cursor, and writes the BigQuery schema for the results to a - local file system. + Takes a cursor, and writes the BigQuery schema in .json format for the + results to a local file system. :return: A dictionary where key is a filename to be used as an object name in GCS, and values are file handles to local files that contains the BigQuery schema fields in .json format. """ schema_str = None + schema_file_mime_type = 'application/json' tmp_schema_file_handle = NamedTemporaryFile(delete=True) if self.schema is not None and isinstance(self.schema, string_types): schema_str = self.schema @@ -199,13 +246,18 @@ def _write_local_schema_file(self, cursor): 'type': field_type, 'mode': field_mode, }) - schema_str = json.dumps(schema) + schema_str = json.dumps(schema, sort_keys=True) if PY3: schema_str = schema_str.encode('utf-8') tmp_schema_file_handle.write(schema_str) self.log.info('Using schema for %s: %s', self.schema_filename, schema_str) - return {self.schema_filename: tmp_schema_file_handle} + schema_file_to_upload = { + 'file_name': self.schema_filename, + 'file_handle': tmp_schema_file_handle, + 'file_mime_type': schema_file_mime_type + } + return schema_file_to_upload def _upload_to_gcs(self, files_to_upload): """ @@ -215,10 +267,13 @@ def _upload_to_gcs(self, files_to_upload): hook = GoogleCloudStorageHook( google_cloud_storage_conn_id=self.google_cloud_storage_conn_id, delegate_to=self.delegate_to) - for object, tmp_file_handle in files_to_upload.items(): - hook.upload(self.bucket, object, tmp_file_handle.name, 'application/json') + for tmp_file in files_to_upload: + hook.upload(self.bucket, tmp_file.get('file_name'), + tmp_file.get('file_handle').name, + mime_type=tmp_file.get('file_mime_type')) - def _convert_types(self, schema, col_type_dict, row): + @staticmethod + def _convert_types(schema, col_type_dict, row): """ Takes a value from MySQLdb, and converts it to a value that's safe for JSON/Google cloud storage/BigQuery. Dates are converted to UTC seconds. diff --git a/airflow/contrib/operators/opsgenie_alert_operator.py b/airflow/contrib/operators/opsgenie_alert_operator.py new file mode 100644 index 0000000000000..c46d234932d0d --- /dev/null +++ b/airflow/contrib/operators/opsgenie_alert_operator.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from airflow.contrib.hooks.opsgenie_alert_hook import OpsgenieAlertHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class OpsgenieAlertOperator(BaseOperator): + """ + This operator allows you to post alerts to Opsgenie. + Accepts a connection that has an Opsgenie API key as the connection's password. + This operator sets the domain to conn_id.host, and if not set will default + to ``https://api.opsgenie.com``. + + Each Opsgenie API key can be pre-configured to a team integration. + You can override these defaults in this operator. + + :param opsgenie_conn_id: The name of the Opsgenie connection to use + :type opsgenie_conn_id: str + :param message: The Message of the Opsgenie alert (templated) + :type message: str + :param alias: Client-defined identifier of the alert (templated) + :type alias: str + :param description: Description field of the alert (templated) + :type description: str + :param responders: Teams, users, escalations and schedules that + the alert will be routed to send notifications. + :type responders: list[dict] + :param visibleTo: Teams and users that the alert will become visible + to without sending any notification. + :type visibleTo: list[dict] + :param actions: Custom actions that will be available for the alert. + :type actions: list[str] + :param tags: Tags of the alert. + :type tags: list[str] + :param details: Map of key-value pairs to use as custom properties of the alert. + :type details: dict + :param entity: Entity field of the alert that is + generally used to specify which domain alert is related to. (templated) + :type entity: str + :param source: Source field of the alert. Default value is + IP address of the incoming request. + :type source: str + :param priority: Priority level of the alert. Default value is P3. (templated) + :type priority: str + :param user: Display name of the request owner. + :type user: str + :param note: Additional note that will be added while creating the alert. (templated) + :type note: str + """ + template_fields = ('message', 'alias', 'description', 'entity', 'priority', 'note') + + @apply_defaults + def __init__(self, + message, + opsgenie_conn_id='opsgenie_default', + alias=None, + description=None, + responders=None, + visibleTo=None, + actions=None, + tags=None, + details=None, + entity=None, + source=None, + priority=None, + user=None, + note=None, + *args, + **kwargs + ): + super(OpsgenieAlertOperator, self).__init__(*args, **kwargs) + + self.message = message + self.opsgenie_conn_id = opsgenie_conn_id + self.alias = alias + self.description = description + self.responders = responders + self.visibleTo = visibleTo + self.actions = actions + self.tags = tags + self.details = details + self.entity = entity + self.source = source + self.priority = priority + self.user = user + self.note = note + self.hook = None + + def _build_opsgenie_payload(self): + """ + Construct the Opsgenie JSON payload. All relevant parameters are combined here + to a valid Opsgenie JSON payload. + + :return: Opsgenie payload (dict) to send + """ + payload = {} + + for key in [ + "message", "alias", "description", "responders", + "visibleTo", "actions", "tags", "details", "entity", + "source", "priority", "user", "note" + ]: + val = getattr(self, key) + if val: + payload[key] = val + return payload + + def execute(self, context): + """ + Call the OpsgenieAlertHook to post message + """ + self.hook = OpsgenieAlertHook(self.opsgenie_conn_id) + self.hook.execute(self._build_opsgenie_payload()) diff --git a/airflow/contrib/operators/oracle_to_azure_data_lake_transfer.py b/airflow/contrib/operators/oracle_to_azure_data_lake_transfer.py new file mode 100644 index 0000000000000..967f84828d9fd --- /dev/null +++ b/airflow/contrib/operators/oracle_to_azure_data_lake_transfer.py @@ -0,0 +1,115 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.oracle_hook import OracleHook +from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.utils.file import TemporaryDirectory + +import unicodecsv as csv +import os + + +class OracleToAzureDataLakeTransfer(BaseOperator): + """ + Moves data from Oracle to Azure Data Lake. The operator runs the query against + Oracle and stores the file locally before loading it into Azure Data Lake. + + + :param filename: file name to be used by the csv file. + :type filename: str + :param azure_data_lake_conn_id: destination azure data lake connection. + :type azure_data_lake_conn_id: str + :param azure_data_lake_path: destination path in azure data lake to put the file. + :type azure_data_lake_path: str + :param oracle_conn_id: source Oracle connection. + :type oracle_conn_id: str + :param sql: SQL query to execute against the Oracle database. (templated) + :type sql: str + :param sql_params: Parameters to use in sql query. (templated) + :type sql_params: str + :param delimiter: field delimiter in the file. + :type delimiter: str + :param encoding: encoding type for the file. + :type encoding: str + :param quotechar: Character to use in quoting. + :type quotechar: str + :param quoting: Quoting strategy. See unicodecsv quoting for more information. + :type quoting: str + """ + + template_fields = ('filename', 'sql', 'sql_params') + ui_color = '#e08c8c' + + @apply_defaults + def __init__( + self, + filename, + azure_data_lake_conn_id, + azure_data_lake_path, + oracle_conn_id, + sql, + sql_params=None, + delimiter=",", + encoding="utf-8", + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + *args, **kwargs): + super(OracleToAzureDataLakeTransfer, self).__init__(*args, **kwargs) + if sql_params is None: + sql_params = {} + self.filename = filename + self.oracle_conn_id = oracle_conn_id + self.sql = sql + self.sql_params = sql_params + self.azure_data_lake_conn_id = azure_data_lake_conn_id + self.azure_data_lake_path = azure_data_lake_path + self.delimiter = delimiter + self.encoding = encoding + self.quotechar = quotechar + self.quoting = quoting + + def _write_temp_file(self, cursor, path_to_save): + with open(path_to_save, 'wb') as csvfile: + csv_writer = csv.writer(csvfile, delimiter=self.delimiter, + encoding=self.encoding, quotechar=self.quotechar, + quoting=self.quoting) + csv_writer.writerow(map(lambda field: field[0], cursor.description)) + csv_writer.writerows(cursor) + csvfile.flush() + + def execute(self, context): + oracle_hook = OracleHook(oracle_conn_id=self.oracle_conn_id) + azure_data_lake_hook = AzureDataLakeHook( + azure_data_lake_conn_id=self.azure_data_lake_conn_id) + + self.log.info("Dumping Oracle query results to local file") + conn = oracle_hook.get_conn() + cursor = conn.cursor() + cursor.execute(self.sql, self.sql_params) + + with TemporaryDirectory(prefix='airflow_oracle_to_azure_op_') as temp: + self._write_temp_file(cursor, os.path.join(temp, self.filename)) + self.log.info("Uploading local file to Azure Data Lake") + azure_data_lake_hook.upload_file(os.path.join(temp, self.filename), + os.path.join(self.azure_data_lake_path, + self.filename)) + cursor.close() + conn.close() diff --git a/airflow/contrib/operators/oracle_to_oracle_transfer.py b/airflow/contrib/operators/oracle_to_oracle_transfer.py new file mode 100644 index 0000000000000..58613c4f33489 --- /dev/null +++ b/airflow/contrib/operators/oracle_to_oracle_transfer.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.oracle_hook import OracleHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class OracleToOracleTransfer(BaseOperator): + """ + Moves data from Oracle to Oracle. + + + :param oracle_destination_conn_id: destination Oracle connection. + :type oracle_destination_conn_id: str + :param destination_table: destination table to insert rows. + :type destination_table: str + :param oracle_source_conn_id: source Oracle connection. + :type oracle_source_conn_id: str + :param source_sql: SQL query to execute against the source Oracle + database. (templated) + :type source_sql: str + :param source_sql_params: Parameters to use in sql query. (templated) + :type source_sql_params: dict + :param rows_chunk: number of rows per chunk to commit. + :type rows_chunk: int + """ + + template_fields = ('source_sql', 'source_sql_params') + ui_color = '#e08c8c' + + @apply_defaults + def __init__( + self, + oracle_destination_conn_id, + destination_table, + oracle_source_conn_id, + source_sql, + source_sql_params=None, + rows_chunk=5000, + *args, **kwargs): + super(OracleToOracleTransfer, self).__init__(*args, **kwargs) + if source_sql_params is None: + source_sql_params = {} + self.oracle_destination_conn_id = oracle_destination_conn_id + self.destination_table = destination_table + self.oracle_source_conn_id = oracle_source_conn_id + self.source_sql = source_sql + self.source_sql_params = source_sql_params + self.rows_chunk = rows_chunk + + def _execute(self, src_hook, dest_hook, context): + with src_hook.get_conn() as src_conn: + cursor = src_conn.cursor() + self.log.info("Querying data from source: %s", self.oracle_source_conn_id) + cursor.execute(self.source_sql, self.source_sql_params) + target_fields = list(map(lambda field: field[0], cursor.description)) + + rows_total = 0 + rows = cursor.fetchmany(self.rows_chunk) + while len(rows) > 0: + rows_total = rows_total + len(rows) + dest_hook.bulk_insert_rows(self.destination_table, rows, + target_fields=target_fields, + commit_every=self.rows_chunk) + rows = cursor.fetchmany(self.rows_chunk) + self.log.info("Total inserted: %s rows", rows_total) + + self.log.info("Finished data transfer.") + cursor.close() + + def execute(self, context): + src_hook = OracleHook(oracle_conn_id=self.oracle_source_conn_id) + dest_hook = OracleHook(oracle_conn_id=self.oracle_destination_conn_id) + self._execute(src_hook, dest_hook, context) diff --git a/airflow/contrib/operators/postgres_to_gcs_operator.py b/airflow/contrib/operators/postgres_to_gcs_operator.py index 88b4d00e39790..78da78ee2f20d 100644 --- a/airflow/contrib/operators/postgres_to_gcs_operator.py +++ b/airflow/contrib/operators/postgres_to_gcs_operator.py @@ -56,18 +56,18 @@ def __init__(self, **kwargs): """ :param sql: The SQL to execute on the Postgres table. - :type sql: string + :type sql: str :param bucket: The bucket to upload to. - :type bucket: string + :type bucket: str :param filename: The filename to use as the object name when uploading to Google Cloud Storage. A {} should be specified in the filename to allow the operator to inject file numbers in cases where the file is split due to size. - :type filename: string + :type filename: str :param schema_filename: If set, the filename to use as the object name when uploading a .json file containing the BigQuery schema fields for the table that was dumped from Postgres. - :type schema_filename: string + :type schema_filename: str :param approx_max_file_size_bytes: This operator supports the ability to split large table dumps into multiple files (see notes in the filenamed param docs above). Google Cloud Storage allows for files @@ -75,10 +75,10 @@ def __init__(self, file size of the splits. :type approx_max_file_size_bytes: long :param postgres_conn_id: Reference to a specific Postgres hook. - :type postgres_conn_id: string + :type postgres_conn_id: str :param google_cloud_storage_conn_id: Reference to a specific Google cloud storage hook. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. @@ -133,28 +133,38 @@ def _write_local_data_files(self, cursor): contain the data for the GCS objects. """ schema = list(map(lambda schema_tuple: schema_tuple[0], cursor.description)) - file_no = 0 - tmp_file_handle = NamedTemporaryFile(delete=True) - tmp_file_handles = {self.filename.format(file_no): tmp_file_handle} - - for row in cursor: - # Convert datetime objects to utc seconds, and decimals to floats - row = map(self.convert_types, row) - row_dict = dict(zip(schema, row)) - - s = json.dumps(row_dict, sort_keys=True) - if PY3: - s = s.encode('utf-8') - tmp_file_handle.write(s) - - # Append newline to make dumps BigQuery compatible. - tmp_file_handle.write(b'\n') - - # Stop if the file exceeds the file size limit. - if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: - file_no += 1 - tmp_file_handle = NamedTemporaryFile(delete=True) - tmp_file_handles[self.filename.format(file_no)] = tmp_file_handle + tmp_file_handles = {} + row_no = 0 + + def _create_new_file(): + handle = NamedTemporaryFile(delete=True) + filename = self.filename.format(len(tmp_file_handles)) + tmp_file_handles[filename] = handle + return handle + + # Don't create a file if there is nothing to write + if cursor.rowcount > 0: + tmp_file_handle = _create_new_file() + + for row in cursor: + # Convert datetime objects to utc seconds, and decimals to floats + row = map(self.convert_types, row) + row_dict = dict(zip(schema, row)) + + s = json.dumps(row_dict, sort_keys=True) + if PY3: + s = s.encode('utf-8') + tmp_file_handle.write(s) + + # Append newline to make dumps BigQuery compatible. + tmp_file_handle.write(b'\n') + + # Stop if the file exceeds the file size limit. + if tmp_file_handle.tell() >= self.approx_max_file_size_bytes: + tmp_file_handle = _create_new_file() + row_no += 1 + + self.log.info('Received %s rows over %s files', row_no, len(tmp_file_handles)) return tmp_file_handles diff --git a/airflow/contrib/operators/pubsub_operator.py b/airflow/contrib/operators/pubsub_operator.py index 2d55b19f8595d..7c4d2fde5a560 100644 --- a/airflow/contrib/operators/pubsub_operator.py +++ b/airflow/contrib/operators/pubsub_operator.py @@ -67,19 +67,19 @@ def __init__( **kwargs): """ :param project: the GCP project ID where the topic will be created - :type project: string + :type project: str :param topic: the topic to create. Do not include the full topic path. In other words, instead of ``projects/{project}/topics/{topic}``, provide only ``{topic}``. (templated) - :type topic: string + :type topic: str :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubTopicCreateOperator, self).__init__(*args, **kwargs) @@ -163,28 +163,28 @@ def __init__( **kwargs): """ :param topic_project: the GCP project ID where the topic exists - :type topic_project: string + :type topic_project: str :param topic: the topic to create. Do not include the full topic path. In other words, instead of ``projects/{project}/topics/{topic}``, provide only ``{topic}``. (templated) - :type topic: string + :type topic: str :param subscription: the Pub/Sub subscription name. If empty, a random name will be generated using the uuid module - :type subscription: string + :type subscription: str :param subscription_project: the GCP project ID where the subscription will be created. If empty, ``topic_project`` will be used. - :type subscription_project: string + :type subscription_project: str :param ack_deadline_secs: Number of seconds that a subscriber has to acknowledge each message pulled from the subscription :type ack_deadline_secs: int :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubSubscriptionCreateOperator, self).__init__(*args, **kwargs) @@ -248,22 +248,22 @@ def __init__( **kwargs): """ :param project: the GCP project ID in which to work (templated) - :type project: string + :type project: str :param topic: the topic to delete. Do not include the full topic path. In other words, instead of ``projects/{project}/topics/{topic}``, provide only ``{topic}``. (templated) - :type topic: string + :type topic: str :param fail_if_not_exists: If True and the topic does not exist, fail the task :type fail_if_not_exists: bool :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubTopicDeleteOperator, self).__init__(*args, **kwargs) @@ -324,22 +324,22 @@ def __init__( **kwargs): """ :param project: the GCP project ID in which to work (templated) - :type project: string + :type project: str :param subscription: the subscription to delete. Do not include the full subscription path. In other words, instead of ``projects/{project}/subscription/{subscription}``, provide only ``{subscription}``. (templated) - :type subscription: string + :type subscription: str :param fail_if_not_exists: If True and the subscription does not exist, fail the task :type fail_if_not_exists: bool :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubSubscriptionDeleteOperator, self).__init__(*args, **kwargs) @@ -378,7 +378,7 @@ class PubSubPublishOperator(BaseOperator): create_topic=True, dag=dag) - ``project`` , ``topic``, and ``messages`` are templated so you can use + ``project`` , ``topic``, and ``messages`` are templated so you can use variables in them. """ template_fields = ['project', 'topic', 'messages'] @@ -396,12 +396,12 @@ def __init__( **kwargs): """ :param project: the GCP project ID in which to work (templated) - :type project: string + :type project: str :param topic: the topic to which to publish. Do not include the full topic path. In other words, instead of ``projects/{project}/topics/{topic}``, provide only ``{topic}``. (templated) - :type topic: string + :type topic: str :param messages: a list of messages to be published to the topic. Each message is a dict with one or more of the following keys-value mappings: @@ -413,11 +413,11 @@ def __init__( :type messages: list :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubPublishOperator, self).__init__(*args, **kwargs) diff --git a/airflow/contrib/operators/qubole_check_operator.py b/airflow/contrib/operators/qubole_check_operator.py index 0e8d75e16729e..73e3a7b15f661 100644 --- a/airflow/contrib/operators/qubole_check_operator.py +++ b/airflow/contrib/operators/qubole_check_operator.py @@ -28,7 +28,7 @@ class QuboleCheckOperator(CheckOperator, QuboleOperator): """ Performs checks against Qubole Commands. ``QuboleCheckOperator`` expects a command that will be executed on QDS. - By default, each value on first row of the result of this Qubole Commmand + By default, each value on first row of the result of this Qubole Command is evaluated using python ``bool`` casting. If any of the values return ``False``, the check is failed and errors out. @@ -69,7 +69,8 @@ class QuboleCheckOperator(CheckOperator, QuboleOperator): which the checks have to be performed. .. note:: All fields in common with template fields of - QuboleOperator and CheckOperator are template-supported. + QuboleOperator and CheckOperator are template-supported. + """ template_fields = QuboleOperator.template_fields + CheckOperator.template_fields @@ -128,13 +129,13 @@ class QuboleValueCheckOperator(ValueCheckOperator, QuboleOperator): :type qubole_conn_id: str :param pass_value: Expected value of the query results. - :type pass_value: str/int/float + :type pass_value: str or int or float :param tolerance: Defines the permissible pass_value range, for example if tolerance is 2, the Qubole command output can be anything between -2*pass_value and 2*pass_value, without the operator erring out. - :type tolerance: int/float + :type tolerance: int or float kwargs: @@ -215,11 +216,11 @@ def get_sql_from_qbol_cmd(params): def handle_airflow_exception(airflow_exception, hook): cmd = hook.cmd if cmd is not None: - if cmd.is_success: + if cmd.is_success(cmd.status): qubole_command_results = hook.get_query_results() qubole_command_id = cmd.id exception_message = '\nQubole Command Id: {qubole_command_id}' \ '\nQubole Command Results:' \ '\n{qubole_command_results}'.format(**locals()) raise AirflowException(str(airflow_exception) + exception_message) - raise AirflowException(airflow_exception.message) + raise AirflowException(str(airflow_exception)) diff --git a/airflow/contrib/operators/qubole_operator.py b/airflow/contrib/operators/qubole_operator.py index 82ee293b934bc..7da3be7d05792 100755 --- a/airflow/contrib/operators/qubole_operator.py +++ b/airflow/contrib/operators/qubole_operator.py @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +from typing import Iterable + from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults from airflow.contrib.hooks.qubole_hook import QuboleHook @@ -57,7 +59,7 @@ class QuboleOperator(BaseOperator): copied into the working directory where the qubole command is being executed. :archives: list of archives in s3 bucket as archive1,archive2 format. These - will be unarchived intothe working directory where the qubole command is + will be unarchived into the working directory where the qubole command is being executed :parameters: any extra args which need to be passed to script (only when script_location is supplied) @@ -103,15 +105,19 @@ class QuboleOperator(BaseOperator): :boundary_query: Query to be used get range of row IDs to be extracted :split_column: Column used as row ID to split data into ranges (mode 2) - .. note:: Following fields are template-supported : ``query``, ``script_location``, + .. note: + + Following fields are template-supported : ``query``, ``script_location``, ``sub_command``, ``script``, ``files``, ``archives``, ``program``, ``cmdline``, ``sql``, ``where_clause``, ``extract_query``, ``boundary_query``, ``macros``, ``tags``, ``name``, ``parameters``, ``dbtap_id``, ``hive_table``, ``db_table``, ``split_column``, ``note_id``, ``db_update_keys``, ``export_dir``, ``partition_spec``, ``qubole_conn_id``, ``arguments``, ``user_program_arguments``. - You can also use ``.txt`` files for template driven use cases. + You can also use ``.txt`` files for template driven use cases. + + .. note: - .. note:: In QuboleOperator there is a default handler for task failures and retries, + In QuboleOperator there is a default handler for task failures and retries, which generally kills the command running at QDS for the corresponding task instance. You can override this behavior by providing your own failure and retry handler in task definition. @@ -122,9 +128,9 @@ class QuboleOperator(BaseOperator): 'extract_query', 'boundary_query', 'macros', 'name', 'parameters', 'dbtap_id', 'hive_table', 'db_table', 'split_column', 'note_id', 'db_update_keys', 'export_dir', 'partition_spec', 'qubole_conn_id', - 'arguments', 'user_program_arguments', 'cluster_label') + 'arguments', 'user_program_arguments', 'cluster_label') # type: Iterable[str] - template_ext = ('.txt',) + template_ext = ('.txt',) # type: Iterable[str] ui_color = '#3064A1' ui_fgcolor = '#fff' diff --git a/airflow/contrib/operators/s3_copy_object_operator.py b/airflow/contrib/operators/s3_copy_object_operator.py new file mode 100644 index 0000000000000..43651707f8925 --- /dev/null +++ b/airflow/contrib/operators/s3_copy_object_operator.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.hooks.S3_hook import S3Hook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class S3CopyObjectOperator(BaseOperator): + """ + Creates a copy of an object that is already stored in S3. + + Note: the S3 connection used here needs to have access to both + source and destination bucket/key. + + :param source_bucket_key: The key of the source object. (templated) + + It can be either full s3:// style url or relative path from root level. + + When it's specified as a full s3:// url, please omit source_bucket_name. + :type source_bucket_key: str + :param dest_bucket_key: The key of the object to copy to. (templated) + + The convention to specify `dest_bucket_key` is the same as `source_bucket_key`. + :type dest_bucket_key: str + :param source_bucket_name: Name of the S3 bucket where the source object is in. (templated) + + It should be omitted when `source_bucket_key` is provided as a full s3:// url. + :type source_bucket_name: str + :param dest_bucket_name: Name of the S3 bucket to where the object is copied. (templated) + + It should be omitted when `dest_bucket_key` is provided as a full s3:// url. + :type dest_bucket_name: str + :param source_version_id: Version ID of the source object (OPTIONAL) + :type source_version_id: str + :param aws_conn_id: Connection id of the S3 connection to use + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + + You can provide the following values: + + - False: do not validate SSL certificates. SSL will still be used, + but SSL certificates will not be + verified. + - path/to/cert/bundle.pem: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str + """ + + template_fields = ('source_bucket_key', 'dest_bucket_key', + 'source_bucket_name', 'dest_bucket_name') + + @apply_defaults + def __init__( + self, + source_bucket_key, + dest_bucket_key, + source_bucket_name=None, + dest_bucket_name=None, + source_version_id=None, + aws_conn_id='aws_default', + verify=None, + *args, **kwargs): + super(S3CopyObjectOperator, self).__init__(*args, **kwargs) + + self.source_bucket_key = source_bucket_key + self.dest_bucket_key = dest_bucket_key + self.source_bucket_name = source_bucket_name + self.dest_bucket_name = dest_bucket_name + self.source_version_id = source_version_id + self.aws_conn_id = aws_conn_id + self.verify = verify + + def execute(self, context): + s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) + s3_hook.copy_object(self.source_bucket_key, self.dest_bucket_key, + self.source_bucket_name, self.dest_bucket_name, + self.source_version_id) diff --git a/airflow/contrib/operators/s3_delete_objects_operator.py b/airflow/contrib/operators/s3_delete_objects_operator.py new file mode 100644 index 0000000000000..926667b497195 --- /dev/null +++ b/airflow/contrib/operators/s3_delete_objects_operator.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.exceptions import AirflowException +from airflow.hooks.S3_hook import S3Hook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class S3DeleteObjectsOperator(BaseOperator): + """ + To enable users to delete single object or multiple objects from + a bucket using a single HTTP request. + + Users may specify up to 1000 keys to delete. + + :param bucket: Name of the bucket in which you are going to delete object(s). (templated) + :type bucket: str + :param keys: The key(s) to delete from S3 bucket. (templated) + + When ``keys`` is a string, it's supposed to be the key name of + the single object to delete. + + When ``keys`` is a list, it's supposed to be the list of the + keys to delete. + + You may specify up to 1000 keys. + :type keys: str or list + :param aws_conn_id: Connection id of the S3 connection to use + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used, + but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str + """ + + template_fields = ('keys', 'bucket') + + @apply_defaults + def __init__( + self, + bucket, + keys, + aws_conn_id='aws_default', + verify=None, + *args, **kwargs): + super(S3DeleteObjectsOperator, self).__init__(*args, **kwargs) + self.bucket = bucket + self.keys = keys + self.aws_conn_id = aws_conn_id + self.verify = verify + + def execute(self, context): + s3_hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) + + response = s3_hook.delete_objects(bucket=self.bucket, keys=self.keys) + + deleted_keys = [x['Key'] for x in response.get("Deleted", [])] + self.log.info("Deleted: %s", deleted_keys) + + if "Errors" in response: + errors_keys = [x['Key'] for x in response.get("Errors", [])] + raise AirflowException("Errors when deleting: {}".format(errors_keys)) diff --git a/airflow/contrib/operators/s3_list_operator.py b/airflow/contrib/operators/s3_list_operator.py index b85691b005fb9..9b13f0744c3de 100644 --- a/airflow/contrib/operators/s3_list_operator.py +++ b/airflow/contrib/operators/s3_list_operator.py @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +from typing import Iterable + from airflow.hooks.S3_hook import S3Hook from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults @@ -30,14 +32,26 @@ class S3ListOperator(BaseOperator): used by `xcom` in the downstream task. :param bucket: The S3 bucket where to find the objects. (templated) - :type bucket: string + :type bucket: str :param prefix: Prefix string to filters the objects whose name begin with such prefix. (templated) - :type prefix: string + :type prefix: str :param delimiter: the delimiter marks key hierarchy. (templated) - :type delimiter: string + :type delimiter: str :param aws_conn_id: The connection ID to use when connecting to S3 storage. - :type aws_conn_id: string + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str + **Example**: The following operator would list all the files @@ -52,7 +66,7 @@ class S3ListOperator(BaseOperator): aws_conn_id='aws_customers_conn' ) """ - template_fields = ('bucket', 'prefix', 'delimiter') + template_fields = ('bucket', 'prefix', 'delimiter') # type: Iterable[str] ui_color = '#ffd700' @apply_defaults @@ -61,6 +75,7 @@ def __init__(self, prefix='', delimiter='', aws_conn_id='aws_default', + verify=None, *args, **kwargs): super(S3ListOperator, self).__init__(*args, **kwargs) @@ -68,13 +83,15 @@ def __init__(self, self.prefix = prefix self.delimiter = delimiter self.aws_conn_id = aws_conn_id + self.verify = verify def execute(self, context): - hook = S3Hook(aws_conn_id=self.aws_conn_id) + hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) self.log.info( - 'Getting the list of files from bucket: {0} in prefix: {1} (Delimiter {2})'. - format(self.bucket, self.prefix, self.delimiter)) + 'Getting the list of files from bucket: %s in prefix: %s (Delimiter {%s)', + self.bucket, self.prefix, self.delimiter + ) return hook.list_keys( bucket_name=self.bucket, diff --git a/airflow/contrib/operators/s3_to_gcs_operator.py b/airflow/contrib/operators/s3_to_gcs_operator.py index 2898af1071773..ad456a5c9bae0 100644 --- a/airflow/contrib/operators/s3_to_gcs_operator.py +++ b/airflow/contrib/operators/s3_to_gcs_operator.py @@ -33,31 +33,44 @@ class S3ToGoogleCloudStorageOperator(S3ListOperator): destination path. :param bucket: The S3 bucket where to find the objects. (templated) - :type bucket: string + :type bucket: str :param prefix: Prefix string which filters objects whose name begin with such prefix. (templated) - :type prefix: string + :type prefix: str :param delimiter: the delimiter marks key hierarchy. (templated) - :type delimiter: string + :type delimiter: str :param aws_conn_id: The source S3 connection - :type aws_conn_id: string + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str :param dest_gcs_conn_id: The destination connection ID to use when connecting to Google Cloud Storage. - :type dest_gcs_conn_id: string + :type dest_gcs_conn_id: str :param dest_gcs: The destination Google Cloud Storage bucket and prefix where you want to store the files. (templated) - :type dest_gcs: string + :type dest_gcs: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str :param replace: Whether you want to replace existing destination files or not. :type replace: bool **Example**: + .. code-block:: python + s3_to_gcs_op = S3ToGoogleCloudStorageOperator( task_id='s3_to_gcs_example', bucket='my-s3-bucket', @@ -80,6 +93,7 @@ def __init__(self, prefix='', delimiter='', aws_conn_id='aws_default', + verify=None, dest_gcs_conn_id=None, dest_gcs=None, delegate_to=None, @@ -98,6 +112,7 @@ def __init__(self, self.dest_gcs = dest_gcs self.delegate_to = delegate_to self.replace = replace + self.verify = verify if dest_gcs and not self._gcs_object_is_directory(self.dest_gcs): self.log.info( @@ -137,16 +152,17 @@ def execute(self, context): else: existing_files.append(f) - files = set(files) - set(existing_files) + files = list(set(files) - set(existing_files)) if len(files) > 0: - self.log.info('{0} files are going to be synced: {1}.'.format( - len(files), files)) + self.log.info( + '%s files are going to be synced: %s.', len(files), files + ) else: self.log.info( 'There are no new files to sync. Have a nice day!') if files: - hook = S3Hook(aws_conn_id=self.aws_conn_id) + hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) for file in files: # GCS hook builds its own in-memory file so we have to create @@ -184,7 +200,8 @@ def execute(self, context): # Following functionality may be better suited in # airflow/contrib/hooks/gcs_hook.py - def _gcs_object_is_directory(self, object): + @staticmethod + def _gcs_object_is_directory(object): bucket, blob = _parse_gcs_url(object) return len(blob) == 0 or blob.endswith('/') diff --git a/airflow/contrib/operators/s3_to_gcs_transfer_operator.py b/airflow/contrib/operators/s3_to_gcs_transfer_operator.py new file mode 100644 index 0000000000000..aaae9bff2111e --- /dev/null +++ b/airflow/contrib/operators/s3_to_gcs_transfer_operator.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import warnings + +from airflow.contrib.operators.gcp_transfer_operator import S3ToGoogleCloudStorageTransferOperator # noqa + +warnings.warn( + "This module is deprecated. Please use `airflow.contrib.operators.gcp_transfer_operator`", + DeprecationWarning, +) diff --git a/airflow/contrib/operators/s3_to_sftp_operator.py b/airflow/contrib/operators/s3_to_sftp_operator.py new file mode 100644 index 0000000000000..43ef269032db8 --- /dev/null +++ b/airflow/contrib/operators/s3_to_sftp_operator.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.models import BaseOperator +from airflow.hooks.S3_hook import S3Hook +from airflow.contrib.hooks.ssh_hook import SSHHook +from tempfile import NamedTemporaryFile +from urllib.parse import urlparse +from airflow.utils.decorators import apply_defaults + + +class S3ToSFTPOperator(BaseOperator): + """ + This operator enables the transferring of files from S3 to a SFTP server. + + :param sftp_conn_id: The sftp connection id. The name or identifier for + establishing a connection to the SFTP server. + :type sftp_conn_id: string + :param sftp_path: The sftp remote path. This is the specified file path for + uploading file to the SFTP server. + :type sftp_path: string + :param s3_conn_id: The s3 connection id. The name or identifier for + establishing a connection to S3 + :type s3_conn_id: string + :param s3_bucket: The targeted s3 bucket. This is the S3 bucket from + where the file is downloaded. + :type s3_bucket: string + :param s3_key: The targeted s3 key. This is the specified file path for + downloading the file from S3. + :type s3_key: string + """ + + template_fields = ('s3_key', 'sftp_path') + + @apply_defaults + def __init__(self, + s3_bucket, + s3_key, + sftp_path, + sftp_conn_id='ssh_default', + s3_conn_id='aws_default', + *args, + **kwargs): + super(S3ToSFTPOperator, self).__init__(*args, **kwargs) + self.sftp_conn_id = sftp_conn_id + self.sftp_path = sftp_path + self.s3_bucket = s3_bucket + self.s3_key = s3_key + self.s3_conn_id = s3_conn_id + + @staticmethod + def get_s3_key(s3_key): + """This parses the correct format for S3 keys + regardless of how the S3 url is passed.""" + + parsed_s3_key = urlparse(s3_key) + return parsed_s3_key.path.lstrip('/') + + def execute(self, context): + self.s3_key = self.get_s3_key(self.s3_key) + ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) + s3_hook = S3Hook(self.s3_conn_id) + + s3_client = s3_hook.get_conn() + sftp_client = ssh_hook.get_conn().open_sftp() + + with NamedTemporaryFile("w") as f: + s3_client.download_file(self.s3_bucket, self.s3_key, f.name) + sftp_client.put(f.name, self.sftp_path) diff --git a/airflow/contrib/operators/sagemaker_base_operator.py b/airflow/contrib/operators/sagemaker_base_operator.py new file mode 100644 index 0000000000000..d6717fd6b41fc --- /dev/null +++ b/airflow/contrib/operators/sagemaker_base_operator.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json + +from typing import Iterable + +from airflow.contrib.hooks.sagemaker_hook import SageMakerHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class SageMakerBaseOperator(BaseOperator): + """ + This is the base operator for all SageMaker operators. + + :param config: The configuration necessary to start a training job (templated) + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + """ + + template_fields = ['config'] + template_ext = () + ui_color = '#ededed' + + integer_fields = [] # type: Iterable[Iterable[str]] + + @apply_defaults + def __init__(self, + config, + aws_conn_id='aws_default', + *args, **kwargs): + super(SageMakerBaseOperator, self).__init__(*args, **kwargs) + + self.aws_conn_id = aws_conn_id + self.config = config + self.hook = None + + def parse_integer(self, config, field): + if len(field) == 1: + if isinstance(config, list): + for sub_config in config: + self.parse_integer(sub_config, field) + return + head = field[0] + if head in config: + config[head] = int(config[head]) + return + + if isinstance(config, list): + for sub_config in config: + self.parse_integer(sub_config, field) + return + + head, tail = field[0], field[1:] + if head in config: + self.parse_integer(config[head], tail) + return + + def parse_config_integers(self): + # Parse the integer fields of training config to integers + # in case the config is rendered by Jinja and all fields are str + for field in self.integer_fields: + self.parse_integer(self.config, field) + + def expand_role(self): + pass + + def preprocess_config(self): + self.log.info( + 'Preprocessing the config and doing required s3_operations' + ) + self.hook = SageMakerHook(aws_conn_id=self.aws_conn_id) + + self.hook.configure_s3_resources(self.config) + self.parse_config_integers() + self.expand_role() + + self.log.info( + 'After preprocessing the config is:\n {}'.format( + json.dumps(self.config, sort_keys=True, indent=4, separators=(',', ': '))) + ) + + def execute(self, context): + raise NotImplementedError('Please implement execute() in sub class!') diff --git a/airflow/contrib/operators/sagemaker_endpoint_config_operator.py b/airflow/contrib/operators/sagemaker_endpoint_config_operator.py new file mode 100644 index 0000000000000..e4f8ce7c9103f --- /dev/null +++ b/airflow/contrib/operators/sagemaker_endpoint_config_operator.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerEndpointConfigOperator(SageMakerBaseOperator): + + """ + Create a SageMaker endpoint config. + + This operator returns The ARN of the endpoint config created in Amazon SageMaker + + :param config: The configuration necessary to create an endpoint config. + + For details of the configuration parameter see :py:meth:`SageMaker.Client.create_endpoint_config` + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + """ + + integer_fields = [ + ['ProductionVariants', 'InitialInstanceCount'] + ] + + @apply_defaults + def __init__(self, + config, + *args, **kwargs): + super(SageMakerEndpointConfigOperator, self).__init__(config=config, + *args, **kwargs) + + self.config = config + + def execute(self, context): + self.preprocess_config() + + self.log.info('Creating SageMaker Endpoint Config %s.', self.config['EndpointConfigName']) + response = self.hook.create_endpoint_config(self.config) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException( + 'Sagemaker endpoint config creation failed: %s' % response) + else: + return { + 'EndpointConfig': self.hook.describe_endpoint_config( + self.config['EndpointConfigName'] + ) + } diff --git a/airflow/contrib/operators/sagemaker_endpoint_operator.py b/airflow/contrib/operators/sagemaker_endpoint_operator.py new file mode 100644 index 0000000000000..45af3381cb9cd --- /dev/null +++ b/airflow/contrib/operators/sagemaker_endpoint_operator.py @@ -0,0 +1,150 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerEndpointOperator(SageMakerBaseOperator): + + """ + Create a SageMaker endpoint. + + This operator returns The ARN of the endpoint created in Amazon SageMaker + + :param config: + The configuration necessary to create an endpoint. + + If you need to create a SageMaker endpoint based on an existed + SageMaker model and an existed SageMaker endpoint config:: + + config = endpoint_configuration; + + If you need to create all of SageMaker model, SageMaker endpoint-config and SageMaker endpoint:: + + config = { + 'Model': model_configuration, + 'EndpointConfig': endpoint_config_configuration, + 'Endpoint': endpoint_configuration + } + + For details of the configuration parameter of model_configuration see + :py:meth:`SageMaker.Client.create_model` + + For details of the configuration parameter of endpoint_config_configuration see + :py:meth:`SageMaker.Client.create_endpoint_config` + + For details of the configuration parameter of endpoint_configuration see + :py:meth:`SageMaker.Client.create_endpoint` + + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + :param wait_for_completion: Whether the operator should wait until the endpoint creation finishes. + :type wait_for_completion: bool + :param check_interval: If wait is set to True, this is the time interval, in seconds, that this operation + waits before polling the status of the endpoint creation. + :type check_interval: int + :param max_ingestion_time: If wait is set to True, this operation fails if the endpoint creation doesn't + finish within max_ingestion_time seconds. If you set this parameter to None it never times out. + :type max_ingestion_time: int + :param operation: Whether to create an endpoint or update an endpoint. Must be either 'create or 'update'. + :type operation: str + """ + + @apply_defaults + def __init__(self, + config, + wait_for_completion=True, + check_interval=30, + max_ingestion_time=None, + operation='create', + *args, **kwargs): + super(SageMakerEndpointOperator, self).__init__(config=config, + *args, **kwargs) + + self.config = config + self.wait_for_completion = wait_for_completion + self.check_interval = check_interval + self.max_ingestion_time = max_ingestion_time + self.operation = operation.lower() + if self.operation not in ['create', 'update']: + raise ValueError('Invalid value! Argument operation has to be one of "create" and "update"') + self.create_integer_fields() + + def create_integer_fields(self): + if 'EndpointConfig' in self.config: + self.integer_fields = [ + ['EndpointConfig', 'ProductionVariants', 'InitialInstanceCount'] + ] + + def expand_role(self): + if 'Model' not in self.config: + return + hook = AwsHook(self.aws_conn_id) + config = self.config['Model'] + if 'ExecutionRoleArn' in config: + config['ExecutionRoleArn'] = hook.expand_role(config['ExecutionRoleArn']) + + def execute(self, context): + self.preprocess_config() + + model_info = self.config.get('Model') + endpoint_config_info = self.config.get('EndpointConfig') + endpoint_info = self.config.get('Endpoint', self.config) + + if model_info: + self.log.info('Creating SageMaker model %s.', model_info['ModelName']) + self.hook.create_model(model_info) + + if endpoint_config_info: + self.log.info('Creating endpoint config %s.', endpoint_config_info['EndpointConfigName']) + self.hook.create_endpoint_config(endpoint_config_info) + + if self.operation == 'create': + sagemaker_operation = self.hook.create_endpoint + log_str = 'Creating' + elif self.operation == 'update': + sagemaker_operation = self.hook.update_endpoint + log_str = 'Updating' + else: + raise ValueError('Invalid value! Argument operation has to be one of "create" and "update"') + + self.log.info('%s SageMaker endpoint %s.', log_str, endpoint_info['EndpointName']) + + response = sagemaker_operation( + endpoint_info, + wait_for_completion=self.wait_for_completion, + check_interval=self.check_interval, + max_ingestion_time=self.max_ingestion_time + ) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException( + 'Sagemaker endpoint creation failed: %s' % response) + else: + return { + 'EndpointConfig': self.hook.describe_endpoint_config( + endpoint_info['EndpointConfigName'] + ), + 'Endpoint': self.hook.describe_endpoint( + endpoint_info['EndpointName'] + ) + } diff --git a/airflow/contrib/operators/sagemaker_model_operator.py b/airflow/contrib/operators/sagemaker_model_operator.py new file mode 100644 index 0000000000000..5bfc1feb3149b --- /dev/null +++ b/airflow/contrib/operators/sagemaker_model_operator.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerModelOperator(SageMakerBaseOperator): + + """ + Create a SageMaker model. + + This operator returns The ARN of the model created in Amazon SageMaker + + :param config: The configuration necessary to create a model. + + For details of the configuration parameter see :py:meth:`SageMaker.Client.create_model` + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + """ + + @apply_defaults + def __init__(self, + config, + *args, **kwargs): + super(SageMakerModelOperator, self).__init__(config=config, + *args, **kwargs) + + self.config = config + + def expand_role(self): + if 'ExecutionRoleArn' in self.config: + hook = AwsHook(self.aws_conn_id) + self.config['ExecutionRoleArn'] = hook.expand_role(self.config['ExecutionRoleArn']) + + def execute(self, context): + self.preprocess_config() + + self.log.info('Creating SageMaker Model %s.', self.config['ModelName']) + response = self.hook.create_model(self.config) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException('Sagemaker model creation failed: %s' % response) + else: + return { + 'Model': self.hook.describe_model( + self.config['ModelName'] + ) + } diff --git a/airflow/contrib/operators/sagemaker_training_operator.py b/airflow/contrib/operators/sagemaker_training_operator.py new file mode 100644 index 0000000000000..f10e13e9736c5 --- /dev/null +++ b/airflow/contrib/operators/sagemaker_training_operator.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerTrainingOperator(SageMakerBaseOperator): + """ + Initiate a SageMaker training job. + + This operator returns The ARN of the training job created in Amazon SageMaker. + + :param config: The configuration necessary to start a training job (templated). + + For details of the configuration parameter see :py:meth:`SageMaker.Client.create_training_job` + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + :param wait_for_completion: If wait is set to True, the time interval, in seconds, + that the operation waits to check the status of the training job. + :type wait_for_completion: bool + :param print_log: if the operator should print the cloudwatch log during training + :type print_log: bool + :param check_interval: if wait is set to be true, this is the time interval + in seconds which the operator will check the status of the training job + :type check_interval: int + :param max_ingestion_time: If wait is set to True, the operation fails if the training job + doesn't finish within max_ingestion_time seconds. If you set this parameter to None, + the operation does not timeout. + :type max_ingestion_time: int + """ + + integer_fields = [ + ['ResourceConfig', 'InstanceCount'], + ['ResourceConfig', 'VolumeSizeInGB'], + ['StoppingCondition', 'MaxRuntimeInSeconds'] + ] + + @apply_defaults + def __init__(self, + config, + wait_for_completion=True, + print_log=True, + check_interval=30, + max_ingestion_time=None, + *args, **kwargs): + super(SageMakerTrainingOperator, self).__init__(config=config, + *args, **kwargs) + + self.wait_for_completion = wait_for_completion + self.print_log = print_log + self.check_interval = check_interval + self.max_ingestion_time = max_ingestion_time + + def expand_role(self): + if 'RoleArn' in self.config: + hook = AwsHook(self.aws_conn_id) + self.config['RoleArn'] = hook.expand_role(self.config['RoleArn']) + + def execute(self, context): + self.preprocess_config() + + self.log.info('Creating SageMaker Training Job %s.', self.config['TrainingJobName']) + + response = self.hook.create_training_job( + self.config, + wait_for_completion=self.wait_for_completion, + print_log=self.print_log, + check_interval=self.check_interval, + max_ingestion_time=self.max_ingestion_time + ) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException('Sagemaker Training Job creation failed: %s' % response) + else: + return { + 'Training': self.hook.describe_training_job( + self.config['TrainingJobName'] + ) + } diff --git a/airflow/contrib/operators/sagemaker_transform_operator.py b/airflow/contrib/operators/sagemaker_transform_operator.py new file mode 100644 index 0000000000000..9dcf9ab32764e --- /dev/null +++ b/airflow/contrib/operators/sagemaker_transform_operator.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerTransformOperator(SageMakerBaseOperator): + """ + Initiate a SageMaker transform job. + + This operator returns The ARN of the model created in Amazon SageMaker. + + :param config: The configuration necessary to start a transform job (templated). + + If you need to create a SageMaker transform job based on an existed SageMaker model:: + + config = transform_config + + If you need to create both SageMaker model and SageMaker Transform job:: + + config = { + 'Model': model_config, + 'Transform': transform_config + } + + For details of the configuration parameter of transform_config see + :py:meth:`SageMaker.Client.create_transform_job` + + For details of the configuration parameter of model_config, See: + :py:meth:`SageMaker.Client.create_model` + + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: string + :param wait_for_completion: Set to True to wait until the transform job finishes. + :type wait_for_completion: bool + :param check_interval: If wait is set to True, the time interval, in seconds, + that this operation waits to check the status of the transform job. + :type check_interval: int + :param max_ingestion_time: If wait is set to True, the operation fails + if the transform job doesn't finish within max_ingestion_time seconds. If you + set this parameter to None, the operation does not timeout. + :type max_ingestion_time: int + """ + + @apply_defaults + def __init__(self, + config, + wait_for_completion=True, + check_interval=30, + max_ingestion_time=None, + *args, **kwargs): + super(SageMakerTransformOperator, self).__init__(config=config, + *args, **kwargs) + self.config = config + self.wait_for_completion = wait_for_completion + self.check_interval = check_interval + self.max_ingestion_time = max_ingestion_time + self.create_integer_fields() + + def create_integer_fields(self): + self.integer_fields = [ + ['Transform', 'TransformResources', 'InstanceCount'], + ['Transform', 'MaxConcurrentTransforms'], + ['Transform', 'MaxPayloadInMB'] + ] + if 'Transform' not in self.config: + for field in self.integer_fields: + field.pop(0) + + def expand_role(self): + if 'Model' not in self.config: + return + config = self.config['Model'] + if 'ExecutionRoleArn' in config: + hook = AwsHook(self.aws_conn_id) + config['ExecutionRoleArn'] = hook.expand_role(config['ExecutionRoleArn']) + + def execute(self, context): + self.preprocess_config() + + model_config = self.config.get('Model') + transform_config = self.config.get('Transform', self.config) + + if model_config: + self.log.info('Creating SageMaker Model %s for transform job', model_config['ModelName']) + self.hook.create_model(model_config) + + self.log.info('Creating SageMaker transform Job %s.', transform_config['TransformJobName']) + response = self.hook.create_transform_job( + transform_config, + wait_for_completion=self.wait_for_completion, + check_interval=self.check_interval, + max_ingestion_time=self.max_ingestion_time) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException('Sagemaker transform Job creation failed: %s' % response) + else: + return { + 'Model': self.hook.describe_model( + transform_config['ModelName'] + ), + 'Transform': self.hook.describe_transform_job( + transform_config['TransformJobName'] + ) + } diff --git a/airflow/contrib/operators/sagemaker_tuning_operator.py b/airflow/contrib/operators/sagemaker_tuning_operator.py new file mode 100644 index 0000000000000..2a92bf1b81cf7 --- /dev/null +++ b/airflow/contrib/operators/sagemaker_tuning_operator.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_hook import AwsHook +from airflow.contrib.operators.sagemaker_base_operator import SageMakerBaseOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerTuningOperator(SageMakerBaseOperator): + """ + Initiate a SageMaker hyperparameter tuning job. + + This operator returns The ARN of the tuning job created in Amazon SageMaker. + + :param config: The configuration necessary to start a tuning job (templated). + + For details of the configuration parameter see + :py:meth:`SageMaker.Client.create_hyper_parameter_tuning_job` + :type config: dict + :param aws_conn_id: The AWS connection ID to use. + :type aws_conn_id: str + :param wait_for_completion: Set to True to wait until the tuning job finishes. + :type wait_for_completion: bool + :param check_interval: If wait is set to True, the time interval, in seconds, + that this operation waits to check the status of the tuning job. + :type check_interval: int + :param max_ingestion_time: If wait is set to True, the operation fails + if the tuning job doesn't finish within max_ingestion_time seconds. If you + set this parameter to None, the operation does not timeout. + :type max_ingestion_time: int + """ + + integer_fields = [ + ['HyperParameterTuningJobConfig', 'ResourceLimits', 'MaxNumberOfTrainingJobs'], + ['HyperParameterTuningJobConfig', 'ResourceLimits', 'MaxParallelTrainingJobs'], + ['TrainingJobDefinition', 'ResourceConfig', 'InstanceCount'], + ['TrainingJobDefinition', 'ResourceConfig', 'VolumeSizeInGB'], + ['TrainingJobDefinition', 'StoppingCondition', 'MaxRuntimeInSeconds'] + ] + + @apply_defaults + def __init__(self, + config, + wait_for_completion=True, + check_interval=30, + max_ingestion_time=None, + *args, **kwargs): + super(SageMakerTuningOperator, self).__init__(config=config, + *args, **kwargs) + self.config = config + self.wait_for_completion = wait_for_completion + self.check_interval = check_interval + self.max_ingestion_time = max_ingestion_time + + def expand_role(self): + if 'TrainingJobDefinition' in self.config: + config = self.config['TrainingJobDefinition'] + if 'RoleArn' in config: + hook = AwsHook(self.aws_conn_id) + config['RoleArn'] = hook.expand_role(config['RoleArn']) + + def execute(self, context): + self.preprocess_config() + + self.log.info( + 'Creating SageMaker Hyper-Parameter Tuning Job %s', self.config['HyperParameterTuningJobName'] + ) + + response = self.hook.create_tuning_job( + self.config, + wait_for_completion=self.wait_for_completion, + check_interval=self.check_interval, + max_ingestion_time=self.max_ingestion_time + ) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise AirflowException('Sagemaker Tuning Job creation failed: %s' % response) + else: + return { + 'Tuning': self.hook.describe_tuning_job( + self.config['HyperParameterTuningJobName'] + ) + } diff --git a/airflow/contrib/operators/segment_track_event_operator.py b/airflow/contrib/operators/segment_track_event_operator.py index 04f6ae6d41d4b..5d2f2db539083 100644 --- a/airflow/contrib/operators/segment_track_event_operator.py +++ b/airflow/contrib/operators/segment_track_event_operator.py @@ -27,16 +27,16 @@ class SegmentTrackEventOperator(BaseOperator): Send Track Event to Segment for a specified user_id and event :param user_id: The ID for this user in your database. (templated) - :type user_id: string + :type user_id: str :param event: The name of the event you're tracking. (templated) - :type event: string + :type event: str :param properties: A dictionary of properties for the event. (templated) :type properties: dict :param segment_conn_id: The connection ID to use when connecting to Segment. - :type segment_conn_id: string + :type segment_conn_id: str :param segment_debug_mode: Determines whether Segment should run in debug mode. Defaults to False - :type segment_debug_mode: boolean + :type segment_debug_mode: bool """ template_fields = ('user_id', 'event', 'properties') ui_color = '#ffd700' diff --git a/airflow/contrib/operators/sftp_operator.py b/airflow/contrib/operators/sftp_operator.py index 153f440d41186..4af5fe9a59979 100644 --- a/airflow/contrib/operators/sftp_operator.py +++ b/airflow/contrib/operators/sftp_operator.py @@ -16,6 +16,8 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import os + from airflow.contrib.hooks.ssh_hook import SSHHook from airflow.exceptions import AirflowException from airflow.models import BaseOperator @@ -30,23 +32,48 @@ class SFTPOperation(object): class SFTPOperator(BaseOperator): """ SFTPOperator for transferring files from remote host to local or vice a versa. - This operator uses ssh_hook to open sftp trasport channel that serve as basis + This operator uses ssh_hook to open sftp transport channel that serve as basis for file transfer. - :param ssh_hook: predefined ssh_hook to use for remote execution - :type ssh_hook: :class:`SSHHook` - :param ssh_conn_id: connection id from airflow Connections + :param ssh_hook: predefined ssh_hook to use for remote execution. + Either `ssh_hook` or `ssh_conn_id` needs to be provided. + :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook + :param ssh_conn_id: connection id from airflow Connections. + `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str - :param remote_host: remote host to connect + :param remote_host: remote host to connect (templated) + Nullable. If provided, it will replace the `remote_host` which was + defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param local_filepath: local file path to get or put. (templated) :type local_filepath: str :param remote_filepath: remote file path to get or put. (templated) :type remote_filepath: str - :param operation: specify operation 'get' or 'put', defaults to get - :type get: bool + :param operation: specify operation 'get' or 'put', defaults to put + :type operation: str + :param confirm: specify if the SFTP operation should be confirmed, defaults to True + :type confirm: bool + :param create_intermediate_dirs: create missing intermediate directories when + copying from remote to local and vice-versa. Default is False. + + Example: The following task would copy ``file.txt`` to the remote host + at ``/tmp/tmp1/tmp2/`` while creating ``tmp``,``tmp1`` and ``tmp2`` if they + don't exist. If the parameter is not passed it would error as the directory + does not exist. :: + + put_file = SFTPOperator( + task_id="test_sftp", + ssh_conn_id="ssh_default", + local_filepath="/tmp/file.txt", + remote_filepath="/tmp/tmp1/tmp2/file.txt", + operation="put", + create_intermediate_dirs=True, + dag=dag + ) + + :type create_intermediate_dirs: bool """ - template_fields = ('local_filepath', 'remote_filepath') + template_fields = ('local_filepath', 'remote_filepath', 'remote_host') @apply_defaults def __init__(self, @@ -56,6 +83,8 @@ def __init__(self, local_filepath=None, remote_filepath=None, operation=SFTPOperation.PUT, + confirm=True, + create_intermediate_dirs=False, *args, **kwargs): super(SFTPOperator, self).__init__(*args, **kwargs) @@ -65,6 +94,8 @@ def __init__(self, self.local_filepath = local_filepath self.remote_filepath = remote_filepath self.operation = operation + self.confirm = confirm + self.create_intermediate_dirs = create_intermediate_dirs if not (self.operation.lower() == SFTPOperation.GET or self.operation.lower() == SFTPOperation.PUT): raise TypeError("unsupported operation value {0}, expected {1} or {2}" @@ -73,30 +104,77 @@ def __init__(self, def execute(self, context): file_msg = None try: - if self.ssh_conn_id and not self.ssh_hook: - self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) + if self.ssh_conn_id: + if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): + self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") + else: + self.log.info("ssh_hook is not provided or invalid. " + + "Trying ssh_conn_id to create SSHHook.") + self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: - raise AirflowException("can not operate without ssh_hook or ssh_conn_id") + raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: + self.log.info("remote_host is provided explicitly. " + + "It will replace the remote_host which was defined " + + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host - ssh_client = self.ssh_hook.get_conn() - sftp_client = ssh_client.open_sftp() - if self.operation.lower() == SFTPOperation.GET: - file_msg = "from {0} to {1}".format(self.remote_filepath, - self.local_filepath) - self.log.debug("Starting to transfer %s", file_msg) - sftp_client.get(self.remote_filepath, self.local_filepath) - else: - file_msg = "from {0} to {1}".format(self.local_filepath, - self.remote_filepath) - self.log.debug("Starting to transfer file %s", file_msg) - sftp_client.put(self.local_filepath, self.remote_filepath) + with self.ssh_hook.get_conn() as ssh_client: + sftp_client = ssh_client.open_sftp() + if self.operation.lower() == SFTPOperation.GET: + local_folder = os.path.dirname(self.local_filepath) + if self.create_intermediate_dirs: + # Create Intermediate Directories if it doesn't exist + try: + os.makedirs(local_folder) + except OSError: + if not os.path.isdir(local_folder): + raise + file_msg = "from {0} to {1}".format(self.remote_filepath, + self.local_filepath) + self.log.debug("Starting to transfer %s", file_msg) + sftp_client.get(self.remote_filepath, self.local_filepath) + else: + remote_folder = os.path.dirname(self.remote_filepath) + if self.create_intermediate_dirs: + _make_intermediate_dirs( + sftp_client=sftp_client, + remote_directory=remote_folder, + ) + file_msg = "from {0} to {1}".format(self.local_filepath, + self.remote_filepath) + self.log.debug("Starting to transfer file %s", file_msg) + sftp_client.put(self.local_filepath, + self.remote_filepath, + confirm=self.confirm) except Exception as e: raise AirflowException("Error while transferring {0}, error: {1}" .format(file_msg, str(e))) return None + + +def _make_intermediate_dirs(sftp_client, remote_directory): + """ + Create all the intermediate directories in a remote host + + :param sftp_client: A Paramiko SFTP client. + :param remote_directory: Absolute Path of the directory containing the file + :return: + """ + if remote_directory == '/': + sftp_client.chdir('/') + return + if remote_directory == '': + return + try: + sftp_client.chdir(remote_directory) + except IOError: + dirname, basename = os.path.split(remote_directory.rstrip('/')) + _make_intermediate_dirs(sftp_client, dirname) + sftp_client.mkdir(basename) + sftp_client.chdir(basename) + return diff --git a/airflow/contrib/operators/sftp_to_s3_operator.py b/airflow/contrib/operators/sftp_to_s3_operator.py new file mode 100644 index 0000000000000..cefc838cf12d1 --- /dev/null +++ b/airflow/contrib/operators/sftp_to_s3_operator.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.models import BaseOperator +from airflow.hooks.S3_hook import S3Hook +from airflow.contrib.hooks.ssh_hook import SSHHook +from tempfile import NamedTemporaryFile +from urllib.parse import urlparse +from airflow.utils.decorators import apply_defaults + + +class SFTPToS3Operator(BaseOperator): + """ + This operator enables the transferring of files from a SFTP server to + Amazon S3. + + :param sftp_conn_id: The sftp connection id. The name or identifier for + establishing a connection to the SFTP server. + :type sftp_conn_id: string + :param sftp_path: The sftp remote path. This is the specified file path + for downloading the file from the SFTP server. + :type sftp_path: string + :param s3_conn_id: The s3 connection id. The name or identifier for + establishing a connection to S3 + :type s3_conn_id: string + :param s3_bucket: The targeted s3 bucket. This is the S3 bucket to where + the file is uploaded. + :type s3_bucket: string + :param s3_key: The targeted s3 key. This is the specified path for + uploading the file to S3. + :type s3_key: string + """ + + template_fields = ('s3_key', 'sftp_path') + + @apply_defaults + def __init__(self, + s3_bucket, + s3_key, + sftp_path, + sftp_conn_id='ssh_default', + s3_conn_id='aws_default', + *args, + **kwargs): + super(SFTPToS3Operator, self).__init__(*args, **kwargs) + self.sftp_conn_id = sftp_conn_id + self.sftp_path = sftp_path + self.s3_bucket = s3_bucket + self.s3_key = s3_key + self.s3_conn_id = s3_conn_id + + @staticmethod + def get_s3_key(s3_key): + """This parses the correct format for S3 keys + regardless of how the S3 url is passed.""" + + parsed_s3_key = urlparse(s3_key) + return parsed_s3_key.path.lstrip('/') + + def execute(self, context): + self.s3_key = self.get_s3_key(self.s3_key) + ssh_hook = SSHHook(ssh_conn_id=self.sftp_conn_id) + s3_hook = S3Hook(self.s3_conn_id) + + sftp_client = ssh_hook.get_conn().open_sftp() + + with NamedTemporaryFile("w") as f: + sftp_client.get(self.sftp_path, f.name) + + s3_hook.load_file( + filename=f.name, + key=self.s3_key, + bucket_name=self.s3_bucket, + replace=True + ) diff --git a/airflow/contrib/operators/slack_webhook_operator.py b/airflow/contrib/operators/slack_webhook_operator.py index bab3e90856d34..8950c69b5a38e 100644 --- a/airflow/contrib/operators/slack_webhook_operator.py +++ b/airflow/contrib/operators/slack_webhook_operator.py @@ -31,12 +31,15 @@ class SlackWebhookOperator(SimpleHttpOperator): Each Slack webhook token can be pre-configured to use a specific channel, username and icon. You can override these defaults in this hook. - :param conn_id: connection that has Slack webhook token in the extra field - :type conn_id: str + :param http_conn_id: connection that has Slack webhook token in the extra field + :type http_conn_id: str :param webhook_token: Slack webhook token :type webhook_token: str :param message: The message you want to send on Slack :type message: str + :param attachments: The attachments to send on Slack. Should be a list of + dictionaries representing Slack attachments. + :type attachments: list :param channel: The channel the message should be posted to :type channel: str :param username: The username to post to slack with @@ -55,6 +58,7 @@ def __init__(self, http_conn_id=None, webhook_token=None, message="", + attachments=None, channel=None, username=None, icon_emoji=None, @@ -68,6 +72,7 @@ def __init__(self, self.http_conn_id = http_conn_id self.webhook_token = webhook_token self.message = message + self.attachments = attachments self.channel = channel self.username = username self.icon_emoji = icon_emoji @@ -77,12 +82,13 @@ def __init__(self, def execute(self, context): """ - Call the SparkSqlHook to run the provided sql query + Call the SlackWebhookHook to post the provided Slack message """ self.hook = SlackWebhookHook( self.http_conn_id, self.webhook_token, self.message, + self.attachments, self.channel, self.username, self.icon_emoji, diff --git a/airflow/contrib/operators/snowflake_operator.py b/airflow/contrib/operators/snowflake_operator.py index 39d7d496ea740..6c3f403912134 100644 --- a/airflow/contrib/operators/snowflake_operator.py +++ b/airflow/contrib/operators/snowflake_operator.py @@ -26,16 +26,16 @@ class SnowflakeOperator(BaseOperator): Executes sql code in a Snowflake database :param snowflake_conn_id: reference to specific snowflake connection id - :type snowflake_conn_id: string + :type snowflake_conn_id: str :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' :param warehouse: name of warehouse which overwrite defined one in connection - :type warehouse: string + :type warehouse: str :param database: name of database which overwrite defined one in connection - :type database: string + :type database: str """ template_fields = ('sql',) diff --git a/airflow/contrib/operators/sns_publish_operator.py b/airflow/contrib/operators/sns_publish_operator.py new file mode 100644 index 0000000000000..d1a042be55e03 --- /dev/null +++ b/airflow/contrib/operators/sns_publish_operator.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.aws_sns_hook import AwsSnsHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class SnsPublishOperator(BaseOperator): + """ + Publish a message to Amazon SNS. + + :param aws_conn_id: aws connection to use + :type aws_conn_id: str + :param target_arn: either a TopicArn or an EndpointArn + :type target_arn: str + :param message: the default message you want to send (templated) + :type message: str + """ + template_fields = ['message'] + template_ext = () + + @apply_defaults + def __init__( + self, + target_arn, + message, + aws_conn_id='aws_default', + *args, **kwargs): + super(SnsPublishOperator, self).__init__(*args, **kwargs) + self.target_arn = target_arn + self.message = message + self.aws_conn_id = aws_conn_id + + def execute(self, context): + sns = AwsSnsHook(aws_conn_id=self.aws_conn_id) + + self.log.info( + 'Sending SNS notification to {} using {}:\n{}'.format( + self.target_arn, + self.aws_conn_id, + self.message + ) + ) + + return sns.publish_to_target( + target_arn=self.target_arn, + message=self.message + ) diff --git a/airflow/contrib/operators/spark_jdbc_operator.py b/airflow/contrib/operators/spark_jdbc_operator.py index 42f9dd5676f1b..b1c2a138b9ea2 100644 --- a/airflow/contrib/operators/spark_jdbc_operator.py +++ b/airflow/contrib/operators/spark_jdbc_operator.py @@ -64,7 +64,7 @@ class SparkJDBCOperator(SparkSubmitOperator): :param jdbc_table: The name of the JDBC table :type jdbc_table: str :param jdbc_conn_id: Connection id used for connection to JDBC database - :type: jdbc_conn_id: str + :type jdbc_conn_id: str :param jdbc_driver: Name of the JDBC driver to use for the JDBC connection. This driver (usually a jar) should be passed in the 'jars' parameter :type jdbc_driver: str diff --git a/airflow/contrib/operators/spark_submit_operator.py b/airflow/contrib/operators/spark_submit_operator.py index 3c3cab0ceb350..45a70b64ea827 100644 --- a/airflow/contrib/operators/spark_submit_operator.py +++ b/airflow/contrib/operators/spark_submit_operator.py @@ -29,31 +29,30 @@ class SparkSubmitOperator(BaseOperator): It requires that the "spark-submit" binary is in the PATH or the spark-home is set in the extra on the connection. - :param application: The application that submitted as a job, either jar or - py file. (templated) + :param application: The application that submitted as a job, either jar or py file. (templated) :type application: str - :param conf: Arbitrary Spark configuration properties + :param conf: Arbitrary Spark configuration properties (templated) :type conf: dict :param conn_id: The connection id as configured in Airflow administration. When an invalid connection_id is supplied, it will default to yarn. :type conn_id: str :param files: Upload additional files to the executor running the job, separated by a comma. Files will be placed in the working directory of each executor. - For example, serialized objects. + For example, serialized objects. (templated) :type files: str - :param py_files: Additional python files used by the job, can be .zip, .egg or .py. + :param py_files: Additional python files used by the job, can be .zip, .egg or .py. (templated) :type py_files: str - :param jars: Submit additional jars to upload and place them in executor classpath. - :param driver_classpath: Additional, driver-specific, classpath settings. - :type driver_classpath: str + :param jars: Submit additional jars to upload and place them in executor classpath. (templated) :type jars: str + :param driver_class_path: Additional, driver-specific, classpath settings. (templated) + :type driver_class_path: str :param java_class: the main class of the Java application :type java_class: str :param packages: Comma-separated list of maven coordinates of jars to include on the driver and executor classpaths. (templated) :type packages: str :param exclude_packages: Comma-separated list of maven coordinates of jars to exclude - while resolving the dependencies provided in 'packages' + while resolving the dependencies provided in 'packages' (templated) :type exclude_packages: str :param repositories: Comma-separated list of additional remote repositories to search for the maven coordinates given with 'packages' @@ -61,30 +60,33 @@ class SparkSubmitOperator(BaseOperator): :param total_executor_cores: (Standalone & Mesos only) Total cores for all executors (Default: all the available cores on the worker) :type total_executor_cores: int - :param executor_cores: (Standalone & YARN only) Number of cores per executor - (Default: 2) + :param executor_cores: (Standalone & YARN only) Number of cores per executor (Default: 2) :type executor_cores: int :param executor_memory: Memory per executor (e.g. 1000M, 2G) (Default: 1G) :type executor_memory: str :param driver_memory: Memory allocated to the driver (e.g. 1000M, 2G) (Default: 1G) :type driver_memory: str - :param keytab: Full path to the file that contains the keytab + :param keytab: Full path to the file that contains the keytab (templated) :type keytab: str - :param principal: The name of the kerberos principal used for keytab + :param principal: The name of the kerberos principal used for keytab (templated) :type principal: str :param name: Name of the job (default airflow-spark). (templated) :type name: str :param num_executors: Number of executors to launch :type num_executors: int - :param application_args: Arguments for the application being submitted + :param application_args: Arguments for the application being submitted (templated) :type application_args: list - :param env_vars: Environment variables for spark-submit. It - supports yarn and k8s mode too. + :param env_vars: Environment variables for spark-submit. It supports yarn and k8s mode too. (templated) :type env_vars: dict :param verbose: Whether to pass the verbose flag to spark-submit process for debugging :type verbose: bool + :param spark_binary: The command to use for spark submit. + Some distros may use spark2-submit. + :type spark_binary: string """ - template_fields = ('_name', '_application_args', '_packages') + template_fields = ('_application', '_conf', '_files', '_py_files', '_jars', '_driver_class_path', + '_packages', '_exclude_packages', '_keytab', '_principal', '_name', + '_application_args', '_env_vars') ui_color = WEB_COLORS['LIGHTORANGE'] @apply_defaults @@ -94,7 +96,8 @@ def __init__(self, conn_id='spark_default', files=None, py_files=None, - driver_classpath=None, + archives=None, + driver_class_path=None, jars=None, java_class=None, packages=None, @@ -111,6 +114,7 @@ def __init__(self, application_args=None, env_vars=None, verbose=False, + spark_binary="spark-submit", *args, **kwargs): super(SparkSubmitOperator, self).__init__(*args, **kwargs) @@ -118,7 +122,8 @@ def __init__(self, self._conf = conf self._files = files self._py_files = py_files - self._driver_classpath = driver_classpath + self._archives = archives + self._driver_class_path = driver_class_path self._jars = jars self._java_class = java_class self._packages = packages @@ -135,6 +140,7 @@ def __init__(self, self._application_args = application_args self._env_vars = env_vars self._verbose = verbose + self._spark_binary = spark_binary self._hook = None self._conn_id = conn_id @@ -147,7 +153,8 @@ def execute(self, context): conn_id=self._conn_id, files=self._files, py_files=self._py_files, - driver_classpath=self._driver_classpath, + archives=self._archives, + driver_class_path=self._driver_class_path, jars=self._jars, java_class=self._java_class, packages=self._packages, @@ -163,7 +170,8 @@ def execute(self, context): num_executors=self._num_executors, application_args=self._application_args, env_vars=self._env_vars, - verbose=self._verbose + verbose=self._verbose, + spark_binary=self._spark_binary ) self._hook.submit(self._application) diff --git a/airflow/contrib/operators/sqoop_operator.py b/airflow/contrib/operators/sqoop_operator.py index fa61ca14cac45..c7ebcf4b3a2d4 100644 --- a/airflow/contrib/operators/sqoop_operator.py +++ b/airflow/contrib/operators/sqoop_operator.py @@ -34,7 +34,7 @@ class SqoopOperator(BaseOperator): """ Execute a Sqoop job. Documentation for Apache Sqoop can be found here: - https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide.html. + https://sqoop.apache.org/docs/1.4.2/SqoopUserGuide.html """ template_fields = ('conn_id', 'cmd_type', 'table', 'query', 'target_dir', 'file_type', 'columns', 'split_by', diff --git a/airflow/contrib/operators/ssh_operator.py b/airflow/contrib/operators/ssh_operator.py index d246800953341..5686792b35d62 100644 --- a/airflow/contrib/operators/ssh_operator.py +++ b/airflow/contrib/operators/ssh_operator.py @@ -31,11 +31,15 @@ class SSHOperator(BaseOperator): """ SSHOperator to execute commands on given remote host using the ssh_hook. - :param ssh_hook: predefined ssh_hook to use for remote execution - :type ssh_hook: :class:`SSHHook` - :param ssh_conn_id: connection id from airflow Connections + :param ssh_hook: predefined ssh_hook to use for remote execution. + Either `ssh_hook` or `ssh_conn_id` needs to be provided. + :type ssh_hook: airflow.contrib.hooks.ssh_hook.SSHHook + :param ssh_conn_id: connection id from airflow Connections. + `ssh_conn_id` will be ignored if `ssh_hook` is provided. :type ssh_conn_id: str - :param remote_host: remote host to connect + :param remote_host: remote host to connect (templated) + Nullable. If provided, it will replace the `remote_host` which was + defined in `ssh_hook` or predefined in the connection of `ssh_conn_id`. :type remote_host: str :param command: command to execute on remote host. (templated) :type command: str @@ -45,7 +49,7 @@ class SSHOperator(BaseOperator): :type do_xcom_push: bool """ - template_fields = ('command',) + template_fields = ('command', 'remote_host') template_ext = ('.sh',) @apply_defaults @@ -68,88 +72,96 @@ def __init__(self, def execute(self, context): try: - if self.ssh_conn_id and not self.ssh_hook: - self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) + if self.ssh_conn_id: + if self.ssh_hook and isinstance(self.ssh_hook, SSHHook): + self.log.info("ssh_conn_id is ignored when ssh_hook is provided.") + else: + self.log.info("ssh_hook is not provided or invalid. " + + "Trying ssh_conn_id to create SSHHook.") + self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id, + timeout=self.timeout) if not self.ssh_hook: - raise AirflowException("can not operate without ssh_hook or ssh_conn_id") + raise AirflowException("Cannot operate without ssh_hook or ssh_conn_id.") if self.remote_host is not None: + self.log.info("remote_host is provided explicitly. " + + "It will replace the remote_host which was defined " + + "in ssh_hook or predefined in connection of ssh_conn_id.") self.ssh_hook.remote_host = self.remote_host - ssh_client = self.ssh_hook.get_conn() - if not self.command: - raise AirflowException("no command specified so nothing to execute here.") - - # Auto apply tty when its required in case of sudo - get_pty = False - if self.command.startswith('sudo'): - get_pty = True - - # set timeout taken as params - stdin, stdout, stderr = ssh_client.exec_command(command=self.command, - get_pty=get_pty, - timeout=self.timeout - ) - # get channels - channel = stdout.channel - - # closing stdin - stdin.close() - channel.shutdown_write() - - agg_stdout = b'' - agg_stderr = b'' - - # capture any initial output in case channel is closed already - stdout_buffer_length = len(stdout.channel.in_buffer) - - if stdout_buffer_length > 0: - agg_stdout += stdout.channel.recv(stdout_buffer_length) - - # read from both stdout and stderr - while not channel.closed or \ - channel.recv_ready() or \ - channel.recv_stderr_ready(): - readq, _, _ = select([channel], [], [], self.timeout) - for c in readq: - if c.recv_ready(): - line = stdout.channel.recv(len(c.in_buffer)) - line = line - agg_stdout += line - self.log.info(line.decode('utf-8').strip('\n')) - if c.recv_stderr_ready(): - line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) - line = line - agg_stderr += line - self.log.warning(line.decode('utf-8').strip('\n')) - if stdout.channel.exit_status_ready()\ - and not stderr.channel.recv_stderr_ready()\ - and not stdout.channel.recv_ready(): - stdout.channel.shutdown_read() - stdout.channel.close() - break - - stdout.close() - stderr.close() - - exit_status = stdout.channel.recv_exit_status() - if exit_status is 0: - # returning output if do_xcom_push is set - if self.do_xcom_push: - enable_pickling = configuration.conf.getboolean( - 'core', 'enable_xcom_pickling' - ) - if enable_pickling: - return agg_stdout - else: - return b64encode(agg_stdout).decode('utf-8') - - else: - error_msg = agg_stderr.decode('utf-8') - raise AirflowException("error running cmd: {0}, error: {1}" - .format(self.command, error_msg)) + raise AirflowException("SSH command not specified. Aborting.") + + with self.ssh_hook.get_conn() as ssh_client: + # Auto apply tty when its required in case of sudo + get_pty = False + if self.command.startswith('sudo'): + get_pty = True + + # set timeout taken as params + stdin, stdout, stderr = ssh_client.exec_command(command=self.command, + get_pty=get_pty, + timeout=self.timeout + ) + # get channels + channel = stdout.channel + + # closing stdin + stdin.close() + channel.shutdown_write() + + agg_stdout = b'' + agg_stderr = b'' + + # capture any initial output in case channel is closed already + stdout_buffer_length = len(stdout.channel.in_buffer) + + if stdout_buffer_length > 0: + agg_stdout += stdout.channel.recv(stdout_buffer_length) + + # read from both stdout and stderr + while not channel.closed or \ + channel.recv_ready() or \ + channel.recv_stderr_ready(): + readq, _, _ = select([channel], [], [], self.timeout) + for c in readq: + if c.recv_ready(): + line = stdout.channel.recv(len(c.in_buffer)) + line = line + agg_stdout += line + self.log.info(line.decode('utf-8').strip('\n')) + if c.recv_stderr_ready(): + line = stderr.channel.recv_stderr(len(c.in_stderr_buffer)) + line = line + agg_stderr += line + self.log.warning(line.decode('utf-8').strip('\n')) + if stdout.channel.exit_status_ready()\ + and not stderr.channel.recv_stderr_ready()\ + and not stdout.channel.recv_ready(): + stdout.channel.shutdown_read() + stdout.channel.close() + break + + stdout.close() + stderr.close() + + exit_status = stdout.channel.recv_exit_status() + if exit_status == 0: + # returning output if do_xcom_push is set + if self.do_xcom_push: + enable_pickling = configuration.conf.getboolean( + 'core', 'enable_xcom_pickling' + ) + if enable_pickling: + return agg_stdout + else: + return b64encode(agg_stdout).decode('utf-8') + + else: + error_msg = agg_stderr.decode('utf-8') + raise AirflowException("error running cmd: {0}, error: {1}" + .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) diff --git a/airflow/contrib/operators/vertica_operator.py b/airflow/contrib/operators/vertica_operator.py index 41072ff82bdf0..37ccd134e84ac 100644 --- a/airflow/contrib/operators/vertica_operator.py +++ b/airflow/contrib/operators/vertica_operator.py @@ -26,7 +26,7 @@ class VerticaOperator(BaseOperator): Executes sql code in a specific Vertica database :param vertica_conn_id: reference to a specific Vertica database - :type vertica_conn_id: string + :type vertica_conn_id: str :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. diff --git a/airflow/contrib/operators/vertica_to_mysql.py b/airflow/contrib/operators/vertica_to_mysql.py index 9d3fe721e4cbb..99b55e61bc6e0 100644 --- a/airflow/contrib/operators/vertica_to_mysql.py +++ b/airflow/contrib/operators/vertica_to_mysql.py @@ -17,7 +17,6 @@ # specific language governing permissions and limitations # under the License. -import logging import MySQLdb from airflow.contrib.hooks.vertica_hook import VerticaHook @@ -103,10 +102,10 @@ def execute(self, context): if self.bulk_load: tmpfile = NamedTemporaryFile("w") - logging.info( - "Selecting rows from Vertica to local file " + str( - tmpfile.name) + "...") - logging.info(self.sql) + self.log.info( + "Selecting rows from Vertica to local file %s...", + tmpfile.name) + self.log.info(self.sql) csv_writer = csv.writer(tmpfile, delimiter='\t', encoding='utf-8') for row in cursor.iterate(): @@ -115,21 +114,21 @@ def execute(self, context): tmpfile.flush() else: - logging.info("Selecting rows from Vertica...") - logging.info(self.sql) + self.log.info("Selecting rows from Vertica...") + self.log.info(self.sql) result = cursor.fetchall() count = len(result) - logging.info("Selected rows from Vertica " + str(count)) + self.log.info("Selected rows from Vertica %s", count) if self.mysql_preoperator: - logging.info("Running MySQL preoperator...") + self.log.info("Running MySQL preoperator...") mysql.run(self.mysql_preoperator) try: if self.bulk_load: - logging.info("Bulk inserting rows into MySQL...") + self.log.info("Bulk inserting rows into MySQL...") with closing(mysql.get_conn()) as conn: with closing(conn.cursor()) as cursor: cursor.execute("LOAD DATA LOCAL INFILE '%s' INTO " @@ -140,17 +139,17 @@ def execute(self, context): conn.commit() tmpfile.close() else: - logging.info("Inserting rows into MySQL...") + self.log.info("Inserting rows into MySQL...") mysql.insert_rows(table=self.mysql_table, rows=result, target_fields=selected_columns) - logging.info("Inserted rows into MySQL " + str(count)) + self.log.info("Inserted rows into MySQL %s", count) except (MySQLdb.Error, MySQLdb.Warning): - logging.error("Inserted rows into MySQL 0") + self.log.info("Inserted rows into MySQL 0") raise if self.mysql_postoperator: - logging.info("Running MySQL postoperator...") + self.log.info("Running MySQL postoperator...") mysql.run(self.mysql_postoperator) - logging.info("Done") + self.log.info("Done") diff --git a/airflow/contrib/operators/wasb_delete_blob_operator.py b/airflow/contrib/operators/wasb_delete_blob_operator.py new file mode 100644 index 0000000000000..4173d7e7a8a5d --- /dev/null +++ b/airflow/contrib/operators/wasb_delete_blob_operator.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +from airflow.contrib.hooks.wasb_hook import WasbHook +from airflow.models import BaseOperator +from airflow.utils.decorators import apply_defaults + + +class WasbDeleteBlobOperator(BaseOperator): + """ + Deletes blob(s) on Azure Blob Storage. + + :param container_name: Name of the container. (templated) + :type container_name: str + :param blob_name: Name of the blob. (templated) + :type blob_name: str + :param wasb_conn_id: Reference to the wasb connection. + :type wasb_conn_id: str + :param check_options: Optional keyword arguments that + `WasbHook.check_for_blob()` takes. + :param is_prefix: If blob_name is a prefix, delete all files matching prefix. + :type is_prefix: bool + :param ignore_if_missing: if True, then return success even if the + blob does not exist. + :type ignore_if_missing: bool + """ + + template_fields = ('container_name', 'blob_name') + + @apply_defaults + def __init__(self, container_name, blob_name, + wasb_conn_id='wasb_default', check_options=None, + is_prefix=False, ignore_if_missing=False, + *args, + **kwargs): + super(WasbDeleteBlobOperator, self).__init__(*args, **kwargs) + if check_options is None: + check_options = {} + self.wasb_conn_id = wasb_conn_id + self.container_name = container_name + self.blob_name = blob_name + self.check_options = check_options + self.is_prefix = is_prefix + self.ignore_if_missing = ignore_if_missing + + def execute(self, context): + self.log.info( + 'Deleting blob: {self.blob_name}\n' + 'in wasb://{self.container_name}'.format(**locals()) + ) + hook = WasbHook(wasb_conn_id=self.wasb_conn_id) + + hook.delete_file(self.container_name, self.blob_name, + self.is_prefix, self.ignore_if_missing, + **self.check_options) diff --git a/airflow/contrib/operators/winrm_operator.py b/airflow/contrib/operators/winrm_operator.py index fcd2328d9ad3a..e7738a471fd92 100644 --- a/airflow/contrib/operators/winrm_operator.py +++ b/airflow/contrib/operators/winrm_operator.py @@ -17,19 +17,29 @@ # specific language governing permissions and limitations # under the License. +from base64 import b64encode +import logging + +from winrm.exceptions import WinRMOperationTimeoutError + +from airflow import configuration from airflow.contrib.hooks.winrm_hook import WinRMHook from airflow.exceptions import AirflowException from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults +# Hide the following error message in urllib3 when making WinRM connections: +# requests.packages.urllib3.exceptions.HeaderParsingError: [StartBoundaryNotFoundDefect(), +# MultipartInvariantViolationDefect()], unparsed data: '' +logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.CRITICAL) -class WinRMOperator(BaseOperator): +class WinRMOperator(BaseOperator): """ WinRMOperator to execute commands on given remote host using the winrm_hook. :param winrm_hook: predefined ssh_hook to use for remote execution - :type winrm_hook: :class:`WinRMHook` + :type winrm_hook: airflow.contrib.hooks.winrm_hook.WinRMHook :param ssh_conn_id: connection id from airflow Connections :type ssh_conn_id: str :param remote_host: remote host to connect @@ -41,7 +51,6 @@ class WinRMOperator(BaseOperator): :param do_xcom_push: return the stdout which also get set in xcom by airflow platform :type do_xcom_push: bool """ - template_fields = ('command',) @apply_defaults @@ -63,48 +72,78 @@ def __init__(self, self.do_xcom_push = do_xcom_push def execute(self, context): - try: - if self.ssh_conn_id and not self.winrm_hook: - self.log.info("hook not found, creating") - self.winrm_hook = WinRMHook(ssh_conn_id=self.ssh_conn_id) + if self.ssh_conn_id and not self.winrm_hook: + self.log.info("Hook not found, creating...") + self.winrm_hook = WinRMHook(ssh_conn_id=self.ssh_conn_id) - if not self.winrm_hook: - raise AirflowException("can not operate without ssh_hook or ssh_conn_id") + if not self.winrm_hook: + raise AirflowException("Cannot operate without winrm_hook or ssh_conn_id.") - if self.remote_host is not None: - self.winrm_hook.remote_host = self.remote_host + if self.remote_host is not None: + self.winrm_hook.remote_host = self.remote_host - winrm_client = self.winrm_hook.get_conn() - self.log.info("Established WinRM connection") + if not self.command: + raise AirflowException("No command specified so nothing to execute here.") - if not self.command: - raise AirflowException("no command specified so nothing to execute here.") + winrm_client = self.winrm_hook.get_conn() - self.log.info( - "Starting command: '{command}' on remote host: {remotehost}". - format(command=self.command, remotehost=self.winrm_hook.remote_host) + try: + self.log.info("Running command: '%s'...", self.command) + command_id = self.winrm_hook.winrm_protocol.run_command( + winrm_client, + self.command ) - command_id = self.winrm_hook.winrm_protocol. \ - run_command(winrm_client, self.command) - std_out, std_err, status_code = self.winrm_hook.winrm_protocol. \ - get_command_output(winrm_client, command_id) - - self.log.info("std out: " + std_out.decode()) - self.log.info("std err: " + std_err.decode()) - self.log.info("exit code: " + str(status_code)) - self.log.info("Cleaning up WinRM command") + + # See: https://github.com/diyan/pywinrm/blob/master/winrm/protocol.py + stdout_buffer = [] + stderr_buffer = [] + command_done = False + while not command_done: + try: + stdout, stderr, return_code, command_done = \ + self.winrm_hook.winrm_protocol._raw_get_command_output( + winrm_client, + command_id + ) + + # Only buffer stdout if we need to so that we minimize memory usage. + if self.do_xcom_push: + stdout_buffer.append(stdout) + stderr_buffer.append(stderr) + + for line in stdout.decode('utf-8').splitlines(): + self.log.info(line) + for line in stderr.decode('utf-8').splitlines(): + self.log.warning(line) + except WinRMOperationTimeoutError: + # this is an expected error when waiting for a + # long-running process, just silently retry + pass + self.winrm_hook.winrm_protocol.cleanup_command(winrm_client, command_id) - self.log.info("Cleaning up WinRM protocol shell") self.winrm_hook.winrm_protocol.close_shell(winrm_client) - if status_code is 0: - return std_out.decode() - - else: - error_msg = std_err.decode() - raise AirflowException("error running cmd: {0}, error: {1}" - .format(self.command, error_msg)) except Exception as e: raise AirflowException("WinRM operator error: {0}".format(str(e))) + if return_code == 0: + # returning output if do_xcom_push is set + if self.do_xcom_push: + enable_pickling = configuration.conf.getboolean( + 'core', 'enable_xcom_pickling' + ) + if enable_pickling: + return stdout_buffer + else: + return b64encode(b''.join(stdout_buffer)).decode('utf-8') + else: + error_msg = "Error running cmd: {0}, return code: {1}, error: {2}".format( + self.command, + return_code, + b''.join(stderr_buffer).decode('utf-8') + ) + raise AirflowException(error_msg) + + self.log.info("Finished!") + return True diff --git a/airflow/contrib/plugins/metastore_browser/main.py b/airflow/contrib/plugins/metastore_browser/main.py index 836e53191ac71..33a8ca5f661ce 100644 --- a/airflow/contrib/plugins/metastore_browser/main.py +++ b/airflow/contrib/plugins/metastore_browser/main.py @@ -164,7 +164,7 @@ def ddl(self): v = MetastoreBrowserView(category="Plugins", name="Hive Metadata Browser") -# Creating a flask blueprint to intergrate the templates and static folder +# Creating a flask blueprint to integrate the templates and static folder bp = Blueprint( "metastore_browser", __name__, template_folder='templates', diff --git a/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/table.html b/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/table.html index 3a9d7ca671ad2..cbb7acff6da37 100644 --- a/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/table.html +++ b/airflow/contrib/plugins/metastore_browser/templates/metastore_browser/table.html @@ -28,7 +28,7 @@

  • Sample Data
  • Partitions
  • -
  • Atributes
  • +
  • Attributes
  • Parameters
  • DDL
  • diff --git a/airflow/contrib/sensors/aws_athena_sensor.py b/airflow/contrib/sensors/aws_athena_sensor.py new file mode 100644 index 0000000000000..af758b93c1335 --- /dev/null +++ b/airflow/contrib/sensors/aws_athena_sensor.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +from airflow.exceptions import AirflowException +from airflow.utils.decorators import apply_defaults +from airflow.contrib.hooks.aws_athena_hook import AWSAthenaHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator + + +class AthenaSensor(BaseSensorOperator): + """ + Asks for the state of the Query until it reaches a failure state or success state. + If it fails, failing the task. + + :param query_execution_id: query_execution_id to check the state of + :type query_execution_id: str + :param max_retires: Number of times to poll for query state before + returning the current state, defaults to None + :type max_retires: int + :param aws_conn_id: aws connection to use, defaults to 'aws_default' + :type aws_conn_id: str + :param sleep_time: Time to wait between two consecutive call to + check query status on athena, defaults to 10 + :type sleep_time: int + """ + + INTERMEDIATE_STATES = ('QUEUED', 'RUNNING',) + FAILURE_STATES = ('FAILED', 'CANCELLED',) + SUCCESS_STATES = ('SUCCEEDED',) + + template_fields = ['query_execution_id'] + template_ext = () + ui_color = '#66c3ff' + + @apply_defaults + def __init__(self, + query_execution_id, + max_retires=None, + aws_conn_id='aws_default', + sleep_time=10, + *args, **kwargs): + super(AthenaSensor, self).__init__(*args, **kwargs) + self.aws_conn_id = aws_conn_id + self.query_execution_id = query_execution_id + self.hook = None + self.sleep_time = sleep_time + self.max_retires = max_retires + + def poke(self, context): + self.hook = self.get_hook() + self.hook.get_conn() + state = self.hook.poll_query_status(self.query_execution_id, self.max_retires) + + if state in self.FAILURE_STATES: + raise AirflowException('Athena sensor failed') + + if state in self.INTERMEDIATE_STATES: + return False + return True + + def get_hook(self): + return AWSAthenaHook(self.aws_conn_id, self.sleep_time) diff --git a/airflow/contrib/sensors/aws_glue_catalog_partition_sensor.py b/airflow/contrib/sensors/aws_glue_catalog_partition_sensor.py new file mode 100644 index 0000000000000..d4c1d3a884e72 --- /dev/null +++ b/airflow/contrib/sensors/aws_glue_catalog_partition_sensor.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class AwsGlueCatalogPartitionSensor(BaseSensorOperator): + """ + Waits for a partition to show up in AWS Glue Catalog. + + :param table_name: The name of the table to wait for, supports the dot + notation (my_database.my_table) + :type table_name: str + :param expression: The partition clause to wait for. This is passed as + is to the AWS Glue Catalog API's get_partitions function, + and supports SQL like notation as in ``ds='2015-01-01' + AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"``. + See https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-partitions.html + #aws-glue-api-catalog-partitions-GetPartitions + :type expression: str + :param aws_conn_id: ID of the Airflow connection where + credentials and extra configuration are stored + :type aws_conn_id: str + :param region_name: Optional aws region name (example: us-east-1). Uses region from connection + if not specified. + :type region_name: str + :param database_name: The name of the catalog database where the partitions reside. + :type database_name: str + :param poke_interval: Time in seconds that the job should wait in + between each tries + :type poke_interval: int + """ + template_fields = ('database_name', 'table_name', 'expression',) + ui_color = '#C5CAE9' + + @apply_defaults + def __init__(self, + table_name, expression="ds='{{ ds }}'", + aws_conn_id='aws_default', + region_name=None, + database_name='default', + poke_interval=60 * 3, + *args, + **kwargs): + super(AwsGlueCatalogPartitionSensor, self).__init__( + poke_interval=poke_interval, *args, **kwargs) + self.aws_conn_id = aws_conn_id + self.region_name = region_name + self.table_name = table_name + self.expression = expression + self.database_name = database_name + + def poke(self, context): + """ + Checks for existence of the partition in the AWS Glue Catalog table + """ + if '.' in self.table_name: + self.database_name, self.table_name = self.table_name.split('.') + self.log.info( + 'Poking for table %s. %s, expression %s', self.database_name, self.table_name, self.expression + ) + + return self.get_hook().check_for_partition( + self.database_name, self.table_name, self.expression) + + def get_hook(self): + """ + Gets the AwsGlueCatalogHook + """ + if not hasattr(self, 'hook'): + from airflow.contrib.hooks.aws_glue_catalog_hook import AwsGlueCatalogHook + self.hook = AwsGlueCatalogHook( + aws_conn_id=self.aws_conn_id, + region_name=self.region_name) + + return self.hook diff --git a/airflow/contrib/sensors/azure_cosmos_sensor.py b/airflow/contrib/sensors/azure_cosmos_sensor.py new file mode 100644 index 0000000000000..78b340d4efe53 --- /dev/null +++ b/airflow/contrib/sensors/azure_cosmos_sensor.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class AzureCosmosDocumentSensor(BaseSensorOperator): + """ + Checks for the existence of a document which + matches the given query in CosmosDB. Example: + + >>> azure_cosmos_sensor = AzureCosmosDocumentSensor(database_name="somedatabase_name", + ... collection_name="somecollection_name", + ... document_id="unique-doc-id", + ... azure_cosmos_conn_id="azure_cosmos_default", + ... task_id="azure_cosmos_sensor") + """ + template_fields = ('database_name', 'collection_name', 'document_id') + + @apply_defaults + def __init__( + self, + database_name, + collection_name, + document_id, + azure_cosmos_conn_id="azure_cosmos_default", + *args, + **kwargs): + """ + Create a new AzureCosmosDocumentSensor + + :param database_name: Target CosmosDB database_name. + :type database_name: str + :param collection_name: Target CosmosDB collection_name. + :type collection_name: str + :param document_id: The ID of the target document. + :type query: str + :param azure_cosmos_conn_id: Reference to the Azure CosmosDB connection. + :type azure_cosmos_conn_id: str + """ + super(AzureCosmosDocumentSensor, self).__init__(*args, **kwargs) + self.azure_cosmos_conn_id = azure_cosmos_conn_id + self.database_name = database_name + self.collection_name = collection_name + self.document_id = document_id + + def poke(self, context): + self.log.info("*** Intering poke") + hook = AzureCosmosDBHook(self.azure_cosmos_conn_id) + return hook.get_document(self.document_id, self.database_name, self.collection_name) is not None diff --git a/airflow/contrib/sensors/bash_sensor.py b/airflow/contrib/sensors/bash_sensor.py index 26fbb06c18d81..fec48c5778ef1 100644 --- a/airflow/contrib/sensors/bash_sensor.py +++ b/airflow/contrib/sensors/bash_sensor.py @@ -33,7 +33,7 @@ class BashSensor(BaseSensorOperator): :param bash_command: The command, set of commands or reference to a bash script (must be '.sh') to be executed. - :type bash_command: string + :type bash_command: str :param env: If env is not None, it must be a mapping that defines the environment variables for the new process; these are used instead @@ -41,7 +41,7 @@ class BashSensor(BaseSensorOperator): behavior. (templated) :type env: dict :param output_encoding: output encoding of bash command. - :type output_encoding: string + :type output_encoding: str """ template_fields = ('bash_command', 'env') @@ -70,10 +70,7 @@ def poke(self, context): f.flush() fname = f.name script_location = tmp_dir + "/" + fname - self.log.info( - "Temporary script location: %s", - script_location - ) + self.log.info("Temporary script location: %s", script_location) self.log.info("Running command: %s", bash_command) sp = Popen( ['bash', fname], @@ -89,7 +86,6 @@ def poke(self, context): line = line.decode(self.output_encoding).strip() self.log.info(line) sp.wait() - self.log.info("Command exited with " - "return code {0}".format(sp.returncode)) + self.log.info("Command exited with return code %s", sp.returncode) return not sp.returncode diff --git a/airflow/contrib/sensors/bigquery_sensor.py b/airflow/contrib/sensors/bigquery_sensor.py index 2e496f68897dd..fe8bd2ed6acc9 100644 --- a/airflow/contrib/sensors/bigquery_sensor.py +++ b/airflow/contrib/sensors/bigquery_sensor.py @@ -25,22 +25,22 @@ class BigQueryTableSensor(BaseSensorOperator): """ Checks for the existence of a table in Google Bigquery. - :param project_id: The Google cloud project in which to look for the table. - The connection supplied to the hook must provide - access to the specified project. - :type project_id: string - :param dataset_id: The name of the dataset in which to look for the table. - storage bucket. - :type dataset_id: string - :param table_id: The name of the table to check the existence of. - :type table_id: string - :param bigquery_conn_id: The connection ID to use when connecting to - Google BigQuery. - :type bigquery_conn_id: string - :param delegate_to: The account to impersonate, if any. - For this to work, the service account making the request must - have domain-wide delegation enabled. - :type delegate_to: string + :param project_id: The Google cloud project in which to look for the table. + The connection supplied to the hook must provide + access to the specified project. + :type project_id: str + :param dataset_id: The name of the dataset in which to look for the table. + storage bucket. + :type dataset_id: str + :param table_id: The name of the table to check the existence of. + :type table_id: str + :param bigquery_conn_id: The connection ID to use when connecting to + Google BigQuery. + :type bigquery_conn_id: str + :param delegate_to: The account to impersonate, if any. + For this to work, the service account making the request must + have domain-wide delegation enabled. + :type delegate_to: str """ template_fields = ('project_id', 'dataset_id', 'table_id',) ui_color = '#f0eee4' diff --git a/airflow/contrib/sensors/cassandra_sensor.py b/airflow/contrib/sensors/cassandra_record_sensor.py similarity index 85% rename from airflow/contrib/sensors/cassandra_sensor.py rename to airflow/contrib/sensors/cassandra_record_sensor.py index aef66122e90c2..bbb2f5099030e 100644 --- a/airflow/contrib/sensors/cassandra_sensor.py +++ b/airflow/contrib/sensors/cassandra_record_sensor.py @@ -29,9 +29,10 @@ class CassandraRecordSensor(BaseSensorOperator): primary keys 'p1' and 'p2' to be populated in keyspace 'k' and table 't', instantiate it as follows: - >>> CassandraRecordSensor(table="k.t", keys={"p1": "v1", "p2": "v2"}, - ... cassandra_conn_id="cassandra_default", task_id="cassandra_sensor") - + >>> cassandra_sensor = CassandraRecordSensor(table="k.t", + ... keys={"p1": "v1", "p2": "v2"}, + ... cassandra_conn_id="cassandra_default", + ... task_id="cassandra_sensor") """ template_fields = ('table', 'keys') @@ -42,12 +43,12 @@ def __init__(self, table, keys, cassandra_conn_id, *args, **kwargs): :param table: Target Cassandra table. Use dot notation to target a specific keyspace. - :type table: string + :type table: str :param keys: The keys and their values to be monitored :type keys: dict :param cassandra_conn_id: The connection ID to use when connecting to Cassandra cluster - :type cassandra_conn_id: string + :type cassandra_conn_id: str """ super(CassandraRecordSensor, self).__init__(*args, **kwargs) self.cassandra_conn_id = cassandra_conn_id diff --git a/airflow/contrib/sensors/cassandra_table_sensor.py b/airflow/contrib/sensors/cassandra_table_sensor.py new file mode 100644 index 0000000000000..088c82bdd3060 --- /dev/null +++ b/airflow/contrib/sensors/cassandra_table_sensor.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from airflow.contrib.hooks.cassandra_hook import CassandraHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class CassandraTableSensor(BaseSensorOperator): + """ + Checks for the existence of a table in a Cassandra cluster. + + For example, if you want to wait for a table called 't' to be created + in a keyspace 'k', instantiate it as follows: + + >>> cassandra_sensor = CassandraTableSensor(table="k.t", + ... cassandra_conn_id="cassandra_default", + ... task_id="cassandra_sensor") + """ + template_fields = ('table',) + + @apply_defaults + def __init__(self, table, cassandra_conn_id, *args, **kwargs): + """ + Create a new CassandraTableSensor + + :param table: Target Cassandra table. + Use dot notation to target a specific keyspace. + :type table: str + :param cassandra_conn_id: The connection ID to use + when connecting to Cassandra cluster + :type cassandra_conn_id: str + """ + super(CassandraTableSensor, self).__init__(*args, **kwargs) + self.cassandra_conn_id = cassandra_conn_id + self.table = table + + def poke(self, context): + self.log.info('Sensor check existence of table: %s', self.table) + hook = CassandraHook(self.cassandra_conn_id) + return hook.table_exists(self.table) diff --git a/airflow/contrib/sensors/datadog_sensor.py b/airflow/contrib/sensors/datadog_sensor.py index fb455d6cd77eb..df0fb0976fb36 100644 --- a/airflow/contrib/sensors/datadog_sensor.py +++ b/airflow/contrib/sensors/datadog_sensor.py @@ -32,7 +32,7 @@ class DatadogSensor(BaseSensorOperator): Airflow runs. :param datadog_conn_id: The connection to datadog, containing metadata for api keys. - :param datadog_conn_id: string + :param datadog_conn_id: str """ ui_color = '#66c3dd' diff --git a/airflow/contrib/sensors/emr_job_flow_sensor.py b/airflow/contrib/sensors/emr_job_flow_sensor.py index 31d16a066b241..2c7514b3926f3 100644 --- a/airflow/contrib/sensors/emr_job_flow_sensor.py +++ b/airflow/contrib/sensors/emr_job_flow_sensor.py @@ -27,7 +27,7 @@ class EmrJobFlowSensor(EmrBaseSensor): If it fails the sensor errors, failing the task. :param job_flow_id: job_flow_id to check the state of - :type job_flow_id: string + :type job_flow_id: str """ NON_TERMINAL_STATES = ['STARTING', 'BOOTSTRAPPING', 'RUNNING', @@ -50,5 +50,6 @@ def get_emr_response(self): self.log.info('Poking cluster %s', self.job_flow_id) return emr.describe_cluster(ClusterId=self.job_flow_id) - def state_from_response(self, response): + @staticmethod + def state_from_response(response): return response['Cluster']['Status']['State'] diff --git a/airflow/contrib/sensors/emr_step_sensor.py b/airflow/contrib/sensors/emr_step_sensor.py index 3dddf01bd83a5..8440acbbb314a 100644 --- a/airflow/contrib/sensors/emr_step_sensor.py +++ b/airflow/contrib/sensors/emr_step_sensor.py @@ -27,13 +27,13 @@ class EmrStepSensor(EmrBaseSensor): If it fails the sensor errors, failing the task. :param job_flow_id: job_flow_id which contains the step check the state of - :type job_flow_id: string + :type job_flow_id: str :param step_id: step to check the state of - :type step_id: string + :type step_id: str """ - NON_TERMINAL_STATES = ['PENDING', 'RUNNING', 'CONTINUE'] - FAILED_STATE = ['CANCELLED', 'FAILED'] + NON_TERMINAL_STATES = ['PENDING', 'RUNNING', 'CONTINUE', 'CANCEL_PENDING'] + FAILED_STATE = ['CANCELLED', 'FAILED', 'INTERRUPTED'] template_fields = ['job_flow_id', 'step_id'] template_ext = () @@ -53,5 +53,6 @@ def get_emr_response(self): self.log.info('Poking step %s on cluster %s', self.step_id, self.job_flow_id) return emr.describe_step(ClusterId=self.job_flow_id, StepId=self.step_id) - def state_from_response(self, response): + @staticmethod + def state_from_response(response): return response['Step']['Status']['State'] diff --git a/airflow/contrib/sensors/file_sensor.py b/airflow/contrib/sensors/file_sensor.py index 3f7bb24e0837f..570faac86a610 100644 --- a/airflow/contrib/sensors/file_sensor.py +++ b/airflow/contrib/sensors/file_sensor.py @@ -35,10 +35,10 @@ class FileSensor(BaseSensorOperator): :param fs_conn_id: reference to the File (path) connection id - :type fs_conn_id: string + :type fs_conn_id: str :param filepath: File or folder name (relative to the base path set within the connection) - :type fs_conn_id: string + :type fs_conn_id: str """ template_fields = ('filepath',) ui_color = '#91818a' diff --git a/airflow/contrib/sensors/ftp_sensor.py b/airflow/contrib/sensors/ftp_sensor.py index f0a49285d031a..4773b15989299 100644 --- a/airflow/contrib/sensors/ftp_sensor.py +++ b/airflow/contrib/sensors/ftp_sensor.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. import ftplib +import re from airflow.contrib.hooks.ftp_hook import FTPHook, FTPSHook from airflow.sensors.base_sensor_operator import BaseSensorOperator @@ -26,33 +27,65 @@ class FTPSensor(BaseSensorOperator): """ Waits for a file or directory to be present on FTP. - - :param path: Remote file or directory path - :type path: str - :param ftp_conn_id: The connection to run the sensor against - :type ftp_conn_id: str """ + template_fields = ('path',) + """Errors that are transient in nature, and where action can be retried""" + transient_errors = [421, 425, 426, 434, 450, 451, 452] + + error_code_pattern = re.compile(r"([\d]+)") + @apply_defaults - def __init__(self, path, ftp_conn_id='ftp_default', *args, **kwargs): + def __init__( + self, + path, + ftp_conn_id='ftp_default', + fail_on_transient_errors=True, + *args, + **kwargs): + """ + Create a new FTP sensor + + :param path: Remote file or directory path + :type path: str + :param fail_on_transient_errors: Fail on all errors, + including 4xx transient errors. Default True. + :type fail_on_transient_errors: bool + :param ftp_conn_id: The connection to run the sensor against + :type ftp_conn_id: str + """ + super(FTPSensor, self).__init__(*args, **kwargs) self.path = path self.ftp_conn_id = ftp_conn_id + self.fail_on_transient_errors = fail_on_transient_errors def _create_hook(self): """Return connection hook.""" return FTPHook(ftp_conn_id=self.ftp_conn_id) + def _get_error_code(self, e): + """Extract error code from ftp exception""" + try: + matches = self.error_code_pattern.match(str(e)) + code = int(matches.group(0)) + return code + except ValueError: + return e + def poke(self, context): with self._create_hook() as hook: self.log.info('Poking for %s', self.path) try: hook.get_mod_time(self.path) except ftplib.error_perm as e: - error = str(e).split(None, 1) - if error[1] != "Can't check for file existence": + self.log.info('Ftp error encountered: %s', str(e)) + error_code = self._get_error_code(e) + if ((error_code != 550) and + (self.fail_on_transient_errors or + (error_code not in self.transient_errors))): raise e return False diff --git a/airflow/contrib/sensors/gcp_transfer_sensor.py b/airflow/contrib/sensors/gcp_transfer_sensor.py new file mode 100644 index 0000000000000..a6a1c73c39dea --- /dev/null +++ b/airflow/contrib/sensors/gcp_transfer_sensor.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import six + +from airflow.contrib.hooks.gcp_transfer_hook import GCPTransferServiceHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class GCPTransferServiceWaitForJobStatusSensor(BaseSensorOperator): + """ + Waits for at least one operation belonging to the job to have the + expected status. + + :param job_name: The name of the transfer job + :type job_name: str + :param expected_statuses: The expected state of the operation. + See: + https://cloud.google.com/storage-transfer/docs/reference/rest/v1/transferOperations#Status + :type expected_statuses: set[str] or string + :param project_id: (Optional) the ID of the project that owns the Transfer + Job. If set to None or missing, the default project_id from the GCP + connection is used. + :type project_id: str + :param gcp_conn_id: The connection ID used to connect to Google Cloud + Platform. + :type gcp_conn_id: str + """ + + # [START gcp_transfer_job_sensor_template_fields] + template_fields = ('job_name',) + # [END gcp_transfer_job_sensor_template_fields] + + @apply_defaults + def __init__( + self, + job_name, + expected_statuses, + project_id=None, + gcp_conn_id='google_cloud_default', + *args, + **kwargs + ): + super(GCPTransferServiceWaitForJobStatusSensor, self).__init__(*args, **kwargs) + self.job_name = job_name + self.expected_statuses = ( + {expected_statuses} if isinstance(expected_statuses, six.string_types) else expected_statuses + ) + self.project_id = project_id + self.gcp_cloud_conn_id = gcp_conn_id + + def poke(self, context): + hook = GCPTransferServiceHook(gcp_conn_id=self.gcp_cloud_conn_id) + operations = hook.list_transfer_operations( + filter={'project_id': self.project_id, 'job_names': [self.job_name]} + ) + + check = GCPTransferServiceHook.operations_contain_expected_statuses( + operations=operations, expected_statuses=self.expected_statuses + ) + if check: + self.xcom_push(key="sensed_operations", value=operations, context=context) + + return check diff --git a/airflow/contrib/sensors/gcs_sensor.py b/airflow/contrib/sensors/gcs_sensor.py index 23cd760a41f19..5a4f73e61e822 100644 --- a/airflow/contrib/sensors/gcs_sensor.py +++ b/airflow/contrib/sensors/gcs_sensor.py @@ -27,17 +27,17 @@ class GoogleCloudStorageObjectSensor(BaseSensorOperator): Create a new GoogleCloudStorageObjectSensor. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to check in the Google cloud storage bucket. - :type object: string + :type object: str :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ('bucket', 'object') ui_color = '#f0eee4' @@ -79,21 +79,21 @@ class GoogleCloudStorageObjectUpdatedSensor(BaseSensorOperator): Create a new GoogleCloudStorageObjectUpdatedSensor. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param object: The name of the object to download in the Google cloud storage bucket. - :type object: string + :type object: str :param ts_func: Callback for defining the update condition. The default callback returns execution_date + schedule_interval. The callback takes the context as parameter. :type ts_func: function :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ('bucket', 'object') template_ext = ('.sql',) @@ -129,17 +129,17 @@ class GoogleCloudStoragePrefixSensor(BaseSensorOperator): Create a new GoogleCloudStorageObjectSensor. :param bucket: The Google cloud storage bucket where the object is. - :type bucket: string + :type bucket: str :param prefix: The name of the prefix to check in the Google cloud storage bucket. - :type prefix: string + :type prefix: str :param google_cloud_storage_conn_id: The connection ID to use when connecting to Google cloud storage. - :type google_cloud_storage_conn_id: string + :type google_cloud_storage_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ template_fields = ('bucket', 'prefix') ui_color = '#f0eee4' diff --git a/airflow/contrib/sensors/hdfs_sensor.py b/airflow/contrib/sensors/hdfs_sensor.py index 832b81b8e5f25..fe9e5cfc6f4df 100644 --- a/airflow/contrib/sensors/hdfs_sensor.py +++ b/airflow/contrib/sensors/hdfs_sensor.py @@ -35,9 +35,7 @@ def poke(self, context): """ sb = self.hook(self.hdfs_conn_id).get_conn() self.log.info( - 'Poking for {self.filepath} to be a directory ' - 'with files matching {self.regex.pattern}'. - format(**locals()) + 'Poking for %s to be a directory with files matching %s', self.filepath, self.regex.pattern ) result = [f for f in sb.ls([self.filepath], include_toplevel=False) if f['file_type'] == 'f' and @@ -68,11 +66,9 @@ def poke(self, context): self.ignore_copying) result = self.filter_for_filesize(result, self.file_size) if self.be_empty: - self.log.info('Poking for filepath {self.filepath} to a empty directory' - .format(**locals())) + self.log.info('Poking for filepath %s to a empty directory', self.filepath) return len(result) == 1 and result[0]['path'] == self.filepath else: - self.log.info('Poking for filepath {self.filepath} to a non empty directory' - .format(**locals())) + self.log.info('Poking for filepath %s to a non empty directory', self.filepath) result.pop(0) return bool(result) and result[0]['file_type'] == 'f' diff --git a/airflow/contrib/sensors/imap_attachment_sensor.py b/airflow/contrib/sensors/imap_attachment_sensor.py new file mode 100644 index 0000000000000..c0eb9b6cd22c3 --- /dev/null +++ b/airflow/contrib/sensors/imap_attachment_sensor.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.imap_hook import ImapHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class ImapAttachmentSensor(BaseSensorOperator): + """ + Waits for a specific attachment on a mail server. + + :param attachment_name: The name of the attachment that will be checked. + :type attachment_name: str + :param check_regex: If set to True the attachment's name will be parsed as regular expression. + Through this you can get a broader set of attachments + that it will look for than just only the equality of the attachment name. + The default value is False. + :type check_regex: bool + :param mail_folder: The mail folder in where to search for the attachment. + The default value is 'INBOX'. + :type mail_folder: str + :param conn_id: The connection to run the sensor against. + The default value is 'imap_default'. + :type conn_id: str + """ + template_fields = ('attachment_name',) + + @apply_defaults + def __init__(self, + attachment_name, + mail_folder='INBOX', + check_regex=False, + conn_id='imap_default', + *args, + **kwargs): + super(ImapAttachmentSensor, self).__init__(*args, **kwargs) + + self.attachment_name = attachment_name + self.mail_folder = mail_folder + self.check_regex = check_regex + self.conn_id = conn_id + + def poke(self, context): + """ + Pokes for a mail attachment on the mail server. + + :param context: The context that is being provided when poking. + :type context: dict + :return: True if attachment with the given name is present and False if not. + :rtype: bool + """ + self.log.info('Poking for %s', self.attachment_name) + + with ImapHook(imap_conn_id=self.conn_id) as imap_hook: + return imap_hook.has_mail_attachment( + name=self.attachment_name, + mail_folder=self.mail_folder, + check_regex=self.check_regex + ) diff --git a/airflow/contrib/sensors/mongo_sensor.py b/airflow/contrib/sensors/mongo_sensor.py new file mode 100644 index 0000000000000..8fd32a0775feb --- /dev/null +++ b/airflow/contrib/sensors/mongo_sensor.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from airflow.contrib.hooks.mongo_hook import MongoHook +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class MongoSensor(BaseSensorOperator): + """ + Checks for the existence of a document which + matches the given query in MongoDB. Example: + + >>> mongo_sensor = MongoSensor(collection="coll", + ... query={"key": "value"}, + ... mongo_conn_id="mongo_default", + ... task_id="mongo_sensor") + """ + template_fields = ('collection', 'query') + + @apply_defaults + def __init__(self, collection, query, mongo_conn_id="mongo_default", *args, **kwargs): + """ + Create a new MongoSensor + + :param collection: Target MongoDB collection. + :type collection: str + :param query: The query to find the target document. + :type query: dict + :param mongo_conn_id: The connection ID to use + when connecting to MongoDB. + :type mongo_conn_id: str + """ + super(MongoSensor, self).__init__(*args, **kwargs) + self.mongo_conn_id = mongo_conn_id + self.collection = collection + self.query = query + + def poke(self, context): + self.log.info("Sensor check existence of the document " + "that matches the following query: %s", self.query) + hook = MongoHook(self.mongo_conn_id) + return hook.find(self.collection, self.query, find_one=True) is not None diff --git a/airflow/contrib/sensors/pubsub_sensor.py b/airflow/contrib/sensors/pubsub_sensor.py index 7d1721570dea4..8104fb56997b2 100644 --- a/airflow/contrib/sensors/pubsub_sensor.py +++ b/airflow/contrib/sensors/pubsub_sensor.py @@ -54,10 +54,10 @@ def __init__( **kwargs): """ :param project: the GCP project ID for the subscription (templated) - :type project: string + :type project: str :param subscription: the Pub/Sub subscription name. Do not include the full subscription path. - :type subscription: string + :type subscription: str :param max_messages: The maximum number of messages to retrieve per PubSub pull request :type max_messages: int @@ -69,11 +69,11 @@ def __init__( :type ack_messages: bool :param gcp_conn_id: The connection ID to use connecting to Google Cloud Platform. - :type gcp_conn_id: string + :type gcp_conn_id: str :param delegate_to: The account to impersonate, if any. For this to work, the service account making the request must have domain-wide delegation enabled. - :type delegate_to: string + :type delegate_to: str """ super(PubSubPullSensor, self).__init__(*args, **kwargs) diff --git a/airflow/contrib/sensors/python_sensor.py b/airflow/contrib/sensors/python_sensor.py new file mode 100644 index 0000000000000..7c4579a891fbb --- /dev/null +++ b/airflow/contrib/sensors/python_sensor.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults + + +class PythonSensor(BaseSensorOperator): + """ + Waits for a Python callable to return True. + + User could put input argument in templates_dict + e.g ``templates_dict = {'start_ds': 1970}`` + and access the argument by calling ``kwargs['templates_dict']['start_ds']`` + in the the callable + + :param python_callable: A reference to an object that is callable + :type python_callable: python callable + :param op_kwargs: a dictionary of keyword arguments that will get unpacked + in your function + :type op_kwargs: dict + :param op_args: a list of positional arguments that will get unpacked when + calling your callable + :type op_args: list + :param provide_context: if set to true, Airflow will pass a set of + keyword arguments that can be used in your function. This set of + kwargs correspond exactly to what you can use in your jinja + templates. For this to work, you need to define `**kwargs` in your + function header. + :type provide_context: bool + :param templates_dict: a dictionary where the values are templates that + will get templated by the Airflow engine sometime between + ``__init__`` and ``execute`` takes place and are made available + in your callable's context after the template has been applied. + :type templates_dict: dict of str + """ + + template_fields = ('templates_dict',) + + @apply_defaults + def __init__( + self, + python_callable, + op_args=None, + op_kwargs=None, + provide_context=False, + templates_dict=None, + *args, **kwargs): + super(PythonSensor, self).__init__(*args, **kwargs) + self.python_callable = python_callable + self.op_args = op_args or [] + self.op_kwargs = op_kwargs or {} + self.provide_context = provide_context + self.templates_dict = templates_dict + + def poke(self, context): + if self.provide_context: + context.update(self.op_kwargs) + context['templates_dict'] = self.templates_dict + self.op_kwargs = context + + self.log.info("Poking callable: %s", str(self.python_callable)) + return_value = self.python_callable(*self.op_args, **self.op_kwargs) + return bool(return_value) diff --git a/airflow/contrib/sensors/qubole_sensor.py b/airflow/contrib/sensors/qubole_sensor.py index d67fa8557e7ad..08e4290570fe0 100644 --- a/airflow/contrib/sensors/qubole_sensor.py +++ b/airflow/contrib/sensors/qubole_sensor.py @@ -29,14 +29,6 @@ class QuboleSensor(BaseSensorOperator): """ Base class for all Qubole Sensors - - :param qubole_conn_id: The qubole connection to run the sensor against - :type qubole_conn_id: string - :param data: a JSON object containing payload, whose presence needs to be checked - :type data: a JSON object - - .. note:: Both ``data`` and ``qubole_conn_id`` fields are template-supported. You can - also use ``.txt`` files for template driven use cases. """ template_fields = ('data', 'qubole_conn_id') @@ -75,6 +67,22 @@ def poke(self, context): class QuboleFileSensor(QuboleSensor): + """ + Wait for a file or folder to be present in cloud storage + and check for its presence via QDS APIs + + :param qubole_conn_id: Connection id which consists of qds auth_token + :type qubole_conn_id: str + :param data: a JSON object containing payload, whose presence needs to be checked + Check this `example `_ for sample payload + structure. + :type data: a JSON object + + .. note:: Both ``data`` and ``qubole_conn_id`` fields support templating. You can + also use ``.txt`` files for template-driven use cases. + """ + @apply_defaults def __init__(self, *args, **kwargs): self.sensor_class = FileSensor @@ -82,6 +90,22 @@ def __init__(self, *args, **kwargs): class QubolePartitionSensor(QuboleSensor): + """ + Wait for a Hive partition to show up in QHS (Qubole Hive Service) + and check for its presence via QDS APIs + + :param qubole_conn_id: Connection id which consists of qds auth_token + :type qubole_conn_id: str + :param data: a JSON object containing payload, whose presence needs to be checked. + Check this `example `_ for sample payload + structure. + :type data: a JSON object + + .. note:: Both ``data`` and ``qubole_conn_id`` fields support templating. You can + also use ``.txt`` files for template-driven use cases. + """ + @apply_defaults def __init__(self, *args, **kwargs): self.sensor_class = PartitionSensor diff --git a/airflow/contrib/sensors/redis_key_sensor.py b/airflow/contrib/sensors/redis_key_sensor.py index baf3e161f5365..4c0ac68840b50 100644 --- a/airflow/contrib/sensors/redis_key_sensor.py +++ b/airflow/contrib/sensors/redis_key_sensor.py @@ -23,25 +23,17 @@ class RedisKeySensor(BaseSensorOperator): """ - Checks for the existence of a key in a Redis database + Checks for the existence of a key in a Redis """ template_fields = ('key',) ui_color = '#f0eee4' @apply_defaults def __init__(self, key, redis_conn_id, *args, **kwargs): - """ - Create a new RedisKeySensor - - :param key: The key to be monitored - :type key: string - :param redis_conn_id: The connection ID to use when connecting to Redis DB. - :type redis_conn_id: string - """ super(RedisKeySensor, self).__init__(*args, **kwargs) self.redis_conn_id = redis_conn_id self.key = key def poke(self, context): - self.log.info('Sensor check existence of key: %s', self.key) - return RedisHook(self.redis_conn_id).key_exists(self.key) + self.log.info('Sensor checks for existence of key: %s', self.key) + return RedisHook(self.redis_conn_id).get_conn().exists(self.key) diff --git a/airflow/contrib/sensors/redis_pub_sub_sensor.py b/airflow/contrib/sensors/redis_pub_sub_sensor.py new file mode 100644 index 0000000000000..474a8d21d61ef --- /dev/null +++ b/airflow/contrib/sensors/redis_pub_sub_sensor.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from airflow.contrib.hooks.redis_hook import RedisHook + + +class RedisPubSubSensor(BaseSensorOperator): + + """ + Redis sensor for reading a message from pub sub channels + """ + template_fields = ('channels',) + ui_color = '#f0eee4' + + @apply_defaults + def __init__(self, channels, redis_conn_id, *args, **kwargs): + """ + Create a new RedisPubSubSensor and subscribe to the channels + + :param channels: The channels to be subscribed to (templated) + :type channels: str or list of str + :param redis_conn_id: the redis connection id + :type redis_conn_id: str + """ + + super(RedisPubSubSensor, self).__init__(*args, **kwargs) + self.channels = channels + self.redis_conn_id = redis_conn_id + self.pubsub = RedisHook(redis_conn_id=self.redis_conn_id).get_conn().pubsub() + self.pubsub.subscribe(self.channels) + + def poke(self, context): + """ + Check for message on subscribed channels and write to xcom the message with key ``message`` + + An example of message ``{'type': 'message', 'pattern': None, 'channel': b'test', 'data': b'hello'}`` + + :param context: the context object + :type context: dict + :return: ``True`` if message (with type 'message') is available or ``False`` if not + """ + self.log.info('RedisPubSubSensor checking for message on channels: %s', self.channels) + + message = self.pubsub.get_message() + self.log.info('Message %s from channel %s', message, self.channels) + + # Process only message types + if message and message['type'] == 'message': + + context['ti'].xcom_push(key='message', value=message) + self.pubsub.unsubscribe(self.channels) + + return True + + return False diff --git a/airflow/contrib/sensors/sagemaker_base_sensor.py b/airflow/contrib/sensors/sagemaker_base_sensor.py new file mode 100644 index 0000000000000..10dd6b2357a66 --- /dev/null +++ b/airflow/contrib/sensors/sagemaker_base_sensor.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils.decorators import apply_defaults +from airflow.exceptions import AirflowException + + +class SageMakerBaseSensor(BaseSensorOperator): + """ + Contains general sensor behavior for SageMaker. + Subclasses should implement get_sagemaker_response() + and state_from_response() methods. + Subclasses should also implement NON_TERMINAL_STATES and FAILED_STATE methods. + """ + ui_color = '#ededed' + + @apply_defaults + def __init__( + self, + aws_conn_id='aws_default', + *args, **kwargs): + super(SageMakerBaseSensor, self).__init__(*args, **kwargs) + self.aws_conn_id = aws_conn_id + + def poke(self, context): + response = self.get_sagemaker_response() + + if not response['ResponseMetadata']['HTTPStatusCode'] == 200: + self.log.info('Bad HTTP response: %s', response) + return False + + state = self.state_from_response(response) + + self.log.info('Job currently %s', state) + + if state in self.non_terminal_states(): + return False + + if state in self.failed_states(): + failed_reason = self.get_failed_reason_from_response(response) + raise AirflowException('Sagemaker job failed for the following reason: %s' + % failed_reason) + return True + + def non_terminal_states(self): + raise NotImplementedError('Please implement non_terminal_states() in subclass') + + def failed_states(self): + raise NotImplementedError('Please implement failed_states() in subclass') + + def get_sagemaker_response(self): + raise NotImplementedError('Please implement get_sagemaker_response() in subclass') + + def get_failed_reason_from_response(self, response): + return 'Unknown' + + def state_from_response(self, response): + raise NotImplementedError('Please implement state_from_response() in subclass') diff --git a/airflow/contrib/sensors/sagemaker_endpoint_sensor.py b/airflow/contrib/sensors/sagemaker_endpoint_sensor.py new file mode 100644 index 0000000000000..ceed9c10097d9 --- /dev/null +++ b/airflow/contrib/sensors/sagemaker_endpoint_sensor.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.sagemaker_hook import SageMakerHook +from airflow.contrib.sensors.sagemaker_base_sensor import SageMakerBaseSensor +from airflow.utils.decorators import apply_defaults + + +class SageMakerEndpointSensor(SageMakerBaseSensor): + """ + Asks for the state of the endpoint state until it reaches a terminal state. + If it fails the sensor errors, the task fails. + + :param job_name: job_name of the endpoint instance to check the state of + :type job_name: str + """ + + template_fields = ['endpoint_name'] + template_ext = () + + @apply_defaults + def __init__(self, + endpoint_name, + *args, + **kwargs): + super(SageMakerEndpointSensor, self).__init__(*args, **kwargs) + self.endpoint_name = endpoint_name + + def non_terminal_states(self): + return SageMakerHook.endpoint_non_terminal_states + + def failed_states(self): + return SageMakerHook.failed_states + + def get_sagemaker_response(self): + sagemaker = SageMakerHook(aws_conn_id=self.aws_conn_id) + + self.log.info('Poking Sagemaker Endpoint %s', self.endpoint_name) + return sagemaker.describe_endpoint(self.endpoint_name) + + def get_failed_reason_from_response(self, response): + return response['FailureReason'] + + def state_from_response(self, response): + return response['EndpointStatus'] diff --git a/airflow/contrib/sensors/sagemaker_training_sensor.py b/airflow/contrib/sensors/sagemaker_training_sensor.py new file mode 100644 index 0000000000000..d550a6c434bfb --- /dev/null +++ b/airflow/contrib/sensors/sagemaker_training_sensor.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time + +from airflow.contrib.hooks.sagemaker_hook import SageMakerHook, LogState +from airflow.contrib.sensors.sagemaker_base_sensor import SageMakerBaseSensor +from airflow.utils.decorators import apply_defaults + + +class SageMakerTrainingSensor(SageMakerBaseSensor): + """ + Asks for the state of the training state until it reaches a terminal state. + If it fails the sensor errors, failing the task. + + :param job_name: name of the SageMaker training job to check the state of + :type job_name: str + :param print_log: if the operator should print the cloudwatch log + :type print_log: bool + """ + + template_fields = ['job_name'] + template_ext = () + + @apply_defaults + def __init__(self, + job_name, + print_log=True, + *args, + **kwargs): + super(SageMakerTrainingSensor, self).__init__(*args, **kwargs) + self.job_name = job_name + self.print_log = print_log + self.positions = {} + self.stream_names = [] + self.instance_count = None + self.state = None + self.last_description = None + self.last_describe_job_call = None + self.log_resource_inited = False + + def init_log_resource(self, hook): + description = hook.describe_training_job(self.job_name) + self.instance_count = description['ResourceConfig']['InstanceCount'] + + status = description['TrainingJobStatus'] + job_already_completed = status not in self.non_terminal_states() + self.state = LogState.TAILING if not job_already_completed else LogState.COMPLETE + self.last_description = description + self.last_describe_job_call = time.time() + self.log_resource_inited = True + + def non_terminal_states(self): + return SageMakerHook.non_terminal_states + + def failed_states(self): + return SageMakerHook.failed_states + + def get_sagemaker_response(self): + sagemaker_hook = SageMakerHook(aws_conn_id=self.aws_conn_id) + if self.print_log: + if not self.log_resource_inited: + self.init_log_resource(sagemaker_hook) + self.state, self.last_description, self.last_describe_job_call = \ + sagemaker_hook.describe_training_job_with_log(self.job_name, + self.positions, self.stream_names, + self.instance_count, self.state, + self.last_description, + self.last_describe_job_call) + else: + self.last_description = sagemaker_hook.describe_training_job(self.job_name) + + status = self.state_from_response(self.last_description) + if status not in self.non_terminal_states() and status not in self.failed_states(): + billable_time = \ + (self.last_description['TrainingEndTime'] - self.last_description['TrainingStartTime']) * \ + self.last_description['ResourceConfig']['InstanceCount'] + self.log.info('Billable seconds: %s', int(billable_time.total_seconds()) + 1) + + return self.last_description + + def get_failed_reason_from_response(self, response): + return response['FailureReason'] + + def state_from_response(self, response): + return response['TrainingJobStatus'] diff --git a/airflow/contrib/sensors/sagemaker_transform_sensor.py b/airflow/contrib/sensors/sagemaker_transform_sensor.py new file mode 100644 index 0000000000000..f64724bde9b24 --- /dev/null +++ b/airflow/contrib/sensors/sagemaker_transform_sensor.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.sagemaker_hook import SageMakerHook +from airflow.contrib.sensors.sagemaker_base_sensor import SageMakerBaseSensor +from airflow.utils.decorators import apply_defaults + + +class SageMakerTransformSensor(SageMakerBaseSensor): + """ + Asks for the state of the transform state until it reaches a terminal state. + The sensor will error if the job errors, throwing a AirflowException + containing the failure reason. + + :param job_name: job_name of the transform job instance to check the state of + :type job_name: string + """ + + template_fields = ['job_name'] + template_ext = () + + @apply_defaults + def __init__(self, + job_name, + *args, + **kwargs): + super(SageMakerTransformSensor, self).__init__(*args, **kwargs) + self.job_name = job_name + + def non_terminal_states(self): + return SageMakerHook.non_terminal_states + + def failed_states(self): + return SageMakerHook.failed_states + + def get_sagemaker_response(self): + sagemaker = SageMakerHook(aws_conn_id=self.aws_conn_id) + + self.log.info('Poking Sagemaker Transform Job %s', self.job_name) + return sagemaker.describe_transform_job(self.job_name) + + def get_failed_reason_from_response(self, response): + return response['FailureReason'] + + def state_from_response(self, response): + return response['TransformJobStatus'] diff --git a/airflow/contrib/sensors/sagemaker_tuning_sensor.py b/airflow/contrib/sensors/sagemaker_tuning_sensor.py new file mode 100644 index 0000000000000..8c835216d6bb3 --- /dev/null +++ b/airflow/contrib/sensors/sagemaker_tuning_sensor.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.contrib.hooks.sagemaker_hook import SageMakerHook +from airflow.contrib.sensors.sagemaker_base_sensor import SageMakerBaseSensor +from airflow.utils.decorators import apply_defaults + + +class SageMakerTuningSensor(SageMakerBaseSensor): + """ + Asks for the state of the tuning state until it reaches a terminal state. + The sensor will error if the job errors, throwing a AirflowException + containing the failure reason. + + :param job_name: job_name of the tuning instance to check the state of + :type job_name: str + """ + + template_fields = ['job_name'] + template_ext = () + + @apply_defaults + def __init__(self, + job_name, + *args, + **kwargs): + super(SageMakerTuningSensor, self).__init__(*args, **kwargs) + self.job_name = job_name + + def non_terminal_states(self): + return SageMakerHook.non_terminal_states + + def failed_states(self): + return SageMakerHook.failed_states + + def get_sagemaker_response(self): + sagemaker = SageMakerHook(aws_conn_id=self.aws_conn_id) + + self.log.info('Poking Sagemaker Tuning Job %s', self.job_name) + return sagemaker.describe_tuning_job(self.job_name) + + def get_failed_reason_from_response(self, response): + return response['FailureReason'] + + def state_from_response(self, response): + return response['HyperParameterTuningJobStatus'] diff --git a/airflow/contrib/sensors/sftp_sensor.py b/airflow/contrib/sensors/sftp_sensor.py index 51ad11ddbf68b..0b61c801514cc 100644 --- a/airflow/contrib/sensors/sftp_sensor.py +++ b/airflow/contrib/sensors/sftp_sensor.py @@ -17,7 +17,6 @@ # specific language governing permissions and limitations # under the License. -import logging from paramiko import SFTP_NO_SUCH_FILE from airflow.contrib.hooks.sftp_hook import SFTPHook from airflow.operators.sensors import BaseSensorOperator @@ -27,6 +26,7 @@ class SFTPSensor(BaseSensorOperator): """ Waits for a file or directory to be present on SFTP. + :param path: Remote file or directory path :type path: str :param sftp_conn_id: The connection to run the sensor against @@ -41,7 +41,7 @@ def __init__(self, path, sftp_conn_id='sftp_default', *args, **kwargs): self.hook = SFTPHook(sftp_conn_id) def poke(self, context): - logging.info('Poking for %s', self.path) + self.log.info('Poking for %s', self.path) try: self.hook.get_mod_time(self.path) except IOError as e: diff --git a/airflow/contrib/sensors/wasb_sensor.py b/airflow/contrib/sensors/wasb_sensor.py index ec6a63bf3d633..8b32d3fa221d9 100644 --- a/airflow/contrib/sensors/wasb_sensor.py +++ b/airflow/contrib/sensors/wasb_sensor.py @@ -90,10 +90,7 @@ def __init__(self, container_name, prefix, wasb_conn_id='wasb_default', self.check_options = check_options def poke(self, context): - self.log.info( - 'Poking for prefix: {self.prefix}\n' - 'in wasb://{self.container_name}'.format(**locals()) - ) + self.log.info('Poking for prefix: %s in wasb://%s', self.prefix, self.container_name) hook = WasbHook(wasb_conn_id=self.wasb_conn_id) return hook.check_for_prefix(self.container_name, self.prefix, **self.check_options) diff --git a/airflow/contrib/sensors/weekday_sensor.py b/airflow/contrib/sensors/weekday_sensor.py new file mode 100644 index 0000000000000..8d99e11fb459f --- /dev/null +++ b/airflow/contrib/sensors/weekday_sensor.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import six +from airflow.contrib.utils.weekday import WeekDay +from airflow.sensors.base_sensor_operator import BaseSensorOperator +from airflow.utils import timezone +from airflow.utils.decorators import apply_defaults + + +class DayOfWeekSensor(BaseSensorOperator): + """ + Waits until the first specified day of the week. For example, if the execution + day of the task is '2018-12-22' (Saturday) and you pass 'FRIDAY', the task will wait + until next Friday. + + **Example** (with single day): :: + + weekend_check = DayOfWeekSensor( + task_id='weekend_check', + week_day='Saturday', + use_task_execution_day=True, + dag=dag) + + **Example** (with multiple day using set): :: + + weekend_check = DayOfWeekSensor( + task_id='weekend_check', + week_day={'Saturday', 'Sunday'}, + use_task_execution_day=True, + dag=dag) + + **Example** (with :class:`~airflow.contrib.utils.weekday.WeekDay` enum): :: + + # import WeekDay Enum + from airflow.contrib.utils.weekday import WeekDay + + weekend_check = DayOfWeekSensor( + task_id='weekend_check', + week_day={WeekDay.SATURDAY, WeekDay.SUNDAY}, + use_task_execution_day=True, + dag=dag) + + :param week_day: Day of the week to check (full name). Optionally, a set + of days can also be provided using a set. + Example values: + + * ``"MONDAY"``, + * ``{"Saturday", "Sunday"}`` + * ``{WeekDay.TUESDAY}`` + * ``{WeekDay.SATURDAY, WeekDay.SUNDAY}`` + + :type week_day: set or str or airflow.contrib.utils.weekday.WeekDay + :param use_task_execution_day: If ``True``, uses task's execution day to compare + with week_day. Execution Date is Useful for backfilling. + If ``False``, uses system's day of the week. Useful when you + don't want to run anything on weekdays on the system. + :type use_task_execution_day: bool + """ + + @apply_defaults + def __init__(self, week_day, + use_task_execution_day=False, + *args, **kwargs): + super(DayOfWeekSensor, self).__init__(*args, **kwargs) + self.week_day = week_day + self.use_task_execution_day = use_task_execution_day + if isinstance(self.week_day, six.string_types): + self._week_day_num = {WeekDay.get_weekday_number(week_day_str=self.week_day)} + elif isinstance(self.week_day, WeekDay): + self._week_day_num = {self.week_day} + elif isinstance(self.week_day, set): + if all(isinstance(day, six.string_types) for day in self.week_day): + self._week_day_num = {WeekDay.get_weekday_number(day) for day in week_day} + elif all(isinstance(day, WeekDay) for day in self.week_day): + self._week_day_num = self.week_day + else: + raise TypeError( + 'Unsupported Type for week_day parameter: {}. It should be one of str' + ', set or Weekday enum type'.format(type(week_day))) + + def poke(self, context): + self.log.info('Poking until weekday is in %s, Today is %s', + self.week_day, + WeekDay(timezone.utcnow().isoweekday()).name) + if self.use_task_execution_day: + return context['execution_date'].isoweekday() in self._week_day_num + else: + return timezone.utcnow().isoweekday() in self._week_day_num diff --git a/airflow/contrib/task_runner/cgroup_task_runner.py b/airflow/contrib/task_runner/cgroup_task_runner.py index a97eac2af88ef..4662b0fe82f5a 100644 --- a/airflow/contrib/task_runner/cgroup_task_runner.py +++ b/airflow/contrib/task_runner/cgroup_task_runner.py @@ -117,13 +117,13 @@ def start(self): "creating another one", cgroups.get("cpu"), cgroups.get("memory") ) - self.process = self.run_command(['bash', '-c'], join_args=True) + self.process = self.run_command() return # Create a unique cgroup name cgroup_name = "airflow/{}/{}".format(datetime.datetime.utcnow(). strftime("%Y-%m-%d"), - str(uuid.uuid1())) + str(uuid.uuid4())) self.mem_cgroup_name = "memory/{}".format(cgroup_name) self.cpu_cgroup_name = "cpu/{}".format(cgroup_name) @@ -193,7 +193,8 @@ def on_finish(self): if self._created_cpu_cgroup: self._delete_cgroup(self.cpu_cgroup_name) - def _get_cgroup_names(self): + @staticmethod + def _get_cgroup_names(): """ :return: a mapping between the subsystem name to the cgroup name :rtype: dict[str, str] diff --git a/airflow/contrib/utils/gcp_field_sanitizer.py b/airflow/contrib/utils/gcp_field_sanitizer.py new file mode 100644 index 0000000000000..8103e6a7f0644 --- /dev/null +++ b/airflow/contrib/utils/gcp_field_sanitizer.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Sanitizer for body fields sent via GCP API. + +The sanitizer removes fields specified from the body. + +Context +------- +In some cases where GCP operation requires modification of existing resources (such +as instances or instance templates) we need to sanitize body of the resources returned +via GCP APIs. This is in the case when we retrieve information from GCP first, +modify the body and either update the existing resource or create a new one with the +modified body. Usually when you retrieve resource from GCP you get some extra fields which +are Output-only, and we need to delete those fields if we want to use +the body as input for subsequent create/insert type operation. + + +Field specification +------------------- + +Specification of fields is an array of strings which denote names of fields to be removed. +The field can be either direct field name to remove from the body or the full +specification of the path you should delete - separated with '.' + + +>>> FIELDS_TO_SANITIZE = [ +>>> "kind", +>>> "properties.disks.kind", +>>> "properties.metadata.kind", +>>>] +>>> body = { +>>> "kind": "compute#instanceTemplate", +>>> "name": "instance", +>>> "properties": { +>>> "disks": [ +>>> { +>>> "name": "a", +>>> "kind": "compute#attachedDisk", +>>> "type": "PERSISTENT", +>>> "mode": "READ_WRITE", +>>> }, +>>> { +>>> "name": "b", +>>> "kind": "compute#attachedDisk", +>>> "type": "PERSISTENT", +>>> "mode": "READ_WRITE", +>>> } +>>> ], +>>> "metadata": { +>>> "kind": "compute#metadata", +>>> "fingerprint": "GDPUYxlwHe4=" +>>> }, +>>> } +>>> } +>>> sanitizer=GcpBodyFieldSanitizer(FIELDS_TO_SANITIZE) +>>> SANITIZED_BODY = sanitizer.sanitize(body) +>>> json.dumps(SANITIZED_BODY, indent=2) +{ + "name": "instance", + "properties": { + "disks": [ + { + "name": "a", + "type": "PERSISTENT", + "mode": "READ_WRITE", + }, + { + "name": "b", + "type": "PERSISTENT", + "mode": "READ_WRITE", + } + ], + "metadata": { + "fingerprint": "GDPUYxlwHe4=" + }, + } +} + +Note that the components of the path can be either dictionaries or arrays of dictionaries. +In case they are dictionaries, subsequent component names key of the field, in case of +arrays - the sanitizer iterates through all dictionaries in the array and searches +components in all elements of the array. +""" + +from typing import List + +from airflow import LoggingMixin, AirflowException + + +class GcpFieldSanitizerException(AirflowException): + """Thrown when sanitizer finds unexpected field type in the path + (other than dict or array). + """ + + def __init__(self, message): + super(GcpFieldSanitizerException, self).__init__(message) + + +class GcpBodyFieldSanitizer(LoggingMixin): + """Sanitizes the body according to specification. + + :param sanitize_specs: array of strings that specifies which fields to remove + :type sanitize_specs: list[str] + + """ + def __init__(self, sanitize_specs): + # type: (List[str]) -> None + super(GcpBodyFieldSanitizer, self).__init__() + self._sanitize_specs = sanitize_specs + + def _sanitize(self, dictionary, remaining_field_spec, current_path): + field_split = remaining_field_spec.split(".", 1) + if len(field_split) == 1: + field_name = field_split[0] + if field_name in dictionary: + self.log.info("Deleted %s [%s]", field_name, current_path) + del dictionary[field_name] + else: + self.log.debug( + "The field %s is missing in %s at the path %s.", field_name, dictionary, current_path + ) + else: + field_name = field_split[0] + remaining_path = field_split[1] + child = dictionary.get(field_name) + if child is None: + self.log.debug( + "The field %s is missing in %s at the path %s. ", field_name, dictionary, current_path + ) + elif isinstance(child, dict): + self._sanitize(child, remaining_path, "{}.{}".format( + current_path, field_name)) + elif isinstance(child, list): + for index, elem in enumerate(child): + if not isinstance(elem, dict): + self.log.warn( + "The field %s element at index %s is of wrong type. " + "It should be dict and is %s. Skipping it.", + current_path, index, elem) + self._sanitize(elem, remaining_path, "{}.{}[{}]".format( + current_path, field_name, index)) + else: + self.log.warn( + "The field %s is of wrong type. It should be dict or list and it is %s. Skipping it.", + current_path, child + ) + + def sanitize(self, body): + for elem in self._sanitize_specs: + self._sanitize(body, elem, "") diff --git a/airflow/contrib/utils/gcp_field_validator.py b/airflow/contrib/utils/gcp_field_validator.py new file mode 100644 index 0000000000000..73e37f3e41be9 --- /dev/null +++ b/airflow/contrib/utils/gcp_field_validator.py @@ -0,0 +1,444 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Validator for body fields sent via GCP API. + +The validator performs validation of the body (being dictionary of fields) that +is sent in the API request to Google Cloud (via googleclient API usually). + +Context +------- +The specification mostly focuses on helping Airflow DAG developers in the development +phase. You can build your own GCP operator (such as GcfDeployOperator for example) which +can have built-in validation specification for the particular API. It's super helpful +when developer plays with different fields and their values at the initial phase of +DAG development. Most of the Google Cloud APIs perform their own validation on the +server side, but most of the requests are asynchronous and you need to wait for result +of the operation. This takes precious times and slows +down iteration over the API. BodyFieldValidator is meant to be used on the client side +and it should therefore provide an instant feedback to the developer on misspelled or +wrong type of parameters. + +The validation should be performed in "execute()" method call in order to allow +template parameters to be expanded before validation is performed. + +Types of fields +--------------- + +Specification is an array of dictionaries - each dictionary describes field, its type, +validation, optionality, api_version supported and nested fields (for unions and dicts). + +Typically (for clarity and in order to aid syntax highlighting) the array of +dicts should be defined as series of dict() executions. Fragment of example +specification might look as follows:: + + SPECIFICATION =[ + dict(name="an_union", type="union", optional=True, fields=[ + dict(name="variant_1", type="dict"), + dict(name="variant_2", regexp=r'^.+$', api_version='v1beta2'), + ), + dict(name="an_union", type="dict", fields=[ + dict(name="field_1", type="dict"), + dict(name="field_2", regexp=r'^.+$'), + ), + ... + ] + + +Each field should have key = "name" indicating field name. The field can be of one of the +following types: + +* Dict fields: (key = "type", value="dict"): + Field of this type should contain nested fields in form of an array of dicts. + Each of the fields in the array is then expected (unless marked as optional) + and validated recursively. If an extra field is present in the dictionary, warning is + printed in log file (but the validation succeeds - see the Forward-compatibility notes) +* List fields: (key = "type", value="list"): + Field of this type should be a list. Only the type correctness is validated. + The contents of a list are not subject to validation. +* Union fields (key = "type", value="union"): field of this type should contain nested + fields in form of an array of dicts. One of the fields (and only one) should be + present (unless the union is marked as optional). If more than one union field is + present, FieldValidationException is raised. If none of the union fields is + present - warning is printed in the log (see below Forward-compatibility notes). +* Fields validated for non-emptiness: (key = "allow_empty") - this applies only to + fields the value of which is a string, and it allows to check for non-emptiness of + the field (allow_empty=False). +* Regexp-validated fields: (key = "regexp") - fields of this type are assumed to be + strings and they are validated with the regexp specified. Remember that the regexps + should ideally contain ^ at the beginning and $ at the end to make sure that + the whole field content is validated. Typically such regexp + validations should be used carefully and sparingly (see Forward-compatibility + notes below). +* Custom-validated fields: (key = "custom_validation") - fields of this type are validated + using method specified via custom_validation field. Any exception thrown in the custom + validation will be turned into FieldValidationException and will cause validation to + fail. Such custom validations might be used to check numeric fields (including + ranges of values), booleans or any other types of fields. +* API version: (key="api_version") if API version is specified, then the field will only + be validated when api_version used at field validator initialization matches exactly the + the version specified. If you want to declare fields that are available in several + versions of the APIs, you should specify the field as many times as many API versions + should be supported (each time with different API version). +* if none of the keys ("type", "regexp", "custom_validation" - the field is not validated + +You can see some of the field examples in EXAMPLE_VALIDATION_SPECIFICATION. + + +Forward-compatibility notes +--------------------------- +Certain decisions are crucial to allow the client APIs to work also with future API +versions. Since body attached is passed to the API’s call, this is entirely +possible to pass-through any new fields in the body (for future API versions) - +albeit without validation on the client side - they can and will still be validated +on the server side usually. + +Here are the guidelines that you should follow to make validation forward-compatible: + +* most of the fields are not validated for their content. It's possible to use regexp + in some specific cases that are guaranteed not to change in the future, but for most + fields regexp validation should be r'^.+$' indicating check for non-emptiness +* api_version is not validated - user can pass any future version of the api here. The API + version is only used to filter parameters that are marked as present in this api version + any new (not present in the specification) fields in the body are allowed (not verified) + For dictionaries, new fields can be added to dictionaries by future calls. However if an + unknown field in dictionary is added, a warning is logged by the client (but validation + remains successful). This is very nice feature to protect against typos in names. +* For unions, newly added union variants can be added by future calls and they will + pass validation, however the content or presence of those fields will not be validated. + This means that it’s possible to send a new non-validated union field together with an + old validated field and this problem will not be detected by the client. In such case + warning will be printed. +* When you add validator to an operator, you should also add ``validate_body`` parameter + (default = True) to __init__ of such operators - when it is set to False, + no validation should be performed. This is a safeguard for totally unpredicted and + backwards-incompatible changes that might sometimes occur in the APIs. + +""" + +import re +from typing import Sequence, Dict, Callable + +from airflow import LoggingMixin, AirflowException + +COMPOSITE_FIELD_TYPES = ['union', 'dict', 'list'] + + +class GcpFieldValidationException(AirflowException): + """Thrown when validation finds dictionary field not valid according to specification. + """ + + def __init__(self, message): + super(GcpFieldValidationException, self).__init__(message) + + +class GcpValidationSpecificationException(AirflowException): + """Thrown when validation specification is wrong. + + This should only happen during development as ideally + specification itself should not be invalid ;) . + """ + + def __init__(self, message): + super(GcpValidationSpecificationException, self).__init__(message) + + +def _int_greater_than_zero(value): + if int(value) <= 0: + raise GcpFieldValidationException("The available memory has to be greater than 0") + + +EXAMPLE_VALIDATION_SPECIFICATION = [ + dict(name="name", allow_empty=False), + dict(name="description", allow_empty=False, optional=True), + dict(name="availableMemoryMb", custom_validation=_int_greater_than_zero, + optional=True), + dict(name="labels", optional=True, type="dict"), + dict(name="an_union", type="union", fields=[ + dict(name="variant_1", regexp=r'^.+$'), + dict(name="variant_2", regexp=r'^.+$', api_version='v1beta2'), + dict(name="variant_3", type="dict", fields=[ + dict(name="url", regexp=r'^.+$') + ]), + dict(name="variant_4") + ]), +] + + +class GcpBodyFieldValidator(LoggingMixin): + """Validates correctness of request body according to specification. + + The specification can describe various type of + fields including custom validation, and union of fields. This validator is + to be reusable by various operators. See the EXAMPLE_VALIDATION_SPECIFICATION + for some examples and explanations of how to create specification. + + :param validation_specs: dictionary describing validation specification + :type validation_specs: list[dict] + :param api_version: Version of the api used (for example v1) + :type api_version: str + + """ + def __init__(self, validation_specs, api_version): + # type: (Sequence[Dict], str) -> None + super(GcpBodyFieldValidator, self).__init__() + self._validation_specs = validation_specs + self._api_version = api_version + + @staticmethod + def _get_field_name_with_parent(field_name, parent): + if parent: + return parent + '.' + field_name + return field_name + + @staticmethod + def _sanity_checks(children_validation_specs, field_type, full_field_path, + regexp, allow_empty, custom_validation, value): + # type: (dict, str, str, str, Callable, object) -> None + if value is None and field_type != 'union': + raise GcpFieldValidationException( + "The required body field '{}' is missing. Please add it.". + format(full_field_path)) + if regexp and field_type: + raise GcpValidationSpecificationException( + "The validation specification entry '{}' has both type and regexp. " + "The regexp is only allowed without type (i.e. assume type is 'str' " + "that can be validated with regexp)".format(full_field_path)) + if allow_empty is not None and field_type: + raise GcpValidationSpecificationException( + "The validation specification entry '{}' has both type and allow_empty. " + "The allow_empty is only allowed without type (i.e. assume type is 'str' " + "that can be validated with allow_empty)".format(full_field_path)) + if children_validation_specs and field_type not in COMPOSITE_FIELD_TYPES: + raise GcpValidationSpecificationException( + "Nested fields are specified in field '{}' of type '{}'. " + "Nested fields are only allowed for fields of those types: ('{}').". + format(full_field_path, field_type, COMPOSITE_FIELD_TYPES)) + if custom_validation and field_type: + raise GcpValidationSpecificationException( + "The validation specification field '{}' has both type and " + "custom_validation. Custom validation is only allowed without type.". + format(full_field_path)) + + @staticmethod + def _validate_regexp(full_field_path, regexp, value): + # type: (str, str, str) -> None + if not re.match(regexp, value): + # Note matching of only the beginning as we assume the regexps all-or-nothing + raise GcpFieldValidationException( + "The body field '{}' of value '{}' does not match the field " + "specification regexp: '{}'.". + format(full_field_path, value, regexp)) + + @staticmethod + def _validate_is_empty(full_field_path, value): + # type: (str, str) -> None + if not value: + raise GcpFieldValidationException( + "The body field '{}' can't be empty. Please provide a value." + .format(full_field_path, value)) + + def _validate_dict(self, children_validation_specs, full_field_path, value): + # type: (dict, str, dict) -> None + for child_validation_spec in children_validation_specs: + self._validate_field(validation_spec=child_validation_spec, + dictionary_to_validate=value, + parent=full_field_path) + all_dict_keys = [spec['name'] for spec in children_validation_specs] + for field_name in value.keys(): + if field_name not in all_dict_keys: + self.log.warning( + "The field '%s' is in the body, but is not specified in the " + "validation specification '%s'. " + "This might be because you are using newer API version and " + "new field names defined for that version. Then the warning " + "can be safely ignored, or you might want to upgrade the operator" + "to the version that supports the new API version.", + self._get_field_name_with_parent(field_name, full_field_path), + children_validation_specs) + + def _validate_union(self, children_validation_specs, full_field_path, + dictionary_to_validate): + # type: (dict, str, dict) -> None + field_found = False + found_field_name = None + for child_validation_spec in children_validation_specs: + # Forcing optional so that we do not have to type optional = True + # in specification for all union fields + new_field_found = self._validate_field( + validation_spec=child_validation_spec, + dictionary_to_validate=dictionary_to_validate, + parent=full_field_path, + force_optional=True) + field_name = child_validation_spec['name'] + if new_field_found and field_found: + raise GcpFieldValidationException( + "The mutually exclusive fields '{}' and '{}' belonging to the " + "union '{}' are both present. Please remove one". + format(field_name, found_field_name, full_field_path)) + if new_field_found: + field_found = True + found_field_name = field_name + if not field_found: + self.log.warning( + "There is no '%s' union defined in the body %s. " + "Validation expected one of '%s' but could not find any. It's possible " + "that you are using newer API version and there is another union variant " + "defined for that version. Then the warning can be safely ignored, " + "or you might want to upgrade the operator to the version that " + "supports the new API version.", + full_field_path, dictionary_to_validate, + [field['name'] for field in children_validation_specs]) + + def _validate_field(self, validation_spec, dictionary_to_validate, parent=None, + force_optional=False): + """ + Validates if field is OK. + + :param validation_spec: specification of the field + :type validation_spec: dict + :param dictionary_to_validate: dictionary where the field should be present + :type dictionary_to_validate: dict + :param parent: full path of parent field + :type parent: str + :param force_optional: forces the field to be optional + (all union fields have force_optional set to True) + :type force_optional: bool + :return: True if the field is present + """ + field_name = validation_spec['name'] + field_type = validation_spec.get('type') + optional = validation_spec.get('optional') + regexp = validation_spec.get('regexp') + allow_empty = validation_spec.get('allow_empty') + children_validation_specs = validation_spec.get('fields') + required_api_version = validation_spec.get('api_version') + custom_validation = validation_spec.get('custom_validation') + + full_field_path = self._get_field_name_with_parent(field_name=field_name, + parent=parent) + if required_api_version and required_api_version != self._api_version: + self.log.debug( + "Skipping validation of the field '%s' for API version '%s' " + "as it is only valid for API version '%s'", + field_name, self._api_version, required_api_version) + return False + value = dictionary_to_validate.get(field_name) + + if (optional or force_optional) and value is None: + self.log.debug("The optional field '%s' is missing. That's perfectly OK.", full_field_path) + return False + + # Certainly down from here the field is present (value is not None) + # so we should only return True from now on + + self._sanity_checks(children_validation_specs=children_validation_specs, + field_type=field_type, + full_field_path=full_field_path, + regexp=regexp, + allow_empty=allow_empty, + custom_validation=custom_validation, + value=value) + + if allow_empty is False: + self._validate_is_empty(full_field_path, value) + if regexp: + self._validate_regexp(full_field_path, regexp, value) + elif field_type == 'dict': + if not isinstance(value, dict): + raise GcpFieldValidationException( + "The field '{}' should be of dictionary type according to the " + "specification '{}' but it is '{}'". + format(full_field_path, validation_spec, value)) + if children_validation_specs is None: + self.log.debug( + "The dict field '%s' has no nested fields defined in the " + "specification '%s'. That's perfectly ok - it's content will " + "not be validated.", full_field_path, validation_spec) + else: + self._validate_dict(children_validation_specs, full_field_path, value) + elif field_type == 'union': + if not children_validation_specs: + raise GcpValidationSpecificationException( + "The union field '%s' has no nested fields " + "defined in specification '%s'. Unions should have at least one " + "nested field defined.", full_field_path, validation_spec) + self._validate_union(children_validation_specs, full_field_path, + dictionary_to_validate) + elif field_type == 'list': + if not isinstance(value, list): + raise GcpFieldValidationException( + "The field '{}' should be of list type according to the " + "specification '{}' but it is '{}'". + format(full_field_path, validation_spec, value)) + elif custom_validation: + try: + custom_validation(value) + except Exception as e: + raise GcpFieldValidationException( + "Error while validating custom field '{}' specified by '{}': '{}'". + format(full_field_path, validation_spec, e)) + elif field_type is None: + self.log.debug("The type of field '%s' is not specified in '%s'. " + "Not validating its content.", full_field_path, validation_spec) + else: + raise GcpValidationSpecificationException( + "The field '{}' is of type '{}' in specification '{}'." + "This type is unknown to validation!".format( + full_field_path, field_type, validation_spec)) + return True + + def validate(self, body_to_validate): + """ + Validates if the body (dictionary) follows specification that the validator was + instantiated with. Raises ValidationSpecificationException or + ValidationFieldException in case of problems with specification or the + body not conforming to the specification respectively. + + :param body_to_validate: body that must follow the specification + :type body_to_validate: dict + :return: None + """ + try: + for validation_spec in self._validation_specs: + self._validate_field(validation_spec=validation_spec, + dictionary_to_validate=body_to_validate) + except GcpFieldValidationException as e: + raise GcpFieldValidationException( + "There was an error when validating: body '{}': '{}'". + format(body_to_validate, e)) + all_field_names = [spec['name'] for spec in self._validation_specs + if spec.get('type') != 'union' and + spec.get('api_version') != self._api_version] + all_union_fields = [spec for spec in self._validation_specs + if spec.get('type') == 'union'] + for union_field in all_union_fields: + all_field_names.extend( + [nested_union_spec['name'] for nested_union_spec in union_field['fields'] + if nested_union_spec.get('type') != 'union' and + nested_union_spec.get('api_version') != self._api_version]) + for field_name in body_to_validate.keys(): + if field_name not in all_field_names: + self.log.warning( + "The field '%s' is in the body, but is not specified in the " + "validation specification '%s'. " + "This might be because you are using newer API version and " + "new field names defined for that version. Then the warning " + "can be safely ignored, or you might want to upgrade the operator" + "to the version that supports the new API version.", + field_name, self._validation_specs) diff --git a/airflow/contrib/operators/mlengine_operator_utils.py b/airflow/contrib/utils/mlengine_operator_utils.py similarity index 95% rename from airflow/contrib/operators/mlengine_operator_utils.py rename to airflow/contrib/utils/mlengine_operator_utils.py index 7ce784ebb4ae3..c3ca8530dee6e 100644 --- a/airflow/contrib/operators/mlengine_operator_utils.py +++ b/airflow/contrib/utils/mlengine_operator_utils.py @@ -108,22 +108,22 @@ def validate_err_and_count(summary): :param task_prefix: a prefix for the tasks. Only alphanumeric characters and hyphen are allowed (no underscores), since this will be used as dataflow job name, which doesn't allow other characters. - :type task_prefix: string + :type task_prefix: str :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP' - :type data_format: string + :type data_format: str :param input_paths: a list of input paths to be sent to BatchPrediction. - :type input_paths: list of strings + :type input_paths: list[str] :param prediction_path: GCS path to put the prediction results in. - :type prediction_path: string + :type prediction_path: str :param metric_fn_and_keys: a tuple of metric_fn and metric_keys: - metric_fn is a function that accepts a dictionary (for an instance), and returns a tuple of metric(s) that it calculates. - metric_keys is a list of strings to denote the key of each metric. - :type metric_fn_and_keys: tuple of a function and a list of strings + :type metric_fn_and_keys: tuple of a function and a list[str] :param validate_fn: a function to validate whether the averaged metric(s) is good enough to push the model. @@ -132,17 +132,17 @@ def validate_err_and_count(summary): :param batch_prediction_job_id: the id to use for the Cloud ML Batch prediction job. Passed directly to the MLEngineBatchPredictionOperator as the job_id argument. - :type batch_prediction_job_id: string + :type batch_prediction_job_id: str :param project_id: the Google Cloud Platform project id in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['project_id']` will be used. - :type project_id: string + :type project_id: str :param region: the Google Cloud Platform region in which to execute Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s `default_args['region']` will be used. - :type region: string + :type region: str :param dataflow_options: options to run Dataflow jobs. If None, then the `dag`'s `default_args['dataflow_default_options']` will be used. @@ -152,22 +152,22 @@ def validate_err_and_count(summary): tensorflow.estimator.export_savedmodel(). It cannot be used with model_name or version_name below. See MLEngineBatchPredictionOperator for more detail. - :type model_uri: string + :type model_uri: str :param model_name: Used to indicate a model to use for prediction. Can be used in combination with version_name, but cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['model_name']` will be used. - :type model_name: string + :type model_name: str - :param version_name: Used to indicate a model version to use for prediciton, + :param version_name: Used to indicate a model version to use for prediction, in combination with model_name. Cannot be used together with model_uri. See MLEngineBatchPredictionOperator for more detail. If None, then the `dag`'s `default_args['version_name']` will be used. - :type version_name: string + :type version_name: str :param dag: The `DAG` to use for all Operators. - :type dag: airflow.DAG + :type dag: airflow.models.DAG :returns: a tuple of three operators, (prediction, summary, validation) :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator, @@ -213,7 +213,7 @@ def validate_err_and_count(summary): evaluate_summary = DataFlowPythonOperator( task_id=(task_prefix + "-summary"), py_options=["-m"], - py_file="airflow.contrib.operators.mlengine_prediction_summary", + py_file="airflow.contrib.utils.mlengine_prediction_summary", dataflow_default_options=dataflow_options, options={ "prediction_path": prediction_path, diff --git a/airflow/contrib/operators/mlengine_prediction_summary.py b/airflow/contrib/utils/mlengine_prediction_summary.py similarity index 93% rename from airflow/contrib/operators/mlengine_prediction_summary.py rename to airflow/contrib/utils/mlengine_prediction_summary.py index 17fc2c090379e..def793c1be001 100644 --- a/airflow/contrib/operators/mlengine_prediction_summary.py +++ b/airflow/contrib/utils/mlengine_prediction_summary.py @@ -1,3 +1,4 @@ +# flake8: noqa: F841 # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -102,24 +103,26 @@ def metric_fn(inst): class JsonCoder(object): - def encode(self, x): + @staticmethod + def encode(x): return json.dumps(x) - def decode(self, x): + @staticmethod + def decode(x): return json.loads(x) @beam.ptransform_fn def MakeSummary(pcoll, metric_fn, metric_keys): # pylint: disable=invalid-name return ( - pcoll - | "ApplyMetricFnPerInstance" >> beam.Map(metric_fn) - | "PairWith1" >> beam.Map(lambda tup: tup + (1,)) - | "SumTuple" >> beam.CombineGlobally(beam.combiners.TupleCombineFn( - *([sum] * (len(metric_keys) + 1)))) - | "AverageAndMakeDict" >> beam.Map( + pcoll | + "ApplyMetricFnPerInstance" >> beam.Map(metric_fn) | + "PairWith1" >> beam.Map(lambda tup: tup + (1,)) | + "SumTuple" >> beam.CombineGlobally(beam.combiners.TupleCombineFn( + *([sum] * (len(metric_keys) + 1)))) | + "AverageAndMakeDict" >> beam.Map( lambda tup: dict( - [(name, tup[i]/tup[-1]) for i, name in enumerate(metric_keys)] + + [(name, tup[i] / tup[-1]) for i, name in enumerate(metric_keys)] + [("count", tup[-1])]))) diff --git a/airflow/contrib/utils/sendgrid.py b/airflow/contrib/utils/sendgrid.py index 9055c97879e17..5082c7a3328bf 100644 --- a/airflow/contrib/utils/sendgrid.py +++ b/airflow/contrib/utils/sendgrid.py @@ -42,7 +42,7 @@ def send_email(to, subject, html_content, files=None, To use this plugin: 0. include sendgrid subpackage as part of your Airflow installation, e.g., - pip install airflow[sendgrid] + pip install 'apache-airflow[sendgrid]' 1. update [email] backend in airflow.cfg, i.e., [email] email_backend = airflow.contrib.utils.sendgrid.send_email @@ -88,7 +88,7 @@ def send_email(to, subject, html_content, files=None, basename = os.path.basename(fname) attachment = Attachment() with open(fname, "rb") as f: - attachment.content = base64.b64encode(f.read()) + attachment.content = str(base64.b64encode(f.read()), 'utf-8') attachment.type = mimetypes.guess_type(basename)[0] attachment.filename = basename attachment.disposition = "attachment" diff --git a/airflow/contrib/utils/weekday.py b/airflow/contrib/utils/weekday.py new file mode 100644 index 0000000000000..8630e5a3dca7c --- /dev/null +++ b/airflow/contrib/utils/weekday.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import enum + + +@enum.unique +class WeekDay(enum.IntEnum): + """ + Python Enum containing Days of the Week + """ + MONDAY = 1 + TUESDAY = 2 + WEDNESDAY = 3 + THURSDAY = 4 + FRIDAY = 5 + SATURDAY = 6 + SUNDAY = 7 + + @classmethod + def get_weekday_number(cls, week_day_str): + """ + Return the ISO Week Day Number for a Week Day + + :param week_day_str: Full Name of the Week Day. Example: "Sunday" + :type week_day_str: str + :return: ISO Week Day Number corresponding to the provided Weekday + """ + sanitized_week_day_str = week_day_str.upper() + + if sanitized_week_day_str not in cls.__members__: + raise AttributeError( + 'Invalid Week Day passed: "{}"'.format(week_day_str) + ) + + return cls[sanitized_week_day_str] diff --git a/airflow/dag/base_dag.py b/airflow/dag/base_dag.py index 5719f572e9b4c..0e65775d41424 100644 --- a/airflow/dag/base_dag.py +++ b/airflow/dag/base_dag.py @@ -96,6 +96,6 @@ def dag_ids(self): def get_dag(self, dag_id): """ :return: whether the task exists in this bag - :rtype: BaseDag + :rtype: airflow.dag.base_dag.BaseDag """ raise NotImplementedError() diff --git a/airflow/default_login.py b/airflow/default_login.py index d44dbf39ea9b3..e423199fe01a9 100644 --- a/airflow/default_login.py +++ b/airflow/default_login.py @@ -25,11 +25,11 @@ """ import flask_login -from flask_login import login_required, current_user, logout_user +from flask_login import login_required, current_user, logout_user # noqa: F401 from flask import url_for, redirect -from airflow import settings +from airflow import settings # noqa: F401 from airflow import models from airflow.utils.db import provide_session @@ -44,14 +44,17 @@ class DefaultUser(object): def __init__(self, user): self.user = user + @property def is_active(self): """Required by flask_login""" return True + @property def is_authenticated(self): """Required by flask_login""" return True + @property def is_anonymous(self): """Required by flask_login""" return False @@ -64,9 +67,6 @@ def is_superuser(self): """Access all the things""" return True -# models.User = User # hack! -# del User - @login_manager.user_loader @provide_session diff --git a/airflow/example_dags/example_bash_operator.py b/airflow/example_dags/example_bash_operator.py index b2d9d14610dfc..68accc6317bbf 100644 --- a/airflow/example_dags/example_bash_operator.py +++ b/airflow/example_dags/example_bash_operator.py @@ -17,44 +17,57 @@ # specific language governing permissions and limitations # under the License. -import airflow from builtins import range -from airflow.operators.bash_operator import BashOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.models import DAG from datetime import timedelta +import airflow +from airflow.models import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.operators.dummy_operator import DummyOperator args = { 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) + 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( - dag_id='example_bash_operator', default_args=args, + dag_id='example_bash_operator', + default_args=args, schedule_interval='0 0 * * *', - dagrun_timeout=timedelta(minutes=60)) + dagrun_timeout=timedelta(minutes=60), +) -cmd = 'ls -l' -run_this_last = DummyOperator(task_id='run_this_last', dag=dag) +run_this_last = DummyOperator( + task_id='run_this_last', + dag=dag, +) +# [START howto_operator_bash] run_this = BashOperator( - task_id='run_after_loop', bash_command='echo 1', dag=dag) -run_this.set_downstream(run_this_last) + task_id='run_after_loop', + bash_command='echo 1', + dag=dag, +) +# [END howto_operator_bash] + +run_this >> run_this_last for i in range(3): - i = str(i) task = BashOperator( - task_id='runme_' + i, + task_id='runme_' + str(i), bash_command='echo "{{ task_instance_key_str }}" && sleep 1', - dag=dag) - task.set_downstream(run_this) + dag=dag, + ) + task >> run_this -task = BashOperator( +# [START howto_operator_bash_template] +also_run_this = BashOperator( task_id='also_run_this', bash_command='echo "run_id={{ run_id }} | dag_run={{ dag_run }}"', - dag=dag) -task.set_downstream(run_this_last) + dag=dag, +) +# [END howto_operator_bash_template] +also_run_this >> run_this_last if __name__ == "__main__": dag.cli() diff --git a/airflow/example_dags/example_branch_operator.py b/airflow/example_dags/example_branch_operator.py index 45bf11f30139c..197d7d7a73bad 100644 --- a/airflow/example_dags/example_branch_operator.py +++ b/airflow/example_dags/example_branch_operator.py @@ -17,43 +17,53 @@ # specific language governing permissions and limitations # under the License. -import airflow -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.models import DAG import random +import airflow +from airflow.models import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.python_operator import BranchPythonOperator args = { 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) + 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( dag_id='example_branch_operator', default_args=args, - schedule_interval="@daily") + schedule_interval="@daily", +) -cmd = 'ls -l' -run_this_first = DummyOperator(task_id='run_this_first', dag=dag) +run_this_first = DummyOperator( + task_id='run_this_first', + dag=dag, +) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), - dag=dag) -branching.set_upstream(run_this_first) + dag=dag, +) +run_this_first >> branching join = DummyOperator( task_id='join', trigger_rule='one_success', - dag=dag + dag=dag, ) for option in options: - t = DummyOperator(task_id=option, dag=dag) - t.set_upstream(branching) - dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) - t.set_downstream(dummy_follow) - dummy_follow.set_downstream(join) + t = DummyOperator( + task_id=option, + dag=dag, + ) + + dummy_follow = DummyOperator( + task_id='follow_' + option, + dag=dag, + ) + + branching >> t >> dummy_follow >> join diff --git a/airflow/example_dags/example_branch_python_dop_operator_3.py b/airflow/example_dags/example_branch_python_dop_operator_3.py index 7be55a5f36915..36edb27317344 100644 --- a/airflow/example_dags/example_branch_python_dop_operator_3.py +++ b/airflow/example_dags/example_branch_python_dop_operator_3.py @@ -18,9 +18,9 @@ # under the License. import airflow -from airflow.operators.python_operator import BranchPythonOperator -from airflow.operators.dummy_operator import DummyOperator from airflow.models import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.python_operator import BranchPythonOperator args = { 'owner': 'airflow', @@ -31,32 +31,29 @@ # BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs -dag = DAG(dag_id='example_branch_dop_operator_v3', - schedule_interval='*/1 * * * *', default_args=args) - +dag = DAG( + dag_id='example_branch_dop_operator_v3', + schedule_interval='*/1 * * * *', + default_args=args, +) -def should_run(ds, **kwargs): +def should_run(**kwargs): print('------------- exec dttm = {} and minute = {}'. format(kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: - return "oper_1" + return "dummy_task_1" else: - return "oper_2" + return "dummy_task_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, - dag=dag) - -oper_1 = DummyOperator( - task_id='oper_1', - dag=dag) -oper_1.set_upstream(cond) + dag=dag, +) -oper_2 = DummyOperator( - task_id='oper_2', - dag=dag) -oper_2.set_upstream(cond) +dummy_task_1 = DummyOperator(task_id='dummy_task_1', dag=dag) +dummy_task_2 = DummyOperator(task_id='dummy_task_2', dag=dag) +cond >> [dummy_task_1, dummy_task_2] diff --git a/airflow/example_dags/example_http_operator.py b/airflow/example_dags/example_http_operator.py index da7ea3f2187e3..4a469795a21f7 100644 --- a/airflow/example_dags/example_http_operator.py +++ b/airflow/example_dags/example_http_operator.py @@ -49,14 +49,16 @@ data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, response_check=lambda response: True if len(response.json()) == 0 else False, - dag=dag) + dag=dag, +) t5 = SimpleHttpOperator( task_id='post_op_formenc', endpoint='nodes/url', data="name=Joe", headers={"Content-Type": "application/x-www-form-urlencoded"}, - dag=dag) + dag=dag, +) t2 = SimpleHttpOperator( task_id='get_op', @@ -64,7 +66,8 @@ endpoint='api/v1.0/nodes', data={"param1": "value1", "param2": "value2"}, headers={}, - dag=dag) + dag=dag, +) t3 = SimpleHttpOperator( task_id='put_op', @@ -72,7 +75,8 @@ endpoint='api/v1.0/nodes', data=json.dumps({"priority": 5}), headers={"Content-Type": "application/json"}, - dag=dag) + dag=dag, +) t4 = SimpleHttpOperator( task_id='del_op', @@ -80,19 +84,17 @@ endpoint='api/v1.0/nodes', data="some=data", headers={"Content-Type": "application/x-www-form-urlencoded"}, - dag=dag) + dag=dag, +) sensor = HttpSensor( task_id='http_sensor_check', http_conn_id='http_default', endpoint='', request_params={}, - response_check=lambda response: True if "Google" in response.content else False, + response_check=lambda response: True if "Google" in response.text else False, poke_interval=5, - dag=dag) + dag=dag, +) -t1.set_upstream(sensor) -t2.set_upstream(t1) -t3.set_upstream(t2) -t4.set_upstream(t3) -t5.set_upstream(t4) +sensor >> t1 >> t2 >> t3 >> t4 >> t5 diff --git a/airflow/example_dags/example_latest_only.py b/airflow/example_dags/example_latest_only.py index fdb2dca490fbf..635a7641983c0 100644 --- a/airflow/example_dags/example_latest_only.py +++ b/airflow/example_dags/example_latest_only.py @@ -33,6 +33,6 @@ ) latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) - task1 = DummyOperator(task_id='task1', dag=dag) -task1.set_upstream(latest_only) + +latest_only >> task1 diff --git a/airflow/example_dags/example_latest_only_with_trigger.py b/airflow/example_dags/example_latest_only_with_trigger.py index b8f4811c1a529..3559afb0c85b9 100644 --- a/airflow/example_dags/example_latest_only_with_trigger.py +++ b/airflow/example_dags/example_latest_only_with_trigger.py @@ -34,15 +34,10 @@ ) latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) - task1 = DummyOperator(task_id='task1', dag=dag) -task1.set_upstream(latest_only) - task2 = DummyOperator(task_id='task2', dag=dag) - task3 = DummyOperator(task_id='task3', dag=dag) -task3.set_upstream([task1, task2]) +task4 = DummyOperator(task_id='task4', dag=dag, trigger_rule=TriggerRule.ALL_DONE) -task4 = DummyOperator(task_id='task4', dag=dag, - trigger_rule=TriggerRule.ALL_DONE) -task4.set_upstream([task1, task2]) +latest_only >> task1 >> [task3, task4] +task2 >> [task3, task4] diff --git a/airflow/example_dags/example_passing_params_via_test_command.py b/airflow/example_dags/example_passing_params_via_test_command.py index 7efca2f3b0ef2..2aef593abfd8b 100644 --- a/airflow/example_dags/example_passing_params_via_test_command.py +++ b/airflow/example_dags/example_passing_params_via_test_command.py @@ -18,18 +18,21 @@ # under the License. from datetime import timedelta + import airflow from airflow import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator - -dag = DAG("example_passing_params_via_test_command", - default_args={"owner": "airflow", - "start_date": airflow.utils.dates.days_ago(1)}, - schedule_interval='*/1 * * * *', - dagrun_timeout=timedelta(minutes=4) - ) +dag = DAG( + "example_passing_params_via_test_command", + default_args={ + "owner": "airflow", + "start_date": airflow.utils.dates.days_ago(1), + }, + schedule_interval='*/1 * * * *', + dagrun_timeout=timedelta(minutes=4), +) def my_py_command(ds, **kwargs): @@ -54,12 +57,14 @@ def my_py_command(ds, **kwargs): provide_context=True, python_callable=my_py_command, params={"miff": "agg"}, - dag=dag) - + dag=dag, +) also_run_this = BashOperator( task_id='also_run_this', bash_command=my_templated_command, params={"miff": "agg"}, - dag=dag) -also_run_this.set_upstream(run_this) + dag=dag, +) + +run_this >> also_run_this diff --git a/airflow/example_dags/example_python_operator.py b/airflow/example_dags/example_python_operator.py index 0ecf099e7b01c..84705181a8b01 100644 --- a/airflow/example_dags/example_python_operator.py +++ b/airflow/example_dags/example_python_operator.py @@ -18,29 +18,28 @@ # under the License. from __future__ import print_function -from builtins import range -import airflow -from airflow.operators.python_operator import PythonOperator -from airflow.models import DAG import time +from builtins import range from pprint import pprint +import airflow +from airflow.models import DAG +from airflow.operators.python_operator import PythonOperator + args = { 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) + 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG( - dag_id='example_python_operator', default_args=args, - schedule_interval=None) - - -def my_sleeping_function(random_base): - """This is a function that will run within the DAG execution""" - time.sleep(random_base) + dag_id='example_python_operator', + default_args=args, + schedule_interval=None, +) +# [START howto_operator_python] def print_context(ds, **kwargs): pprint(kwargs) print(ds) @@ -51,14 +50,25 @@ def print_context(ds, **kwargs): task_id='print_the_context', provide_context=True, python_callable=print_context, - dag=dag) + dag=dag, +) +# [END howto_operator_python] + + +# [START howto_operator_python_kwargs] +def my_sleeping_function(random_base): + """This is a function that will run within the DAG execution""" + time.sleep(random_base) + -# Generate 10 sleeping tasks, sleeping from 0 to 4 seconds respectively +# Generate 5 sleeping tasks, sleeping from 0.0 to 0.4 seconds respectively for i in range(5): task = PythonOperator( task_id='sleep_for_' + str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i) / 10}, - dag=dag) + dag=dag, + ) - task.set_upstream(run_this) + run_this >> task +# [END howto_operator_python_kwargs] diff --git a/airflow/example_dags/example_short_circuit_operator.py b/airflow/example_dags/example_short_circuit_operator.py index 8e9565df917da..1093dab616779 100644 --- a/airflow/example_dags/example_short_circuit_operator.py +++ b/airflow/example_dags/example_short_circuit_operator.py @@ -17,25 +17,29 @@ # specific language governing permissions and limitations # under the License. -import airflow -from airflow.operators.python_operator import ShortCircuitOperator -from airflow.operators.dummy_operator import DummyOperator -from airflow.models import DAG import airflow.utils.helpers - +from airflow.models import DAG +from airflow.operators.dummy_operator import DummyOperator +from airflow.operators.python_operator import ShortCircuitOperator args = { 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) + 'start_date': airflow.utils.dates.days_ago(2), } dag = DAG(dag_id='example_short_circuit_operator', default_args=args) cond_true = ShortCircuitOperator( - task_id='condition_is_True', python_callable=lambda: True, dag=dag) + task_id='condition_is_True', + python_callable=lambda: True, + dag=dag, +) cond_false = ShortCircuitOperator( - task_id='condition_is_False', python_callable=lambda: False, dag=dag) + task_id='condition_is_False', + python_callable=lambda: False, + dag=dag, +) ds_true = [DummyOperator(task_id='true_' + str(i), dag=dag) for i in [1, 2]] ds_false = [DummyOperator(task_id='false_' + str(i), dag=dag) for i in [1, 2]] diff --git a/airflow/example_dags/example_skip_dag.py b/airflow/example_dags/example_skip_dag.py index f11ca59338a0a..456eb911dcaa1 100644 --- a/airflow/example_dags/example_skip_dag.py +++ b/airflow/example_dags/example_skip_dag.py @@ -18,14 +18,13 @@ # under the License. import airflow -from airflow.operators.dummy_operator import DummyOperator -from airflow.models import DAG from airflow.exceptions import AirflowSkipException - +from airflow.models import DAG +from airflow.operators.dummy_operator import DummyOperator args = { 'owner': 'airflow', - 'start_date': airflow.utils.dates.days_ago(2) + 'start_date': airflow.utils.dates.days_ago(2), } @@ -37,23 +36,17 @@ def execute(self, context): raise AirflowSkipException -dag = DAG(dag_id='example_skip_dag', default_args=args) - - def create_test_pipeline(suffix, trigger_rule, dag): - skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) - always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) - join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) - - join.set_upstream(skip_operator) - join.set_upstream(always_true) - final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) - final.set_upstream(join) + + skip_operator >> join + always_true >> join + join >> final +dag = DAG(dag_id='example_skip_dag', default_args=args) create_test_pipeline('1', 'all_success', dag) create_test_pipeline('2', 'one_success', dag) diff --git a/airflow/example_dags/example_subdag_operator.py b/airflow/example_dags/example_subdag_operator.py index ffd254b19a845..98386ba4545e2 100644 --- a/airflow/example_dags/example_subdag_operator.py +++ b/airflow/example_dags/example_subdag_operator.py @@ -18,14 +18,11 @@ # under the License. import airflow - +from airflow.example_dags.subdags.subdag import subdag from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.subdag_operator import SubDagOperator -from airflow.example_dags.subdags.subdag import subdag - - DAG_NAME = 'example_subdag_operator' args = { @@ -71,7 +68,4 @@ dag=dag, ) -start.set_downstream(section_1) -section_1.set_downstream(some_other_task) -some_other_task.set_downstream(section_2) -section_2.set_downstream(end) +start >> section_1 >> some_other_task >> section_2 >> end diff --git a/airflow/example_dags/example_trigger_controller_dag.py b/airflow/example_dags/example_trigger_controller_dag.py index f5c7218239150..35e7184f76cf0 100644 --- a/airflow/example_dags/example_trigger_controller_dag.py +++ b/airflow/example_dags/example_trigger_controller_dag.py @@ -33,11 +33,11 @@ 2. A Target DAG : c.f. example_trigger_target_dag.py """ -from airflow import DAG -from airflow.operators.dagrun_operator import TriggerDagRunOperator +import pprint from datetime import datetime -import pprint +from airflow import DAG +from airflow.operators.dagrun_operator import TriggerDagRunOperator pp = pprint.PrettyPrinter(indent=4) @@ -53,16 +53,20 @@ def conditionally_trigger(context, dag_run_obj): # Define the DAG -dag = DAG(dag_id='example_trigger_controller_dag', - default_args={"owner": "airflow", - "start_date": datetime.utcnow()}, - schedule_interval='@once') - +dag = DAG( + dag_id='example_trigger_controller_dag', + default_args={ + "owner": "airflow", + "start_date": datetime.utcnow(), + }, + schedule_interval='@once', +) # Define the single task in this controller example DAG -trigger = TriggerDagRunOperator(task_id='test_trigger_dagrun', - trigger_dag_id="example_trigger_target_dag", - python_callable=conditionally_trigger, - params={'condition_param': True, - 'message': 'Hello World'}, - dag=dag) +trigger = TriggerDagRunOperator( + task_id='test_trigger_dagrun', + trigger_dag_id="example_trigger_target_dag", + python_callable=conditionally_trigger, + params={'condition_param': True, 'message': 'Hello World'}, + dag=dag, +) diff --git a/airflow/example_dags/example_trigger_target_dag.py b/airflow/example_dags/example_trigger_target_dag.py index 7a656f285964e..c1403a60e1c00 100644 --- a/airflow/example_dags/example_trigger_target_dag.py +++ b/airflow/example_dags/example_trigger_target_dag.py @@ -17,12 +17,13 @@ # specific language governing permissions and limitations # under the License. +import pprint +from datetime import datetime + +from airflow.models import DAG from airflow.operators.bash_operator import BashOperator from airflow.operators.python_operator import PythonOperator -from airflow.models import DAG -from datetime import datetime -import pprint pp = pprint.PrettyPrinter(indent=4) # This example illustrates the use of the TriggerDagRunOperator. There are 2 @@ -50,7 +51,8 @@ dag = DAG( dag_id='example_trigger_target_dag', default_args=args, - schedule_interval=None) + schedule_interval=None, +) def run_this_func(ds, **kwargs): @@ -62,12 +64,13 @@ def run_this_func(ds, **kwargs): task_id='run_this', provide_context=True, python_callable=run_this_func, - dag=dag) - + dag=dag, +) # You can also access the DagRun object in templates bash_task = BashOperator( task_id="bash_task", bash_command='echo "Here is the message: ' '{{ dag_run.conf["message"] if dag_run else "" }}" ', - dag=dag) + dag=dag, +) diff --git a/airflow/example_dags/example_xcom.py b/airflow/example_dags/example_xcom.py index 66bec9a780285..f2b7627aca2f7 100644 --- a/airflow/example_dags/example_xcom.py +++ b/airflow/example_dags/example_xcom.py @@ -17,6 +17,7 @@ # specific language governing permissions and limitations # under the License. from __future__ import print_function + import airflow from airflow import DAG from airflow.operators.python_operator import PythonOperator @@ -24,25 +25,22 @@ args = { 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2), - 'provide_context': True + 'provide_context': True, } -dag = DAG( - 'example_xcom', - schedule_interval="@once", - default_args=args) +dag = DAG('example_xcom', schedule_interval="@once", default_args=args) value_1 = [1, 2, 3] value_2 = {'a': 'b'} def push(**kwargs): - # pushes an XCom without a specific target + """Pushes an XCom without a specific target""" kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1) def push_by_returning(**kwargs): - # pushes an XCom without a specific target, just by returning it + """Pushes an XCom without a specific target, just by returning it""" return value_2 @@ -63,12 +61,21 @@ def puller(**kwargs): push1 = PythonOperator( - task_id='push', dag=dag, python_callable=push) + task_id='push', + dag=dag, + python_callable=push, +) push2 = PythonOperator( - task_id='push_by_returning', dag=dag, python_callable=push_by_returning) + task_id='push_by_returning', + dag=dag, + python_callable=push_by_returning, +) pull = PythonOperator( - task_id='puller', dag=dag, python_callable=puller) + task_id='puller', + dag=dag, + python_callable=puller, +) -pull.set_upstream([push1, push2]) +pull << [push1, push2] diff --git a/airflow/example_dags/test_utils.py b/airflow/example_dags/test_utils.py index fb8792a1bf2bc..3fc8af1dada06 100644 --- a/airflow/example_dags/test_utils.py +++ b/airflow/example_dags/test_utils.py @@ -18,17 +18,15 @@ # under the License. """Used for unit tests""" import airflow -from airflow.operators.bash_operator import BashOperator from airflow.models import DAG +from airflow.operators.bash_operator import BashOperator -dag = DAG( - dag_id='test_utils', - schedule_interval=None, -) +dag = DAG(dag_id='test_utils', schedule_interval=None) task = BashOperator( task_id='sleeps_forever', dag=dag, bash_command="sleep 10000000000", start_date=airflow.utils.dates.days_ago(2), - owner='airflow') + owner='airflow', +) diff --git a/airflow/example_dags/tutorial.py b/airflow/example_dags/tutorial.py index ad817338ef2aa..f87df8b2bca95 100644 --- a/airflow/example_dags/tutorial.py +++ b/airflow/example_dags/tutorial.py @@ -20,16 +20,16 @@ """ ### Tutorial Documentation Documentation that goes along with the Airflow tutorial located -[here](https://airflow.incubator.apache.org/tutorial.html) +[here](https://airflow.apache.org/tutorial.html) """ +from datetime import timedelta + import airflow from airflow import DAG from airflow.operators.bash_operator import BashOperator -from datetime import timedelta - -# these args will get passed on to each operator -# you can override them on a per-task basis during operator initialization +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization default_args = { 'owner': 'airflow', 'depends_on_past': False, @@ -45,7 +45,6 @@ # 'end_date': datetime(2016, 1, 1), # 'wait_for_downstream': False, # 'dag': dag, - # 'adhoc':False, # 'sla': timedelta(hours=2), # 'execution_timeout': timedelta(seconds=300), # 'on_failure_callback': some_function, @@ -58,13 +57,15 @@ 'tutorial', default_args=default_args, description='A simple tutorial DAG', - schedule_interval=timedelta(days=1)) + schedule_interval=timedelta(days=1), +) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator( task_id='print_date', bash_command='date', - dag=dag) + dag=dag, +) t1.doc_md = """\ #### Task Documentation @@ -80,7 +81,8 @@ task_id='sleep', depends_on_past=False, bash_command='sleep 5', - dag=dag) + dag=dag, +) templated_command = """ {% for i in range(5) %} @@ -95,7 +97,7 @@ depends_on_past=False, bash_command=templated_command, params={'my_param': 'Parameter I passed in'}, - dag=dag) + dag=dag, +) -t2.set_upstream(t1) -t3.set_upstream(t1) +t1 >> [t2, t3] diff --git a/airflow/exceptions.py b/airflow/exceptions.py index 89f3d0e048da2..41f0a3dda8e38 100644 --- a/airflow/exceptions.py +++ b/airflow/exceptions.py @@ -47,6 +47,17 @@ class AirflowSensorTimeout(AirflowException): pass +class AirflowRescheduleException(AirflowException): + """ + Raise when the task should be re-scheduled at a later time. + + :param reschedule_date: The date when the task should be rescheduled + :type reschedule: datetime.datetime + """ + def __init__(self, reschedule_date): + self.reschedule_date = reschedule_date + + class AirflowTaskTimeout(AirflowException): pass diff --git a/airflow/executors/base_executor.py b/airflow/executors/base_executor.py index 04c90884012f7..f88f34591ec4b 100644 --- a/airflow/executors/base_executor.py +++ b/airflow/executors/base_executor.py @@ -18,12 +18,15 @@ # under the License. from builtins import range +from collections import OrderedDict +# To avoid circular imports +import airflow.utils.dag_processing from airflow import configuration +from airflow.settings import Stats from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State - PARALLELISM = configuration.conf.getint('core', 'PARALLELISM') @@ -39,7 +42,7 @@ def __init__(self, parallelism=PARALLELISM): :type parallelism: int """ self.parallelism = parallelism - self.queued_tasks = {} + self.queued_tasks = OrderedDict() self.running = {} self.event_buffer = {} @@ -50,13 +53,13 @@ def start(self): # pragma: no cover """ pass - def queue_command(self, task_instance, command, priority=1, queue=None): - key = task_instance.key + def queue_command(self, simple_task_instance, command, priority=1, queue=None): + key = simple_task_instance.key if key not in self.queued_tasks and key not in self.running: self.log.info("Adding to queue: %s", command) - self.queued_tasks[key] = (command, priority, queue, task_instance) + self.queued_tasks[key] = (command, priority, queue, simple_task_instance) else: - self.log.info("could not queue task {}".format(key)) + self.log.info("could not queue task %s", key) def queue_task_instance( self, @@ -75,7 +78,7 @@ def queue_task_instance( # cfg_path is needed to propagate the config values if using impersonation # (run_as_user), given that there are different code paths running tasks. # For a long term solution we need to address AIRFLOW-1986 - command = task_instance.command( + command = task_instance.command_as_list( local=True, mark_success=mark_success, ignore_all_deps=ignore_all_deps, @@ -86,7 +89,7 @@ def queue_task_instance( pickle_id=pickle_id, cfg_path=cfg_path) self.queue_command( - task_instance, + airflow.utils.dag_processing.SimpleTaskInstance(task_instance), command, priority=task_instance.task.priority_weight_total, queue=task_instance.task.queue) @@ -115,43 +118,37 @@ def heartbeat(self): else: open_slots = self.parallelism - len(self.running) - self.log.debug("%s running task instances", len(self.running)) - self.log.debug("%s in queue", len(self.queued_tasks)) + num_running_tasks = len(self.running) + num_queued_tasks = len(self.queued_tasks) + + self.log.debug("%s running task instances", num_running_tasks) + self.log.debug("%s in queue", num_queued_tasks) self.log.debug("%s open slots", open_slots) + Stats.gauge('executor.open_slots', open_slots) + Stats.gauge('executor.queued_tasks', num_queued_tasks) + Stats.gauge('executor.running_tasks', num_running_tasks) + sorted_queue = sorted( [(k, v) for k, v in self.queued_tasks.items()], key=lambda x: x[1][1], reverse=True) for i in range(min((open_slots, len(self.queued_tasks)))): - key, (command, _, queue, ti) = sorted_queue.pop(0) - # TODO(jlowin) without a way to know what Job ran which tasks, - # there is a danger that another Job started running a task - # that was also queued to this executor. This is the last chance - # to check if that happened. The most probable way is that a - # Scheduler tried to run a task that was originally queued by a - # Backfill. This fix reduces the probability of a collision but - # does NOT eliminate it. + key, (command, _, queue, simple_ti) = sorted_queue.pop(0) self.queued_tasks.pop(key) - ti.refresh_from_db() - if ti.state != State.RUNNING: - self.running[key] = command - self.execute_async(key=key, - command=command, - queue=queue, - executor_config=ti.executor_config) - else: - self.logger.info( - 'Task is already running, not sending to ' - 'executor: {}'.format(key)) + self.running[key] = command + self.execute_async(key=key, + command=command, + queue=queue, + executor_config=simple_ti.executor_config) # Calling child class sync method self.log.debug("Calling the %s sync method", self.__class__) self.sync() def change_state(self, key, state): - print("popping: {}".format(key)) - self.running.pop(key) + self.log.debug("Changing state: %s", key) + self.running.pop(key, None) self.event_buffer[key] = state def fail(self, key): @@ -175,7 +172,7 @@ def get_event_buffer(self, dag_ids=None): self.event_buffer = dict() else: for key in list(self.event_buffer.keys()): - dag_id, _, _ = key + dag_id, _, _, _ = key if dag_id in dag_ids: cleared_events[key] = self.event_buffer.pop(key) diff --git a/airflow/executors/celery_executor.py b/airflow/executors/celery_executor.py index 6cfd2d3769893..09ed425547e12 100644 --- a/airflow/executors/celery_executor.py +++ b/airflow/executors/celery_executor.py @@ -17,21 +17,28 @@ # specific language governing permissions and limitations # under the License. +import math +import os import subprocess import time -import os +import traceback +from multiprocessing import Pool, cpu_count from celery import Celery from celery import states as celery_states +from airflow import configuration from airflow.config_templates.default_celery import DEFAULT_CELERY_CONFIG from airflow.exceptions import AirflowException from airflow.executors.base_executor import BaseExecutor -from airflow import configuration from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.module_loading import import_string +from airflow.utils.timeout import timeout -PARALLELISM = configuration.conf.get('core', 'PARALLELISM') +# Make it constant for unit test. +CELERY_FETCH_ERR_MSG_HEADER = 'Error fetching Celery task state' + +CELERY_SEND_ERR_MSG_HEADER = 'Error sending Celery task' ''' To start the celery worker, run the command: @@ -51,12 +58,12 @@ @app.task -def execute_command(command): +def execute_command(command_to_exec): log = LoggingMixin().log - log.info("Executing command in Celery: %s", command) + log.info("Executing command in Celery: %s", command_to_exec) env = os.environ.copy() try: - subprocess.check_call(command, shell=True, stderr=subprocess.STDOUT, + subprocess.check_call(command_to_exec, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') @@ -65,6 +72,58 @@ def execute_command(command): raise AirflowException('Celery command failed') +class ExceptionWithTraceback(object): + """ + Wrapper class used to propagate exceptions to parent processes from subprocesses. + + :param exception: The exception to wrap + :type exception: Exception + :param exception_traceback: The stacktrace to wrap + :type exception_traceback: str + """ + + def __init__(self, exception, exception_traceback): + self.exception = exception + self.traceback = exception_traceback + + +def fetch_celery_task_state(celery_task): + """ + Fetch and return the state of the given celery task. The scope of this function is + global so that it can be called by subprocesses in the pool. + + :param celery_task: a tuple of the Celery task key and the async Celery object used + to fetch the task's state + :type celery_task: tuple(str, celery.result.AsyncResult) + :return: a tuple of the Celery task key and the Celery state of the task + :rtype: tuple[str, str] + """ + + try: + with timeout(seconds=2): + # Accessing state property of celery task will make actual network request + # to get the current state of the task. + res = (celery_task[0], celery_task[1].state) + except Exception as e: + exception_traceback = "Celery Task ID: {}\n{}".format(celery_task[0], + traceback.format_exc()) + res = ExceptionWithTraceback(e, exception_traceback) + return res + + +def send_task_to_executor(task_tuple): + key, simple_ti, command, queue, task = task_tuple + try: + with timeout(seconds=2): + result = task.apply_async(args=[command], queue=queue) + except Exception as e: + exception_traceback = "Celery Task ID: {}\n{}".format(key, + traceback.format_exc()) + result = ExceptionWithTraceback(e, exception_traceback) + + return key, command, result + + class CeleryExecutor(BaseExecutor): """ CeleryExecutor is recommended for production use of Airflow. It allows @@ -74,24 +133,147 @@ class CeleryExecutor(BaseExecutor): vast amounts of messages, while providing operations with the tools required to maintain such a system. """ - def start(self): + + def __init__(self): + super(CeleryExecutor, self).__init__() + + # Celery doesn't support querying the state of multiple tasks in parallel + # (which can become a bottleneck on bigger clusters) so we use + # a multiprocessing pool to speed this up. + # How many worker processes are created for checking celery task state. + self._sync_parallelism = configuration.getint('celery', 'SYNC_PARALLELISM') + if self._sync_parallelism == 0: + self._sync_parallelism = max(1, cpu_count() - 1) + + self._sync_pool = None self.tasks = {} self.last_state = {} - def execute_async(self, key, command, - queue=DEFAULT_CELERY_CONFIG['task_default_queue'], - executor_config=None): - self.log.info("[celery] queuing {key} through celery, " - "queue={queue}".format(**locals())) - self.tasks[key] = execute_command.apply_async( - args=[command], queue=queue) - self.last_state[key] = celery_states.PENDING + def start(self): + self.log.debug( + 'Starting Celery Executor using %s processes for syncing', + self._sync_parallelism + ) + + def _num_tasks_per_send_process(self, to_send_count): + """ + How many Celery tasks should each worker process send. + + :return: Number of tasks that should be sent per process + :rtype: int + """ + return max(1, + int(math.ceil(1.0 * to_send_count / self._sync_parallelism))) + + def _num_tasks_per_fetch_process(self): + """ + How many Celery tasks should be sent to each worker process. + + :return: Number of tasks that should be used per process + :rtype: int + """ + return max(1, + int(math.ceil(1.0 * len(self.tasks) / self._sync_parallelism))) + + def heartbeat(self): + # Triggering new jobs + if not self.parallelism: + open_slots = len(self.queued_tasks) + else: + open_slots = self.parallelism - len(self.running) + + self.log.debug("%s running task instances", len(self.running)) + self.log.debug("%s in queue", len(self.queued_tasks)) + self.log.debug("%s open slots", open_slots) + + sorted_queue = sorted( + [(k, v) for k, v in self.queued_tasks.items()], + key=lambda x: x[1][1], + reverse=True) + + task_tuples_to_send = [] + + for i in range(min((open_slots, len(self.queued_tasks)))): + key, (command, _, queue, simple_ti) = sorted_queue.pop(0) + task_tuples_to_send.append((key, simple_ti, command, queue, + execute_command)) + + cached_celery_backend = None + if task_tuples_to_send: + tasks = [t[4] for t in task_tuples_to_send] + + # Celery state queries will stuck if we do not use one same backend + # for all tasks. + cached_celery_backend = tasks[0].backend + + if task_tuples_to_send: + # Use chunking instead of a work queue to reduce context switching + # since tasks are roughly uniform in size + chunksize = self._num_tasks_per_send_process(len(task_tuples_to_send)) + num_processes = min(len(task_tuples_to_send), self._sync_parallelism) + + send_pool = Pool(processes=num_processes) + key_and_async_results = send_pool.map( + send_task_to_executor, + task_tuples_to_send, + chunksize=chunksize) + + send_pool.close() + send_pool.join() + self.log.debug('Sent all tasks.') + + for key, command, result in key_and_async_results: + if isinstance(result, ExceptionWithTraceback): + self.log.error( + CELERY_SEND_ERR_MSG_HEADER + ":%s\n%s\n", result.exception, result.traceback + ) + elif result is not None: + # Only pops when enqueued successfully, otherwise keep it + # and expect scheduler loop to deal with it. + self.queued_tasks.pop(key) + result.backend = cached_celery_backend + self.running[key] = command + self.tasks[key] = result + self.last_state[key] = celery_states.PENDING + + # Calling child class sync method + self.log.debug("Calling the %s sync method", self.__class__) + self.sync() def sync(self): - self.log.debug("Inquiring about %s celery task(s)", len(self.tasks)) - for key, async in list(self.tasks.items()): + num_processes = min(len(self.tasks), self._sync_parallelism) + if num_processes == 0: + self.log.debug("No task to query celery, skipping sync") + return + + self.log.debug("Inquiring about %s celery task(s) using %s processes", + len(self.tasks), num_processes) + + # Recreate the process pool each sync in case processes in the pool die + self._sync_pool = Pool(processes=num_processes) + + # Use chunking instead of a work queue to reduce context switching since tasks are + # roughly uniform in size + chunksize = self._num_tasks_per_fetch_process() + + self.log.debug("Waiting for inquiries to complete...") + task_keys_to_states = self._sync_pool.map( + fetch_celery_task_state, + self.tasks.items(), + chunksize=chunksize) + self._sync_pool.close() + self._sync_pool.join() + self.log.debug("Inquiries completed.") + + for key_and_state in task_keys_to_states: + if isinstance(key_and_state, ExceptionWithTraceback): + self.log.error( + CELERY_FETCH_ERR_MSG_HEADER + ", ignoring it:%s\n%s\n", + repr(key_and_state.exception), key_and_state.traceback + ) + continue + key, state = key_and_state try: - state = async.state if self.last_state[key] != state: if state == celery_states.SUCCESS: self.success(key) @@ -106,16 +288,15 @@ def sync(self): del self.tasks[key] del self.last_state[key] else: - self.log.info("Unexpected state: %s", async.state) - self.last_state[key] = async.state - except Exception as e: - self.log.error("Error syncing the celery executor, ignoring it:") - self.log.exception(e) + self.log.info("Unexpected state: %s", state) + self.last_state[key] = state + except Exception: + self.log.exception("Error syncing the Celery executor, ignoring it.") def end(self, synchronous=False): if synchronous: while any([ - async.state not in celery_states.READY_STATES - for async in self.tasks.values()]): + task.state not in celery_states.READY_STATES + for task in self.tasks.values()]): time.sleep(5) self.sync() diff --git a/airflow/executors/dask_executor.py b/airflow/executors/dask_executor.py index a6ba677f8bd7c..80527a2512545 100644 --- a/airflow/executors/dask_executor.py +++ b/airflow/executors/dask_executor.py @@ -43,12 +43,13 @@ def __init__(self, cluster_address=None): super(DaskExecutor, self).__init__(parallelism=0) def start(self): - if (self.tls_ca) or (self.tls_key) or (self.tls_cert): + if self.tls_ca or self.tls_key or self.tls_cert: from distributed.security import Security security = Security( tls_client_key=self.tls_key, tls_client_cert=self.tls_cert, tls_ca_file=self.tls_ca, + require_encryption=True, ) else: security = None diff --git a/airflow/executors/local_executor.py b/airflow/executors/local_executor.py index 0c85262324283..c9454a677f832 100644 --- a/airflow/executors/local_executor.py +++ b/airflow/executors/local_executor.py @@ -75,16 +75,15 @@ def execute_work(self, key, command): """ Executes command received and stores result state in queue. :param key: the key to identify the TI - :type key: Tuple(dag_id, task_id, execution_date) + :type key: tuple(dag_id, task_id, execution_date) :param command: the command to execute - :type command: string + :type command: str """ if key is None: return self.log.info("%s running %s", self.__class__.__name__, command) - command = "exec bash -c '{0}'".format(command) try: - subprocess.check_call(command, shell=True, close_fds=True) + subprocess.check_call(command, close_fds=True) state = State.SUCCESS except subprocess.CalledProcessError as e: state = State.FAILED @@ -145,9 +144,9 @@ def start(self): def execute_async(self, key, command): """ :param key: the key to identify the TI - :type key: Tuple(dag_id, task_id, execution_date) + :type key: tuple(dag_id, task_id, execution_date) :param command: the command to execute - :type command: string + :type command: str """ local_worker = LocalWorker(self.executor.result_queue) local_worker.key = key @@ -190,9 +189,9 @@ def start(self): def execute_async(self, key, command): """ :param key: the key to identify the TI - :type key: Tuple(dag_id, task_id, execution_date) + :type key: tuple(dag_id, task_id, execution_date) :param command: the command to execute - :type command: string + :type command: str """ self.executor.queue.put((key, command)) diff --git a/airflow/executors/sequential_executor.py b/airflow/executors/sequential_executor.py index 9c0d8ecf0ca6f..1542e3318eb28 100644 --- a/airflow/executors/sequential_executor.py +++ b/airflow/executors/sequential_executor.py @@ -45,7 +45,7 @@ def sync(self): self.log.info("Executing command: %s", command) try: - subprocess.check_call(command, shell=True, close_fds=True) + subprocess.check_call(command, close_fds=True) self.change_state(key, State.SUCCESS) except subprocess.CalledProcessError as e: self.change_state(key, State.FAILED) diff --git a/airflow/hooks/S3_hook.py b/airflow/hooks/S3_hook.py index b4f3ac3b030e8..a8e4d865e649a 100644 --- a/airflow/hooks/S3_hook.py +++ b/airflow/hooks/S3_hook.py @@ -16,6 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +from botocore.exceptions import ClientError from airflow.exceptions import AirflowException from airflow.contrib.hooks.aws_hook import AwsHook @@ -42,7 +43,7 @@ def parse_s3_url(s3url): else: bucket_name = parsed_url.netloc key = parsed_url.path.strip('/') - return (bucket_name, key) + return bucket_name, key def check_for_bucket(self, bucket_name): """ @@ -54,7 +55,8 @@ def check_for_bucket(self, bucket_name): try: self.get_conn().head_bucket(Bucket=bucket_name) return True - except: + except ClientError as e: + self.log.info(e.response["Error"]["Message"]) return False def get_bucket(self, bucket_name): @@ -67,9 +69,36 @@ def get_bucket(self, bucket_name): s3 = self.get_resource_type('s3') return s3.Bucket(bucket_name) + def create_bucket(self, bucket_name, region_name=None): + """ + Creates an Amazon S3 bucket. + + :param bucket_name: The name of the bucket + :type bucket_name: str + :param region_name: The name of the aws region in which to create the bucket. + :type region_name: str + """ + s3_conn = self.get_conn() + if not region_name: + region_name = s3_conn.meta.region_name + if region_name == 'us-east-1': + self.get_conn().create_bucket(Bucket=bucket_name) + else: + self.get_conn().create_bucket(Bucket=bucket_name, + CreateBucketConfiguration={ + 'LocationConstraint': region_name + }) + def check_for_prefix(self, bucket_name, prefix, delimiter): """ Checks that a prefix exists in a bucket + + :param bucket_name: the name of the bucket + :type bucket_name: str + :param prefix: a key prefix + :type prefix: str + :param delimiter: the delimiter marks key hierarchy. + :type delimiter: str """ prefix = prefix + delimiter if prefix[-1] != delimiter else prefix prefix_split = re.split(r'(\w+[{d}])$'.format(d=delimiter), prefix, 1) @@ -168,7 +197,8 @@ def check_for_key(self, key, bucket_name=None): try: self.get_conn().head_object(Bucket=bucket_name, Key=key) return True - except: + except ClientError as e: + self.log.info(e.response["Error"]["Message"]) return False def get_key(self, key, bucket_name=None): @@ -203,8 +233,8 @@ def read_key(self, key, bucket_name=None): def select_key(self, key, bucket_name=None, expression='SELECT * FROM S3Object', expression_type='SQL', - input_serialization={'CSV': {}}, - output_serialization={'CSV': {}}): + input_serialization=None, + output_serialization=None): """ Reads a key with S3 Select. @@ -227,6 +257,10 @@ def select_key(self, key, bucket_name=None, For more details about S3 Select parameters: http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client.select_object_content """ + if input_serialization is None: + input_serialization = {'CSV': {}} + if output_serialization is None: + output_serialization = {'CSV': {}} if not bucket_name: (bucket_name, key) = self.parse_s3_url(key) @@ -238,7 +272,7 @@ def select_key(self, key, bucket_name=None, InputSerialization=input_serialization, OutputSerialization=output_serialization) - return ''.join(event['Records']['Payload'] + return ''.join(event['Records']['Payload'].decode('utf-8') for event in response['Payload'] if 'Records' in event) @@ -246,6 +280,13 @@ def check_for_wildcard_key(self, wildcard_key, bucket_name=None, delimiter=''): """ Checks that a key matching a wildcard expression exists in a bucket + + :param wildcard_key: the path to the key + :type wildcard_key: str + :param bucket_name: the name of the bucket + :type bucket_name: str + :param delimiter: the delimiter marks key hierarchy + :type delimiter: str """ return self.get_wildcard_key(wildcard_key=wildcard_key, bucket_name=bucket_name, @@ -259,6 +300,8 @@ def get_wildcard_key(self, wildcard_key, bucket_name=None, delimiter=''): :type wildcard_key: str :param bucket_name: the name of the bucket :type bucket_name: str + :param delimiter: the delimiter marks key hierarchy + :type delimiter: str """ if not bucket_name: (bucket_name, wildcard_key) = self.parse_s3_url(wildcard_key) @@ -319,7 +362,7 @@ def load_string(self, This is provided as a convenience to drop a string in S3. It uses the boto infrastructure to ship a file to s3. - :param string_data: string to set as content for the key. + :param string_data: str to set as content for the key. :type string_data: str :param key: S3 key that will point to the file :type key: str @@ -377,3 +420,124 @@ def load_bytes(self, client = self.get_conn() client.upload_fileobj(filelike_buffer, bucket_name, key, ExtraArgs=extra_args) + + def load_file_obj(self, + file_obj, + key, + bucket_name=None, + replace=False, + encrypt=False): + """ + Loads a file object to S3 + + :param file_obj: The file-like object to set as the content for the S3 key. + :type file_obj: file-like object + :param key: S3 key that will point to the file + :type key: str + :param bucket_name: Name of the bucket in which to store the file + :type bucket_name: str + :param replace: A flag that indicates whether to overwrite the key + if it already exists. + :type replace: bool + :param encrypt: If True, S3 encrypts the file on the server, + and the file is stored in encrypted form at rest in S3. + :type encrypt: bool + """ + if not bucket_name: + (bucket_name, key) = self.parse_s3_url(key) + + if not replace and self.check_for_key(key, bucket_name): + raise ValueError("The key {key} already exists.".format(key=key)) + + extra_args = {} + if encrypt: + extra_args['ServerSideEncryption'] = "AES256" + + client = self.get_conn() + client.upload_fileobj(file_obj, bucket_name, key, ExtraArgs=extra_args) + + def copy_object(self, + source_bucket_key, + dest_bucket_key, + source_bucket_name=None, + dest_bucket_name=None, + source_version_id=None): + """ + Creates a copy of an object that is already stored in S3. + + Note: the S3 connection used here needs to have access to both + source and destination bucket/key. + + :param source_bucket_key: The key of the source object. + + It can be either full s3:// style url or relative path from root level. + + When it's specified as a full s3:// url, please omit source_bucket_name. + :type source_bucket_key: str + :param dest_bucket_key: The key of the object to copy to. + + The convention to specify `dest_bucket_key` is the same + as `source_bucket_key`. + :type dest_bucket_key: str + :param source_bucket_name: Name of the S3 bucket where the source object is in. + + It should be omitted when `source_bucket_key` is provided as a full s3:// url. + :type source_bucket_name: str + :param dest_bucket_name: Name of the S3 bucket to where the object is copied. + + It should be omitted when `dest_bucket_key` is provided as a full s3:// url. + :type dest_bucket_name: str + :param source_version_id: Version ID of the source object (OPTIONAL) + :type source_version_id: str + """ + + if dest_bucket_name is None: + dest_bucket_name, dest_bucket_key = self.parse_s3_url(dest_bucket_key) + else: + parsed_url = urlparse(dest_bucket_key) + if parsed_url.scheme != '' or parsed_url.netloc != '': + raise AirflowException('If dest_bucket_name is provided, ' + + 'dest_bucket_key should be relative path ' + + 'from root level, rather than a full s3:// url') + + if source_bucket_name is None: + source_bucket_name, source_bucket_key = self.parse_s3_url(source_bucket_key) + else: + parsed_url = urlparse(source_bucket_key) + if parsed_url.scheme != '' or parsed_url.netloc != '': + raise AirflowException('If source_bucket_name is provided, ' + + 'source_bucket_key should be relative path ' + + 'from root level, rather than a full s3:// url') + + CopySource = {'Bucket': source_bucket_name, + 'Key': source_bucket_key, + 'VersionId': source_version_id} + response = self.get_conn().copy_object(Bucket=dest_bucket_name, + Key=dest_bucket_key, + CopySource=CopySource) + return response + + def delete_objects(self, + bucket, + keys): + """ + :param bucket: Name of the bucket in which you are going to delete object(s) + :type bucket: str + :param keys: The key(s) to delete from S3 bucket. + + When ``keys`` is a string, it's supposed to be the key name of + the single object to delete. + + When ``keys`` is a list, it's supposed to be the list of the + keys to delete. + :type keys: str or list + """ + if isinstance(keys, list): + keys = keys + else: + keys = [keys] + + delete_dict = {"Objects": [{"Key": k} for k in keys]} + response = self.get_conn().delete_objects(Bucket=bucket, + Delete=delete_dict) + return response diff --git a/airflow/hooks/base_hook.py b/airflow/hooks/base_hook.py index 103fa6260b564..c1283e3fb4ceb 100644 --- a/airflow/hooks/base_hook.py +++ b/airflow/hooks/base_hook.py @@ -25,7 +25,7 @@ import os import random -from airflow.models import Connection +from airflow.models.connection import Connection from airflow.exceptions import AirflowException from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin @@ -80,7 +80,7 @@ def get_connection(cls, conn_id): conn = random.choice(cls.get_connections(conn_id)) if conn.host: log = LoggingMixin().log - log.info("Using connection to: %s", conn.host) + log.info("Using connection to: %s", conn.debug_info()) return conn @classmethod diff --git a/airflow/hooks/dbapi_hook.py b/airflow/hooks/dbapi_hook.py index 5b50ade34e486..9f09b6ccc477a 100644 --- a/airflow/hooks/dbapi_hook.py +++ b/airflow/hooks/dbapi_hook.py @@ -22,6 +22,7 @@ from datetime import datetime from contextlib import closing import sys +from typing import Optional from sqlalchemy import create_engine @@ -34,7 +35,7 @@ class DbApiHook(BaseHook): Abstract base class for sql hooks. """ # Override to provide the connection name. - conn_name_attr = None + conn_name_attr = None # type: Optional[str] # Override to have a default connection id for a particular dbHook default_conn_name = 'default_conn_id' # Override if this db supports autocommit. @@ -163,10 +164,11 @@ def run(self, sql, autocommit=False, parameters=None): for s in sql: if sys.version_info[0] < 3: s = s.encode('utf-8') - self.log.info(s) if parameters is not None: + self.log.info("{} with parameters {}".format(s, parameters)) cur.execute(s, parameters) else: + self.log.info(s) cur.execute(s) # If autocommit was set to False for db that supports autocommit, @@ -191,10 +193,11 @@ def get_autocommit(self, conn): Return True if conn.autocommit is set to True. Return False if conn.autocommit is not set or set to False or conn does not support autocommit. + :param conn: Connection to get autocommit setting from. :type conn: connection object. :return: connection autocommit setting. - :rtype bool. + :rtype: bool """ return getattr(conn, 'autocommit', False) and self.supports_autocommit @@ -254,12 +257,11 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000, if commit_every and i % commit_every == 0: conn.commit() self.log.info( - "Loaded {i} into {table} rows so far".format(**locals()) + "Loaded %s into %s rows so far", i, table ) conn.commit() - self.log.info( - "Done loading. Loaded a total of {i} rows".format(**locals())) + self.log.info("Done loading. Loaded a total of %s rows", i) @staticmethod def _serialize_cell(cell, conn=None): diff --git a/airflow/hooks/druid_hook.py b/airflow/hooks/druid_hook.py index ef4f2338d29dc..5e2f94f1258dd 100644 --- a/airflow/hooks/druid_hook.py +++ b/airflow/hooks/druid_hook.py @@ -35,9 +35,10 @@ class DruidHook(BaseHook): :param druid_ingest_conn_id: The connection id to the Druid overlord machine which accepts index jobs - :type druid_ingest_conn_id: string + :type druid_ingest_conn_id: str :param timeout: The interval between polling - the Druid job for the status of the ingestion job + the Druid job for the status of the ingestion job. + Must be greater than or equal to 1 :type timeout: int :param max_ingestion_time: The maximum ingestion time before assuming the job failed :type max_ingestion_time: int @@ -53,6 +54,9 @@ def __init__( self.max_ingestion_time = max_ingestion_time self.header = {'content-type': 'application/json'} + if self.timeout < 1: + raise ValueError("Druid timeout should be equal or greater than 1") + def get_conn_url(self): conn = self.get_connection(self.druid_ingest_conn_id) host = conn.host @@ -64,14 +68,16 @@ def get_conn_url(self): def submit_indexing_job(self, json_index_spec): url = self.get_conn_url() - req_index = requests.post(url, data=json_index_spec, headers=self.header) - if (req_index.status_code != 200): + self.log.info("Druid ingestion spec: %s", json_index_spec) + req_index = requests.post(url, json=json_index_spec, headers=self.header) + if req_index.status_code != 200: raise AirflowException('Did not get 200 when ' 'submitting the Druid job to {}'.format(url)) req_json = req_index.json() # Wait until the job is completed druid_task_id = req_json['task'] + self.log.info("Druid indexing task-id: %s", druid_task_id) running = True @@ -81,8 +87,6 @@ def submit_indexing_job(self, json_index_spec): self.log.info("Job still running for %s seconds...", sec) - sec = sec + 1 - if self.max_ingestion_time and sec > self.max_ingestion_time: # ensure that the job gets killed if the max ingestion time is exceeded requests.post("{0}/{1}/shutdown".format(url, druid_task_id)) @@ -91,6 +95,8 @@ def submit_indexing_job(self, json_index_spec): time.sleep(self.timeout) + sec = sec + self.timeout + status = req_status.json()['status']['status'] if status == 'RUNNING': running = True @@ -130,8 +136,7 @@ def get_conn(self): path=conn.extra_dejson.get('endpoint', '/druid/v2/sql'), scheme=conn.extra_dejson.get('schema', 'http') ) - self.log.info('Get the connection to druid ' - 'broker on {host}'.format(host=conn.host)) + self.log.info('Get the connection to druid broker on %s', conn.host) return druid_broker_conn def get_uri(self): diff --git a/airflow/hooks/hdfs_hook.py b/airflow/hooks/hdfs_hook.py index 3c9136b1fd165..31b2f11501a01 100644 --- a/airflow/hooks/hdfs_hook.py +++ b/airflow/hooks/hdfs_hook.py @@ -16,18 +16,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -from six import PY2 - from airflow import configuration from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook - -snakebite_imported = False -if PY2: +try: from snakebite.client import Client, HAClient, Namenode, AutoConfigClient - snakebite_imported = True + snakebite_loaded = True +except ImportError: + snakebite_loaded = False class HDFSHookException(AirflowException): @@ -39,15 +36,15 @@ class HDFSHook(BaseHook): Interact with HDFS. This class is a wrapper around the snakebite library. :param hdfs_conn_id: Connection id to fetch connection info - :type conn_id: string + :type hdfs_conn_id: str :param proxy_user: effective user for HDFS operations - :type proxy_user: string + :type proxy_user: str :param autoconfig: use snakebite's automatically configured client :type autoconfig: bool """ def __init__(self, hdfs_conn_id='hdfs_default', proxy_user=None, autoconfig=False): - if not snakebite_imported: + if not snakebite_loaded: raise ImportError( 'This HDFSHook implementation requires snakebite, but ' 'snakebite is not compatible with Python 3 ' diff --git a/airflow/hooks/hive_hooks.py b/airflow/hooks/hive_hooks.py index 93e1f45fe6402..2cf22a36498aa 100644 --- a/airflow/hooks/hive_hooks.py +++ b/airflow/hooks/hive_hooks.py @@ -21,30 +21,39 @@ import contextlib import os - -from six.moves import zip -from past.builtins import basestring, unicode - -import unicodecsv as csv import re -import six import subprocess import time from collections import OrderedDict from tempfile import NamedTemporaryFile -import hmsclient -from airflow import configuration as conf +import six +import unicodecsv as csv +from past.builtins import basestring +from past.builtins import unicode +from six.moves import zip + +import airflow.security.utils as utils +from airflow import configuration from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook -from airflow.utils.helpers import as_flattened_list from airflow.utils.file import TemporaryDirectory -from airflow import configuration -import airflow.security.utils as utils +from airflow.utils.helpers import as_flattened_list +from airflow.utils.operator_helpers import AIRFLOW_VAR_NAME_FORMAT_MAPPING HIVE_QUEUE_PRIORITIES = ['VERY_HIGH', 'HIGH', 'NORMAL', 'LOW', 'VERY_LOW'] +def get_context_from_env_var(): + """ + Extract context from env variable, e.g. dag_id, task_id and execution_date, + so that they can be used inside BashOperator and PythonOperator. + :return: The context of interest. + """ + return {format_map['default']: os.environ.get(format_map['env_var_format'], '') + for format_map in AIRFLOW_VAR_NAME_FORMAT_MAPPING.values()} + + class HiveCliHook(BaseHook): """Simple wrapper around the hive CLI. @@ -62,13 +71,13 @@ class HiveCliHook(BaseHook): connection string as is. :param mapred_queue: queue used by the Hadoop Scheduler (Capacity or Fair) - :type mapred_queue: string + :type mapred_queue: str :param mapred_queue_priority: priority within the job queue. Possible settings include: VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW - :type mapred_queue_priority: string + :type mapred_queue_priority: str :param mapred_job_name: This name will appear in the jobtracker. This can make monitoring easier. - :type mapred_job_name: string + :type mapred_job_name: str """ def __init__( @@ -92,8 +101,8 @@ def __init__( "Invalid Mapred Queue Priority. Valid values are: " "{}".format(', '.join(HIVE_QUEUE_PRIORITIES))) - self.mapred_queue = mapred_queue or conf.get('hive', - 'default_hive_mapred_queue') + self.mapred_queue = mapred_queue or configuration.get('hive', + 'default_hive_mapred_queue') self.mapred_queue_priority = mapred_queue_priority self.mapred_job_name = mapred_job_name @@ -126,6 +135,7 @@ def _prepare_cli_cmd(self): jdbc_url += ";auth=" + self.auth jdbc_url = jdbc_url.format(**locals()) + jdbc_url = '"{}"'.format(jdbc_url) cmd_extra += ['-u', jdbc_url] if conn.login: @@ -137,7 +147,8 @@ def _prepare_cli_cmd(self): return [hive_bin] + cmd_extra + hive_params_list - def _prepare_hiveconf(self, d): + @staticmethod + def _prepare_hiveconf(d): """ This function prepares a list of hiveconf params from a dictionary of key value pairs. @@ -184,10 +195,15 @@ def run_cli(self, hql, schema=None, verbose=True, hive_conf=None): with TemporaryDirectory(prefix='airflow_hiveop_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: + hql = hql + '\n' f.write(hql.encode('UTF-8')) f.flush() hive_cmd = self._prepare_cli_cmd() - hive_conf_params = self._prepare_hiveconf(hive_conf) + env_context = get_context_from_env_var() + # Only extend the hive_conf if it is defined. + if hive_conf: + env_context.update(hive_conf) + hive_conf_params = self._prepare_hiveconf(env_context) if self.mapred_queue: hive_conf_params.extend( ['-hiveconf', @@ -217,7 +233,7 @@ def run_cli(self, hql, schema=None, verbose=True, hive_conf=None): hive_cmd.extend(['-f', f.name]) if verbose: - self.log.info(" ".join(hive_cmd)) + self.log.info("%s", " ".join(hive_cmd)) sp = subprocess.Popen( hive_cmd, stdout=subprocess.PIPE, @@ -273,7 +289,7 @@ def test_hql(self, hql): except AirflowException as e: message = e.args[0].split('\n')[-2] self.log.info(message) - error_loc = re.search('(\d+):(\d+)', message) + error_loc = re.search(r'(\d+):(\d+)', message) if error_loc and error_loc.group(1).isdigit(): lst = int(error_loc.group(1)) begin = max(lst - 2, 0) @@ -298,16 +314,16 @@ def load_df( not be sanitized. :param df: DataFrame to load into a Hive table - :type df: DataFrame + :type df: pandas.DataFrame :param table: target Hive table, use dot notation to target a specific database :type table: str :param field_dict: mapping from column name to hive data type. Note that it must be OrderedDict so as to keep columns' order. - :type field_dict: OrderedDict + :type field_dict: collections.OrderedDict :param delimiter: field delimiter in the file :type delimiter: str - :param encoding: string encoding to use when writing DataFrame to file + :param encoding: str encoding to use when writing DataFrame to file :type encoding: str :param pandas_kwargs: passed to DataFrame.to_csv :type pandas_kwargs: dict @@ -390,7 +406,7 @@ def load_file( :param field_dict: A dictionary of the fields name in the file as keys and their Hive types as values. Note that it must be OrderedDict so as to keep columns' order. - :type field_dict: OrderedDict + :type field_dict: collections.OrderedDict :param create: whether to create the table if it doesn't exist :type create: bool :param overwrite: whether to overwrite the data in table or partition @@ -435,11 +451,11 @@ def load_file( if partition: pvals = ", ".join( ["{0}='{1}'".format(k, v) for k, v in partition.items()]) - hql += "PARTITION ({pvals});" + hql += "PARTITION ({pvals})" # As a workaround for HIVE-10541, add a newline character # at the end of hql (AIRFLOW-2412). - hql += '\n' + hql += ';\n' hql = hql.format(**locals()) self.log.info(hql) @@ -479,6 +495,7 @@ def get_metastore_client(self): """ Returns a Hive thrift client. """ + import hmsclient from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol ms = self.metastore_conn @@ -519,13 +536,13 @@ def check_for_partition(self, schema, table, partition): Checks whether a partition exists :param schema: Name of hive schema (database) @table belongs to - :type schema: string + :type schema: str :param table: Name of hive table @partition belongs to - :type schema: string + :type schema: str :partition: Expression that matches the partitions to check for (eg `a = 'b' AND c = 'd'`) - :type schema: string - :rtype: boolean + :type schema: str + :rtype: bool >>> hh = HiveMetastoreHook() >>> t = 'static_babynames_partitioned' @@ -546,12 +563,12 @@ def check_for_named_partition(self, schema, table, partition_name): Checks whether a partition with a given name exists :param schema: Name of hive schema (database) @table belongs to - :type schema: string + :type schema: str :param table: Name of hive table @partition belongs to - :type schema: string + :type schema: str :partition: Name of the partitions to check for (eg `a=b/c=d`) - :type schema: string - :rtype: boolean + :type schema: str + :rtype: bool >>> hh = HiveMetastoreHook() >>> t = 'static_babynames_partitioned' @@ -635,7 +652,7 @@ def _get_max_partition_from_part_specs(part_specs, partition_key, filter_map): :param part_specs: list of partition specs. :type part_specs: list :param partition_key: partition key name. - :type partition_key: string + :type partition_key: str :param filter_map: partition_key:partition_value map used for partition filtering, e.g. {'key1': 'value1', 'key2': 'value2'}. Only partitions matching all partition_key:partition_value @@ -675,11 +692,11 @@ def max_partition(self, schema, table_name, field=None, filter_map=None): filter out partitions. :param schema: schema name. - :type schema: string + :type schema: str :param table_name: table name. - :type table_name: string + :type table_name: str :param field: partition key to get max partition from. - :type field: string + :type field: str :param filter_map: partition_key:partition_value map used for partition filtering. :type filter_map: map @@ -744,6 +761,9 @@ def __init__(self, hiveserver2_conn_id='hiveserver2_default'): self.hiveserver2_conn_id = hiveserver2_conn_id def get_conn(self, schema=None): + """ + Returns a Hive connection object. + """ db = self.get_connection(self.hiveserver2_conn_id) auth_mechanism = db.extra_dejson.get('authMechanism', 'NONE') if auth_mechanism == 'NONE' and db.login is None: @@ -772,7 +792,7 @@ def get_conn(self, schema=None): username=db.login or username, database=schema or db.schema or 'default') - def _get_results(self, hql, schema='default', fetch_size=None): + def _get_results(self, hql, schema='default', fetch_size=None, hive_conf=None): from pyhive.exc import ProgrammingError if isinstance(hql, basestring): hql = [hql] @@ -780,12 +800,21 @@ def _get_results(self, hql, schema='default', fetch_size=None): with contextlib.closing(self.get_conn(schema)) as conn, \ contextlib.closing(conn.cursor()) as cur: cur.arraysize = fetch_size or 1000 + + env_context = get_context_from_env_var() + if hive_conf: + env_context.update(hive_conf) + for k, v in env_context.items(): + cur.execute("set {}={}".format(k, v)) + for statement in hql: cur.execute(statement) # we only get results of statements that returns lowered_statement = statement.lower().strip() if (lowered_statement.startswith('select') or - lowered_statement.startswith('with')): + lowered_statement.startswith('with') or + (lowered_statement.startswith('set') and + '=' not in lowered_statement)): description = [c for c in cur.description] if previous_description and previous_description != description: message = '''The statements are producing different descriptions: @@ -805,8 +834,23 @@ def _get_results(self, hql, schema='default', fetch_size=None): except ProgrammingError: self.log.debug("get_results returned no records") - def get_results(self, hql, schema='default', fetch_size=None): - results_iter = self._get_results(hql, schema, fetch_size=fetch_size) + def get_results(self, hql, schema='default', fetch_size=None, hive_conf=None): + """ + Get results of the provided hql in target schema. + + :param hql: hql to be executed. + :type hql: str or list + :param schema: target schema, default to 'default'. + :type schema: str + :param fetch_size: max size of result to fetch. + :type fetch_size: int + :param hive_conf: hive_conf to execute alone with the hql. + :type hive_conf: dict + :return: results of hql execution, dict with data (list of results) and header + :rtype: dict + """ + results_iter = self._get_results(hql, schema, + fetch_size=fetch_size, hive_conf=hive_conf) header = next(results_iter) results = { 'data': list(results_iter), @@ -822,12 +866,36 @@ def to_csv( delimiter=',', lineterminator='\r\n', output_header=True, - fetch_size=1000): + fetch_size=1000, + hive_conf=None): + """ + Execute hql in target schema and write results to a csv file. + + :param hql: hql to be executed. + :type hql: str or list + :param csv_filepath: filepath of csv to write results into. + :type csv_filepath: str + :param schema: target schema, default to 'default'. + :type schema: str + :param delimiter: delimiter of the csv file, default to ','. + :type delimiter: str + :param lineterminator: lineterminator of the csv file. + :type lineterminator: str + :param output_header: header of the csv file, default to True. + :type output_header: bool + :param fetch_size: number of result rows to write into the csv file, default to 1000. + :type fetch_size: int + :param hive_conf: hive_conf to execute alone with the hql. + :type hive_conf: dict + + """ - results_iter = self._get_results(hql, schema, fetch_size=fetch_size) + results_iter = self._get_results(hql, schema, + fetch_size=fetch_size, hive_conf=hive_conf) header = next(results_iter) message = None + i = 0 with open(csv_filepath, 'wb') as f: writer = csv.writer(f, delimiter=delimiter, @@ -838,7 +906,7 @@ def to_csv( self.log.debug('Cursor description is %s', header) writer.writerow([c[0] for c in header]) - for i, row in enumerate(results_iter): + for i, row in enumerate(results_iter, 1): writer.writerow(row) if i % fetch_size == 0: self.log.info("Written %s rows so far.", i) @@ -856,6 +924,15 @@ def get_records(self, hql, schema='default'): """ Get a set of records from a Hive query. + :param hql: hql to be executed. + :type hql: str or list + :param schema: target schema, default to 'default'. + :type schema: str + :param hive_conf: hive_conf to execute alone with the hql. + :type hive_conf: dict + :return: result of hive execution + :rtype: list + >>> hh = HiveServer2Hook() >>> sql = "SELECT * FROM airflow.static_babynames LIMIT 100" >>> len(hh.get_records(sql)) @@ -867,11 +944,20 @@ def get_pandas_df(self, hql, schema='default'): """ Get a pandas dataframe from a Hive query + :param hql: hql to be executed. + :type hql: str or list + :param schema: target schema, default to 'default'. + :type schema: str + :return: result of hql execution + :rtype: DataFrame + >>> hh = HiveServer2Hook() >>> sql = "SELECT * FROM airflow.static_babynames LIMIT 100" >>> df = hh.get_pandas_df(sql) >>> len(df.index) 100 + + :return: pandas.DateFrame """ import pandas as pd res = self.get_results(hql, schema=schema) diff --git a/airflow/hooks/http_hook.py b/airflow/hooks/http_hook.py index c449fe0c15ffb..9c8c3854f2200 100644 --- a/airflow/hooks/http_hook.py +++ b/airflow/hooks/http_hook.py @@ -29,6 +29,7 @@ class HttpHook(BaseHook): """ Interact with HTTP servers. + :param http_conn_id: connection that has the base API url i.e https://www.google.com/ and optional authentication credentials. Default headers can also be specified in the Extra field in json format. @@ -52,6 +53,7 @@ def __init__( def get_conn(self, headers=None): """ Returns http session for use with requests + :param headers: additional headers to be passed through as a dictionary :type headers: dict """ @@ -70,7 +72,10 @@ def get_conn(self, headers=None): if conn.login: session.auth = (conn.login, conn.password) if conn.extra: - session.headers.update(conn.extra_dejson) + try: + session.headers.update(conn.extra_dejson) + except TypeError: + self.log.warn('Connection to %s has invalid extra field.', conn.host) if headers: session.headers.update(headers) @@ -79,6 +84,7 @@ def get_conn(self, headers=None): def run(self, endpoint, data=None, headers=None, extra_options=None): """ Performs the request + :param endpoint: the endpoint to be called i.e. resource/v1/query? :type endpoint: str :param data: payload to be uploaded or request parameters @@ -94,7 +100,11 @@ def run(self, endpoint, data=None, headers=None, extra_options=None): session = self.get_conn(headers) - url = self.base_url + endpoint + if not self.base_url.endswith('/') and not endpoint.startswith('/'): + url = self.base_url + '/' + endpoint + else: + url = self.base_url + endpoint + req = None if self.method == 'GET': # GET uses params @@ -122,6 +132,7 @@ def check_response(self, response): """ Checks the status code and raise an AirflowException exception on non 2XX or 3XX status codes + :param response: A requests response object :type response: requests.response """ @@ -137,6 +148,7 @@ def run_and_check(self, session, prepped_request, extra_options): """ Grabs extra options like timeout and actually runs the request, checking for the result + :param session: the session to be used to execute the request :type session: requests.Session :param prepped_request: the prepared request generated in run() @@ -171,12 +183,14 @@ def run_with_advanced_retry(self, _retry_args, *args, **kwargs): Runs Hook.run() with a Tenacity decorator attached to it. This is useful for connectors which might be disturbed by intermittent issues and should not instantly fail. + :param _retry_args: Arguments which define the retry behaviour. See Tenacity documentation at https://github.com/jd/tenacity :type _retry_args: dict - Example: :: + :Example:: + hook = HttpHook(http_conn_id='my_conn',method='GET') retry_args = dict( wait=tenacity.wait_exponential(), diff --git a/airflow/hooks/mssql_hook.py b/airflow/hooks/mssql_hook.py index 7ba34892a7d56..6dc3980fd20e2 100644 --- a/airflow/hooks/mssql_hook.py +++ b/airflow/hooks/mssql_hook.py @@ -50,3 +50,6 @@ def get_conn(self): def set_autocommit(self, conn, autocommit): conn.autocommit(autocommit) + + def get_autocommit(self, conn): + return conn.autocommit_state diff --git a/airflow/hooks/mysql_hook.py b/airflow/hooks/mysql_hook.py index c02c0f43b5e37..3e839527ee9a5 100644 --- a/airflow/hooks/mysql_hook.py +++ b/airflow/hooks/mysql_hook.py @@ -19,6 +19,8 @@ import MySQLdb import MySQLdb.cursors +import json +import six from airflow.hooks.dbapi_hook import DbApiHook @@ -49,10 +51,11 @@ def set_autocommit(self, conn, autocommit): def get_autocommit(self, conn): """ MySql connection gets autocommit in a different way. + :param conn: connection to get autocommit setting from. :type conn: connection object. :return: connection autocommit setting - :rtype bool + :rtype: bool """ return conn.get_autocommit() @@ -87,7 +90,15 @@ def get_conn(self): conn_config["cursorclass"] = MySQLdb.cursors.SSDictCursor local_infile = conn.extra_dejson.get('local_infile', False) if conn.extra_dejson.get('ssl', False): - conn_config['ssl'] = conn.extra_dejson['ssl'] + # SSL parameter for MySQL has to be a dictionary and in case + # of extra/dejson we can get string if extra is passed via + # URL parameters + dejson_ssl = conn.extra_dejson['ssl'] + if isinstance(dejson_ssl, six.string_types): + dejson_ssl = json.loads(dejson_ssl) + conn_config['ssl'] = dejson_ssl + if conn.extra_dejson.get('unix_socket'): + conn_config['unix_socket'] = conn.extra_dejson['unix_socket'] if local_infile: conn_config["local_infile"] = 1 conn = MySQLdb.connect(**conn_config) @@ -121,7 +132,7 @@ def bulk_dump(self, table, tmp_file): def _serialize_cell(cell, conn): """ MySQLdb converts an argument to a literal - when passing those seperately to execute. Hence, this method does nothing. + when passing those separately to execute. Hence, this method does nothing. :param cell: The cell to insert into the table :type cell: object diff --git a/airflow/hooks/oracle_hook.py b/airflow/hooks/oracle_hook.py index 39e447d010793..5d617c9a87073 100644 --- a/airflow/hooks/oracle_hook.py +++ b/airflow/hooks/oracle_hook.py @@ -46,24 +46,69 @@ def get_conn(self): :param dsn: the host address for the Oracle server :param service_name: the db_unique_name of the database that you are connecting to (CONNECT_DATA part of TNS) + You can set these parameters in the extra fields of your connection as in ``{ "dsn":"some.host.address" , "service_name":"some.service.name" }`` + see more param detail in + `cx_Oracle.connect `_ """ conn = self.get_connection(self.oracle_conn_id) + conn_config = { + 'user': conn.login, + 'password': conn.password + } dsn = conn.extra_dejson.get('dsn', None) sid = conn.extra_dejson.get('sid', None) mod = conn.extra_dejson.get('module', None) service_name = conn.extra_dejson.get('service_name', None) + port = conn.port if conn.port else 1521 if dsn and sid and not service_name: - dsn = cx_Oracle.makedsn(dsn, conn.port, sid) - conn = cx_Oracle.connect(conn.login, conn.password, dsn=dsn) + conn_config['dsn'] = cx_Oracle.makedsn(dsn, port, sid) elif dsn and service_name and not sid: - dsn = cx_Oracle.makedsn(dsn, conn.port, service_name=service_name) - conn = cx_Oracle.connect(conn.login, conn.password, dsn=dsn) + conn_config['dsn'] = cx_Oracle.makedsn(dsn, port, service_name=service_name) else: - conn = cx_Oracle.connect(conn.login, conn.password, conn.host) + conn_config['dsn'] = conn.host + + if 'encoding' in conn.extra_dejson: + conn_config['encoding'] = conn.extra_dejson.get('encoding') + # if `encoding` is specific but `nencoding` is not + # `nencoding` should use same values as `encoding` to set encoding, inspired by + # https://github.com/oracle/python-cx_Oracle/issues/157#issuecomment-371877993 + if 'nencoding' not in conn.extra_dejson: + conn_config['nencoding'] = conn.extra_dejson.get('encoding') + if 'nencoding' in conn.extra_dejson: + conn_config['nencoding'] = conn.extra_dejson.get('nencoding') + if 'threaded' in conn.extra_dejson: + conn_config['threaded'] = conn.extra_dejson.get('threaded') + if 'events' in conn.extra_dejson: + conn_config['events'] = conn.extra_dejson.get('events') + + mode = conn.extra_dejson.get('mode', '').lower() + if mode == 'sysdba': + conn_config['mode'] = cx_Oracle.SYSDBA + elif mode == 'sysasm': + conn_config['mode'] = cx_Oracle.SYSASM + elif mode == 'sysoper': + conn_config['mode'] = cx_Oracle.SYSOPER + elif mode == 'sysbkp': + conn_config['mode'] = cx_Oracle.SYSBKP + elif mode == 'sysdgd': + conn_config['mode'] = cx_Oracle.SYSDGD + elif mode == 'syskmt': + conn_config['mode'] = cx_Oracle.SYSKMT + elif mode == 'sysrac': + conn_config['mode'] = cx_Oracle.SYSRAC + + purity = conn.extra_dejson.get('purity', '').lower() + if purity == 'new': + conn_config['purity'] = cx_Oracle.ATTR_PURITY_NEW + elif purity == 'self': + conn_config['purity'] = cx_Oracle.ATTR_PURITY_SELF + elif purity == 'default': + conn_config['purity'] = cx_Oracle.ATTR_PURITY_DEFAULT + conn = cx_Oracle.connect(**conn_config) if mod is not None: conn.module = mod @@ -74,10 +119,23 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000): A generic way to insert a set of tuples into a table, the whole set of inserts is treated as one transaction Changes from standard DbApiHook implementation: - - Oracle SQL queries in cx_Oracle can not be terminated with a semicolon (';') - - Replace NaN values with NULL using numpy.nan_to_num (not using is_nan() - because of input types error for strings) + + - Oracle SQL queries in cx_Oracle can not be terminated with a semicolon (`;`) + - Replace NaN values with NULL using `numpy.nan_to_num` (not using + `is_nan()` because of input types error for strings) - Coerce datetime cells to Oracle DATETIME format during insert + + :param table: target Oracle table, use dot notation to target a + specific database + :type table: str + :param rows: the rows to insert into the table + :type rows: iterable of tuples + :param target_fields: the names of the columns to fill in the table + :type target_fields: iterable of str + :param commit_every: the maximum number of rows to insert in one transaction + Default 1000, Set greater than 0. + Set 1 to insert each row in each single transaction + :type commit_every: int """ if target_fields: target_fields = ', '.join(target_fields) @@ -117,25 +175,39 @@ def insert_rows(self, table, rows, target_fields=None, commit_every=1000): cur.execute(sql) if i % commit_every == 0: conn.commit() - self.log.info('Loaded {i} into {table} rows so far'.format(**locals())) + self.log.info('Loaded %s into %s rows so far', i, table) conn.commit() cur.close() conn.close() - self.log.info('Done loading. Loaded a total of {i} rows'.format(**locals())) + self.log.info('Done loading. Loaded a total of %s rows', i) def bulk_insert_rows(self, table, rows, target_fields=None, commit_every=5000): """ A performant bulk insert for cx_Oracle that uses prepared statements via `executemany()`. For best performance, pass in `rows` as an iterator. + + :param table: target Oracle table, use dot notation to target a + specific database + :type table: str + :param rows: the rows to insert into the table + :type rows: iterable of tuples + :param target_fields: the names of the columns to fill in the table, default None. + If None, each rows should have some order as table columns name + :type target_fields: iterable of str Or None + :param commit_every: the maximum number of rows to insert in one transaction + Default 5000. Set greater than 0. Set 1 to insert each row in each transaction + :type commit_every: int """ + if not rows: + raise ValueError("parameter rows could not be None or empty iterable") conn = self.get_conn() cursor = conn.cursor() - values = ', '.join(':%s' % i for i in range(1, len(target_fields) + 1)) - prepared_stm = 'insert into {tablename} ({columns}) values ({values})'.format( + values_base = target_fields if target_fields else rows[0] + prepared_stm = 'insert into {tablename} {columns} values ({values})'.format( tablename=table, - columns=', '.join(target_fields), - values=values, + columns='({})'.format(', '.join(target_fields)) if target_fields else '', + values=', '.join(':%s' % i for i in range(1, len(values_base) + 1)), ) row_count = 0 # Chunk the rows diff --git a/airflow/hooks/pig_hook.py b/airflow/hooks/pig_hook.py index a3836b1ba321f..7cb7d70ca413e 100644 --- a/airflow/hooks/pig_hook.py +++ b/airflow/hooks/pig_hook.py @@ -55,7 +55,7 @@ def run_cli(self, pig, verbose=True): with TemporaryDirectory(prefix='airflow_pigop_') as tmp_dir: with NamedTemporaryFile(dir=tmp_dir) as f: - f.write(pig) + f.write(pig.encode('utf-8')) f.flush() fname = f.name pig_bin = 'pig' @@ -67,7 +67,7 @@ def run_cli(self, pig, verbose=True): pig_properties_list = self.pig_properties.split() pig_cmd.extend(pig_properties_list) if verbose: - self.log.info(" ".join(pig_cmd)) + self.log.info("%s", " ".join(pig_cmd)) sp = subprocess.Popen( pig_cmd, stdout=subprocess.PIPE, @@ -76,8 +76,8 @@ def run_cli(self, pig, verbose=True): close_fds=True) self.sp = sp stdout = '' - for line in iter(sp.stdout.readline, ''): - stdout += line + for line in iter(sp.stdout.readline, b''): + stdout += line.decode('utf-8') if verbose: self.log.info(line.strip()) sp.wait() diff --git a/airflow/hooks/presto_hook.py b/airflow/hooks/presto_hook.py index d6b5293fc0f97..20c918c1a4f49 100644 --- a/airflow/hooks/presto_hook.py +++ b/airflow/hooks/presto_hook.py @@ -21,6 +21,7 @@ from pyhive import presto from pyhive.exc import DatabaseError +from requests.auth import HTTPBasicAuth from airflow.hooks.dbapi_hook import DbApiHook @@ -45,18 +46,25 @@ class PrestoHook(DbApiHook): def get_conn(self): """Returns a connection object""" db = self.get_connection(self.presto_conn_id) + reqkwargs = None + if db.password is not None: + reqkwargs = {'auth': HTTPBasicAuth(db.login, db.password)} return presto.connect( host=db.host, port=db.port, username=db.login, + source=db.extra_dejson.get('source', 'airflow'), + protocol=db.extra_dejson.get('protocol', 'http'), catalog=db.extra_dejson.get('catalog', 'hive'), + requests_kwargs=reqkwargs, schema=db.schema) @staticmethod def _strip_sql(sql): return sql.strip().rstrip(';') - def _get_pretty_exception_message(self, e): + @staticmethod + def _get_pretty_exception_message(e): """ Parses some DatabaseError to provide a better error message """ diff --git a/airflow/hooks/slack_hook.py b/airflow/hooks/slack_hook.py index 17006fc9a5cc9..953ce319b1eee 100644 --- a/airflow/hooks/slack_hook.py +++ b/airflow/hooks/slack_hook.py @@ -34,9 +34,9 @@ def __init__(self, token=None, slack_conn_id=None): If both supplied, Slack API token will be used. :param token: Slack API token - :type token: string + :type token: str :param slack_conn_id: connection that has Slack API token in the password field - :type slack_conn_id: string + :type slack_conn_id: str """ self.token = self.__get_token(token, slack_conn_id) diff --git a/airflow/hooks/webhdfs_hook.py b/airflow/hooks/webhdfs_hook.py index c4dbe8be6505a..58289ba40653a 100644 --- a/airflow/hooks/webhdfs_hook.py +++ b/airflow/hooks/webhdfs_hook.py @@ -66,8 +66,8 @@ def get_conn(self): return client except HdfsError as e: self.log.debug( - "Read operation on namenode {nn.host} " - "failed with error: {e}".format(**locals()) + "Read operation on namenode %s " + "failed with error: %s", nn.host, e ) nn_hosts = [c.host for c in nn_connections] no_nn_error = "Read operations failed " \ @@ -83,7 +83,7 @@ def check_for_path(self, hdfs_path): def load_file(self, source, destination, overwrite=True, parallelism=1, **kwargs): - """ + r""" Uploads a file to HDFS :param source: Local path to file or folder. If a folder, all the files diff --git a/airflow/hooks/zendesk_hook.py b/airflow/hooks/zendesk_hook.py index 3cf8353344f11..a6f758345018b 100644 --- a/airflow/hooks/zendesk_hook.py +++ b/airflow/hooks/zendesk_hook.py @@ -33,7 +33,8 @@ def __init__(self, zendesk_conn_id): def get_conn(self): conn = self.get_connection(self.__zendesk_conn_id) self.__url = "https://" + conn.host - return Zendesk(self.__url, conn.login, conn.password, True) + return Zendesk(zdesk_url=self.__url, zdesk_email=conn.login, zdesk_password=conn.password, + zdesk_token=True) def __handle_rate_limit_exception(self, rate_limit_exception): """ diff --git a/airflow/jobs.py b/airflow/jobs.py index ad114abda3d6f..0a403b0818662 100644 --- a/airflow/jobs.py +++ b/airflow/jobs.py @@ -26,46 +26,47 @@ import logging import multiprocessing import os -import psutil import signal -import six import sys import threading import time -import datetime +from collections import defaultdict, OrderedDict +from datetime import timedelta +from time import sleep +from typing import Any -from collections import defaultdict +import six from past.builtins import basestring -from sqlalchemy import ( - Column, Integer, String, func, Index, or_, and_, not_) +from sqlalchemy import (Column, Index, Integer, String, and_, func, not_, or_) from sqlalchemy.exc import OperationalError from sqlalchemy.orm.session import make_transient -from sqlalchemy_utc import UtcDateTime -from tabulate import tabulate -from time import sleep from airflow import configuration as conf from airflow import executors, models, settings from airflow.exceptions import AirflowException -from airflow.models import DAG, DagRun +from airflow.models import DAG, DagRun, errors +from airflow.models.dagpickle import DagPickle +from airflow.models.slamiss import SlaMiss from airflow.settings import Stats from airflow.task.task_runner import get_task_runner from airflow.ti_deps.dep_context import DepContext, QUEUE_DEPS, RUN_DEPS from airflow.utils import asciiart, helpers, timezone +from airflow.utils.configuration import tmp_configuration_copy from airflow.utils.dag_processing import (AbstractDagFileProcessor, - DagFileProcessorManager, + DagFileProcessorAgent, SimpleDag, SimpleDagBag, + SimpleTaskInstance, list_py_file_paths) from airflow.utils.db import create_session, provide_session -from airflow.utils.email import send_email -from airflow.utils.log.logging_mixin import LoggingMixin, set_context, StreamLogWriter -from airflow.utils.state import State -from airflow.utils.configuration import tmp_configuration_copy +from airflow.utils.email import get_email_address_list, send_email +from airflow.utils.log.logging_mixin import LoggingMixin, StreamLogWriter, set_context from airflow.utils.net import get_hostname +from airflow.utils.sqlalchemy import UtcDateTime +from airflow.utils.state import State -Base = models.Base -ID_LEN = models.ID_LEN +Base = models.base.Base # type: Any +ID_LEN = models.base.ID_LEN class BaseJob(Base, LoggingMixin): @@ -96,6 +97,7 @@ class BaseJob(Base, LoggingMixin): __table_args__ = ( Index('job_type_heart', job_type, latest_heartbeat), + Index('idx_job_state_heartbeat', state, latest_heartbeat), ) def __init__( @@ -126,7 +128,7 @@ def kill(self, session=None): try: self.on_kill() except Exception as e: - self.log.error('on_kill() method failed: {}'.format(e)) + self.log.error('on_kill() method failed: %s', str(e)) session.merge(job) session.commit() raise AirflowException("Job shut down externally.") @@ -159,32 +161,38 @@ def heartbeat(self): heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ - with create_session() as session: - job = session.query(BaseJob).filter_by(id=self.id).one() - make_transient(job) - session.commit() - - if job.state == State.SHUTDOWN: - self.kill() - - # Figure out how long to sleep for - sleep_for = 0 - if job.latest_heartbeat: - sleep_for = max( - 0, - self.heartrate - (timezone.utcnow() - job.latest_heartbeat).total_seconds()) - - sleep(sleep_for) - - # Update last heartbeat time - with create_session() as session: - job = session.query(BaseJob).filter(BaseJob.id == self.id).first() - job.latest_heartbeat = timezone.utcnow() - session.merge(job) - session.commit() - - self.heartbeat_callback(session=session) - self.log.debug('[heartbeat]') + try: + with create_session() as session: + job = session.query(BaseJob).filter_by(id=self.id).one() + make_transient(job) + session.commit() + + if job.state == State.SHUTDOWN: + self.kill() + + is_unit_test = conf.getboolean('core', 'unit_test_mode') + if not is_unit_test: + # Figure out how long to sleep for + sleep_for = 0 + if job.latest_heartbeat: + seconds_remaining = self.heartrate - \ + (timezone.utcnow() - job.latest_heartbeat)\ + .total_seconds() + sleep_for = max(0, seconds_remaining) + + sleep(sleep_for) + + # Update last heartbeat time + with create_session() as session: + job = session.query(BaseJob).filter(BaseJob.id == self.id).first() + job.latest_heartbeat = timezone.utcnow() + session.merge(job) + session.commit() + + self.heartbeat_callback(session=session) + self.log.debug('[heartbeat]') + except OperationalError as e: + self.log.error("Scheduler heartbeat got an exception: %s", str(e)) def run(self): Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) @@ -197,14 +205,20 @@ def run(self): make_transient(self) self.id = id_ - # Run - self._execute() - - # Marking the success in the DB - self.end_date = timezone.utcnow() - self.state = State.SUCCESS - session.merge(self) - session.commit() + try: + self._execute() + # In case of max runs or max duration + self.state = State.SUCCESS + except SystemExit: + # In case of ^C or SIGTERM + self.state = State.SUCCESS + except Exception: + self.state = State.FAILED + raise + finally: + self.end_date = timezone.utcnow() + session.merge(self) + session.commit() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1) @@ -222,9 +236,9 @@ def reset_state_for_orphaned_tasks(self, filter_by_dag_run=None, session=None): sequence. :param filter_by_dag_run: the dag_run we want to process, None if all - :type filter_by_dag_run: models.DagRun + :type filter_by_dag_run: airflow.models.DagRun :return: the TIs reset (in expired SQLAlchemy state) - :rtype: List(TaskInstance) + :rtype: list[airflow.models.TaskInstance] """ queued_tis = self.executor.queued_tasks # also consider running as the state might not have changed in the db yet @@ -280,7 +294,7 @@ def query(result, items): self.max_tis_per_query) task_instance_str = '\n\t'.join( - ["{}".format(x) for x in reset_tis]) + [repr(x) for x in reset_tis]) session.commit() self.log.info( @@ -296,7 +310,7 @@ class DagFileProcessor(AbstractDagFileProcessor, LoggingMixin): # Counter that increments everytime an instance of this class is created class_creation_counter = 0 - def __init__(self, file_path, pickle_dags, dag_id_white_list): + def __init__(self, file_path, pickle_dags, dag_id_white_list, zombies): """ :param file_path: a Python file containing Airflow DAG definitions :type file_path: unicode @@ -304,6 +318,8 @@ def __init__(self, file_path, pickle_dags, dag_id_white_list): :type pickle_dags: bool :param dag_id_whitelist: If specified, only look at these DAG ID's :type dag_id_whitelist: list[unicode] + :param zombies: zombie task instances to kill + :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] """ self._file_path = file_path # Queue that's used to pass results from the child process. @@ -312,6 +328,7 @@ def __init__(self, file_path, pickle_dags, dag_id_white_list): self._process = None self._dag_id_white_list = dag_id_white_list self._pickle_dags = pickle_dags + self._zombies = zombies # The result of Scheduler.process_file(file_path). self._result = None # Whether the process is done running. @@ -332,7 +349,8 @@ def _launch_process(result_queue, file_path, pickle_dags, dag_id_white_list, - thread_name): + thread_name, + zombies): """ Launch a process to process the given file. @@ -341,15 +359,17 @@ def _launch_process(result_queue, :param file_path: the file to process :type file_path: unicode :param pickle_dags: whether to pickle the DAGs found in the file and - save them to the DB + save them to the DB :type pickle_dags: bool :param dag_id_white_list: if specified, only examine DAG ID's that are - in this list + in this list :type dag_id_white_list: list[unicode] :param thread_name: the name to use for the process that is launched :type thread_name: unicode :return: the process that was launched :rtype: multiprocessing.Process + :param zombies: zombie task instances to kill + :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] """ def helper(): # This helper runs in the newly created process @@ -378,13 +398,14 @@ def helper(): os.getpid(), file_path) scheduler_job = SchedulerJob(dag_ids=dag_id_white_list, log=log) result = scheduler_job.process_file(file_path, + zombies, pickle_dags) result_queue.put(result) end_time = time.time() log.info( "Processing %s took %.3f seconds", file_path, end_time - start_time ) - except: + except Exception: # Log exceptions through the logging framework. log.exception("Got an exception! Propagating...") raise @@ -410,12 +431,14 @@ def start(self): self.file_path, self._pickle_dags, self._dag_id_white_list, - "DagFileProcessor{}".format(self._instance_id)) + "DagFileProcessor{}".format(self._instance_id), + self._zombies) self._start_time = timezone.utcnow() def terminate(self, sigkill=False): """ Terminate (and then kill) the process launched to process the file. + :param sigkill: whether to issue a SIGKILL if SIGTERM doesn't work. :type sigkill: bool """ @@ -444,6 +467,7 @@ def pid(self): def exit_code(self): """ After the process is finished, this can be called to get the return code + :return: the exit code of the process :rtype: int """ @@ -455,6 +479,7 @@ def exit_code(self): def done(self): """ Check if the process launched to process this file is done. + :return: whether the process is finished running :rtype: bool """ @@ -464,7 +489,8 @@ def done(self): if self._done: return True - if not self._result_queue.empty(): + # In case result queue is corrupted. + if self._result_queue and not self._result_queue.empty(): self._result = self._result_queue.get_nowait() self._done = True self.log.debug("Waiting for %s", self._process) @@ -472,7 +498,7 @@ def done(self): return True # Potential error case when process dies - if not self._process.is_alive(): + if self._result_queue and not self._process.is_alive(): self._done = True # Get the object from the queue or else join() can hang. if not self._result_queue.empty(): @@ -487,7 +513,7 @@ def done(self): def result(self): """ :return: result of running SchedulerJob.process_file() - :rtype: SimpleDag + :rtype: airflow.utils.dag_processing.SimpleDag """ if not self.done: raise AirflowException("Tried to get the result before it's done!") @@ -523,10 +549,7 @@ def __init__( dag_ids=None, subdir=settings.DAGS_FOLDER, num_runs=-1, - file_process_interval=conf.getint('scheduler', - 'min_file_process_interval'), - min_file_parsing_loop_time=conf.getint('scheduler', - 'min_file_parsing_loop_time'), + processor_poll_interval=1.0, run_duration=None, do_pickle=False, log=None, @@ -537,14 +560,18 @@ def __init__( :param dag_ids: if specified, only schedule tasks with these DAG IDs :type dag_ids: list[unicode] :param subdir: directory containing Python files with Airflow DAG - definitions, or a specific path to a file + definitions, or a specific path to a file :type subdir: unicode :param num_runs: The number of times to try to schedule each DAG file. - -1 for unlimited within the run_duration. + -1 for unlimited within the run_duration. + :type num_runs: int + :param processor_poll_interval: The number of seconds to wait between + polls of running processors + :type processor_poll_interval: int :param run_duration: how long to run (in seconds) before exiting :type run_duration: int :param do_pickle: once a DAG object is obtained by executing the Python - file, whether to serialize the DAG object to the DB + file, whether to serialize the DAG object to the DB :type do_pickle: bool """ # for BaseJob compatibility @@ -557,6 +584,7 @@ def __init__( self.num_runs = num_runs self.run_duration = run_duration + self._processor_poll_interval = processor_poll_interval self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) @@ -569,30 +597,28 @@ def __init__( self.using_sqlite = False if 'sqlite' in conf.get('core', 'sql_alchemy_conn'): - if self.max_threads > 1: - self.log.error("Cannot use more than 1 thread when using sqlite. Setting max_threads to 1") - self.max_threads = 1 self.using_sqlite = True - # How often to scan the DAGs directory for new files. Default to 5 minutes. - self.dag_dir_list_interval = conf.getint('scheduler', - 'dag_dir_list_interval') - # How often to print out DAG file processing stats to the log. Default to - # 30 seconds. - self.print_stats_interval = conf.getint('scheduler', - 'print_stats_interval') - # Parse and schedule each file no faster than this interval. Default - # to 3 minutes. - self.file_process_interval = file_process_interval - - # Wait until at least this many seconds have passed before parsing files once all - # files have finished parsing. - self.min_file_parsing_loop_time = min_file_parsing_loop_time - + self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') if run_duration is None: self.run_duration = conf.getint('scheduler', 'run_duration') + self.processor_agent = None + self._last_loop = False + + signal.signal(signal.SIGINT, self._exit_gracefully) + signal.signal(signal.SIGTERM, self._exit_gracefully) + + def _exit_gracefully(self, signum, frame): + """ + Helper method to clean up processor_agent to avoid leaving orphan processes. + """ + self.log.info("Exiting gracefully upon receiving signal %s", signum) + if self.processor_agent: + self.processor_agent.end() + sys.exit(os.EX_OK) + @provide_session def manage_slas(self, dag, session=None): """ @@ -602,11 +628,8 @@ def manage_slas(self, dag, session=None): Where assuming that the scheduler runs often, so we only check for tasks that should have succeeded in the past hour. """ - if not any([ti.sla for ti in dag.tasks]): - self.log.info( - "Skipping SLA check for %s because no tasks in DAG have SLAs", - dag - ) + if not any([isinstance(ti.sla, timedelta) for ti in dag.tasks]): + self.log.info("Skipping SLA check for %s because no tasks in DAG have SLAs", dag) return TI = models.TaskInstance @@ -631,16 +654,15 @@ def manage_slas(self, dag, session=None): ).all() ts = timezone.utcnow() - SlaMiss = models.SlaMiss for ti in max_tis: task = dag.get_task(ti.task_id) dttm = ti.execution_date - if task.sla: + if isinstance(task.sla, timedelta): dttm = dag.following_schedule(dttm) while dttm < timezone.utcnow(): following_schedule = dag.following_schedule(dttm) if following_schedule + task.sla < timezone.utcnow(): - session.merge(models.SlaMiss( + session.merge(SlaMiss( task_id=ti.task_id, dag_id=ti.dag_id, execution_date=dttm, @@ -651,7 +673,7 @@ def manage_slas(self, dag, session=None): slas = ( session .query(SlaMiss) - .filter(SlaMiss.notification_sent == False) + .filter(SlaMiss.notification_sent == False) # noqa: E712 .filter(SlaMiss.dag_id == dag.dag_id) .all() ) @@ -701,16 +723,13 @@ def manage_slas(self, dag, session=None): Blocking tasks:
    {blocking_task_list}\n{bug}
    """.format(bug=asciiart.bug, **locals()) - emails = [] - for t in dag.tasks: - if t.email: - if isinstance(t.email, basestring): - l = [t.email] - elif isinstance(t.email, (list, tuple)): - l = t.email - for email in l: - if email not in emails: - emails.append(email) + emails = set() + for task in dag.tasks: + if task.email: + if isinstance(task.email, basestring): + emails |= set(get_email_address_list(task.email)) + elif isinstance(task.email, (list, tuple)): + emails |= set(task.email) if emails and len(slas): try: send_email( @@ -731,25 +750,6 @@ def manage_slas(self, dag, session=None): session.merge(sla) session.commit() - @staticmethod - @provide_session - def clear_nonexistent_import_errors(session, known_file_paths): - """ - Clears import errors for files that no longer exist. - - :param session: session for ORM operations - :type session: sqlalchemy.orm.session.Session - :param known_file_paths: The list of existing files that are parsed for DAGs - :type known_file_paths: list[unicode] - """ - query = session.query(models.ImportError) - if known_file_paths: - query = query.filter( - ~models.ImportError.filename.in_(known_file_paths) - ) - query.delete(synchronize_session='fetch') - session.commit() - @staticmethod def update_import_errors(session, dagbag): """ @@ -760,17 +760,17 @@ def update_import_errors(session, dagbag): :param session: session for ORM operations :type session: sqlalchemy.orm.session.Session :param dagbag: DagBag containing DAGs with import errors - :type dagbag: models.Dagbag + :type dagbag: airflow.models.DagBag """ # Clear the errors of the processed files for dagbag_file in dagbag.file_last_changed: - session.query(models.ImportError).filter( - models.ImportError.filename == dagbag_file + session.query(errors.ImportError).filter( + errors.ImportError.filename == dagbag_file ).delete() # Add the errors of the processed files for filename, stacktrace in six.iteritems(dagbag.import_errors): - session.add(models.ImportError( + session.add(errors.ImportError( filename=filename, stacktrace=stacktrace)) session.commit() @@ -779,10 +779,10 @@ def update_import_errors(session, dagbag): def create_dag_run(self, dag, session=None): """ This method checks whether a new DagRun needs to be created - for a DAG based on scheduling interval + for a DAG based on scheduling interval. Returns DagRun if one is scheduled. Otherwise returns None. """ - if dag.schedule_interval: + if dag.schedule_interval and conf.getboolean('scheduler', 'USE_JOB_SCHEDULE'): active_runs = DagRun.find( dag_id=dag.dag_id, state=State.RUNNING, @@ -811,7 +811,7 @@ def create_dag_run(self, dag, session=None): session.query(func.max(DagRun.execution_date)) .filter_by(dag_id=dag.dag_id) .filter(or_( - DagRun.external_trigger == False, + DagRun.external_trigger == False, # noqa: E712 # add % as a wildcard for the like query DagRun.run_id.like(DagRun.ID_PREFIX + '%') )) @@ -872,8 +872,8 @@ def create_dag_run(self, dag, session=None): dag.start_date, next_run_date ) - # don't ever schedule in the future - if next_run_date > timezone.utcnow(): + # don't ever schedule in the future or if next_run_date is None + if not next_run_date or next_run_date > timezone.utcnow(): return # this structure is necessary to avoid a TypeError from concatenating @@ -948,7 +948,8 @@ def _process_task_instances(self, dag, queue, session=None): self.log.debug("Examining active DAG run: %s", run) # this needs a fresh session sometimes tis get detached tis = run.get_task_instances(state=(State.NONE, - State.UP_FOR_RETRY)) + State.UP_FOR_RETRY, + State.UP_FOR_RESCHEDULE)) # this loop is quite slow as it uses are_dependencies_met for # every task (in ti.is_runnable). This is also called in @@ -959,10 +960,6 @@ def _process_task_instances(self, dag, queue, session=None): # fixme: ti.task is transient but needs to be set ti.task = task - # future: remove adhoc - if task.adhoc: - continue - if ti.are_dependencies_met( dep_context=DepContext(flag_upstream_failed=True), session=session): @@ -983,12 +980,12 @@ def _change_state_for_tis_without_dagrun(self, changed manually. :param old_states: examine TaskInstances in this state - :type old_state: list[State] + :type old_state: list[airflow.utils.state.State] :param new_state: set TaskInstances to this state - :type new_state: State + :type new_state: airflow.utils.state.State :param simple_dag_bag: TaskInstances associated with DAGs in the - simple_dag_bag and with states in the old_state will be examined - :type simple_dag_bag: SimpleDagBag + simple_dag_bag and with states in the old_state will be examined + :type simple_dag_bag: airflow.utils.dag_processing.SimpleDagBag """ tis_changed = 0 query = session \ @@ -1033,9 +1030,9 @@ def __get_task_concurrency_map(self, states, session=None): Returns a map from tasks to number in the states list given. :param states: List of states to query for - :type states: List[State] + :type states: list[airflow.utils.state.State] :return: A map from (dag_id, task_id) to count of tasks in states - :rtype: Dict[[String, String], Int] + :rtype: dict[tuple[str, str], int] """ TI = models.TaskInstance @@ -1058,17 +1055,14 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): dag concurrency, executor state, and priority. :param simple_dag_bag: TaskInstances associated with DAGs in the - simple_dag_bag will be fetched from the DB and executed - :type simple_dag_bag: SimpleDagBag + simple_dag_bag will be fetched from the DB and executed + :type simple_dag_bag: airflow.utils.dag_processing.SimpleDagBag :param executor: the executor that runs task instances :type executor: BaseExecutor :param states: Execute TaskInstances in these states - :type states: Tuple[State] - :return: List[TaskInstance] + :type states: tuple[airflow.utils.state.State] + :return: list[airflow.models.TaskInstance] """ - # TODO(saguziel): Change this to include QUEUED, for concurrency - # purposes we may want to count queued tasks - states_to_count_as_running = [State.RUNNING] executable_tis = [] # Get all the queued task instances from associated with scheduled @@ -1081,30 +1075,36 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): session .query(TI) .filter(TI.dag_id.in_(simple_dag_bag.dag_ids)) - .outerjoin(DR, - and_(DR.dag_id == TI.dag_id, - DR.execution_date == TI.execution_date)) - .filter(or_(DR.run_id == None, + .outerjoin( + DR, + and_(DR.dag_id == TI.dag_id, DR.execution_date == TI.execution_date) + ) + .filter(or_(DR.run_id == None, # noqa: E711 not_(DR.run_id.like(BackfillJob.ID_PREFIX + '%')))) - .outerjoin(DM, DM.dag_id==TI.dag_id) - .filter(or_(DM.dag_id == None, + .outerjoin(DM, DM.dag_id == TI.dag_id) + .filter(or_(DM.dag_id == None, # noqa: E711 not_(DM.is_paused))) ) if None in states: - ti_query = ti_query.filter(or_(TI.state == None, TI.state.in_(states))) + ti_query = ti_query.filter( + or_(TI.state == None, TI.state.in_(states)) # noqa: E711 + ) else: ti_query = ti_query.filter(TI.state.in_(states)) task_instances_to_examine = ti_query.all() if len(task_instances_to_examine) == 0: - self.log.info("No tasks to consider for execution.") + self.log.debug("No tasks to consider for execution.") return executable_tis # Put one task instance on each line task_instance_str = "\n\t".join( - ["{}".format(x) for x in task_instances_to_examine]) - self.log.info("Tasks up for execution:\n\t%s", task_instance_str) + [repr(x) for x in task_instances_to_examine]) + self.log.info( + "%s tasks up for execution:\n\t%s", len(task_instances_to_examine), + task_instance_str + ) # Get the pool settings pools = {p.pool: p for p in session.query(models.Pool).all()} @@ -1113,16 +1113,20 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): for task_instance in task_instances_to_examine: pool_to_task_instances[task_instance.pool].append(task_instance) - task_concurrency_map = self.__get_task_concurrency_map(states=states_to_count_as_running, session=session) + states_to_count_as_running = [State.RUNNING, State.QUEUED] + task_concurrency_map = self.__get_task_concurrency_map( + states=states_to_count_as_running, session=session) # Go through each pool, and queue up a task for execution if there are # any open slots in the pool. for pool, task_instances in pool_to_task_instances.items(): + pool_name = pool if not pool: # Arbitrary: # If queued outside of a pool, trigger no more than # non_pooled_task_slot_count per run open_slots = conf.getint('core', 'non_pooled_task_slot_count') + pool_name = 'not_pooled' else: if pool not in pools: self.log.warning( @@ -1135,10 +1139,9 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): num_queued = len(task_instances) self.log.info( - "Figuring out tasks to run in Pool(name={pool}) with {open_slots} " - "open slots and {num_queued} task instances in queue".format( - **locals() - ) + "Figuring out tasks to run in Pool(name=%s) with %s open slots " + "and %s task instances in queue", + pool, open_slots, num_queued ) priority_sorted_task_instances = sorted( @@ -1147,13 +1150,16 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): # DAG IDs with running tasks that equal the concurrency limit of the dag dag_id_to_possibly_running_task_count = {} - for task_instance in priority_sorted_task_instances: + # Number of tasks that cannot be scheduled because of no open slot in pool + num_starving_tasks = 0 + for current_index, task_instance in enumerate(priority_sorted_task_instances): if open_slots <= 0: self.log.info( "Not scheduling since there are %s open slots in pool %s", open_slots, pool ) # Can't schedule any more since there are no more open slots. + num_starving_tasks = len(priority_sorted_task_instances) - current_index break # Check to make sure that the task concurrency of the DAG hasn't been @@ -1162,7 +1168,6 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): simple_dag = simple_dag_bag.get_dag(dag_id) if dag_id not in dag_id_to_possibly_running_task_count: - # TODO(saguziel): also check against QUEUED state, see AIRFLOW-1104 dag_id_to_possibly_running_task_count[dag_id] = \ DAG.get_num_task_instances( dag_id, @@ -1178,15 +1183,20 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): ) if current_task_concurrency >= task_concurrency_limit: self.log.info( - "Not executing %s since the number of tasks running or queued from DAG %s" - " is >= to the DAG's task concurrency limit of %s", + "Not executing %s since the number of tasks running or queued " + "from DAG %s is >= to the DAG's task concurrency limit of %s", task_instance, dag_id, task_concurrency_limit ) continue - task_concurrency = simple_dag.get_task_special_arg(task_instance.task_id, 'task_concurrency') + task_concurrency = simple_dag.get_task_special_arg( + task_instance.task_id, + 'task_concurrency') if task_concurrency is not None: - num_running = task_concurrency_map[((task_instance.dag_id, task_instance.task_id))] + num_running = task_concurrency_map[ + (task_instance.dag_id, task_instance.task_id) + ] + if num_running >= task_concurrency: self.log.info("Not executing %s since the task concurrency for" " this task has been reached.", task_instance) @@ -1204,9 +1214,13 @@ def _find_executable_task_instances(self, simple_dag_bag, states, session=None): open_slots -= 1 dag_id_to_possibly_running_task_count[dag_id] += 1 + Stats.gauge('pool.starving_tasks.{pool_name}'.format(pool_name=pool_name), + num_starving_tasks) + task_instance_str = "\n\t".join( - ["{}".format(x) for x in executable_tis]) - self.log.info("Setting the follow tasks to queued state:\n\t%s", task_instance_str) + [repr(x) for x in executable_tis]) + self.log.info( + "Setting the follow tasks to queued state:\n\t%s", task_instance_str) # so these dont expire on commit for ti in executable_tis: copy_dag_id = ti.dag_id @@ -1223,13 +1237,13 @@ def _change_state_for_executable_task_instances(self, task_instances, acceptable_states, session=None): """ Changes the state of task instances in the list with one of the given states - to QUEUED atomically, and returns the TIs changed. + to QUEUED atomically, and returns the TIs changed in SimpleTaskInstance format. :param task_instances: TaskInstances to change the state of - :type task_instances: List[TaskInstance] + :type task_instances: list[airflow.models.TaskInstance] :param acceptable_states: Filters the TaskInstances updated to be in these states :type acceptable_states: Iterable[State] - :return: List[TaskInstance] + :rtype: list[airflow.utils.dag_processing.SimpleTaskInstance] """ if len(task_instances) == 0: session.commit() @@ -1248,7 +1262,9 @@ def _change_state_for_executable_task_instances(self, task_instances, .filter(or_(*filter_for_ti_state_change))) if None in acceptable_states: - ti_query = ti_query.filter(or_(TI.state == None, TI.state.in_(acceptable_states))) + ti_query = ti_query.filter( + or_(TI.state == None, TI.state.in_(acceptable_states)) # noqa: E711 + ) else: ti_query = ti_query.filter(TI.state.in_(acceptable_states)) @@ -1269,80 +1285,57 @@ def _change_state_for_executable_task_instances(self, task_instances, else task_instance.queued_dttm) session.merge(task_instance) - # save which TIs we set before session expires them - filter_for_ti_enqueue = ([and_(TI.dag_id == ti.dag_id, - TI.task_id == ti.task_id, - TI.execution_date == ti.execution_date) - for ti in tis_to_set_to_queued]) - session.commit() - - # requery in batches since above was expired by commit + # Generate a list of SimpleTaskInstance for the use of queuing + # them in the executor. + simple_task_instances = [SimpleTaskInstance(ti) for ti in + tis_to_set_to_queued] - def query(result, items): - tis_to_be_queued = ( - session - .query(TI) - .filter(or_(*items)) - .all()) - task_instance_str = "\n\t".join( - ["{}".format(x) for x in tis_to_be_queued]) - self.log.info("Setting the follow tasks to queued state:\n\t%s", - task_instance_str) - return result + tis_to_be_queued - - tis_to_be_queued = helpers.reduce_in_chunks(query, - filter_for_ti_enqueue, - [], - self.max_tis_per_query) + task_instance_str = "\n\t".join( + [repr(x) for x in tis_to_set_to_queued]) - return tis_to_be_queued + session.commit() + self.log.info("Setting the following %s tasks to queued state:\n\t%s", + len(tis_to_set_to_queued), task_instance_str) + return simple_task_instances - def _enqueue_task_instances_with_queued_state(self, simple_dag_bag, task_instances): + def _enqueue_task_instances_with_queued_state(self, simple_dag_bag, + simple_task_instances): """ Takes task_instances, which should have been set to queued, and enqueues them with the executor. - :param task_instances: TaskInstances to enqueue - :type task_instances: List[TaskInstance] + :param simple_task_instances: TaskInstances to enqueue + :type simple_task_instances: list[SimpleTaskInstance] :param simple_dag_bag: Should contains all of the task_instances' dags - :type simple_dag_bag: SimpleDagBag + :type simple_dag_bag: airflow.utils.dag_processing.SimpleDagBag """ TI = models.TaskInstance # actually enqueue them - for task_instance in task_instances: - simple_dag = simple_dag_bag.get_dag(task_instance.dag_id) - command = " ".join(TI.generate_command( - task_instance.dag_id, - task_instance.task_id, - task_instance.execution_date, + for simple_task_instance in simple_task_instances: + simple_dag = simple_dag_bag.get_dag(simple_task_instance.dag_id) + command = TI.generate_command( + simple_task_instance.dag_id, + simple_task_instance.task_id, + simple_task_instance.execution_date, local=True, mark_success=False, ignore_all_deps=False, ignore_depends_on_past=False, ignore_task_deps=False, ignore_ti_state=False, - pool=task_instance.pool, + pool=simple_task_instance.pool, file_path=simple_dag.full_filepath, - pickle_id=simple_dag.pickle_id)) + pickle_id=simple_dag.pickle_id) - priority = task_instance.priority_weight - queue = task_instance.queue + priority = simple_task_instance.priority_weight + queue = simple_task_instance.queue self.log.info( "Sending %s to executor with priority %s and queue %s", - task_instance.key, priority, queue + simple_task_instance.key, priority, queue ) - # save attributes so sqlalchemy doesnt expire them - copy_dag_id = task_instance.dag_id - copy_task_id = task_instance.task_id - copy_execution_date = task_instance.execution_date - make_transient(task_instance) - task_instance.dag_id = copy_dag_id - task_instance.task_id = copy_task_id - task_instance.execution_date = copy_execution_date - self.executor.queue_command( - task_instance, + simple_task_instance, command, priority=priority, queue=queue) @@ -1362,28 +1355,68 @@ def _execute_task_instances(self, 3. Enqueue the TIs in the executor. :param simple_dag_bag: TaskInstances associated with DAGs in the - simple_dag_bag will be fetched from the DB and executed - :type simple_dag_bag: SimpleDagBag + simple_dag_bag will be fetched from the DB and executed + :type simple_dag_bag: airflow.utils.dag_processing.SimpleDagBag :param states: Execute TaskInstances in these states - :type states: Tuple[State] - :return: None + :type states: tuple[airflow.utils.state.State] + :return: Number of task instance with state changed. """ executable_tis = self._find_executable_task_instances(simple_dag_bag, states, session=session) def query(result, items): - tis_with_state_changed = self._change_state_for_executable_task_instances( - items, - states, - session=session) + simple_tis_with_state_changed = \ + self._change_state_for_executable_task_instances(items, + states, + session=session) self._enqueue_task_instances_with_queued_state( simple_dag_bag, - tis_with_state_changed) + simple_tis_with_state_changed) session.commit() - return result + len(tis_with_state_changed) + return result + len(simple_tis_with_state_changed) return helpers.reduce_in_chunks(query, executable_tis, 0, self.max_tis_per_query) + @provide_session + def _change_state_for_tasks_failed_to_execute(self, session): + """ + If there are tasks left over in the executor, + we set them back to SCHEDULED to avoid creating hanging tasks. + + :param session: session for ORM operations + """ + if self.executor.queued_tasks: + TI = models.TaskInstance + filter_for_ti_state_change = ( + [and_( + TI.dag_id == dag_id, + TI.task_id == task_id, + TI.execution_date == execution_date, + # The TI.try_number will return raw try_number+1 since the + # ti is not running. And we need to -1 to match the DB record. + TI._try_number == try_number - 1, + TI.state == State.QUEUED) + for dag_id, task_id, execution_date, try_number + in self.executor.queued_tasks.keys()]) + ti_query = (session.query(TI) + .filter(or_(*filter_for_ti_state_change))) + tis_to_set_to_scheduled = (ti_query + .with_for_update() + .all()) + if len(tis_to_set_to_scheduled) == 0: + session.commit() + return + + # set TIs to queued state + for task_instance in tis_to_set_to_scheduled: + task_instance.state = State.SCHEDULED + + task_instance_str = "\n\t".join( + [repr(x) for x in tis_to_set_to_scheduled]) + + session.commit() + self.log.info("Set the following tasks to scheduled state:\n\t%s", task_instance_str) + def _process_dags(self, dagbag, dags, tis_out): """ Iterates over the dags and processes them. Processing includes: @@ -1393,12 +1426,12 @@ def _process_dags(self, dagbag, dags, tis_out): 3. Send emails for tasks that have missed SLAs. :param dagbag: a collection of DAGs to process - :type dagbag: models.DagBag + :type dagbag: airflow.models.DagBag :param dags: the DAGs from the DagBag to process - :type dags: DAG + :type dags: airflow.models.DAG :param tis_out: A queue to add generated TaskInstance objects :type tis_out: multiprocessing.Queue[TaskInstance] - :return: None + :rtype: None """ for dag in dags: dag = dagbag.get_dag(dag.dag_id) @@ -1418,8 +1451,6 @@ def _process_dags(self, dagbag, dags, tis_out): self._process_task_instances(dag, tis_out) self.manage_slas(dag) - models.DagStat.update([d.dag_id for d in dags]) - @provide_session def _process_executor_events(self, simple_dag_bag, session=None): """ @@ -1430,10 +1461,11 @@ def _process_executor_events(self, simple_dag_bag, session=None): TI = models.TaskInstance for key, state in list(self.executor.get_event_buffer(simple_dag_bag.dag_ids) .items()): - dag_id, task_id, execution_date = key + dag_id, task_id, execution_date, try_number = key self.log.info( - "Executor reports %s.%s execution_date=%s as %s", - dag_id, task_id, execution_date, state + "Executor reports execution of %s.%s execution_date=%s " + "exited with status %s for try_number %s", + dag_id, task_id, execution_date, state, try_number ) if state == State.FAILED or state == State.SUCCESS: qry = session.query(TI).filter(TI.dag_id == dag_id, @@ -1445,7 +1477,7 @@ def _process_executor_events(self, simple_dag_bag, session=None): continue # TODO: should we fail RUNNING as well, as we do in Backfills? - if ti.state == State.QUEUED: + if ti.try_number == try_number and ti.state == State.QUEUED: msg = ("Executor reports task instance {} finished ({}) " "although the task says its {}. Was the task " "killed externally?".format(ti, state, ti.state)) @@ -1464,72 +1496,6 @@ def _process_executor_events(self, simple_dag_bag, session=None): session.merge(ti) session.commit() - def _log_file_processing_stats(self, - known_file_paths, - processor_manager): - """ - Print out stats about how files are getting processed. - - :param known_file_paths: a list of file paths that may contain Airflow - DAG definitions - :type known_file_paths: list[unicode] - :param processor_manager: manager for the file processors - :type stats: DagFileProcessorManager - :return: None - """ - - # File Path: Path to the file containing the DAG definition - # PID: PID associated with the process that's processing the file. May - # be empty. - # Runtime: If the process is currently running, how long it's been - # running for in seconds. - # Last Runtime: If the process ran before, how long did it take to - # finish in seconds - # Last Run: When the file finished processing in the previous run. - headers = ["File Path", - "PID", - "Runtime", - "Last Runtime", - "Last Run"] - - rows = [] - for file_path in known_file_paths: - last_runtime = processor_manager.get_last_runtime(file_path) - processor_pid = processor_manager.get_pid(file_path) - processor_start_time = processor_manager.get_start_time(file_path) - runtime = ((timezone.utcnow() - processor_start_time).total_seconds() - if processor_start_time else None) - last_run = processor_manager.get_last_finish_time(file_path) - - rows.append((file_path, - processor_pid, - runtime, - last_runtime, - last_run)) - - # Sort by longest last runtime. (Can't sort None values in python3) - rows = sorted(rows, key=lambda x: x[3] or 0.0) - - formatted_rows = [] - for file_path, pid, runtime, last_runtime, last_run in rows: - formatted_rows.append((file_path, - pid, - "{:.2f}s".format(runtime) - if runtime else None, - "{:.2f}s".format(last_runtime) - if last_runtime else None, - last_run.strftime("%Y-%m-%dT%H:%M:%S") - if last_run else None)) - log_str = ("\n" + - "=" * 80 + - "\n" + - "DAG File Processing Stats\n\n" + - tabulate(formatted_rows, headers=headers) + - "\n" + - "=" * 80) - - self.log.info(log_str) - def _execute(self): self.log.info("Starting the scheduler") @@ -1539,100 +1505,67 @@ def _execute(self): (executors.LocalExecutor, executors.SequentialExecutor): pickle_dags = True - # Use multiple processes to parse and generate tasks for the - # DAGs in parallel. By processing them in separate processes, - # we can get parallelism and isolation from potentially harmful - # user code. - self.log.info("Processing files using up to %s processes at a time", - self.max_threads) self.log.info("Running execute loop for %s seconds", self.run_duration) self.log.info("Processing each file at most %s times", self.num_runs) - self.log.info("Process each file at most once every %s seconds", - self.file_process_interval) - self.log.info("Wait until at least %s seconds have passed between file parsing " - "loops", self.min_file_parsing_loop_time) - self.log.info("Checking for new files in %s every %s seconds", - self.subdir, self.dag_dir_list_interval) # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self.subdir) known_file_paths = list_py_file_paths(self.subdir) self.log.info("There are %s files in %s", len(known_file_paths), self.subdir) - def processor_factory(file_path): + def processor_factory(file_path, zombies): return DagFileProcessor(file_path, pickle_dags, - self.dag_ids) + self.dag_ids, + zombies) + + # When using sqlite, we do not use async_mode + # so the scheduler job and DAG parser don't access the DB at the same time. + async_mode = not self.using_sqlite - processor_manager = DagFileProcessorManager(self.subdir, - known_file_paths, - self.max_threads, - self.file_process_interval, - self.min_file_parsing_loop_time, - self.num_runs, - processor_factory) + self.processor_agent = DagFileProcessorAgent(self.subdir, + known_file_paths, + self.num_runs, + processor_factory, + async_mode) try: - self._execute_helper(processor_manager) + self._execute_helper() + except Exception: + self.log.exception("Exception when executing execute_helper") finally: + self.processor_agent.end() self.log.info("Exited execute loop") - # Kill all child processes on exit since we don't want to leave - # them as orphaned. - pids_to_kill = processor_manager.get_all_pids() - if len(pids_to_kill) > 0: - # First try SIGTERM - this_process = psutil.Process(os.getpid()) - # Only check child processes to ensure that we don't have a case - # where we kill the wrong process because a child process died - # but the PID got reused. - child_processes = [x for x in this_process.children(recursive=True) - if x.is_running() and x.pid in pids_to_kill] - for child in child_processes: - self.log.info("Terminating child PID: %s", child.pid) - child.terminate() - # TODO: Remove magic number - timeout = 5 - self.log.info("Waiting up to %s seconds for processes to exit...", timeout) - try: - psutil.wait_procs( - child_processes, timeout=timeout, - callback=lambda x: self.log.info('Terminated PID %s', x.pid)) - except psutil.TimeoutExpired: - self.log.debug("Ran out of time while waiting for processes to exit") - - # Then SIGKILL - child_processes = [x for x in this_process.children(recursive=True) - if x.is_running() and x.pid in pids_to_kill] - if len(child_processes) > 0: - self.log.info("SIGKILL processes that did not terminate gracefully") - for child in child_processes: - self.log.info("Killing child PID: %s", child.pid) - child.kill() - child.wait() - - def _execute_helper(self, processor_manager): - """ - :param processor_manager: manager to use - :type processor_manager: DagFileProcessorManager - :return: None + def _execute_helper(self): + """ + The actual scheduler loop. The main steps in the loop are: + #. Harvest DAG parsing results through DagFileProcessorAgent + #. Find and queue executable tasks + #. Change task instance state in DB + #. Queue tasks in executor + #. Heartbeat executor + #. Execute queued tasks in executor asynchronously + #. Sync on the states of running tasks + + Following is a graphic representation of these steps. + + .. image:: ../docs/img/scheduler_loop.jpg + + :rtype: None """ self.executor.start() self.log.info("Resetting orphaned tasks for active dag runs") self.reset_state_for_orphaned_tasks() + # Start after resetting orphaned tasks to avoid stressing out DB. + self.processor_agent.start() + execute_start_time = timezone.utcnow() - # Last time stats were printed - last_stat_print_time = datetime.datetime(2000, 1, 1, tzinfo=timezone.utc) # Last time that self.heartbeat() was called. last_self_heartbeat_time = timezone.utcnow() - # Last time that the DAG dir was traversed to look for files - last_dag_dir_refresh_time = timezone.utcnow() - - # Use this value initially - known_file_paths = processor_manager.file_paths # For the execute duration, parse and schedule DAGs while (timezone.utcnow() - execute_start_time).total_seconds() < \ @@ -1640,61 +1573,56 @@ def _execute_helper(self, processor_manager): self.log.debug("Starting Loop...") loop_start_time = time.time() - # Traverse the DAG directory for Python files containing DAGs - # periodically - elapsed_time_since_refresh = (timezone.utcnow() - - last_dag_dir_refresh_time).total_seconds() - - if elapsed_time_since_refresh > self.dag_dir_list_interval: - # Build up a list of Python files that could contain DAGs - self.log.info("Searching for files in %s", self.subdir) - known_file_paths = list_py_file_paths(self.subdir) - last_dag_dir_refresh_time = timezone.utcnow() - self.log.info("There are %s files in %s", len(known_file_paths), self.subdir) - processor_manager.set_file_paths(known_file_paths) - - self.log.debug("Removing old import errors") - self.clear_nonexistent_import_errors(known_file_paths=known_file_paths) - - # Kick of new processes and collect results from finished ones - self.log.debug("Heartbeating the process manager") - simple_dags = processor_manager.heartbeat() - if self.using_sqlite: + self.processor_agent.heartbeat() # For the sqlite case w/ 1 thread, wait until the processor # is finished to avoid concurrent access to the DB. - self.log.debug("Waiting for processors to finish since we're using sqlite") - processor_manager.wait_until_finished() + self.log.debug( + "Waiting for processors to finish since we're using sqlite") + self.processor_agent.wait_until_finished() + + self.log.debug("Harvesting DAG parsing results") + simple_dags = self.processor_agent.harvest_simple_dags() + self.log.debug("Harvested {} SimpleDAGs".format(len(simple_dags))) # Send tasks for execution if available simple_dag_bag = SimpleDagBag(simple_dags) if len(simple_dags) > 0: - - # Handle cases where a DAG run state is set (perhaps manually) to - # a non-running state. Handle task instances that belong to - # DAG runs in those states - - # If a task instance is up for retry but the corresponding DAG run - # isn't running, mark the task instance as FAILED so we don't try - # to re-run it. - self._change_state_for_tis_without_dagrun(simple_dag_bag, - [State.UP_FOR_RETRY], - State.FAILED) - # If a task instance is scheduled or queued, but the corresponding - # DAG run isn't running, set the state to NONE so we don't try to - # re-run it. - self._change_state_for_tis_without_dagrun(simple_dag_bag, - [State.QUEUED, - State.SCHEDULED], - State.NONE) - - self._execute_task_instances(simple_dag_bag, - (State.SCHEDULED,)) + try: + simple_dag_bag = SimpleDagBag(simple_dags) + + # Handle cases where a DAG run state is set (perhaps manually) to + # a non-running state. Handle task instances that belong to + # DAG runs in those states + + # If a task instance is up for retry but the corresponding DAG run + # isn't running, mark the task instance as FAILED so we don't try + # to re-run it. + self._change_state_for_tis_without_dagrun(simple_dag_bag, + [State.UP_FOR_RETRY], + State.FAILED) + # If a task instance is scheduled or queued or up for reschedule, + # but the corresponding DAG run isn't running, set the state to + # NONE so we don't try to re-run it. + self._change_state_for_tis_without_dagrun(simple_dag_bag, + [State.QUEUED, + State.SCHEDULED, + State.UP_FOR_RESCHEDULE], + State.NONE) + + self._execute_task_instances(simple_dag_bag, + (State.SCHEDULED,)) + except Exception as e: + self.log.error("Error queuing tasks") + self.log.exception(e) + continue # Call heartbeats self.log.debug("Heartbeating the executor") self.executor.heartbeat() + self._change_state_for_tasks_failed_to_execute() + # Process events from the executor self._process_executor_events(simple_dag_bag) @@ -1706,36 +1634,42 @@ def _execute_helper(self, processor_manager): self.heartbeat() last_self_heartbeat_time = timezone.utcnow() - # Occasionally print out stats about how fast the files are getting processed - if ((timezone.utcnow() - last_stat_print_time).total_seconds() > - self.print_stats_interval): - if len(known_file_paths) > 0: - self._log_file_processing_stats(known_file_paths, - processor_manager) - last_stat_print_time = timezone.utcnow() - + is_unit_test = conf.getboolean('core', 'unit_test_mode') loop_end_time = time.time() - self.log.debug("Ran scheduling loop in %.2f seconds", - loop_end_time - loop_start_time) - - # Exit early for a test mode - if processor_manager.max_runs_reached(): - self.log.info("Exiting loop as all files have been processed %s times", - self.num_runs) + loop_duration = loop_end_time - loop_start_time + self.log.debug( + "Ran scheduling loop in %.2f seconds", + loop_duration) + + if not is_unit_test: + self.log.debug("Sleeping for %.2f seconds", self._processor_poll_interval) + time.sleep(self._processor_poll_interval) + + # Exit early for a test mode, run one additional scheduler loop + # to reduce the possibility that parsed DAG was put into the queue + # by the DAG manager but not yet received by DAG agent. + if self.processor_agent.done: + self._last_loop = True + + if self._last_loop: + self.log.info("Exiting scheduler loop as all files" + " have been processed {} times".format(self.num_runs)) break + if loop_duration < 1 and not is_unit_test: + sleep_length = 1 - loop_duration + self.log.debug( + "Sleeping for {0:.2f} seconds to prevent excessive logging" + .format(sleep_length)) + sleep(sleep_length) + # Stop any processors - processor_manager.terminate() + self.processor_agent.terminate() # Verify that all files were processed, and if so, deactivate DAGs that # haven't been touched by the scheduler as they likely have been # deleted. - all_files_processed = True - for file_path in known_file_paths: - if processor_manager.get_last_finish_time(file_path) is None: - all_files_processed = False - break - if all_files_processed: + if self.processor_agent.all_files_processed: self.log.info( "Deactivating DAGs that haven't been touched since %s", execute_start_time.isoformat() @@ -1747,7 +1681,7 @@ def _execute_helper(self, processor_manager): settings.Session.remove() @provide_session - def process_file(self, file_path, pickle_dags=False, session=None): + def process_file(self, file_path, zombies, pickle_dags=False, session=None): """ Process a Python file containing Airflow DAGs. @@ -1766,18 +1700,20 @@ def process_file(self, file_path, pickle_dags=False, session=None): :param file_path: the path to the Python file that should be executed :type file_path: unicode + :param zombies: zombie task instances to kill. + :type zombies: list[airflow.utils.dag_processing.SimpleTaskInstance] :param pickle_dags: whether serialize the DAGs found in the file and - save them to the db + save them to the db :type pickle_dags: bool :return: a list of SimpleDags made from the Dags found in the file - :rtype: list[SimpleDag] + :rtype: list[airflow.utils.dag_processing.SimpleDagBag] """ self.log.info("Processing file %s for tasks to queue", file_path) # As DAGs are parsed from this file, they will be converted into SimpleDags simple_dags = [] try: - dagbag = models.DagBag(file_path) + dagbag = models.DagBag(file_path, include_examples=False) except Exception: self.log.exception("Failed at reloading the DAG file %s", file_path) Stats.incr('dag_file_refresh_error', 1, 1) @@ -1799,13 +1735,12 @@ def process_file(self, file_path, pickle_dags=False, session=None): # Pickle the DAGs (if necessary) and put them into a SimpleDag for dag_id in dagbag.dags: - dag = dagbag.get_dag(dag_id) - pickle_id = None - if pickle_dags: - pickle_id = dag.pickle(session).id - # Only return DAGs that are not paused if dag_id not in paused_dag_ids: + dag = dagbag.get_dag(dag_id) + pickle_id = None + if pickle_dags: + pickle_id = dag.pickle(session).id simple_dags.append(SimpleDag(dag, pickle_id=pickle_id)) if len(self.dag_ids) > 0: @@ -1860,7 +1795,7 @@ def process_file(self, file_path, pickle_dags=False, session=None): except Exception: self.log.exception("Error logging import errors!") try: - dagbag.kill_zombies() + dagbag.kill_zombies(zombies) except Exception: self.log.exception("Error killing zombies!") @@ -1868,7 +1803,7 @@ def process_file(self, file_path, pickle_dags=False, session=None): @provide_session def heartbeat_callback(self, session=None): - Stats.gauge('scheduler_heartbeat', 1, 1) + Stats.incr('scheduler_heartbeat', 1, 1) class BackfillJob(BaseJob): @@ -1910,29 +1845,29 @@ def __init__(self, ): """ :param to_run: Tasks to run in the backfill - :type to_run: dict[Tuple[String, String, DateTime], TaskInstance] + :type to_run: dict[tuple[string, string, datetime.datetime], airflow.models.TaskInstance] :param running: Maps running task instance key to task instance object - :type running: dict[Tuple[String, String, DateTime], TaskInstance] + :type running: dict[tuple[string, string, datetime.datetime], airflow.models.TaskInstance] :param skipped: Tasks that have been skipped - :type skipped: set[Tuple[String, String, DateTime]] + :type skipped: set[tuple[string, string, datetime.datetime]] :param succeeded: Tasks that have succeeded so far - :type succeeded: set[Tuple[String, String, DateTime]] + :type succeeded: set[tuple[string, string, datetime.datetime]] :param failed: Tasks that have failed - :type failed: set[Tuple[String, String, DateTime]] + :type failed: set[tuple[string, string, datetime.datetime]] :param not_ready: Tasks not ready for execution - :type not_ready: set[Tuple[String, String, DateTime]] + :type not_ready: set[tuple[string, string, datetime.datetime]] :param deadlocked: Deadlocked tasks - :type deadlocked: set[Tuple[String, String, DateTime]] + :type deadlocked: set[tuple[string, string, datetime.datetime]] :param active_runs: Active dag runs at a certain point in time :type active_runs: list[DagRun] :param executed_dag_run_dates: Datetime objects for the executed dag runs - :type executed_dag_run_dates: set[Datetime] + :type executed_dag_run_dates: set[datetime.datetime] :param finished_runs: Number of finished runs so far :type finished_runs: int :param total_runs: Number of total dag runs able to run :type total_runs: int """ - self.to_run = to_run or dict() + self.to_run = to_run or OrderedDict() self.running = running or dict() self.skipped = skipped or set() self.succeeded = succeeded or set() @@ -1958,14 +1893,15 @@ def __init__( verbose=False, conf=None, rerun_failed_tasks=False, + run_backwards=False, *args, **kwargs): """ :param dag: DAG object. - :type dag: `class DAG`. + :type dag: airflow.models.DAG :param start_date: start date for the backfill date range. - :type start_date: datetime. + :type start_date: datetime.datetime :param end_date: end date for the backfill date range. - :type end_date: datetime + :type end_date: datetime.datetime :param mark_success: flag whether to mark the task auto success. :type mark_success: bool :param donot_pickle: whether pickle @@ -1984,6 +1920,8 @@ def __init__( :param rerun_failed_tasks: flag to whether to auto rerun the failed task in backfill :type rerun_failed_tasks: bool + :param run_backwards: Whether to process the dates from most to least recent + :type run_backwards bool :param args: :param kwargs: """ @@ -2000,12 +1938,14 @@ def __init__( self.verbose = verbose self.conf = conf self.rerun_failed_tasks = rerun_failed_tasks + self.run_backwards = run_backwards super(BackfillJob, self).__init__(*args, **kwargs) def _update_counters(self, ti_status): """ Updates the counters per state of the tasks that were running. Can re-add to tasks to run in case required. + :param ti_status: the internal status of the backfill job tasks :type ti_status: BackfillJob._DagRunTaskStatus """ @@ -2031,6 +1971,11 @@ def _update_counters(self, ti_status): self.log.warning("Task instance %s is up for retry", ti) ti_status.running.pop(key) ti_status.to_run[key] = ti + # special case: if the task needs to be rescheduled put it back + elif ti.state == State.UP_FOR_RESCHEDULE: + self.log.warning("Task instance %s is up for reschedule", ti) + ti_status.running.pop(key) + ti_status.to_run[key] = ti # special case: The state of the task can be set to NONE by the task itself # when it reaches concurrency limits. It could also happen when the state # is changed externally, e.g. by clearing tasks from the ui. We need to cover @@ -2050,6 +1995,7 @@ def _manage_executor_state(self, running): """ Checks if the executor agrees with the state of task instances that are running + :param running: dict of key, task to verify """ executor = self.executor @@ -2081,10 +2027,11 @@ def _get_dag_run(self, run_date, session=None): Returns a dag run for the given run date, which will be matched to an existing dag run if available or create a new dag run otherwise. If the max_active_runs limit is reached, this function will return None. + :param run_date: the execution date for the dag run - :type run_date: datetime + :type run_date: datetime.datetime :param session: the database session object - :type session: Session + :type session: sqlalchemy.orm.session.Session :return: a DagRun in state RUNNING or None """ run_id = BackfillJob.ID_FORMAT_PREFIX.format(run_date.isoformat()) @@ -2140,10 +2087,11 @@ def _task_instances_for_dag_run(self, dag_run, session=None): """ Returns a map of task instance key to task instance object for the tasks to run in the given dag run. + :param dag_run: the dag run to get the tasks from - :type dag_run: models.DagRun + :type dag_run: airflow.models.DagRun :param session: the database session object - :type session: Session + :type session: sqlalchemy.orm.session.Session """ tasks_to_run = {} @@ -2168,27 +2116,13 @@ def _task_instances_for_dag_run(self, dag_run, session=None): return tasks_to_run def _log_progress(self, ti_status): - msg = ' | '.join([ - "[backfill progress]", - "finished run {0} of {1}", - "tasks waiting: {2}", - "succeeded: {3}", - "running: {4}", - "failed: {5}", - "skipped: {6}", - "deadlocked: {7}", - "not ready: {8}" - ]).format( - ti_status.finished_runs, - ti_status.total_runs, - len(ti_status.to_run), - len(ti_status.succeeded), - len(ti_status.running), - len(ti_status.failed), - len(ti_status.skipped), - len(ti_status.deadlocked), - len(ti_status.not_ready)) - self.log.info(msg) + self.log.info( + '[backfill progress] | finished run %s of %s | tasks waiting: %s | succeeded: %s | ' + 'running: %s | failed: %s | skipped: %s | deadlocked: %s | not ready: %s', + ti_status.finished_runs, ti_status.total_runs, len(ti_status.to_run), len(ti_status.succeeded), + len(ti_status.running), len(ti_status.failed), len(ti_status.skipped), len(ti_status.deadlocked), + len(ti_status.not_ready) + ) self.log.debug( "Finished dag run loop iteration. Remaining tasks %s", @@ -2205,6 +2139,7 @@ def _process_backfill_task_instances(self, Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. + :param ti_status: the internal status of the job :type ti_status: BackfillJob._DagRunTaskStatus :param executor: the executor to run the task instances @@ -2212,9 +2147,9 @@ def _process_backfill_task_instances(self, :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job - :type start_date: datetime + :type start_date: datetime.datetime :param session: the current session object - :type session: Session + :type session: sqlalchemy.orm.session.Session :return: the list of execution_dates for the finished dag runs :rtype: list """ @@ -2230,134 +2165,149 @@ def _process_backfill_task_instances(self, # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish - for task in self.dag.topological_sort(): - for key, ti in list(ti_status.to_run.items()): - if task.task_id != ti.task_id: - continue + @provide_session + def _per_task_process(task, key, ti, session=None): + if task.task_id != ti.task_id: + return - ti.refresh_from_db() + ti.refresh_from_db() - task = self.dag.get_task(ti.task_id) - ti.task = task + task = self.dag.get_task(ti.task_id) + ti.task = task - ignore_depends_on_past = ( - self.ignore_first_depends_on_past and - ti.execution_date == (start_date or ti.start_date)) - self.log.debug( - "Task instance to run %s state %s", ti, ti.state) + ignore_depends_on_past = ( + self.ignore_first_depends_on_past and + ti.execution_date == (start_date or ti.start_date)) + self.log.debug( + "Task instance to run %s state %s", ti, ti.state) + + # The task was already marked successful or skipped by a + # different Job. Don't rerun it. + if ti.state == State.SUCCESS: + ti_status.succeeded.add(key) + self.log.debug("Task instance %s succeeded. Don't rerun.", ti) + ti_status.to_run.pop(key) + if key in ti_status.running: + ti_status.running.pop(key) + return + elif ti.state == State.SKIPPED: + ti_status.skipped.add(key) + self.log.debug("Task instance %s skipped. Don't rerun.", ti) + ti_status.to_run.pop(key) + if key in ti_status.running: + ti_status.running.pop(key) + return - # The task was already marked successful or skipped by a - # different Job. Don't rerun it. - if ti.state == State.SUCCESS: - ti_status.succeeded.add(key) - self.log.debug("Task instance %s succeeded. Don't rerun.", ti) - ti_status.to_run.pop(key) + # guard against externally modified tasks instances or + # in case max concurrency has been reached at task runtime + elif ti.state == State.NONE: + self.log.warning( + "FIXME: task instance {} state was set to None " + "externally. This should not happen" + ) + ti.set_state(State.SCHEDULED, session=session) + if self.rerun_failed_tasks: + # Rerun failed tasks or upstreamed failed tasks + if ti.state in (State.FAILED, State.UPSTREAM_FAILED): + self.log.error("Task instance {ti} " + "with state {state}".format(ti=ti, + state=ti.state)) if key in ti_status.running: ti_status.running.pop(key) - continue - elif ti.state == State.SKIPPED: - ti_status.skipped.add(key) - self.log.debug("Task instance %s skipped. Don't rerun.", ti) + # Reset the failed task in backfill to scheduled state + ti.set_state(State.SCHEDULED, session=session) + else: + # Default behaviour which works for subdag. + if ti.state in (State.FAILED, State.UPSTREAM_FAILED): + self.log.error("Task instance {ti} " + "with {state} state".format(ti=ti, + state=ti.state)) + ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) - continue + return - # guard against externally modified tasks instances or - # in case max concurrency has been reached at task runtime - elif ti.state == State.NONE: - self.log.warning( - "FIXME: task instance {} state was set to None " - "externally. This should not happen" - ) - ti.set_state(State.SCHEDULED, session=session) - if self.rerun_failed_tasks: - # Rerun failed tasks or upstreamed failed tasks - if ti.state in (State.FAILED, State.UPSTREAM_FAILED): - self.log.error("Task instance {ti} " - "with state {state}".format(ti=ti, - state=ti.state)) - if key in ti_status.running: - ti_status.running.pop(key) - # Reset the failed task in backfill to scheduled state - ti.set_state(State.SCHEDULED, session=session) - else: - # Default behaviour which works for subdag. - if ti.state in (State.FAILED, State.UPSTREAM_FAILED): - self.log.error("Task instance {ti} " - "with {state} state".format(ti=ti, - state=ti.state)) - ti_status.failed.add(key) + backfill_context = DepContext( + deps=RUN_DEPS, + ignore_depends_on_past=ignore_depends_on_past, + ignore_task_deps=self.ignore_task_deps, + flag_upstream_failed=True) + + # Is the task runnable? -- then run it + # the dependency checker can change states of tis + if ti.are_dependencies_met( + dep_context=backfill_context, + session=session, + verbose=self.verbose): + ti.refresh_from_db(lock_for_update=True, session=session) + if ti.state in (State.SCHEDULED, State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE): + if executor.has_task(ti): + self.log.debug( + "Task Instance %s already in executor " + "waiting for queue to clear", + ti + ) + else: + self.log.debug('Sending %s to executor', ti) + # Skip scheduled state, we are executing immediately + ti.state = State.QUEUED + ti.queued_dttm = timezone.utcnow() if not ti.queued_dttm else ti.queued_dttm + session.merge(ti) + + cfg_path = None + if executor.__class__ in (executors.LocalExecutor, + executors.SequentialExecutor): + cfg_path = tmp_configuration_copy() + + executor.queue_task_instance( + ti, + mark_success=self.mark_success, + pickle_id=pickle_id, + ignore_task_deps=self.ignore_task_deps, + ignore_depends_on_past=ignore_depends_on_past, + pool=self.pool, + cfg_path=cfg_path) + ti_status.running[key] = ti ti_status.to_run.pop(key) - if key in ti_status.running: - ti_status.running.pop(key) - continue - - backfill_context = DepContext( - deps=RUN_DEPS, - ignore_depends_on_past=ignore_depends_on_past, - ignore_task_deps=self.ignore_task_deps, - flag_upstream_failed=True) - - # Is the task runnable? -- then run it - # the dependency checker can change states of tis - if ti.are_dependencies_met( - dep_context=backfill_context, - session=session, - verbose=self.verbose): - ti.refresh_from_db(lock_for_update=True, session=session) - if ti.state == State.SCHEDULED or ti.state == State.UP_FOR_RETRY: - if executor.has_task(ti): - self.log.debug( - "Task Instance %s already in executor " - "waiting for queue to clear", - ti - ) - else: - self.log.debug('Sending %s to executor', ti) - # Skip scheduled state, we are executing immediately - ti.state = State.QUEUED - session.merge(ti) - - cfg_path = None - if executor.__class__ in (executors.LocalExecutor, - executors.SequentialExecutor): - cfg_path = tmp_configuration_copy() - - executor.queue_task_instance( - ti, - mark_success=self.mark_success, - pickle_id=pickle_id, - ignore_task_deps=self.ignore_task_deps, - ignore_depends_on_past=ignore_depends_on_past, - pool=self.pool, - cfg_path=cfg_path) - ti_status.running[key] = ti - ti_status.to_run.pop(key) - session.commit() - continue + session.commit() + return - if ti.state == State.UPSTREAM_FAILED: - self.log.error("Task instance %s upstream failed", ti) - ti_status.failed.add(key) - ti_status.to_run.pop(key) - if key in ti_status.running: - ti_status.running.pop(key) - continue + if ti.state == State.UPSTREAM_FAILED: + self.log.error("Task instance %s upstream failed", ti) + ti_status.failed.add(key) + ti_status.to_run.pop(key) + if key in ti_status.running: + ti_status.running.pop(key) + return - # special case - if ti.state == State.UP_FOR_RETRY: - self.log.debug( - "Task instance %s retry period not " - "expired yet", ti) - if key in ti_status.running: - ti_status.running.pop(key) - ti_status.to_run[key] = ti - continue + # special case + if ti.state == State.UP_FOR_RETRY: + self.log.debug( + "Task instance %s retry period not " + "expired yet", ti) + if key in ti_status.running: + ti_status.running.pop(key) + ti_status.to_run[key] = ti + return - # all remaining tasks - self.log.debug('Adding %s to not_ready', ti) - ti_status.not_ready.add(key) + # special case + if ti.state == State.UP_FOR_RESCHEDULE: + self.log.debug( + "Task instance %s reschedule period not " + "expired yet", ti) + if key in ti_status.running: + ti_status.running.pop(key) + ti_status.to_run[key] = ti + return + + # all remaining tasks + self.log.debug('Adding %s to not_ready', ti) + ti_status.not_ready.add(key) + + for task in self.dag.topological_sort(): + for key, ti in list(ti_status.to_run.items()): + _per_task_process(task, key, ti) # execute the tasks in the queue self.heartbeat() @@ -2391,9 +2341,6 @@ def _process_backfill_task_instances(self, ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) - if run.dag.is_paused: - models.DagStat.update([run.dag_id], session=session) - self._log_progress(ti_status) # return updated status @@ -2442,6 +2389,7 @@ def _execute_for_run_dates(self, run_dates, ti_status, executor, pickle_id, Computes the dag runs and their respective task instances for the given run dates and executes the task instances. Returns a list of execution dates of the dag runs that were executed. + :param run_dates: Execution dates for dag runs :type run_dates: list :param ti_status: internal BackfillJob status structure to tis track progress @@ -2451,9 +2399,9 @@ def _execute_for_run_dates(self, run_dates, ti_status, executor, pickle_id, :param pickle_id: numeric id of the pickled dag, None if not pickled :type pickle_id: int :param start_date: backfill start date - :type start_date: datetime + :type start_date: datetime.datetime :param session: the current session object - :type session: Session + :type session: sqlalchemy.orm.session.Session """ for next_run_date in run_dates: dag_run = self._get_dag_run(next_run_date, session=session) @@ -2487,6 +2435,14 @@ def _execute(self, session=None): # Get intervals between the start/end dates, which will turn into dag runs run_dates = self.dag.get_run_dates(start_date=start_date, end_date=self.bf_end_date) + if self.run_backwards: + tasks_that_depend_on_past = [t.task_id for t in self.dag.task_dict.values() if t.depends_on_past] + if tasks_that_depend_on_past: + raise AirflowException( + 'You cannot backfill backwards because one or more tasks depend_on_past: {}'.format( + ",".join(tasks_that_depend_on_past))) + run_dates = run_dates[::-1] + if len(run_dates) == 0: self.log.info("No run dates were found for the given dates and dag interval.") return @@ -2495,7 +2451,7 @@ def _execute(self, session=None): pickle_id = None if not self.donot_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): - pickle = models.DagPickle(self.dag) + pickle = DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id @@ -2557,6 +2513,7 @@ def __init__( pool=None, *args, **kwargs): self.task_instance = task_instance + self.dag_id = task_instance.dag_id self.ignore_all_deps = ignore_all_deps self.ignore_depends_on_past = ignore_depends_on_past self.ignore_task_deps = ignore_task_deps diff --git a/airflow/lineage/__init__.py b/airflow/lineage/__init__.py index 3e2af478bb5e0..124cbabbd9a3d 100644 --- a/airflow/lineage/__init__.py +++ b/airflow/lineage/__init__.py @@ -21,7 +21,7 @@ from airflow import configuration as conf from airflow.lineage.datasets import DataSet from airflow.utils.log.logging_mixin import LoggingMixin -from airflow.utils.module_loading import import_string, prepare_classpath +from airflow.utils.module_loading import import_string from itertools import chain @@ -36,7 +36,6 @@ def _get_backend(): try: _backend_str = conf.get("lineage", "backend") - prepare_classpath() backend = import_string(_backend_str) except ImportError as ie: log.debug("Cannot import %s due to %s", _backend_str, ie) @@ -85,13 +84,13 @@ def wrapper(self, context, *args, **kwargs): def prepare_lineage(func): """ - Prepares the lineage inlets and outlets - inlets can be: - "auto" -> picks up any outlets from direct upstream tasks that have outlets - defined, as such that if A -> B -> C and B does not have outlets but A does, - these are provided as inlets. - "list of task_ids" -> picks up outlets from the upstream task_ids - "list of datasets" -> manually defined list of DataSet + Prepares the lineage inlets and outlets. Inlets can be: + + * "auto" -> picks up any outlets from direct upstream tasks that have outlets defined, as such that + if A -> B -> C and B does not have outlets but A does, these are provided as inlets. + * "list of task_ids" -> picks up outlets from the upstream task_ids + * "list of datasets" -> manually defined list of DataSet + """ @wraps(func) def wrapper(self, context, *args, **kwargs): diff --git a/airflow/lineage/datasets.py b/airflow/lineage/datasets.py index 40c8edc9a8f9d..260277065b6f0 100644 --- a/airflow/lineage/datasets.py +++ b/airflow/lineage/datasets.py @@ -18,6 +18,7 @@ # under the License. import six +from typing import List from jinja2 import Environment @@ -28,7 +29,7 @@ def _inherited(cls): class DataSet(object): - attributes = [] + attributes = [] # type: List[str] type_name = "dataSet" def __init__(self, qualified_name=None, data=None, **kwargs): @@ -96,7 +97,7 @@ def map_type(name): if cls.type_name == name: return cls - raise NotImplemented("No known mapping for {}".format(name)) + raise NotImplementedError("No known mapping for {}".format(name)) class DataBase(DataSet): diff --git a/airflow/logging_config.py b/airflow/logging_config.py index 33c2dc82e1be5..0cf8f4db0d59b 100644 --- a/airflow/logging_config.py +++ b/airflow/logging_config.py @@ -18,11 +18,12 @@ # under the License. # import logging +import warnings from logging.config import dictConfig from airflow import configuration as conf from airflow.exceptions import AirflowConfigException -from airflow.utils.module_loading import import_string, prepare_classpath +from airflow.utils.module_loading import import_string log = logging.getLogger(__name__) @@ -30,10 +31,6 @@ def configure_logging(): logging_class_path = '' try: - # Prepare the classpath so we are sure that the config folder - # is on the python classpath and it is reachable - prepare_classpath() - logging_class_path = conf.get('core', 'logging_config_class') except AirflowConfigException: log.debug('Could not find key logging_config_class in config') @@ -56,9 +53,9 @@ def configure_logging(): .format(logging_class_path, err) ) else: - from airflow.config_templates.airflow_local_settings import ( - DEFAULT_LOGGING_CONFIG as logging_config - ) + logging_class_path = 'airflow.config_templates.' \ + 'airflow_local_settings.DEFAULT_LOGGING_CONFIG' + logging_config = import_string(logging_class_path) log.debug('Unable to load custom logging, using default config instead') try: @@ -70,4 +67,36 @@ def configure_logging(): # otherwise Airflow would silently fall back on the default config raise e - return logging_config + validate_logging_config(logging_config) + + return logging_class_path + + +def validate_logging_config(logging_config): + # Now lets validate the other logging-related settings + task_log_reader = conf.get('core', 'task_log_reader') + + logger = logging.getLogger('airflow.task') + + def _get_handler(name): + return next((h for h in logger.handlers if h.name == name), None) + + if _get_handler(task_log_reader) is None: + # Check for pre 1.10 setting that might be in deployed airflow.cfg files + if task_log_reader == "file.task" and _get_handler("task"): + warnings.warn( + "task_log_reader setting in [core] has a deprecated value of " + "{!r}, but no handler with this name was found. Please update " + "your config to use {!r}. Running config has been adjusted to " + "match".format( + task_log_reader, + "task", + ), + DeprecationWarning, + ) + conf.set('core', 'task_log_reader', 'task') + else: + raise AirflowConfigException( + "Configured task_log_reader {!r} was not a handler of the 'airflow.task' " + "logger.".format(task_log_reader) + ) diff --git a/airflow/macros/hive.py b/airflow/macros/hive.py index bb60203e95347..914b9af2c4b1e 100644 --- a/airflow/macros/hive.py +++ b/airflow/macros/hive.py @@ -27,14 +27,14 @@ def max_partition( Gets the max partition for a table. :param schema: The hive schema the table lives in - :type schema: string + :type schema: str :param table: The hive table you are interested in, supports the dot notation as in "my_database.my_table", if a dot is found, the schema param is disregarded - :type table: string + :type table: str :param metastore_conn_id: The hive connection you are interested in. If your default is set you don't need to use this parameter. - :type metastore_conn_id: string + :type metastore_conn_id: str :param filter_map: partition_key:partition_value map used for partition filtering, e.g. {'key1': 'value1', 'key2': 'value2'}. Only partitions matching all partition_key:partition_value @@ -63,7 +63,7 @@ def _closest_date(target_dt, date_list, before_target=None): :param target_dt: The target date :type target_dt: datetime.date :param date_list: The list of dates to search - :type date_list: datetime.date list + :type date_list: list[datetime.date] :param before_target: closest before or after the target :type before_target: bool or None :returns: The closest date @@ -90,7 +90,7 @@ def closest_ds_partition( :param table: A hive table name :type table: str :param ds: A datestamp ``%Y-%m-%d`` e.g. ``yyyy-mm-dd`` - :type ds: datetime.date list + :type ds: list[datetime.date] :param before: closest before (True), after (False) or either side of ds :type before: bool or None :returns: The closest date diff --git a/airflow/migrations/__init__.py b/airflow/migrations/__init__.py index f0f8b68337da6..114d189da14ab 100644 --- a/airflow/migrations/__init__.py +++ b/airflow/migrations/__init__.py @@ -7,13 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - diff --git a/airflow/migrations/env.py b/airflow/migrations/env.py index 828cacf0e0fd6..76c2eb329b0df 100644 --- a/airflow/migrations/env.py +++ b/airflow/migrations/env.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -36,7 +36,7 @@ # for 'autogenerate' support # from myapp import mymodel # target_metadata = mymodel.Base.metadata -target_metadata = models.Base.metadata +target_metadata = models.base.Base.metadata # other values from the config, defined by the needs of env.py, # can be acquired: @@ -85,6 +85,7 @@ def run_migrations_online(): with context.begin_transaction(): context.run_migrations() + if context.is_offline_mode(): run_migrations_offline() else: diff --git a/airflow/migrations/versions/03bc53e68815_add_sm_dag_index.py b/airflow/migrations/versions/03bc53e68815_add_sm_dag_index.py new file mode 100644 index 0000000000000..fc8468155c235 --- /dev/null +++ b/airflow/migrations/versions/03bc53e68815_add_sm_dag_index.py @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""merge_heads_2 + +Revision ID: 03bc53e68815 +Revises: 0a2a5b66e19d, bf00311e1990 +Create Date: 2018-11-24 20:21:46.605414 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = '03bc53e68815' +down_revision = ('0a2a5b66e19d', 'bf00311e1990') +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_index('sm_dag', 'sla_miss', ['dag_id'], unique=False) + + +def downgrade(): + op.drop_index('sm_dag', table_name='sla_miss') diff --git a/airflow/migrations/versions/05f30312d566_merge_heads.py b/airflow/migrations/versions/05f30312d566_merge_heads.py new file mode 100644 index 0000000000000..f869cb8c08e5a --- /dev/null +++ b/airflow/migrations/versions/05f30312d566_merge_heads.py @@ -0,0 +1,40 @@ +# flake8: noqa +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""merge heads + +Revision ID: 05f30312d566 +Revises: 86770d1215c0, 0e2a74e0fc9f +Create Date: 2018-06-17 10:47:23.339972 + +""" + +# revision identifiers, used by Alembic. +revision = '05f30312d566' +down_revision = ('86770d1215c0', '0e2a74e0fc9f') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass diff --git a/airflow/migrations/versions/0a2a5b66e19d_add_task_reschedule_table.py b/airflow/migrations/versions/0a2a5b66e19d_add_task_reschedule_table.py new file mode 100644 index 0000000000000..643a1ca81b678 --- /dev/null +++ b/airflow/migrations/versions/0a2a5b66e19d_add_task_reschedule_table.py @@ -0,0 +1,91 @@ +# flake8: noqa +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""add task_reschedule table + +Revision ID: 0a2a5b66e19d +Revises: 9635ae0956e7 +Create Date: 2018-06-17 22:50:00.053620 + +""" + +# revision identifiers, used by Alembic. +revision = '0a2a5b66e19d' +down_revision = '9635ae0956e7' +branch_labels = None +depends_on = None + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + + +TABLE_NAME = 'task_reschedule' +INDEX_NAME = 'idx_' + TABLE_NAME + '_dag_task_date' + +# For Microsoft SQL Server, TIMESTAMP is a row-id type, +# having nothing to do with date-time. DateTime() will +# be sufficient. +def mssql_timestamp(): + return sa.DateTime() + +def mysql_timestamp(): + return mysql.TIMESTAMP(fsp=6) + +def sa_timestamp(): + return sa.TIMESTAMP(timezone=True) + +def upgrade(): + # See 0e2a74e0fc9f_add_time_zone_awareness + conn = op.get_bind() + if conn.dialect.name == 'mysql': + timestamp = mysql_timestamp + elif conn.dialect.name == 'mssql': + timestamp = mssql_timestamp + else: + timestamp = sa_timestamp + + op.create_table( + TABLE_NAME, + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('task_id', sa.String(length=250), nullable=False), + sa.Column('dag_id', sa.String(length=250), nullable=False), + # use explicit server_default=None otherwise mysql implies defaults for first timestamp column + sa.Column('execution_date', timestamp(), nullable=False, server_default=None), + sa.Column('try_number', sa.Integer(), nullable=False), + sa.Column('start_date', timestamp(), nullable=False), + sa.Column('end_date', timestamp(), nullable=False), + sa.Column('duration', sa.Integer(), nullable=False), + sa.Column('reschedule_date', timestamp(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.ForeignKeyConstraint(['task_id', 'dag_id', 'execution_date'], + ['task_instance.task_id', 'task_instance.dag_id','task_instance.execution_date'], + name='task_reschedule_dag_task_date_fkey') + ) + op.create_index( + INDEX_NAME, + TABLE_NAME, + ['dag_id', 'task_id', 'execution_date'], + unique=False + ) + + +def downgrade(): + op.drop_index(INDEX_NAME, table_name=TABLE_NAME) + op.drop_table(TABLE_NAME) diff --git a/airflow/migrations/versions/0e2a74e0fc9f_add_time_zone_awareness.py b/airflow/migrations/versions/0e2a74e0fc9f_add_time_zone_awareness.py index 64ee41c44d9e2..ac7824fd9e2f5 100644 --- a/airflow/migrations/versions/0e2a74e0fc9f_add_time_zone_awareness.py +++ b/airflow/migrations/versions/0e2a74e0fc9f_add_time_zone_awareness.py @@ -25,16 +25,16 @@ """ +from alembic import op +from sqlalchemy.dialects import mysql +import sqlalchemy as sa + # revision identifiers, used by Alembic. revision = '0e2a74e0fc9f' down_revision = 'd2ae31099d61' branch_labels = None depends_on = None -from alembic import op -from sqlalchemy.dialects import mysql -import sqlalchemy as sa - def upgrade(): conn = op.get_bind() @@ -69,14 +69,16 @@ def upgrade(): op.alter_column(table_name='log', column_name='dttm', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='log', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6)) - op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6), nullable=False) + op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6), + nullable=False) op.alter_column(table_name='sla_miss', column_name='timestamp', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='task_fail', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='task_fail', column_name='start_date', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='task_fail', column_name='end_date', type_=mysql.TIMESTAMP(fsp=6)) - op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6), nullable=False) + op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6), + nullable=False) op.alter_column(table_name='task_instance', column_name='start_date', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='task_instance', column_name='end_date', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='task_instance', column_name='queued_dttm', type_=mysql.TIMESTAMP(fsp=6)) @@ -84,8 +86,8 @@ def upgrade(): op.alter_column(table_name='xcom', column_name='timestamp', type_=mysql.TIMESTAMP(fsp=6)) op.alter_column(table_name='xcom', column_name='execution_date', type_=mysql.TIMESTAMP(fsp=6)) else: - # sqlite datetime is fine as is not converting - if conn.dialect.name == 'sqlite': + # sqlite and mssql datetime are fine as is. Therefore, not converting + if conn.dialect.name in ('sqlite', 'mssql'): return # we try to be database agnostic, but not every db (e.g. sqlserver) @@ -117,14 +119,16 @@ def upgrade(): op.alter_column(table_name='log', column_name='dttm', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='log', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True)) - op.alter_column(table_name='sla_miss', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True), nullable=False) + op.alter_column(table_name='sla_miss', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True), + nullable=False) op.alter_column(table_name='sla_miss', column_name='timestamp', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='task_fail', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='task_fail', column_name='start_date', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='task_fail', column_name='end_date', type_=sa.TIMESTAMP(timezone=True)) - op.alter_column(table_name='task_instance', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True), nullable=False) + op.alter_column(table_name='task_instance', column_name='execution_date', type_=sa.TIMESTAMP(timezone=True), + nullable=False) op.alter_column(table_name='task_instance', column_name='start_date', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='task_instance', column_name='end_date', type_=sa.TIMESTAMP(timezone=True)) op.alter_column(table_name='task_instance', column_name='queued_dttm', type_=sa.TIMESTAMP(timezone=True)) @@ -161,14 +165,16 @@ def downgrade(): op.alter_column(table_name='log', column_name='dttm', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='log', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.DATETIME(fsp=6), nullable=False) + op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.DATETIME(fsp=6), + nullable=False) op.alter_column(table_name='sla_miss', column_name='DATETIME', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='task_fail', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='task_fail', column_name='start_date', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='task_fail', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.DATETIME(fsp=6), nullable=False) + op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.DATETIME(fsp=6), + nullable=False) op.alter_column(table_name='task_instance', column_name='start_date', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='task_instance', column_name='end_date', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='task_instance', column_name='queued_dttm', type_=mysql.DATETIME(fsp=6)) @@ -176,7 +182,7 @@ def downgrade(): op.alter_column(table_name='xcom', column_name='DATETIME', type_=mysql.DATETIME(fsp=6)) op.alter_column(table_name='xcom', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) else: - if conn.dialect.name == 'sqlite': + if conn.dialect.name in ('sqlite', 'mssql'): return # we try to be database agnostic, but not every db (e.g. sqlserver) diff --git a/airflow/migrations/versions/127d2bf2dfa7_add_dag_id_state_index_on_dag_run_table.py b/airflow/migrations/versions/127d2bf2dfa7_add_dag_id_state_index_on_dag_run_table.py index 58517256949ab..288a0b60aa821 100644 --- a/airflow/migrations/versions/127d2bf2dfa7_add_dag_id_state_index_on_dag_run_table.py +++ b/airflow/migrations/versions/127d2bf2dfa7_add_dag_id_state_index_on_dag_run_table.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,7 @@ Create Date: 2017-01-25 11:43:51.635667 """ +from alembic import op # revision identifiers, used by Alembic. revision = '127d2bf2dfa7' @@ -30,8 +31,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa def upgrade(): op.create_index('dag_id_state', 'dag_run', ['dag_id', 'state'], unique=False) @@ -39,4 +38,3 @@ def upgrade(): def downgrade(): op.drop_index('dag_id_state', table_name='dag_run') - diff --git a/airflow/migrations/versions/13eb55f81627_for_compatibility.py b/airflow/migrations/versions/13eb55f81627_for_compatibility.py index a14e195bbefe1..b9e88bd01becd 100644 --- a/airflow/migrations/versions/13eb55f81627_for_compatibility.py +++ b/airflow/migrations/versions/13eb55f81627_for_compatibility.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/migrations/versions/1507a7289a2f_create_is_encrypted.py b/airflow/migrations/versions/1507a7289a2f_create_is_encrypted.py index 47bdf13d672d3..fe84254c38caa 100644 --- a/airflow/migrations/versions/1507a7289a2f_create_is_encrypted.py +++ b/airflow/migrations/versions/1507a7289a2f_create_is_encrypted.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,9 @@ Create Date: 2015-08-18 18:57:51.927315 """ +from alembic import op +import sqlalchemy as sa +from sqlalchemy.engine.reflection import Inspector # revision identifiers, used by Alembic. revision = '1507a7289a2f' @@ -31,10 +34,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from sqlalchemy.engine.reflection import Inspector - connectionhelper = sa.Table( 'connection', sa.MetaData(), diff --git a/airflow/migrations/versions/1968acfc09e3_add_is_encrypted_column_to_variable_.py b/airflow/migrations/versions/1968acfc09e3_add_is_encrypted_column_to_variable_.py index 57ee27aeb4bfb..16ab349563428 100644 --- a/airflow/migrations/versions/1968acfc09e3_add_is_encrypted_column_to_variable_.py +++ b/airflow/migrations/versions/1968acfc09e3_add_is_encrypted_column_to_variable_.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,8 @@ Create Date: 2016-02-02 17:20:55.692295 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '1968acfc09e3' @@ -31,12 +33,9 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): - op.add_column('variable', sa.Column('is_encrypted', sa.Boolean,default=False)) + op.add_column('variable', sa.Column('is_encrypted', sa.Boolean, default=False)) def downgrade(): diff --git a/airflow/migrations/versions/1b38cef5b76e_add_dagrun.py b/airflow/migrations/versions/1b38cef5b76e_add_dagrun.py index a0af7fa7ad6c2..50d53652c4734 100644 --- a/airflow/migrations/versions/1b38cef5b76e_add_dagrun.py +++ b/airflow/migrations/versions/1b38cef5b76e_add_dagrun.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,28 +25,27 @@ """ +from alembic import op +import sqlalchemy as sa + # revision identifiers, used by Alembic. revision = '1b38cef5b76e' down_revision = '502898887f84' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.create_table('dag_run', - sa.Column('id', sa.Integer(), nullable=False), - sa.Column('dag_id', sa.String(length=250), nullable=True), - sa.Column('execution_date', sa.DateTime(), nullable=True), - sa.Column('state', sa.String(length=50), nullable=True), - sa.Column('run_id', sa.String(length=250), nullable=True), - sa.Column('external_trigger', sa.Boolean(), nullable=True), - sa.PrimaryKeyConstraint('id'), - sa.UniqueConstraint('dag_id', 'execution_date'), - sa.UniqueConstraint('dag_id', 'run_id'), - ) + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('dag_id', sa.String(length=250), nullable=True), + sa.Column('execution_date', sa.DateTime(), nullable=True), + sa.Column('state', sa.String(length=50), nullable=True), + sa.Column('run_id', sa.String(length=250), nullable=True), + sa.Column('external_trigger', sa.Boolean(), nullable=True), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('dag_id', 'execution_date'), + sa.UniqueConstraint('dag_id', 'run_id')) def downgrade(): diff --git a/airflow/migrations/versions/211e584da130_add_ti_state_index.py b/airflow/migrations/versions/211e584da130_add_ti_state_index.py index 93347a2649214..b17f390e0b65b 100644 --- a/airflow/migrations/versions/211e584da130_add_ti_state_index.py +++ b/airflow/migrations/versions/211e584da130_add_ti_state_index.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,7 @@ Create Date: 2016-06-30 10:54:24.323588 """ +from alembic import op # revision identifiers, used by Alembic. revision = '211e584da130' @@ -31,9 +32,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.create_index('ti_state', 'task_instance', ['state'], unique=False) diff --git a/airflow/migrations/versions/27c6a30d7c24_add_executor_config_to_task_instance.py b/airflow/migrations/versions/27c6a30d7c24_add_executor_config_to_task_instance.py index b7213a3031ddd..a757d2770971b 100644 --- a/airflow/migrations/versions/27c6a30d7c24_add_executor_config_to_task_instance.py +++ b/airflow/migrations/versions/27c6a30d7c24_add_executor_config_to_task_instance.py @@ -1,16 +1,22 @@ # flake8: noqa # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """kubernetes_resource_checkpointing @@ -20,18 +26,16 @@ """ +from alembic import op +import sqlalchemy as sa +import dill + # revision identifiers, used by Alembic. revision = '27c6a30d7c24' down_revision = '33ae817a1ff4' branch_labels = None depends_on = None - -from alembic import op -import sqlalchemy as sa -import dill - - TASK_INSTANCE_TABLE = "task_instance" NEW_COLUMN = "executor_config" @@ -42,4 +46,3 @@ def upgrade(): def downgrade(): op.drop_column(TASK_INSTANCE_TABLE, NEW_COLUMN) - diff --git a/airflow/migrations/versions/2e541a1dcfed_task_duration.py b/airflow/migrations/versions/2e541a1dcfed_task_duration.py index 7b540430a5dd0..595a5774a6b27 100644 --- a/airflow/migrations/versions/2e541a1dcfed_task_duration.py +++ b/airflow/migrations/versions/2e541a1dcfed_task_duration.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,16 +25,16 @@ """ +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import mysql + # revision identifiers, used by Alembic. revision = '2e541a1dcfed' down_revision = '1b38cef5b76e' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import mysql - def upgrade(): # use batch_alter_table to support SQLite workaround diff --git a/airflow/migrations/versions/2e82aab8ef20_rename_user_table.py b/airflow/migrations/versions/2e82aab8ef20_rename_user_table.py index 03266679279f9..fc8a1aab20e69 100644 --- a/airflow/migrations/versions/2e82aab8ef20_rename_user_table.py +++ b/airflow/migrations/versions/2e82aab8ef20_rename_user_table.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,7 @@ Create Date: 2016-04-02 19:28:15.211915 """ +from alembic import op # revision identifiers, used by Alembic. revision = '2e82aab8ef20' @@ -31,9 +32,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.rename_table('user', 'users') @@ -41,4 +39,3 @@ def upgrade(): def downgrade(): op.rename_table('users', 'user') - diff --git a/airflow/migrations/versions/338e90f54d61_more_logging_into_task_isntance.py b/airflow/migrations/versions/338e90f54d61_more_logging_into_task_isntance.py index c101e4a477230..473f76778b89d 100644 --- a/airflow/migrations/versions/338e90f54d61_more_logging_into_task_isntance.py +++ b/airflow/migrations/versions/338e90f54d61_more_logging_into_task_isntance.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,13 +17,17 @@ # specific language governing permissions and limitations # under the License. -"""More logging into task_isntance +"""More logging into task_instance Revision ID: 338e90f54d61 Revises: 13eb55f81627 Create Date: 2015-08-25 06:09:20.460147 """ +# flake8: noqa: E266 + +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '338e90f54d61' @@ -31,9 +35,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): ### commands auto generated by Alembic - please adjust! ### diff --git a/airflow/migrations/versions/33ae817a1ff4_add_kubernetes_resource_checkpointing.py b/airflow/migrations/versions/33ae817a1ff4_add_kubernetes_resource_checkpointing.py index 4347bae92af1e..88a4199d3250e 100644 --- a/airflow/migrations/versions/33ae817a1ff4_add_kubernetes_resource_checkpointing.py +++ b/airflow/migrations/versions/33ae817a1ff4_add_kubernetes_resource_checkpointing.py @@ -1,16 +1,22 @@ # flake8: noqa # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """kubernetes_resource_checkpointing @@ -19,6 +25,8 @@ Create Date: 2017-09-11 15:26:47.598494 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '33ae817a1ff4' @@ -26,20 +34,27 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - - RESOURCE_TABLE = "kube_resource_version" def upgrade(): + + columns_and_constraints = [ + sa.Column("one_row_id", sa.Boolean, server_default=sa.true(), primary_key=True), + sa.Column("resource_version", sa.String(255)) + ] + + conn = op.get_bind() + + # alembic creates an invalid SQL for mssql dialect + if conn.dialect.name not in ('mssql'): + columns_and_constraints.append(sa.CheckConstraint("one_row_id", name="kube_resource_version_one_row_id")) + table = op.create_table( RESOURCE_TABLE, - sa.Column("one_row_id", sa.Boolean, server_default=sa.true(), primary_key=True), - sa.Column("resource_version", sa.String(255)), - sa.CheckConstraint("one_row_id", name="kube_resource_version_one_row_id") + *columns_and_constraints ) + op.bulk_insert(table, [ {"resource_version": ""} ]) @@ -47,4 +62,3 @@ def upgrade(): def downgrade(): op.drop_table(RESOURCE_TABLE) - diff --git a/airflow/migrations/versions/40e67319e3a9_dagrun_config.py b/airflow/migrations/versions/40e67319e3a9_dagrun_config.py index ecfe58cfc3d1b..3da4d5f543038 100644 --- a/airflow/migrations/versions/40e67319e3a9_dagrun_config.py +++ b/airflow/migrations/versions/40e67319e3a9_dagrun_config.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,8 @@ Create Date: 2015-10-29 08:36:31.726728 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '40e67319e3a9' @@ -31,9 +33,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.add_column('dag_run', sa.Column('conf', sa.PickleType(), nullable=True)) diff --git a/airflow/migrations/versions/41f5f12752f8_add_superuser_field.py b/airflow/migrations/versions/41f5f12752f8_add_superuser_field.py new file mode 100644 index 0000000000000..6e02582b7e840 --- /dev/null +++ b/airflow/migrations/versions/41f5f12752f8_add_superuser_field.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""add superuser field + +Revision ID: 41f5f12752f8 +Revises: 03bc53e68815 +Create Date: 2018-12-04 15:50:04.456875 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '41f5f12752f8' +down_revision = '03bc53e68815' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('users', sa.Column('superuser', sa.Boolean(), default=False)) + + +def downgrade(): + op.drop_column('users', 'superuser') diff --git a/airflow/migrations/versions/4446e08588_dagrun_start_end.py b/airflow/migrations/versions/4446e08588_dagrun_start_end.py index dc546f8c77750..29932c92060e9 100644 --- a/airflow/migrations/versions/4446e08588_dagrun_start_end.py +++ b/airflow/migrations/versions/4446e08588_dagrun_start_end.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,15 +25,15 @@ """ +from alembic import op +import sqlalchemy as sa + # revision identifiers, used by Alembic. revision = '4446e08588' down_revision = '561833c1c74b' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.add_column('dag_run', sa.Column('end_date', sa.DateTime(), nullable=True)) diff --git a/airflow/migrations/versions/4addfa1236f1_add_fractional_seconds_to_mysql_tables.py b/airflow/migrations/versions/4addfa1236f1_add_fractional_seconds_to_mysql_tables.py index 80af57836771a..655ff61042d1e 100644 --- a/airflow/migrations/versions/4addfa1236f1_add_fractional_seconds_to_mysql_tables.py +++ b/airflow/migrations/versions/4addfa1236f1_add_fractional_seconds_to_mysql_tables.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,93 +24,147 @@ """ +from alembic import op +from sqlalchemy.dialects import mysql +from alembic import context + # revision identifiers, used by Alembic. revision = '4addfa1236f1' down_revision = 'f2ca10b85618' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import mysql -from alembic import context - def upgrade(): if context.config.get_main_option('sqlalchemy.url').startswith('mysql'): - op.alter_column(table_name='dag', column_name='last_scheduler_run', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='dag', column_name='last_pickled', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='dag', column_name='last_expired', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='dag_pickle', column_name='created_dttm', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='dag_run', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='dag_run', column_name='start_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='dag_run', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='import_error', column_name='timestamp', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='job', column_name='start_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='job', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='job', column_name='latest_heartbeat', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='known_event', column_name='start_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='known_event', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='log', column_name='dttm', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='log', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.DATETIME(fsp=6), nullable=False) - op.alter_column(table_name='sla_miss', column_name='timestamp', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='task_fail', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='task_fail', column_name='start_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='task_fail', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.DATETIME(fsp=6), nullable=False) - op.alter_column(table_name='task_instance', column_name='start_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='task_instance', column_name='end_date', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='task_instance', column_name='queued_dttm', type_=mysql.DATETIME(fsp=6)) - - op.alter_column(table_name='xcom', column_name='timestamp', type_=mysql.DATETIME(fsp=6)) - op.alter_column(table_name='xcom', column_name='execution_date', type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='dag', column_name='last_scheduler_run', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='dag', column_name='last_pickled', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='dag', column_name='last_expired', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='dag_pickle', column_name='created_dttm', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='dag_run', column_name='execution_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='dag_run', column_name='start_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='dag_run', column_name='end_date', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='import_error', column_name='timestamp', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='job', column_name='start_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='job', column_name='end_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='job', column_name='latest_heartbeat', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='known_event', column_name='start_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='known_event', column_name='end_date', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='log', column_name='dttm', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='log', column_name='execution_date', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='sla_miss', column_name='execution_date', + type_=mysql.DATETIME(fsp=6), + nullable=False) + op.alter_column(table_name='sla_miss', column_name='timestamp', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='task_fail', column_name='execution_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='task_fail', column_name='start_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='task_fail', column_name='end_date', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='task_instance', column_name='execution_date', + type_=mysql.DATETIME(fsp=6), + nullable=False) + op.alter_column(table_name='task_instance', column_name='start_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='task_instance', column_name='end_date', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='task_instance', column_name='queued_dttm', + type_=mysql.DATETIME(fsp=6)) + + op.alter_column(table_name='xcom', column_name='timestamp', + type_=mysql.DATETIME(fsp=6)) + op.alter_column(table_name='xcom', column_name='execution_date', + type_=mysql.DATETIME(fsp=6)) def downgrade(): if context.config.get_main_option('sqlalchemy.url').startswith('mysql'): - op.alter_column(table_name='dag', column_name='last_scheduler_run', type_=mysql.DATETIME()) - op.alter_column(table_name='dag', column_name='last_pickled', type_=mysql.DATETIME()) - op.alter_column(table_name='dag', column_name='last_expired', type_=mysql.DATETIME()) - - op.alter_column(table_name='dag_pickle', column_name='created_dttm', type_=mysql.DATETIME()) - - op.alter_column(table_name='dag_run', column_name='execution_date', type_=mysql.DATETIME()) - op.alter_column(table_name='dag_run', column_name='start_date', type_=mysql.DATETIME()) - op.alter_column(table_name='dag_run', column_name='end_date', type_=mysql.DATETIME()) - - op.alter_column(table_name='import_error', column_name='timestamp', type_=mysql.DATETIME()) - - op.alter_column(table_name='job', column_name='start_date', type_=mysql.DATETIME()) - op.alter_column(table_name='job', column_name='end_date', type_=mysql.DATETIME()) - op.alter_column(table_name='job', column_name='latest_heartbeat', type_=mysql.DATETIME()) - - op.alter_column(table_name='known_event', column_name='start_date', type_=mysql.DATETIME()) - op.alter_column(table_name='known_event', column_name='end_date', type_=mysql.DATETIME()) - - op.alter_column(table_name='log', column_name='dttm', type_=mysql.DATETIME()) - op.alter_column(table_name='log', column_name='execution_date', type_=mysql.DATETIME()) - - op.alter_column(table_name='sla_miss', column_name='execution_date', type_=mysql.DATETIME(), nullable=False) - op.alter_column(table_name='sla_miss', column_name='timestamp', type_=mysql.DATETIME()) - - op.alter_column(table_name='task_fail', column_name='execution_date', type_=mysql.DATETIME()) - op.alter_column(table_name='task_fail', column_name='start_date', type_=mysql.DATETIME()) - op.alter_column(table_name='task_fail', column_name='end_date', type_=mysql.DATETIME()) - - op.alter_column(table_name='task_instance', column_name='execution_date', type_=mysql.DATETIME(), nullable=False) - op.alter_column(table_name='task_instance', column_name='start_date', type_=mysql.DATETIME()) - op.alter_column(table_name='task_instance', column_name='end_date', type_=mysql.DATETIME()) - op.alter_column(table_name='task_instance', column_name='queued_dttm', type_=mysql.DATETIME()) - - op.alter_column(table_name='xcom', column_name='timestamp', type_=mysql.DATETIME()) - op.alter_column(table_name='xcom', column_name='execution_date', type_=mysql.DATETIME()) + op.alter_column(table_name='dag', column_name='last_scheduler_run', + type_=mysql.DATETIME()) + op.alter_column(table_name='dag', column_name='last_pickled', + type_=mysql.DATETIME()) + op.alter_column(table_name='dag', column_name='last_expired', + type_=mysql.DATETIME()) + + op.alter_column(table_name='dag_pickle', column_name='created_dttm', + type_=mysql.DATETIME()) + + op.alter_column(table_name='dag_run', column_name='execution_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='dag_run', column_name='start_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='dag_run', column_name='end_date', + type_=mysql.DATETIME()) + + op.alter_column(table_name='import_error', column_name='timestamp', + type_=mysql.DATETIME()) + + op.alter_column(table_name='job', column_name='start_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='job', column_name='end_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='job', column_name='latest_heartbeat', + type_=mysql.DATETIME()) + + op.alter_column(table_name='known_event', column_name='start_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='known_event', column_name='end_date', + type_=mysql.DATETIME()) + + op.alter_column(table_name='log', column_name='dttm', + type_=mysql.DATETIME()) + op.alter_column(table_name='log', column_name='execution_date', + type_=mysql.DATETIME()) + + op.alter_column(table_name='sla_miss', column_name='execution_date', + type_=mysql.DATETIME(), nullable=False) + op.alter_column(table_name='sla_miss', column_name='timestamp', + type_=mysql.DATETIME()) + + op.alter_column(table_name='task_fail', column_name='execution_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='task_fail', column_name='start_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='task_fail', column_name='end_date', + type_=mysql.DATETIME()) + + op.alter_column(table_name='task_instance', column_name='execution_date', + type_=mysql.DATETIME(), + nullable=False) + op.alter_column(table_name='task_instance', column_name='start_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='task_instance', column_name='end_date', + type_=mysql.DATETIME()) + op.alter_column(table_name='task_instance', column_name='queued_dttm', + type_=mysql.DATETIME()) + + op.alter_column(table_name='xcom', column_name='timestamp', + type_=mysql.DATETIME()) + op.alter_column(table_name='xcom', column_name='execution_date', + type_=mysql.DATETIME()) diff --git a/airflow/migrations/versions/502898887f84_adding_extra_to_log.py b/airflow/migrations/versions/502898887f84_adding_extra_to_log.py index f5cded7ea5776..632720a4e27b0 100644 --- a/airflow/migrations/versions/502898887f84_adding_extra_to_log.py +++ b/airflow/migrations/versions/502898887f84_adding_extra_to_log.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,8 @@ Create Date: 2015-11-03 22:50:49.794097 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '502898887f84' @@ -31,9 +33,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.add_column('log', sa.Column('extra', sa.Text(), nullable=True)) diff --git a/airflow/migrations/versions/52d714495f0_job_id_indices.py b/airflow/migrations/versions/52d714495f0_job_id_indices.py index 43893dba8a081..94374cb68aa30 100644 --- a/airflow/migrations/versions/52d714495f0_job_id_indices.py +++ b/airflow/migrations/versions/52d714495f0_job_id_indices.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,7 @@ Create Date: 2015-10-20 03:17:01.962542 """ +from alembic import op # revision identifiers, used by Alembic. revision = '52d714495f0' @@ -31,12 +32,10 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): - op.create_index('idx_job_state_heartbeat', 'job', ['state', 'latest_heartbeat'], unique=False) + op.create_index('idx_job_state_heartbeat', 'job', + ['state', 'latest_heartbeat'], unique=False) def downgrade(): diff --git a/airflow/migrations/versions/561833c1c74b_add_password_column_to_user.py b/airflow/migrations/versions/561833c1c74b_add_password_column_to_user.py index ad82c9378b2f1..a26a105ac8299 100644 --- a/airflow/migrations/versions/561833c1c74b_add_password_column_to_user.py +++ b/airflow/migrations/versions/561833c1c74b_add_password_column_to_user.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,8 @@ Create Date: 2015-11-30 06:51:25.872557 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '561833c1c74b' @@ -31,9 +33,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.add_column('user', sa.Column('password', sa.String(255))) diff --git a/airflow/migrations/versions/5e7d17757c7a_add_pid_field_to_taskinstance.py b/airflow/migrations/versions/5e7d17757c7a_add_pid_field_to_taskinstance.py index 5e2963f060665..77a35db48fad5 100644 --- a/airflow/migrations/versions/5e7d17757c7a_add_pid_field_to_taskinstance.py +++ b/airflow/migrations/versions/5e7d17757c7a_add_pid_field_to_taskinstance.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,15 +24,15 @@ """ +from alembic import op +import sqlalchemy as sa + # revision identifiers, used by Alembic. revision = '5e7d17757c7a' down_revision = '8504051e801b' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.add_column('task_instance', sa.Column('pid', sa.Integer)) diff --git a/airflow/migrations/versions/64de9cddf6c9_add_task_fails_journal_table.py b/airflow/migrations/versions/64de9cddf6c9_add_task_fails_journal_table.py index be747383289a9..2def57e904870 100644 --- a/airflow/migrations/versions/64de9cddf6c9_add_task_fails_journal_table.py +++ b/airflow/migrations/versions/64de9cddf6c9_add_task_fails_journal_table.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,8 @@ Create Date: 2016-08-03 14:02:59.203021 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '64de9cddf6c9' @@ -30,9 +32,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.create_table( @@ -47,5 +46,6 @@ def upgrade(): sa.PrimaryKeyConstraint('id'), ) + def downgrade(): op.drop_table('task_fail') diff --git a/airflow/migrations/versions/8504051e801b_xcom_dag_task_indices.py b/airflow/migrations/versions/8504051e801b_xcom_dag_task_indices.py index 47473e318d625..fdcbc59df8d00 100644 --- a/airflow/migrations/versions/8504051e801b_xcom_dag_task_indices.py +++ b/airflow/migrations/versions/8504051e801b_xcom_dag_task_indices.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,18 +25,18 @@ """ +from alembic import op + # revision identifiers, used by Alembic. revision = '8504051e801b' down_revision = '4addfa1236f1' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): - op.create_index('idx_xcom_dag_task_date', 'xcom', ['dag_id', 'task_id', 'execution_date'], unique=False) + op.create_index('idx_xcom_dag_task_date', 'xcom', + ['dag_id', 'task_id', 'execution_date'], unique=False) def downgrade(): diff --git a/airflow/migrations/versions/856955da8476_fix_sqlite_foreign_key.py b/airflow/migrations/versions/856955da8476_fix_sqlite_foreign_key.py new file mode 100644 index 0000000000000..52a817081be7c --- /dev/null +++ b/airflow/migrations/versions/856955da8476_fix_sqlite_foreign_key.py @@ -0,0 +1,88 @@ +# flake8: noqa +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""fix sqlite foreign key + +Revision ID: 856955da8476 +Revises: f23433877c24 +Create Date: 2018-06-17 15:54:53.844230 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = '856955da8476' +down_revision = 'f23433877c24' +branch_labels = None +depends_on = None + + +def upgrade(): + conn = op.get_bind() + if conn.dialect.name == 'sqlite': + # Fix broken foreign-key constraint for existing SQLite DBs. + # + # Re-define tables and use copy_from to avoid reflection + # which would fail because referenced user table doesn't exist. + # + # Use batch_alter_table to support SQLite workaround. + chart_table = sa.Table('chart', + sa.MetaData(), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('label', sa.String(length=200), nullable=True), + sa.Column('conn_id', sa.String(length=250), nullable=False), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('chart_type', sa.String(length=100), nullable=True), + sa.Column('sql_layout', sa.String(length=50), nullable=True), + sa.Column('sql', sa.Text(), nullable=True), + sa.Column('y_log_scale', sa.Boolean(), nullable=True), + sa.Column('show_datatable', sa.Boolean(), nullable=True), + sa.Column('show_sql', sa.Boolean(), nullable=True), + sa.Column('height', sa.Integer(), nullable=True), + sa.Column('default_params', sa.String(length=5000), nullable=True), + sa.Column('x_is_date', sa.Boolean(), nullable=True), + sa.Column('iteration_no', sa.Integer(), nullable=True), + sa.Column('last_modified', sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint('id')) + with op.batch_alter_table('chart', copy_from=chart_table) as batch_op: + batch_op.create_foreign_key('chart_user_id_fkey', 'users', + ['user_id'], ['id']) + + known_event_table = sa.Table('known_event', + sa.MetaData(), + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('label', sa.String(length=200), nullable=True), + sa.Column('start_date', sa.DateTime(), nullable=True), + sa.Column('end_date', sa.DateTime(), nullable=True), + sa.Column('user_id', sa.Integer(), nullable=True), + sa.Column('known_event_type_id', sa.Integer(), nullable=True), + sa.Column('description', sa.Text(), nullable=True), + sa.ForeignKeyConstraint(['known_event_type_id'], + ['known_event_type.id'], ), + sa.PrimaryKeyConstraint('id')) + with op.batch_alter_table('chart', copy_from=known_event_table) as batch_op: + batch_op.create_foreign_key('known_event_user_id_fkey', 'users', + ['user_id'], ['id']) + + +def downgrade(): + # Downgrade would fail because the broken FK constraint can't be re-created. + pass diff --git a/airflow/migrations/versions/86770d1215c0_add_kubernetes_scheduler_uniqueness.py b/airflow/migrations/versions/86770d1215c0_add_kubernetes_scheduler_uniqueness.py index 6bc48f1105639..633201b03b6cd 100644 --- a/airflow/migrations/versions/86770d1215c0_add_kubernetes_scheduler_uniqueness.py +++ b/airflow/migrations/versions/86770d1215c0_add_kubernetes_scheduler_uniqueness.py @@ -1,16 +1,22 @@ # flake8: noqa # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """add kubernetes scheduler uniqueness @@ -19,6 +25,8 @@ Create Date: 2018-04-03 15:31:20.814328 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = '86770d1215c0' @@ -26,20 +34,27 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - - RESOURCE_TABLE = "kube_worker_uuid" def upgrade(): + + columns_and_constraints = [ + sa.Column("one_row_id", sa.Boolean, server_default=sa.true(), primary_key=True), + sa.Column("worker_uuid", sa.String(255)) + ] + + conn = op.get_bind() + + # alembic creates an invalid SQL for mssql dialect + if conn.dialect.name not in ('mssql'): + columns_and_constraints.append(sa.CheckConstraint("one_row_id", name="kube_worker_one_row_id")) + table = op.create_table( RESOURCE_TABLE, - sa.Column("one_row_id", sa.Boolean, server_default=sa.true(), primary_key=True), - sa.Column("worker_uuid", sa.String(255)), - sa.CheckConstraint("one_row_id", name="kube_worker_one_row_id") + *columns_and_constraints ) + op.bulk_insert(table, [ {"worker_uuid": ""} ]) diff --git a/airflow/migrations/versions/939bb1e647c8_task_reschedule_fk_on_cascade_delete.py b/airflow/migrations/versions/939bb1e647c8_task_reschedule_fk_on_cascade_delete.py new file mode 100644 index 0000000000000..b425b4aecec34 --- /dev/null +++ b/airflow/migrations/versions/939bb1e647c8_task_reschedule_fk_on_cascade_delete.py @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""task reschedule fk on cascade delete + +Revision ID: 939bb1e647c8 +Revises: 4ebbffe0a39a +Create Date: 2019-02-04 20:21:50.669751 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = '939bb1e647c8' +down_revision = 'dd4ecb8fbee3' +branch_labels = None +depends_on = None + + +def upgrade(): + with op.batch_alter_table('task_reschedule') as batch_op: + batch_op.drop_constraint( + 'task_reschedule_dag_task_date_fkey', + type_='foreignkey' + ) + batch_op.create_foreign_key( + 'task_reschedule_dag_task_date_fkey', + 'task_instance', + ['task_id', 'dag_id', 'execution_date'], + ['task_id', 'dag_id', 'execution_date'], + ondelete='CASCADE' + ) + + +def downgrade(): + with op.batch_alter_table('task_reschedule') as batch_op: + batch_op.drop_constraint( + 'task_reschedule_dag_task_date_fkey', + type_='foreignkey' + ) + batch_op.create_foreign_key( + 'task_reschedule_dag_task_date_fkey', + 'task_instance', + ['task_id', 'dag_id', 'execution_date'], + ['task_id', 'dag_id', 'execution_date'] + ) diff --git a/airflow/migrations/versions/947454bf1dff_add_ti_job_id_index.py b/airflow/migrations/versions/947454bf1dff_add_ti_job_id_index.py index 4c8bce9b471e7..6ff41baa28c7a 100644 --- a/airflow/migrations/versions/947454bf1dff_add_ti_job_id_index.py +++ b/airflow/migrations/versions/947454bf1dff_add_ti_job_id_index.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,7 @@ Create Date: 2017-08-15 15:12:13.845074 """ +from alembic import op # revision identifiers, used by Alembic. revision = '947454bf1dff' @@ -31,9 +32,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.create_index('ti_job_id', 'task_instance', ['job_id'], unique=False) diff --git a/airflow/migrations/versions/9635ae0956e7_index_faskfail.py b/airflow/migrations/versions/9635ae0956e7_index_faskfail.py new file mode 100644 index 0000000000000..6b21c3474acc4 --- /dev/null +++ b/airflow/migrations/versions/9635ae0956e7_index_faskfail.py @@ -0,0 +1,41 @@ +# flake8: noqa +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""index-faskfail + +Revision ID: 9635ae0956e7 +Revises: 856955da8476 +Create Date: 2018-06-17 21:40:01.963540 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = '9635ae0956e7' +down_revision = '856955da8476' +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_index('idx_task_fail_dag_task_date', 'task_fail', ['dag_id', 'task_id', 'execution_date'], unique=False) + + +def downgrade(): + op.drop_index('idx_task_fail_dag_task_date', table_name='task_fail') diff --git a/airflow/migrations/versions/__init__.py b/airflow/migrations/versions/__init__.py index f0f8b68337da6..114d189da14ab 100644 --- a/airflow/migrations/versions/__init__.py +++ b/airflow/migrations/versions/__init__.py @@ -7,13 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - diff --git a/airflow/migrations/versions/a56c9515abdc_remove_dag_stat_table.py b/airflow/migrations/versions/a56c9515abdc_remove_dag_stat_table.py new file mode 100644 index 0000000000000..89dba33c3e394 --- /dev/null +++ b/airflow/migrations/versions/a56c9515abdc_remove_dag_stat_table.py @@ -0,0 +1,47 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Remove dag_stat table + +Revision ID: a56c9515abdc +Revises: c8ffec048a3b +Create Date: 2018-12-27 10:27:59.715872 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = 'a56c9515abdc' +down_revision = 'c8ffec048a3b' +branch_labels = None +depends_on = None + + +def upgrade(): + op.drop_table("dag_stats") + + +def downgrade(): + op.create_table('dag_stats', + sa.Column('dag_id', sa.String(length=250), nullable=False), + sa.Column('state', sa.String(length=50), nullable=False), + sa.Column('count', sa.Integer(), nullable=False, default=0), + sa.Column('dirty', sa.Boolean(), nullable=False, default=False), + sa.PrimaryKeyConstraint('dag_id', 'state')) diff --git a/airflow/migrations/versions/bba5a7cfc896_add_a_column_to_track_the_encryption_.py b/airflow/migrations/versions/bba5a7cfc896_add_a_column_to_track_the_encryption_.py index c780e5830dce6..503cd0b6f0406 100644 --- a/airflow/migrations/versions/bba5a7cfc896_add_a_column_to_track_the_encryption_.py +++ b/airflow/migrations/versions/bba5a7cfc896_add_a_column_to_track_the_encryption_.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,18 +25,19 @@ """ +from alembic import op +import sqlalchemy as sa + # revision identifiers, used by Alembic. revision = 'bba5a7cfc896' down_revision = 'bbc73705a13e' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): - op.add_column('connection', sa.Column('is_extra_encrypted', sa.Boolean,default=False)) + op.add_column('connection', + sa.Column('is_extra_encrypted', sa.Boolean, default=False)) def downgrade(): diff --git a/airflow/migrations/versions/bbc73705a13e_add_notification_sent_column_to_sla_miss.py b/airflow/migrations/versions/bbc73705a13e_add_notification_sent_column_to_sla_miss.py index 84c54fad7157a..9855a6d4daf3c 100644 --- a/airflow/migrations/versions/bbc73705a13e_add_notification_sent_column_to_sla_miss.py +++ b/airflow/migrations/versions/bbc73705a13e_add_notification_sent_column_to_sla_miss.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,6 +24,8 @@ Create Date: 2016-01-14 18:05:54.871682 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = 'bbc73705a13e' @@ -31,12 +33,9 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): - op.add_column('sla_miss', sa.Column('notification_sent', sa.Boolean,default=False)) + op.add_column('sla_miss', sa.Column('notification_sent', sa.Boolean, default=False)) def downgrade(): diff --git a/airflow/migrations/versions/bdaa763e6c56_make_xcom_value_column_a_large_binary.py b/airflow/migrations/versions/bdaa763e6c56_make_xcom_value_column_a_large_binary.py index 5e06766182f6d..a1a5270c7bff4 100644 --- a/airflow/migrations/versions/bdaa763e6c56_make_xcom_value_column_a_large_binary.py +++ b/airflow/migrations/versions/bdaa763e6c56_make_xcom_value_column_a_large_binary.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,9 @@ Create Date: 2017-08-14 16:06:31.568971 """ +from alembic import op +import dill +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = 'bdaa763e6c56' @@ -30,15 +33,10 @@ branch_labels = None depends_on = None -from alembic import op -import dill -import sqlalchemy as sa - def upgrade(): - # There can be data truncation here as LargeBinary can be smaller than the pickle + # There can be data truncation here as LargeBinary can be smaller than the pickle # type. - # use batch_alter_table to support SQLite workaround with op.batch_alter_table("xcom") as batch_op: batch_op.alter_column('value', type_=sa.LargeBinary()) diff --git a/airflow/migrations/versions/bf00311e1990_add_index_to_taskinstance.py b/airflow/migrations/versions/bf00311e1990_add_index_to_taskinstance.py new file mode 100644 index 0000000000000..528bd53b366e5 --- /dev/null +++ b/airflow/migrations/versions/bf00311e1990_add_index_to_taskinstance.py @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""add index to taskinstance + +Revision ID: bf00311e1990 +Revises: dd25f486b8ea +Create Date: 2018-09-12 09:53:52.007433 + +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = 'bf00311e1990' +down_revision = 'dd25f486b8ea' +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_index( + 'ti_dag_date', + 'task_instance', + ['dag_id', 'execution_date'], + unique=False + ) + + +def downgrade(): + op.drop_index('ti_dag_date', table_name='task_instance') diff --git a/airflow/migrations/versions/c8ffec048a3b_add_fields_to_dag.py b/airflow/migrations/versions/c8ffec048a3b_add_fields_to_dag.py new file mode 100644 index 0000000000000..70282715edfa6 --- /dev/null +++ b/airflow/migrations/versions/c8ffec048a3b_add_fields_to_dag.py @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""add fields to dag + +Revision ID: c8ffec048a3b +Revises: 41f5f12752f8 +Create Date: 2018-12-23 21:55:46.463634 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = 'c8ffec048a3b' +down_revision = '41f5f12752f8' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('dag', sa.Column('description', sa.Text(), nullable=True)) + op.add_column('dag', sa.Column('default_view', sa.String(25), nullable=True)) + + +def downgrade(): + op.drop_column('dag', 'description') + op.drop_column('dag', 'default_view') diff --git a/airflow/migrations/versions/cc1e65623dc7_add_max_tries_column_to_task_instance.py b/airflow/migrations/versions/cc1e65623dc7_add_max_tries_column_to_task_instance.py index 0503d93df5bec..03195a7536d3d 100644 --- a/airflow/migrations/versions/cc1e65623dc7_add_max_tries_column_to_task_instance.py +++ b/airflow/migrations/versions/cc1e65623dc7_add_max_tries_column_to_task_instance.py @@ -1,3 +1,4 @@ +# flake8: noqa # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -6,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,24 +25,38 @@ """ +from alembic import op +import sqlalchemy as sa +from airflow import settings +from airflow.models import DagBag + +from sqlalchemy import Column, Integer, String +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.ext.declarative import declarative_base + # revision identifiers, used by Alembic. revision = 'cc1e65623dc7' down_revision = '127d2bf2dfa7' branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from airflow import settings -from airflow.models import DagBag, TaskInstance -from sqlalchemy.engine.reflection import Inspector - +Base = declarative_base() BATCH_SIZE = 5000 +ID_LEN = 250 + + +class TaskInstance(Base): + __tablename__ = "task_instance" + + task_id = Column(String(ID_LEN), primary_key=True) + dag_id = Column(String(ID_LEN), primary_key=True) + execution_date = Column(sa.DateTime, primary_key=True) + max_tries = Column(Integer) + try_number = Column(Integer, default=0) def upgrade(): - op.add_column('task_instance', sa.Column('max_tries', sa.Integer, - server_default="-1")) + op.add_column('task_instance', sa.Column('max_tries', sa.Integer, server_default="-1")) # Check if table task_instance exist before data migration. This check is # needed for database that does not create table until migration finishes. # Checking task_instance table exists prevent the error of querying @@ -111,7 +126,7 @@ def downgrade(): # max number of self retry (task.retries) minus number of # times left for task instance to try the task. ti.try_number = max(0, task.retries - (ti.max_tries - - ti.try_number)) + ti.try_number)) ti.max_tries = -1 session.merge(ti) session.commit() diff --git a/airflow/migrations/versions/d2ae31099d61_increase_text_size_for_mysql.py b/airflow/migrations/versions/d2ae31099d61_increase_text_size_for_mysql.py index ff67b4fb16d3a..db5afaf023e54 100644 --- a/airflow/migrations/versions/d2ae31099d61_increase_text_size_for_mysql.py +++ b/airflow/migrations/versions/d2ae31099d61_increase_text_size_for_mysql.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,9 @@ Create Date: 2017-08-18 17:07:16.686130 """ +from alembic import op +from sqlalchemy.dialects import mysql +from alembic import context # revision identifiers, used by Alembic. revision = 'd2ae31099d61' @@ -30,11 +33,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import mysql -from alembic import context - def upgrade(): if context.config.get_main_option('sqlalchemy.url').startswith('mysql'): diff --git a/scripts/ci/load_fixtures.sh b/airflow/migrations/versions/dd25f486b8ea_add_idx_log_dag.py old mode 100755 new mode 100644 similarity index 64% rename from scripts/ci/load_fixtures.sh rename to airflow/migrations/versions/dd25f486b8ea_add_idx_log_dag.py index 55beb919dfda2..3249a2e0589cb --- a/scripts/ci/load_fixtures.sh +++ b/airflow/migrations/versions/dd25f486b8ea_add_idx_log_dag.py @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -8,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,17 +16,26 @@ # specific language governing permissions and limitations # under the License. -set -o verbose +from alembic import op + +"""add idx_log_dag + +Revision ID: dd25f486b8ea +Revises: 9635ae0956e7 +Create Date: 2018-08-07 06:41:41.028249 + +""" + +# revision identifiers, used by Alembic. +revision = 'dd25f486b8ea' +down_revision = '9635ae0956e7' +branch_labels = None +depends_on = None + -DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) -FIXTURES_DIR="$DIR/ldif" -LOAD_ORDER=("example.com.ldif" "manager.example.com.ldif" "users.example.com.ldif" "groups.example.com.ldif") +def upgrade(): + op.create_index('idx_log_dag', 'log', ['dag_id'], unique=False) -load_fixture () { - ldapadd -x -H ldap://127.0.0.1:3890/ -D "cn=Manager,dc=example,dc=com" -w insecure -f $1 -} -for FILE in "${LOAD_ORDER[@]}" -do - load_fixture "${FIXTURES_DIR}/${FILE}" -done; +def downgrade(): + op.drop_index('idx_log_dag', table_name='log') diff --git a/airflow/migrations/versions/dd4ecb8fbee3_add_schedule_interval_to_dag.py b/airflow/migrations/versions/dd4ecb8fbee3_add_schedule_interval_to_dag.py new file mode 100644 index 0000000000000..3b2e6d577ac9d --- /dev/null +++ b/airflow/migrations/versions/dd4ecb8fbee3_add_schedule_interval_to_dag.py @@ -0,0 +1,42 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Add schedule interval to dag + +Revision ID: dd4ecb8fbee3 +Revises: c8ffec048a3b +Create Date: 2018-12-27 18:39:25.748032 + +""" + +from alembic import op +import sqlalchemy as sa + +# revision identifiers, used by Alembic. +revision = 'dd4ecb8fbee3' +down_revision = 'c8ffec048a3b' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('dag', sa.Column('schedule_interval', sa.Text(), nullable=True)) + + +def downgrade(): + op.drop_column('dag', sa.Column('schedule_interval', sa.Text(), nullable=True)) diff --git a/airflow/migrations/versions/e3a246e0dc1_current_schema.py b/airflow/migrations/versions/e3a246e0dc1_current_schema.py index 6c63d0a9dd337..cbf98976458d4 100644 --- a/airflow/migrations/versions/e3a246e0dc1_current_schema.py +++ b/airflow/migrations/versions/e3a246e0dc1_current_schema.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -25,17 +25,17 @@ """ +from alembic import op +import sqlalchemy as sa +from sqlalchemy import func +from sqlalchemy.engine.reflection import Inspector + # revision identifiers, used by Alembic. revision = 'e3a246e0dc1' down_revision = None branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa -from sqlalchemy import func -from sqlalchemy.engine.reflection import Inspector - def upgrade(): conn = op.get_bind() diff --git a/airflow/migrations/versions/f23433877c24_fix_mysql_not_null_constraint.py b/airflow/migrations/versions/f23433877c24_fix_mysql_not_null_constraint.py new file mode 100644 index 0000000000000..3e643f629dfcd --- /dev/null +++ b/airflow/migrations/versions/f23433877c24_fix_mysql_not_null_constraint.py @@ -0,0 +1,52 @@ +# flake8: noqa +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""fix mysql not null constraint + +Revision ID: f23433877c24 +Revises: 05f30312d566 +Create Date: 2018-06-17 10:16:31.412131 + +""" +from alembic import op +from sqlalchemy.dialects import mysql + +# revision identifiers, used by Alembic. +revision = 'f23433877c24' +down_revision = '05f30312d566' +branch_labels = None +depends_on = None + + +def upgrade(): + conn = op.get_bind() + if conn.dialect.name == 'mysql': + conn.execute("SET time_zone = '+00:00'") + op.alter_column('task_fail', 'execution_date', existing_type=mysql.TIMESTAMP(fsp=6), nullable=False) + op.alter_column('xcom', 'execution_date', existing_type=mysql.TIMESTAMP(fsp=6), nullable=False) + op.alter_column('xcom', 'timestamp', existing_type=mysql.TIMESTAMP(fsp=6), nullable=False) + + +def downgrade(): + conn = op.get_bind() + if conn.dialect.name == 'mysql': + conn.execute("SET time_zone = '+00:00'") + op.alter_column('xcom', 'timestamp', existing_type=mysql.TIMESTAMP(fsp=6), nullable=True) + op.alter_column('xcom', 'execution_date', existing_type=mysql.TIMESTAMP(fsp=6), nullable=True) + op.alter_column('task_fail', 'execution_date', existing_type=mysql.TIMESTAMP(fsp=6), nullable=True) diff --git a/airflow/migrations/versions/f2ca10b85618_add_dag_stats_table.py b/airflow/migrations/versions/f2ca10b85618_add_dag_stats_table.py index 7c23d507127a2..e14b4b8025109 100644 --- a/airflow/migrations/versions/f2ca10b85618_add_dag_stats_table.py +++ b/airflow/migrations/versions/f2ca10b85618_add_dag_stats_table.py @@ -6,9 +6,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,8 @@ Create Date: 2016-07-20 15:08:28.247537 """ +from alembic import op +import sqlalchemy as sa # revision identifiers, used by Alembic. revision = 'f2ca10b85618' @@ -30,9 +32,6 @@ branch_labels = None depends_on = None -from alembic import op -import sqlalchemy as sa - def upgrade(): op.create_table('dag_stats', diff --git a/airflow/minihivecluster.py b/airflow/minihivecluster.py deleted file mode 100644 index c5441c67a1b61..0000000000000 --- a/airflow/minihivecluster.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2016 Bolke de Bruin -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. -import os -import subprocess -import select -import re - - -class MiniHiveCluster(object): - def __init__(self): - self._minicluster_home = os.environ['MINICLUSTER_HOME'] - self._minicluster_class = "com.ing.minicluster.MiniCluster" - self._start_mini_cluster() - self._is_started() - - def _start_mini_cluster(self): - classpath = os.path.join(self._minicluster_home, "*") - cmd = ["java", "-cp", classpath, self._minicluster_class] - - self.hive = subprocess.Popen(cmd, bufsize=0, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, universal_newlines=True, - close_fds=True) - - def terminate(self): - self.hive.terminate() - - def _is_started(self): - while self.hive.poll() is None: - rlist, wlist, xlist = select.select([self.hive.stderr, self.hive.stdout], [], []) - for f in rlist: - line = f.readline() - print(line,) - m = re.match(".*Starting ThriftBinaryCLIService", line) - if m: - return True diff --git a/airflow/models.py b/airflow/models/__init__.py similarity index 80% rename from airflow/models.py rename to airflow/models/__init__.py index 4706c2d50b6e1..702ed7f7e0431 100755 --- a/airflow/models.py +++ b/airflow/models/__init__.py @@ -22,13 +22,21 @@ from __future__ import print_function from __future__ import unicode_literals +from builtins import ImportError as BuiltinImportError, bytes, object, str +from collections import defaultdict, namedtuple, OrderedDict +import copy +from typing import Iterable + from future.standard_library import install_aliases -from builtins import str -from builtins import object, bytes -import copy -from collections import namedtuple, defaultdict -import cryptography +from airflow.models.base import Base, ID_LEN + +try: + # Fix Python > 3.7 deprecation + from collections.abc import Hashable +except BuiltinImportError: + # Preserve Python < 3.3 compatibility + from collections import Hashable from datetime import timedelta import dill @@ -36,51 +44,61 @@ import getpass import imp import importlib -import itertools import zipfile import jinja2 import json import logging -import numbers import os +import pendulum import pickle import re import signal import sys +import time import textwrap import traceback import warnings import hashlib -import uuid from datetime import datetime -from urllib.parse import urlparse, quote, parse_qsl +from urllib.parse import quote from sqlalchemy import ( - Column, Integer, String, DateTime, Text, Boolean, ForeignKey, PickleType, - Index, Float, LargeBinary) -from sqlalchemy import func, or_, and_, true as sqltrue -from sqlalchemy.ext.declarative import declarative_base, declared_attr + Boolean, Column, DateTime, Float, ForeignKey, Index, + Integer, LargeBinary, PickleType, String, Text, UniqueConstraint, and_, + func, or_ +) +from sqlalchemy.ext.declarative import declared_attr from sqlalchemy.orm import reconstructor, relationship, synonym -from sqlalchemy_utc import UtcDateTime -from croniter import croniter +from croniter import ( + croniter, CroniterBadCronError, CroniterBadDateError, CroniterNotAlphaError +) import six from airflow import settings, utils from airflow.executors import GetDefaultExecutor, LocalExecutor from airflow import configuration from airflow.exceptions import ( - AirflowDagCycleException, AirflowException, AirflowSkipException, AirflowTaskTimeout + AirflowDagCycleException, AirflowException, AirflowSkipException, AirflowTaskTimeout, + AirflowRescheduleException ) from airflow.dag.base_dag import BaseDag, BaseDagBag from airflow.lineage import apply_lineage, prepare_lineage +from airflow.models.dagpickle import DagPickle +from airflow.models.errors import ImportError # noqa: F401 +from airflow.models.slamiss import SlaMiss # noqa: F401 +from airflow.models.kubernetes import KubeWorkerIdentifier, KubeResourceVersion # noqa: F401 +from airflow.models.log import Log +from airflow.models.taskfail import TaskFail +from airflow.models.taskreschedule import TaskReschedule from airflow.ti_deps.deps.not_in_retry_period_dep import NotInRetryPeriodDep from airflow.ti_deps.deps.prev_dagrun_dep import PrevDagrunDep from airflow.ti_deps.deps.trigger_rule_dep import TriggerRuleDep from airflow.ti_deps.dep_context import DepContext, QUEUE_DEPS, RUN_DEPS from airflow.utils import timezone +from airflow.utils.dag_processing import list_py_file_paths from airflow.utils.dates import cron_presets, date_range as utils_date_range from airflow.utils.db import provide_session from airflow.utils.decorators import apply_defaults @@ -89,21 +107,48 @@ as_tuple, is_container, validate_key, pprinttable) from airflow.utils.operator_resources import Resources from airflow.utils.state import State +from airflow.utils.sqlalchemy import UtcDateTime, Interval from airflow.utils.timeout import timeout from airflow.utils.trigger_rule import TriggerRule from airflow.utils.weight_rule import WeightRule from airflow.utils.net import get_hostname from airflow.utils.log.logging_mixin import LoggingMixin + install_aliases() -Base = declarative_base() -ID_LEN = 250 XCOM_RETURN_KEY = 'return_value' Stats = settings.Stats +class InvalidFernetToken(Exception): + # If Fernet isn't loaded we need a valid exception class to catch. If it is + # loaded this will get reset to the actual class once get_fernet() is called + pass + + +class NullFernet(object): + """ + A "Null" encryptor class that doesn't encrypt or decrypt but that presents + a similar interface to Fernet. + + The purpose of this is to make the rest of the code not have to know the + difference, and to only display the message once, not 20 times when + `airflow initdb` is ran. + """ + is_encrypted = False + + def decrpyt(self, b): + return b + + def encrypt(self, b): + return b + + +_fernet = None + + def get_fernet(): """ Deferred load of Fernet key. @@ -112,17 +157,43 @@ def get_fernet(): or because the Fernet key is invalid. :return: Fernet object - :raises: AirflowException if there's a problem trying to load Fernet + :raises: airflow.exceptions.AirflowException if there's a problem trying to load Fernet """ + global _fernet + log = LoggingMixin().log + + if _fernet: + return _fernet try: - from cryptography.fernet import Fernet - except ImportError: - raise AirflowException('Failed to import Fernet, it may not be installed') + from cryptography.fernet import Fernet, MultiFernet, InvalidToken + global InvalidFernetToken + InvalidFernetToken = InvalidToken + + except BuiltinImportError: + log.warning( + "cryptography not found - values will not be stored encrypted." + ) + _fernet = NullFernet() + return _fernet + try: - return Fernet(configuration.conf.get('core', 'FERNET_KEY').encode('utf-8')) + fernet_key = configuration.conf.get('core', 'FERNET_KEY') + if not fernet_key: + log.warning( + "empty cryptography key - values will not be stored encrypted." + ) + _fernet = NullFernet() + else: + _fernet = MultiFernet([ + Fernet(fernet_part.encode('utf-8')) + for fernet_part in fernet_key.split(',') + ]) + _fernet.is_encrypted = True except (ValueError, TypeError) as ve: raise AirflowException("Could not create Fernet object: {}".format(ve)) + return _fernet + # Used by DAG context_managers _CONTEXT_MANAGER_DAG = None @@ -178,6 +249,20 @@ def clear_task_instances(tis, dr.start_date = timezone.utcnow() +def get_last_dagrun(dag_id, session, include_externally_triggered=False): + """ + Returns the last dag run for a dag, None if there was none. + Last dag run can be any type of run eg. scheduled or backfilled. + Overridden DagRuns are ignored. + """ + DR = DagRun + query = session.query(DR).filter(DR.dag_id == dag_id) + if not include_externally_triggered: + query = query.filter(DR.external_trigger == False) # noqa + query = query.order_by(DR.execution_date.desc()) + return query.first() + + class DagBag(BaseDagBag, LoggingMixin): """ A dagbag is a collection of dags, parsed out of a folder tree and has high @@ -210,7 +295,8 @@ def __init__( self, dag_folder=None, executor=None, - include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES')): + include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), + safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): # do not use default arg in signature, to fix import cycle on plugin load if executor is None: @@ -225,12 +311,10 @@ def __init__( self.import_errors = {} self.has_logged = False - if include_examples: - example_dag_folder = os.path.join( - os.path.dirname(__file__), - 'example_dags') - self.collect_dags(example_dag_folder) - self.collect_dags(dag_folder) + self.collect_dags( + dag_folder=dag_folder, + include_examples=include_examples, + safe_mode=safe_mode) def size(self): """ @@ -238,6 +322,10 @@ def size(self): """ return len(self.dags) + @property + def dag_ids(self): + return self.dags.keys() + def get_dag(self, dag_id): """ Gets the DAG out of the dictionary, and refreshes it if expired @@ -296,7 +384,8 @@ def process_file(self, filepath, only_if_updated=True, safe_mode=True): return found_dags mods = [] - if not zipfile.is_zipfile(filepath): + is_zipfile = zipfile.is_zipfile(filepath) + if not is_zipfile: if safe_mode and os.path.isfile(filepath): with open(filepath, 'rb') as f: content = f.read() @@ -368,13 +457,23 @@ def process_file(self, filepath, only_if_updated=True, safe_mode=True): if isinstance(dag, DAG): if not dag.full_filepath: dag.full_filepath = filepath - if dag.fileloc != filepath: + if dag.fileloc != filepath and not is_zipfile: dag.fileloc = filepath try: dag.is_subdag = False self.bag_dag(dag, parent_dag=dag, root_dag=dag) + if isinstance(dag._schedule_interval, six.string_types): + croniter(dag._schedule_interval) found_dags.append(dag) found_dags += dag.subdags + except (CroniterBadCronError, + CroniterBadDateError, + CroniterNotAlphaError) as cron_e: + self.log.exception("Failed to bag_dag: %s", dag.full_filepath) + self.import_errors[dag.full_filepath] = \ + "Invalid Cron expression: " + str(cron_e) + self.file_last_changed[dag.full_filepath] = \ + file_last_changed_on_disk except AirflowDagCycleException as cycle_exception: self.log.exception("Failed to bag_dag: %s", dag.full_filepath) self.import_errors[dag.full_filepath] = str(cycle_exception) @@ -385,39 +484,28 @@ def process_file(self, filepath, only_if_updated=True, safe_mode=True): return found_dags @provide_session - def kill_zombies(self, session=None): - """ - Fails tasks that haven't had a heartbeat in too long - """ - from airflow.jobs import LocalTaskJob as LJ - self.log.info("Finding 'running' jobs without a recent heartbeat") - TI = TaskInstance - secs = configuration.conf.getint('scheduler', 'scheduler_zombie_task_threshold') - limit_dttm = timezone.utcnow() - timedelta(seconds=secs) - self.log.info("Failing jobs without heartbeat after %s", limit_dttm) - - tis = ( - session.query(TI) - .join(LJ, TI.job_id == LJ.id) - .filter(TI.state == State.RUNNING) - .filter( - or_( - LJ.state != State.RUNNING, - LJ.latest_heartbeat < limit_dttm, - )) - .all() - ) - - for ti in tis: - if ti and ti.dag_id in self.dags: - dag = self.dags[ti.dag_id] - if ti.task_id in dag.task_ids: - task = dag.get_task(ti.task_id) - - # now set non db backed vars on ti - ti.task = task + def kill_zombies(self, zombies, session=None): + """ + Fail given zombie tasks, which are tasks that haven't + had a heartbeat for too long, in the current DagBag. + + :param zombies: zombie task instances to kill. + :type zombies: airflow.utils.dag_processing.SimpleTaskInstance + :param session: DB session. + :type session: sqlalchemy.orm.session.Session + """ + for zombie in zombies: + if zombie.dag_id in self.dags: + dag = self.dags[zombie.dag_id] + if zombie.task_id in dag.task_ids: + task = dag.get_task(zombie.task_id) + ti = TaskInstance(task, zombie.execution_date) + # Get properties needed for failure handling from SimpleTaskInstance. + ti.start_date = zombie.start_date + ti.end_date = zombie.end_date + ti.try_number = zombie.try_number + ti.state = zombie.state ti.test_mode = configuration.getboolean('core', 'unit_test_mode') - ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode, ti.get_template_context()) self.log.info( @@ -449,7 +537,7 @@ def bag_dag(self, dag, parent_dag, root_dag): self.bag_dag(subdag, parent_dag=dag, root_dag=root_dag) self.dags[dag.dag_id] = dag - self.log.debug('Loaded DAG {dag}'.format(**locals())) + self.log.debug('Loaded DAG %s', dag) except AirflowDagCycleException as cycle_exception: # There was an error in bagging the dag. Remove it from the list of dags self.log.exception('Exception bagging dag: {dag.dag_id}'.format(**locals())) @@ -464,15 +552,20 @@ def bag_dag(self, dag, parent_dag, root_dag): def collect_dags( self, dag_folder=None, - only_if_updated=True): + only_if_updated=True, + include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), + safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. - Note that if a .airflowignore file is found while processing, - the directory, it will behaves much like a .gitignore does, + Note that if a ``.airflowignore`` file is found while processing + the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. + + **Note**: The patterns in .airflowignore are treated as + un-anchored regexes, not shell-like glob patterns. """ start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder @@ -481,42 +574,27 @@ def collect_dags( stats = [] FileLoadStat = namedtuple( 'FileLoadStat', "file duration dag_num task_num dags") - if os.path.isfile(dag_folder): - self.process_file(dag_folder, only_if_updated=only_if_updated) - elif os.path.isdir(dag_folder): - for root, dirs, files in os.walk(dag_folder, followlinks=True): - patterns = [] - ignore_file = os.path.join(root, '.airflowignore') - if os.path.isfile(ignore_file): - with open(ignore_file, 'r') as f: - patterns += [p for p in f.read().split('\n') if p] - for f in files: - try: - filepath = os.path.join(root, f) - if not os.path.isfile(filepath): - continue - mod_name, file_ext = os.path.splitext( - os.path.split(filepath)[-1]) - if file_ext != '.py' and not zipfile.is_zipfile(filepath): - continue - if not any( - [re.findall(p, filepath) for p in patterns]): - ts = timezone.utcnow() - found_dags = self.process_file( - filepath, only_if_updated=only_if_updated) - - td = timezone.utcnow() - ts - td = td.total_seconds() + ( - float(td.microseconds) / 1000000) - stats.append(FileLoadStat( - filepath.replace(dag_folder, ''), - td, - len(found_dags), - sum([len(dag.tasks) for dag in found_dags]), - str([dag.dag_id for dag in found_dags]), - )) - except Exception as e: - self.log.exception(e) + + for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, + include_examples=include_examples): + try: + ts = timezone.utcnow() + found_dags = self.process_file( + filepath, only_if_updated=only_if_updated, + safe_mode=safe_mode) + + td = timezone.utcnow() - ts + td = td.total_seconds() + ( + float(td.microseconds) / 1000000) + stats.append(FileLoadStat( + filepath.replace(dag_folder, ''), + td, + len(found_dags), + sum([len(dag.tasks) for dag in found_dags]), + str([dag.dag_id for dag in found_dags]), + )) + except Exception as e: + self.log.exception(e) Stats.gauge( 'collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge( @@ -546,21 +624,6 @@ def dagbag_report(self): table=pprinttable(stats), ) - @provide_session - def deactivate_inactive_dags(self, session=None): - active_dag_ids = [dag.dag_id for dag in list(self.dags.values())] - for dag in session.query( - DagModel).filter(~DagModel.dag_id.in_(active_dag_ids)).all(): - dag.is_active = False - session.merge(dag) - session.commit() - - @provide_session - def paused_dags(self, session=None): - dag_ids = [dp.dag_id for dp in session.query(DagModel).filter( - DagModel.is_paused.__eq__(True))] - return dag_ids - class User(Base): __tablename__ = "users" @@ -568,7 +631,7 @@ class User(Base): id = Column(Integer, primary_key=True) username = Column(String(ID_LEN), unique=True) email = Column(String(500)) - superuser = False + superuser = Column(Boolean(), default=False) def __repr__(self): return self.username @@ -580,260 +643,6 @@ def is_superuser(self): return self.superuser -class Connection(Base, LoggingMixin): - """ - Placeholder to store information about different database instances - connection information. The idea here is that scripts use references to - database instances (conn_id) instead of hard coding hostname, logins and - passwords when using operators or hooks. - """ - __tablename__ = "connection" - - id = Column(Integer(), primary_key=True) - conn_id = Column(String(ID_LEN)) - conn_type = Column(String(500)) - host = Column(String(500)) - schema = Column(String(500)) - login = Column(String(500)) - _password = Column('password', String(5000)) - port = Column(Integer()) - is_encrypted = Column(Boolean, unique=False, default=False) - is_extra_encrypted = Column(Boolean, unique=False, default=False) - _extra = Column('extra', String(5000)) - - _types = [ - ('docker', 'Docker Registry',), - ('fs', 'File (path)'), - ('ftp', 'FTP',), - ('google_cloud_platform', 'Google Cloud Platform'), - ('hdfs', 'HDFS',), - ('http', 'HTTP',), - ('hive_cli', 'Hive Client Wrapper',), - ('hive_metastore', 'Hive Metastore Thrift',), - ('hiveserver2', 'Hive Server 2 Thrift',), - ('jdbc', 'Jdbc Connection',), - ('jenkins', 'Jenkins'), - ('mysql', 'MySQL',), - ('postgres', 'Postgres',), - ('oracle', 'Oracle',), - ('vertica', 'Vertica',), - ('presto', 'Presto',), - ('s3', 'S3',), - ('samba', 'Samba',), - ('sqlite', 'Sqlite',), - ('ssh', 'SSH',), - ('cloudant', 'IBM Cloudant',), - ('mssql', 'Microsoft SQL Server'), - ('mesos_framework-id', 'Mesos Framework ID'), - ('jira', 'JIRA',), - ('redis', 'Redis',), - ('wasb', 'Azure Blob Storage'), - ('databricks', 'Databricks',), - ('aws', 'Amazon Web Services',), - ('emr', 'Elastic MapReduce',), - ('snowflake', 'Snowflake',), - ('segment', 'Segment',), - ('azure_data_lake', 'Azure Data Lake'), - ('cassandra', 'Cassandra',), - ] - - def __init__( - self, conn_id=None, conn_type=None, - host=None, login=None, password=None, - schema=None, port=None, extra=None, - uri=None): - self.conn_id = conn_id - if uri: - self.parse_from_uri(uri) - else: - self.conn_type = conn_type - self.host = host - self.login = login - self.password = password - self.schema = schema - self.port = port - self.extra = extra - - def parse_from_uri(self, uri): - temp_uri = urlparse(uri) - hostname = temp_uri.hostname or '' - if '%2f' in hostname: - hostname = hostname.replace('%2f', '/').replace('%2F', '/') - conn_type = temp_uri.scheme - if conn_type == 'postgresql': - conn_type = 'postgres' - self.conn_type = conn_type - self.host = hostname - self.schema = temp_uri.path[1:] - self.login = temp_uri.username - self.password = temp_uri.password - self.port = temp_uri.port - if temp_uri.query: - self.extra = json.dumps(dict(parse_qsl(temp_uri.query))) - - def get_password(self): - if self._password and self.is_encrypted: - try: - fernet = get_fernet() - except AirflowException: - raise AirflowException( - "Can't decrypt encrypted password for login={}, \ - FERNET_KEY configuration is missing".format(self.login)) - return fernet.decrypt(bytes(self._password, 'utf-8')).decode() - else: - return self._password - - def set_password(self, value): - if value: - try: - fernet = get_fernet() - self._password = fernet.encrypt(bytes(value, 'utf-8')).decode() - self.is_encrypted = True - except AirflowException: - self.log.exception("Failed to load fernet while encrypting value, " - "using non-encrypted value.") - self._password = value - self.is_encrypted = False - - @declared_attr - def password(cls): - return synonym('_password', - descriptor=property(cls.get_password, cls.set_password)) - - def get_extra(self): - if self._extra and self.is_extra_encrypted: - try: - fernet = get_fernet() - except AirflowException: - raise AirflowException( - "Can't decrypt `extra` params for login={},\ - FERNET_KEY configuration is missing".format(self.login)) - return fernet.decrypt(bytes(self._extra, 'utf-8')).decode() - else: - return self._extra - - def set_extra(self, value): - if value: - try: - fernet = get_fernet() - self._extra = fernet.encrypt(bytes(value, 'utf-8')).decode() - self.is_extra_encrypted = True - except AirflowException: - self.log.exception("Failed to load fernet while encrypting value, " - "using non-encrypted value.") - self._extra = value - self.is_extra_encrypted = False - else: - self._extra = value - self.is_extra_encrypted = False - - @declared_attr - def extra(cls): - return synonym('_extra', - descriptor=property(cls.get_extra, cls.set_extra)) - - def get_hook(self): - try: - if self.conn_type == 'mysql': - from airflow.hooks.mysql_hook import MySqlHook - return MySqlHook(mysql_conn_id=self.conn_id) - elif self.conn_type == 'google_cloud_platform': - from airflow.contrib.hooks.bigquery_hook import BigQueryHook - return BigQueryHook(bigquery_conn_id=self.conn_id) - elif self.conn_type == 'postgres': - from airflow.hooks.postgres_hook import PostgresHook - return PostgresHook(postgres_conn_id=self.conn_id) - elif self.conn_type == 'hive_cli': - from airflow.hooks.hive_hooks import HiveCliHook - return HiveCliHook(hive_cli_conn_id=self.conn_id) - elif self.conn_type == 'presto': - from airflow.hooks.presto_hook import PrestoHook - return PrestoHook(presto_conn_id=self.conn_id) - elif self.conn_type == 'hiveserver2': - from airflow.hooks.hive_hooks import HiveServer2Hook - return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) - elif self.conn_type == 'sqlite': - from airflow.hooks.sqlite_hook import SqliteHook - return SqliteHook(sqlite_conn_id=self.conn_id) - elif self.conn_type == 'jdbc': - from airflow.hooks.jdbc_hook import JdbcHook - return JdbcHook(jdbc_conn_id=self.conn_id) - elif self.conn_type == 'mssql': - from airflow.hooks.mssql_hook import MsSqlHook - return MsSqlHook(mssql_conn_id=self.conn_id) - elif self.conn_type == 'oracle': - from airflow.hooks.oracle_hook import OracleHook - return OracleHook(oracle_conn_id=self.conn_id) - elif self.conn_type == 'vertica': - from airflow.contrib.hooks.vertica_hook import VerticaHook - return VerticaHook(vertica_conn_id=self.conn_id) - elif self.conn_type == 'cloudant': - from airflow.contrib.hooks.cloudant_hook import CloudantHook - return CloudantHook(cloudant_conn_id=self.conn_id) - elif self.conn_type == 'jira': - from airflow.contrib.hooks.jira_hook import JiraHook - return JiraHook(jira_conn_id=self.conn_id) - elif self.conn_type == 'redis': - from airflow.contrib.hooks.redis_hook import RedisHook - return RedisHook(redis_conn_id=self.conn_id) - elif self.conn_type == 'wasb': - from airflow.contrib.hooks.wasb_hook import WasbHook - return WasbHook(wasb_conn_id=self.conn_id) - elif self.conn_type == 'docker': - from airflow.hooks.docker_hook import DockerHook - return DockerHook(docker_conn_id=self.conn_id) - elif self.conn_type == 'azure_data_lake': - from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook - return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) - elif self.conn_type == 'cassandra': - from airflow.contrib.hooks.cassandra_hook import CassandraHook - return CassandraHook(cassandra_conn_id=self.conn_id) - except Exception: - pass - - def __repr__(self): - return self.conn_id - - @property - def extra_dejson(self): - """Returns the extra property by deserializing json.""" - obj = {} - if self.extra: - try: - obj = json.loads(self.extra) - except Exception as e: - self.log.exception(e) - self.log.error("Failed parsing the json for conn_id %s", self.conn_id) - - return obj - - -class DagPickle(Base): - """ - Dags can originate from different places (user repos, master repo, ...) - and also get executed in different places (different executors). This - object represents a version of a DAG and becomes a source of truth for - a BackfillJob execution. A pickle is a native python serialized object, - and in this case gets stored in the database for the duration of the job. - - The executors pick up the DagPickle id and read the dag definition from - the database. - """ - id = Column(Integer, primary_key=True) - pickle = Column(PickleType(pickler=dill)) - created_dttm = Column(UtcDateTime, default=timezone.utcnow) - pickle_hash = Column(Text) - - __tablename__ = "dag_pickle" - - def __init__(self, dag): - self.dag_id = dag.dag_id - if hasattr(dag, 'template_env'): - dag.template_env = None - self.pickle_hash = hash(dag) - self.pickle = dag - - class TaskInstance(Base, LoggingMixin): """ Task instances store the state of a task instance. This table is the @@ -861,7 +670,7 @@ class TaskInstance(Base, LoggingMixin): max_tries = Column(Integer) hostname = Column(String(1000)) unixname = Column(String(1000)) - job_id = Column(Integer, index=True) + job_id = Column(Integer) pool = Column(String(50)) queue = Column(String(50)) priority_weight = Column(Integer) @@ -872,9 +681,11 @@ class TaskInstance(Base, LoggingMixin): __table_args__ = ( Index('ti_dag_state', dag_id, state), + Index('ti_dag_date', dag_id, execution_date), Index('ti_state', state), Index('ti_state_lkp', dag_id, task_id, execution_date, state), Index('ti_pool', pool, state, priority_weight), + Index('ti_job_id', job_id), ) def __init__(self, task, execution_date, state=None): @@ -921,7 +732,7 @@ def init_on_load(self): @property def try_number(self): """ - Return the try number that this task number will be when it is acutally + Return the try number that this task number will be when it is actually run. If the TI is currently running, this will match the column in the @@ -1046,15 +857,15 @@ def generate_command(dag_id, :type mark_success: bool :param ignore_all_deps: Ignore all ignorable dependencies. Overrides the other ignore_* parameters. - :type ignore_all_deps: boolean + :type ignore_all_deps: bool :param ignore_depends_on_past: Ignore depends_on_past parameter of DAGs (e.g. for Backfills) - :type ignore_depends_on_past: boolean + :type ignore_depends_on_past: bool :param ignore_task_deps: Ignore task-specific dependencies such as depends_on_past and trigger rule - :type ignore_task_deps: boolean + :type ignore_task_deps: bool :param ignore_ti_state: Ignore the task instance's previous failure/success - :type ignore_ti_state: boolean + :type ignore_ti_state: bool :param local: Whether to run the task locally :type local: bool :param pickle_id: If the DAG was serialized to the DB, the ID @@ -1098,10 +909,10 @@ def log_url(self): BASE_URL = configuration.conf.get('webserver', 'BASE_URL') if settings.RBAC: return BASE_URL + ( - "/log/list/" - "?_flt_3_dag_id={self.dag_id}" - "&_flt_3_task_id={self.task_id}" - "&_flt_3_execution_date={iso}" + "/log?" + "execution_date={iso}" + "&task_id={self.task_id}" + "&dag_id={self.dag_id}" ).format(**locals()) else: return BASE_URL + ( @@ -1214,7 +1025,7 @@ def key(self): """ Returns a tuple that identifies the task instance uniquely """ - return self.dag_id, self.task_id, self.execution_date + return self.dag_id, self.task_id, self.execution_date, self.try_number @provide_session def set_state(self, state, session=None): @@ -1257,11 +1068,8 @@ def are_dependents_done(self, session=None): count = ti[0][0] return count == len(task.downstream_task_ids) - @property @provide_session - def previous_ti(self, session=None): - """ The task instance for the task that ran before this task instance """ - + def _get_previous_ti(self, session=None): dag = self.task.dag if dag: dr = self.get_dagrun(session=session) @@ -1287,6 +1095,11 @@ def previous_ti(self, session=None): return None + @property + def previous_ti(self): + """The task instance for the task that ran before this task instance.""" + return self._get_previous_ti() + @provide_session def are_dependencies_met( self, @@ -1302,10 +1115,10 @@ def are_dependencies_met( should be evaluated. :type dep_context: DepContext :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session :param verbose: whether log details on failed dependencies on info or debug log level - :type verbose: boolean + :type verbose: bool """ dep_context = dep_context or DepContext() failed = False @@ -1445,19 +1258,19 @@ def _check_and_change_state_before_execution( executed, in preparation for _run_raw_task :param verbose: whether to turn on more verbose logging - :type verbose: boolean + :type verbose: bool :param ignore_all_deps: Ignore all of the non-critical dependencies, just runs - :type ignore_all_deps: boolean + :type ignore_all_deps: bool :param ignore_depends_on_past: Ignore depends_on_past DAG attribute - :type ignore_depends_on_past: boolean + :type ignore_depends_on_past: bool :param ignore_task_deps: Don't check the dependencies of this TI's task - :type ignore_task_deps: boolean + :type ignore_task_deps: bool :param ignore_ti_state: Disregards previous task instance state - :type ignore_ti_state: boolean + :type ignore_ti_state: bool :param mark_success: Don't run the task, mark its state as success - :type mark_success: boolean + :type mark_success: bool :param test_mode: Doesn't record success or failure in the DB - :type test_mode: boolean + :type test_mode: bool :param pool: specifies the pool to use to run the task instance :type pool: str :return: whether the state was changed to running or not @@ -1488,15 +1301,17 @@ def _check_and_change_state_before_execution( return False # TODO: Logging needs cleanup, not clear what is being printed - hr = "\n" + ("-" * 80) + "\n" # Line break + hr = "\n" + ("-" * 80) # Line break # For reporting purposes, we report based on 1-indexed, # not 0-indexed lists (i.e. Attempt 1 instead of # Attempt 0 for the first attempt). - msg = "Starting attempt {attempt} of {total}".format( - attempt=self.try_number, - total=self.max_tries + 1) + # Set the task start date. In case it was re-scheduled use the initial + # start date that is recorded in task_reschedule table self.start_date = timezone.utcnow() + task_reschedules = TaskReschedule.find_for_task_instance(self, session) + if task_reschedules: + self.start_date = task_reschedules[0].start_date dep_context = DepContext( deps=RUN_DEPS - QUEUE_DEPS, @@ -1514,11 +1329,12 @@ def _check_and_change_state_before_execution( # have been running prematurely. This should be handled in the # scheduling mechanism. self.state = State.NONE - msg = ("FIXME: Rescheduling due to concurrency limits reached at task " - "runtime. Attempt {attempt} of {total}. State set to NONE.").format( - attempt=self.try_number, - total=self.max_tries + 1) - self.log.warning(hr + msg + hr) + self.log.warning(hr) + self.log.warning( + "FIXME: Rescheduling due to concurrency limits reached at task runtime. Attempt %s of " + "%s. State set to NONE.", self.try_number, self.max_tries + 1 + ) + self.log.warning(hr) self.queued_dttm = timezone.utcnow() self.log.info("Queuing into pool %s", self.pool) @@ -1529,13 +1345,14 @@ def _check_and_change_state_before_execution( # Another worker might have started running this task instance while # the current worker process was blocked on refresh_from_db if self.state == State.RUNNING: - msg = "Task Instance already running {}".format(self) - self.log.warning(msg) + self.log.warning("Task Instance already running %s", self) session.commit() return False # print status message - self.log.info(hr + msg + hr) + self.log.info(hr) + self.log.info("Starting attempt %s of %s", self.try_number, self.max_tries + 1) + self.log.info(hr) self._try_number += 1 if not test_mode: @@ -1552,12 +1369,9 @@ def _check_and_change_state_before_execution( settings.engine.dispose() if verbose: if mark_success: - msg = "Marking success for {} on {}".format(self.task, - self.execution_date) - self.log.info(msg) + self.log.info("Marking success for %s on %s", self.task, self.execution_date) else: - msg = "Executing {} on {}".format(self.task, self.execution_date) - self.log.info(msg) + self.log.info("Executing %s on %s", self.task, self.execution_date) return True @provide_session @@ -1575,9 +1389,9 @@ def _run_raw_task( only after another function changes the state to running. :param mark_success: Don't run the task, mark its state as success - :type mark_success: boolean + :type mark_success: bool :param test_mode: Doesn't record success or failure in the DB - :type test_mode: boolean + :type test_mode: bool :param pool: specifies the pool to use to run the task instance :type pool: str """ @@ -1590,6 +1404,7 @@ def _run_raw_task( self.operator = task.__class__.__name__ context = {} + actual_start_date = timezone.utcnow() try: if not mark_success: context = self.get_template_context() @@ -1606,6 +1421,8 @@ def signal_handler(signum, frame): # Don't clear Xcom until the task is certain to execute self.clear_xcom_data() + start_time = time.time() + self.render_templates() task_copy.pre_execute(context=context) @@ -1643,6 +1460,14 @@ def signal_handler(signum, frame): else: raise + end_time = time.time() + duration = end_time - start_time + Stats.timing( + 'dag.{dag_id}.{task_id}.duration'.format( + dag_id=task_copy.dag_id, + task_id=task_copy.task_id), + duration) + Stats.incr('operator_successes_{}'.format( self.task.__class__.__name__), 1, 1) Stats.incr('ti_successes') @@ -1651,11 +1476,15 @@ def signal_handler(signum, frame): except AirflowSkipException: self.refresh_from_db(lock_for_update=True) self.state = State.SKIPPED + except AirflowRescheduleException as reschedule_exception: + self.refresh_from_db() + self._handle_reschedule(actual_start_date, reschedule_exception, test_mode, context) + return except AirflowException as e: self.refresh_from_db() - # for case when task is marked as success externally + # for case when task is marked as success/failed externally # current behavior doesn't hit the success callback - if self.state == State.SUCCESS: + if self.state in {State.SUCCESS, State.FAILED}: return else: self.handle_failure(e, test_mode, context) @@ -1664,14 +1493,6 @@ def signal_handler(signum, frame): self.handle_failure(e, test_mode, context) raise - # Recording SUCCESS - self.end_date = timezone.utcnow() - self.set_duration() - if not test_mode: - session.add(Log(self.state, self)) - session.merge(self) - session.commit() - # Success callback try: if task.on_success_callback: @@ -1680,6 +1501,12 @@ def signal_handler(signum, frame): self.log.error("Failed when executing success callback") self.log.exception(e3) + # Recording SUCCESS + self.end_date = timezone.utcnow() + self.set_duration() + if not test_mode: + session.add(Log(self.state, self)) + session.merge(self) session.commit() @provide_session @@ -1722,6 +1549,32 @@ def dry_run(self): self.render_templates() task_copy.dry_run() + @provide_session + def _handle_reschedule(self, actual_start_date, reschedule_exception, test_mode=False, context=None, + session=None): + # Don't record reschedule request in test mode + if test_mode: + return + + self.end_date = timezone.utcnow() + self.set_duration() + + # Log reschedule request + session.add(TaskReschedule(self.task, self.execution_date, self._try_number, + actual_start_date, self.end_date, + reschedule_exception.reschedule_date)) + + # set state + self.state = State.UP_FOR_RESCHEDULE + + # Decrement try_number so subsequent runs will use the same try number and write + # to same log file. + self._try_number -= 1 + + session.merge(self) + session.commit() + self.log.info('Rescheduling task, marking task as UP_FOR_RESCHEDULE') + @provide_session def handle_failure(self, error, test_mode=False, context=None, session=None): self.log.exception(error) @@ -1736,6 +1589,9 @@ def handle_failure(self, error, test_mode=False, context=None, session=None): # Log failure duration session.add(TaskFail(task, self.execution_date, self.start_date, self.end_date)) + if context is not None: + context['exception'] = error + # Let's go deeper try: # Since this function is called only when the TI state is running, @@ -1746,7 +1602,7 @@ def handle_failure(self, error, test_mode=False, context=None, session=None): self.state = State.UP_FOR_RETRY self.log.info('Marking task as UP_FOR_RETRY') if task.email_on_retry and task.email: - self.email_alert(error, is_retry=True) + self.email_alert(error) else: self.state = State.FAILED if task.retries: @@ -1754,7 +1610,7 @@ def handle_failure(self, error, test_mode=False, context=None, session=None): else: self.log.info('Marking task as FAILED.') if task.email_on_failure and task.email: - self.email_alert(error, is_retry=False) + self.email_alert(error) except Exception as e2: self.log.error('Failed to send email to: %s', task.email) self.log.exception(e2) @@ -1785,47 +1641,60 @@ def get_template_context(self, session=None): if 'tables' in task.params: tables = task.params['tables'] + params = {} + run_id = '' + dag_run = None + if hasattr(task, 'dag'): + if task.dag.params: + params.update(task.dag.params) + dag_run = ( + session.query(DagRun) + .filter_by( + dag_id=task.dag.dag_id, + execution_date=self.execution_date) + .first() + ) + run_id = dag_run.run_id if dag_run else None + session.expunge_all() + session.commit() + ds = self.execution_date.strftime('%Y-%m-%d') ts = self.execution_date.isoformat() yesterday_ds = (self.execution_date - timedelta(1)).strftime('%Y-%m-%d') tomorrow_ds = (self.execution_date + timedelta(1)).strftime('%Y-%m-%d') - prev_execution_date = task.dag.previous_schedule(self.execution_date) - next_execution_date = task.dag.following_schedule(self.execution_date) + # For manually triggered dagruns that aren't run on a schedule, next/previous + # schedule dates don't make sense, and should be set to execution date for + # consistency with how execution_date is set for manually triggered tasks, i.e. + # triggered_date == execution_date. + if dag_run and dag_run.external_trigger: + prev_execution_date = self.execution_date + next_execution_date = self.execution_date + else: + prev_execution_date = task.dag.previous_schedule(self.execution_date) + next_execution_date = task.dag.following_schedule(self.execution_date) next_ds = None + next_ds_nodash = None if next_execution_date: next_ds = next_execution_date.strftime('%Y-%m-%d') + next_ds_nodash = next_ds.replace('-', '') prev_ds = None + prev_ds_nodash = None if prev_execution_date: prev_ds = prev_execution_date.strftime('%Y-%m-%d') + prev_ds_nodash = prev_ds.replace('-', '') ds_nodash = ds.replace('-', '') - ts_nodash = ts.replace('-', '').replace(':', '') + ts_nodash = self.execution_date.strftime('%Y%m%dT%H%M%S') + ts_nodash_with_tz = ts.replace('-', '').replace(':', '') yesterday_ds_nodash = yesterday_ds.replace('-', '') tomorrow_ds_nodash = tomorrow_ds.replace('-', '') ti_key_str = "{task.dag_id}__{task.task_id}__{ds_nodash}" ti_key_str = ti_key_str.format(**locals()) - params = {} - run_id = '' - dag_run = None - if hasattr(task, 'dag'): - if task.dag.params: - params.update(task.dag.params) - dag_run = ( - session.query(DagRun) - .filter_by( - dag_id=task.dag.dag_id, - execution_date=self.execution_date) - .first() - ) - run_id = dag_run.run_id if dag_run else None - session.expunge_all() - session.commit() - if task.params: params.update(task.params) @@ -1866,10 +1735,13 @@ def __repr__(self): 'dag': task.dag, 'ds': ds, 'next_ds': next_ds, + 'next_ds_nodash': next_ds_nodash, 'prev_ds': prev_ds, + 'prev_ds_nodash': prev_ds_nodash, 'ds_nodash': ds_nodash, 'ts': ts, 'ts_nodash': ts_nodash, + 'ts_nodash_with_tz': ts_nodash_with_tz, 'yesterday_ds': yesterday_ds, 'yesterday_ds_nodash': yesterday_ds_nodash, 'tomorrow_ds': tomorrow_ds, @@ -1918,22 +1790,43 @@ def render_templates(self): rendered_content = rt(attr, content, jinja_context) setattr(task, attr, rendered_content) - def email_alert(self, exception, is_retry=False): - task = self.task - title = "Airflow alert: {self}".format(**locals()) - exception = str(exception).replace('\n', '
    ') + def email_alert(self, exception): + exception_html = str(exception).replace('\n', '
    ') + jinja_context = self.get_template_context() + # This function is called after changing the state + # from State.RUNNING so need to subtract 1 from self.try_number. + jinja_context.update(dict( + exception=exception, + exception_html=exception_html, + try_number=self.try_number - 1, + max_tries=self.max_tries)) + + jinja_env = self.task.get_template_env() + + default_subject = 'Airflow alert: {{ti}}' # For reporting purposes, we report based on 1-indexed, # not 0-indexed lists (i.e. Try 1 instead of # Try 0 for the first attempt). - body = ( - "Try {try_number} out of {max_tries}
    " - "Exception:
    {exception}
    " - "Log: Link
    " - "Host: {self.hostname}
    " - "Log file: {self.log_filepath}
    " - "Mark success: Link
    " - ).format(try_number=self.try_number, max_tries=self.max_tries + 1, **locals()) - send_email(task.email, title, body) + default_html_content = ( + 'Try {{try_number}} out of {{max_tries + 1}}
    ' + 'Exception:
    {{exception_html}}
    ' + 'Log: Link
    ' + 'Host: {{ti.hostname}}
    ' + 'Log file: {{ti.log_filepath}}
    ' + 'Mark success: Link
    ' + ) + + def render(key, content): + if configuration.has_option('email', key): + path = configuration.get('email', key) + with open(path) as f: + content = f.read() + + return jinja_env.from_string(content).render(**jinja_context) + + subject = render('subject_template', default_subject) + html_content = render('html_content_template', default_html_content) + send_email(self.task.email, subject, html_content) def set_duration(self): if self.end_date and self.start_date: @@ -1950,7 +1843,7 @@ def xcom_push( Make an XCom available for tasks to pull. :param key: A key for the XCom - :type key: string + :type key: str :param value: A value for the XCom. The value is pickled and stored in the database. :type value: any pickleable object @@ -1996,13 +1889,13 @@ def xcom_pull( available as a constant XCOM_RETURN_KEY. This key is automatically given to XComs returned by tasks (as opposed to being pushed manually). To remove the filter, pass key=None. - :type key: string + :type key: str :param task_ids: Only XComs from tasks with matching ids will be pulled. Can pass None to remove the filter. - :type task_ids: string or iterable of strings (representing task_ids) + :type task_ids: str or iterable of strings (representing task_ids) :param dag_id: If provided, only pulls XComs from this DAG. If None (default), the DAG of the calling task is used. - :type dag_id: string + :type dag_id: str :param include_prior_dates: If False, only XComs from the current execution_date are returned. If True, XComs from previous dates are returned as well. @@ -2041,114 +1934,6 @@ def init_run_context(self, raw=False): self._set_context(self) -class TaskFail(Base): - """ - TaskFail tracks the failed run durations of each task instance. - """ - - __tablename__ = "task_fail" - - task_id = Column(String(ID_LEN), primary_key=True) - dag_id = Column(String(ID_LEN), primary_key=True) - execution_date = Column(UtcDateTime, primary_key=True) - start_date = Column(UtcDateTime) - end_date = Column(UtcDateTime) - duration = Column(Float) - - def __init__(self, task, execution_date, start_date, end_date): - self.dag_id = task.dag_id - self.task_id = task.task_id - self.execution_date = execution_date - self.start_date = start_date - self.end_date = end_date - if self.end_date and self.start_date: - self.duration = (self.end_date - self.start_date).total_seconds() - else: - self.duration = None - - -class Log(Base): - """ - Used to actively log events to the database - """ - - __tablename__ = "log" - - id = Column(Integer, primary_key=True) - dttm = Column(UtcDateTime) - dag_id = Column(String(ID_LEN)) - task_id = Column(String(ID_LEN)) - event = Column(String(30)) - execution_date = Column(UtcDateTime) - owner = Column(String(500)) - extra = Column(Text) - - def __init__(self, event, task_instance, owner=None, extra=None, **kwargs): - self.dttm = timezone.utcnow() - self.event = event - self.extra = extra - - task_owner = None - - if task_instance: - self.dag_id = task_instance.dag_id - self.task_id = task_instance.task_id - self.execution_date = task_instance.execution_date - task_owner = task_instance.task.owner - - if 'task_id' in kwargs: - self.task_id = kwargs['task_id'] - if 'dag_id' in kwargs: - self.dag_id = kwargs['dag_id'] - if 'execution_date' in kwargs: - if kwargs['execution_date']: - self.execution_date = kwargs['execution_date'] - - self.owner = owner or task_owner - - -class SkipMixin(LoggingMixin): - @provide_session - def skip(self, dag_run, execution_date, tasks, session=None): - """ - Sets tasks instances to skipped from the same dag run. - - :param dag_run: the DagRun for which to set the tasks to skipped - :param execution_date: execution_date - :param tasks: tasks to skip (not task_ids) - :param session: db session to use - """ - if not tasks: - return - - task_ids = [d.task_id for d in tasks] - now = timezone.utcnow() - - if dag_run: - session.query(TaskInstance).filter( - TaskInstance.dag_id == dag_run.dag_id, - TaskInstance.execution_date == dag_run.execution_date, - TaskInstance.task_id.in_(task_ids) - ).update({TaskInstance.state: State.SKIPPED, - TaskInstance.start_date: now, - TaskInstance.end_date: now}, - synchronize_session=False) - session.commit() - else: - assert execution_date is not None, "Execution date is None and no dag run" - - self.log.warning("No DAG RUN present this should not happen") - # this is defensive against dag runs that are not complete - for task in tasks: - ti = TaskInstance(task, execution_date=execution_date) - ti.state = State.SKIPPED - ti.start_date = now - ti.end_date = now - session.merge(ti) - - session.commit() - - @functools.total_ordering class BaseOperator(LoggingMixin): """ @@ -2171,20 +1956,20 @@ class derived from this one results in the creation of a task object, be set by using the set_upstream and/or set_downstream methods. :param task_id: a unique, meaningful id for the task - :type task_id: string + :type task_id: str :param owner: the owner of the task, using the unix username is recommended - :type owner: string + :type owner: str :param retries: the number of retries that should be performed before failing the task :type retries: int :param retry_delay: delay between retries - :type retry_delay: timedelta + :type retry_delay: datetime.timedelta :param retry_exponential_backoff: allow progressive longer waits between retries by using exponential backoff algorithm on retry delay (delay will be converted into seconds) :type retry_exponential_backoff: bool :param max_retry_delay: maximum delay interval between retries - :type max_retry_delay: timedelta + :type max_retry_delay: datetime.timedelta :param start_date: The ``start_date`` for the task, determines the ``execution_date`` for the first task instance. The best practice is to have the start_date rounded @@ -2201,9 +1986,9 @@ class derived from this one results in the creation of a task object, ``TimeSensor`` and ``TimeDeltaSensor``. We advise against using dynamic ``start_date`` and recommend using fixed ones. Read the FAQ entry about start_date for more information. - :type start_date: datetime + :type start_date: datetime.datetime :param end_date: if specified, the scheduler won't go beyond this date - :type end_date: datetime + :type end_date: datetime.datetime :param depends_on_past: when set to true, task instances will run sequentially while relying on the previous task's schedule to succeed. The task instance for the start_date is allowed to run. @@ -2220,7 +2005,7 @@ class derived from this one results in the creation of a task object, does support targeting specific queues. :type queue: str :param dag: a reference to the dag the task is attached to (if any) - :type dag: DAG + :type dag: airflow.models.DAG :param priority_weight: priority weight of this task against other task. This allows the executor to trigger higher priority tasks before others when things get backed up. @@ -2274,13 +2059,14 @@ class derived from this one results in the creation of a task object, :type on_failure_callback: callable :param on_retry_callback: much like the ``on_failure_callback`` except that it is executed when retries occur. + :type on_retry_callback: callable :param on_success_callback: much like the ``on_failure_callback`` except that it is executed when the task succeeds. :type on_success_callback: callable :param trigger_rule: defines the rule by which dependencies are applied for the task to get triggered. Options are: ``{ all_success | all_failed | all_done | one_success | - one_failed | dummy}`` + one_failed | none_failed | none_skipped | dummy}`` default is ``all_success``. Options can be set as string or using the constants defined in the static class ``airflow.utils.TriggerRule`` @@ -2296,25 +2082,37 @@ class derived from this one results in the creation of a task object, :param executor_config: Additional task-level configuration parameters that are interpreted by a specific executor. Parameters are namespaced by the name of executor. - ``example: to run this task in a specific docker container through - the KubernetesExecutor - MyOperator(..., - executor_config={ - "KubernetesExecutor": - {"image": "myCustomDockerImage"} - } - )`` + + **Example**: to run this task in a specific docker container through + the KubernetesExecutor :: + + MyOperator(..., + executor_config={ + "KubernetesExecutor": + {"image": "myCustomDockerImage"} + } + ) + :type executor_config: dict """ # For derived classes to define which fields will get jinjaified - template_fields = [] + template_fields = [] # type: Iterable[str] # Defines which files extensions to look for in the templated fields - template_ext = [] + template_ext = [] # type: Iterable[str] # Defines the color in the UI ui_color = '#fff' ui_fgcolor = '#000' + # base list which includes all the attrs that don't need deep copy. + _base_operator_shallow_copy_attrs = ('user_defined_macros', + 'user_defined_filters', + 'params', + '_log',) + + # each operator should override this class attr for shallow copy attrs. + shallow_copy_attrs = () # type: Iterable[str] + @apply_defaults def __init__( self, @@ -2335,7 +2133,6 @@ def __init__( dag=None, params=None, default_args=None, - adhoc=False, priority_weight=1, weight_rule=WeightRule.DOWNSTREAM, queue=configuration.conf.get('celery', 'default_queue'), @@ -2358,29 +2155,36 @@ def __init__( if args or kwargs: # TODO remove *args and **kwargs in Airflow 2.0 warnings.warn( - 'Invalid arguments were passed to {c}. Support for ' - 'passing such arguments will be dropped in Airflow 2.0. ' - 'Invalid arguments were:' + 'Invalid arguments were passed to {c} (task_id: {t}). ' + 'Support for passing such arguments will be dropped in ' + 'Airflow 2.0. Invalid arguments were:' '\n*args: {a}\n**kwargs: {k}'.format( - c=self.__class__.__name__, a=args, k=kwargs), - category=PendingDeprecationWarning + c=self.__class__.__name__, a=args, k=kwargs, t=task_id), + category=PendingDeprecationWarning, + stacklevel=3 ) - validate_key(task_id) self.task_id = task_id self.owner = owner self.email = email self.email_on_retry = email_on_retry self.email_on_failure = email_on_failure + self.start_date = start_date if start_date and not isinstance(start_date, datetime): self.log.warning("start_date for %s isn't datetime.datetime", self) + elif start_date: + self.start_date = timezone.convert_to_utc(start_date) + self.end_date = end_date + if end_date: + self.end_date = timezone.convert_to_utc(end_date) + if not TriggerRule.is_valid(trigger_rule): raise AirflowException( "The trigger_rule must be one of {all_triggers}," "'{d}.{t}'; received '{tr}'." - .format(all_triggers=TriggerRule.all_triggers, + .format(all_triggers=TriggerRule.all_triggers(), d=dag.dag_id if dag else "", t=task_id, tr=trigger_rule)) self.trigger_rule = trigger_rule @@ -2413,7 +2217,6 @@ def __init__( self.retry_exponential_backoff = retry_exponential_backoff self.max_retry_delay = max_retry_delay self.params = params or {} # Available in templates! - self.adhoc = adhoc self.priority_weight = priority_weight if not WeightRule.is_valid(weight_rule): raise AirflowException( @@ -2473,7 +2276,6 @@ def __init__( 'schedule_interval', 'depends_on_past', 'wait_for_downstream', - 'adhoc', 'priority_weight', 'sla', 'execution_timeout', @@ -2483,10 +2285,10 @@ def __init__( } def __eq__(self, other): - return ( - type(self) == type(other) and - all(self.__dict__.get(c, None) == other.__dict__.get(c, None) - for c in self._comps)) + if (type(self) == type(other) and + self.task_id == other.task_id): + return all(self.__dict__.get(c, None) == other.__dict__.get(c, None) for c in self._comps) + return False def __ne__(self, other): return not self == other @@ -2681,17 +2483,13 @@ def __deepcopy__(self, memo): result = cls.__new__(cls) memo[id(self)] = result + shallow_copy = cls.shallow_copy_attrs + cls._base_operator_shallow_copy_attrs + for k, v in list(self.__dict__.items()): - if k not in ('user_defined_macros', 'user_defined_filters', - 'params', '_log'): + if k not in shallow_copy: setattr(result, k, copy.deepcopy(v, memo)) - result.params = self.params - if hasattr(self, 'user_defined_macros'): - result.user_defined_macros = self.user_defined_macros - if hasattr(self, 'user_defined_filters'): - result.user_defined_filters = self.user_defined_filters - if hasattr(self, '_log'): - result._log = self._log + else: + setattr(result, k, copy.copy(v)) return result def __getstate__(self): @@ -2709,25 +2507,19 @@ def render_template_from_field(self, attr, content, context, jinja_env): Renders a template from a field. If the field is a string, it will simply render the string and return the result. If it is a collection or nested set of collections, it will traverse the structure and render - all strings in it. + all elements in it. If the field has another type, it will return it as it is. """ rt = self.render_template if isinstance(content, six.string_types): result = jinja_env.from_string(content).render(**context) elif isinstance(content, (list, tuple)): result = [rt(attr, e, context) for e in content] - elif isinstance(content, numbers.Number): - result = content elif isinstance(content, dict): result = { k: rt("{}[{}]".format(attr, k), v, context) for k, v in list(content.items())} else: - param_type = type(content) - msg = ( - "Type '{param_type}' used for parameter '{attr}' is " - "not supported for templating").format(**locals()) - raise AirflowException(msg) + result = content return result def render_template(self, attr, content, context): @@ -2735,9 +2527,7 @@ def render_template(self, attr, content, context): Renders a template either from a file or directly in a field, and returns the rendered result. """ - jinja_env = self.dag.get_template_env() \ - if hasattr(self, 'dag') \ - else jinja2.Environment(cache_size=0) + jinja_env = self.get_template_env() exts = self.__class__.template_ext if ( @@ -2747,6 +2537,11 @@ def render_template(self, attr, content, context): else: return self.render_template_from_field(attr, content, context, jinja_env) + def get_template_env(self): + return self.dag.get_template_env() \ + if hasattr(self, 'dag') \ + else jinja2.Environment(cache_size=0) + def prepare_template(self): """ Hook that is triggered after the templated fields get replaced @@ -2760,14 +2555,24 @@ def resolve_template_files(self): # Getting the content of files for template_field / template_ext for attr in self.template_fields: content = getattr(self, attr) - if content is not None and \ - isinstance(content, six.string_types) and \ + if content is None: + continue + elif isinstance(content, six.string_types) and \ any([content.endswith(ext) for ext in self.template_ext]): - env = self.dag.get_template_env() + env = self.get_template_env() try: setattr(self, attr, env.loader.get_source(env, content)[0]) except Exception as e: self.log.exception(e) + elif isinstance(content, list): + env = self.dag.get_template_env() + for i in range(len(content)): + if isinstance(content[i], six.string_types) and \ + any([content[i].endswith(ext) for ext in self.template_ext]): + try: + content[i] = env.loader.get_source(env, content[i])[0] + except Exception as e: + self.log.exception(e) self.prepare_template() @property @@ -2924,7 +2729,7 @@ def task_type(self): def add_only_new(self, item_set, item): if item in item_set: - raise AirflowException( + self.log.warning( 'Dependency {self}, {item} already registered' ''.format(**locals())) else: @@ -3048,15 +2853,83 @@ class DagModel(Base): fileloc = Column(String(2000)) # String representing the owners owners = Column(String(2000)) + # Description of the dag + description = Column(Text) + # Default view of the inside the webserver + default_view = Column(String(25)) + # Schedule interval + schedule_interval = Column(Interval) def __repr__(self): return "".format(self=self) + @property + def timezone(self): + return settings.TIMEZONE + + @staticmethod + @provide_session + def get_dagmodel(dag_id, session=None): + return session.query(DagModel).filter(DagModel.dag_id == dag_id).first() + @classmethod @provide_session def get_current(cls, dag_id, session=None): return session.query(cls).filter(cls.dag_id == dag_id).first() + def get_default_view(self): + if self.default_view is None: + return configuration.conf.get('webserver', 'dag_default_view').lower() + else: + return self.default_view + + @provide_session + def get_last_dagrun(self, session=None, include_externally_triggered=False): + return get_last_dagrun(self.dag_id, session=session, + include_externally_triggered=include_externally_triggered) + + @property + def safe_dag_id(self): + return self.dag_id.replace('.', '__dot__') + + def get_dag(self): + return DagBag(dag_folder=self.fileloc).get_dag(self.dag_id) + + @provide_session + def create_dagrun(self, + run_id, + state, + execution_date, + start_date=None, + external_trigger=False, + conf=None, + session=None): + """ + Creates a dag run from this dag including the tasks associated with this dag. + Returns the dag run. + + :param run_id: defines the the run id for this dag run + :type run_id: str + :param execution_date: the execution date of this dag run + :type execution_date: datetime.datetime + :param state: the state of the dag run + :type state: airflow.utils.state.State + :param start_date: the date this dag run should be evaluated + :type start_date: datetime.datetime + :param external_trigger: whether this dag run is externally triggered + :type external_trigger: bool + :param session: database session + :type session: sqlalchemy.orm.session.Session + """ + + return self.get_dag().create_dagrun(run_id=run_id, + state=state, + execution_date=execution_date, + start_date=start_date, + external_trigger=external_trigger, + conf=conf, + session=session) + @functools.total_ordering class DAG(BaseDag, LoggingMixin): @@ -3072,9 +2945,9 @@ class DAG(BaseDag, LoggingMixin): added once to a DAG. :param dag_id: The id of the DAG - :type dag_id: string + :type dag_id: str :param description: The description for the DAG to e.g. be shown on the webserver - :type description: string + :type description: str :param schedule_interval: Defines how often that DAG runs, this timedelta object gets added to your latest task instance's execution_date to figure out the next schedule @@ -3091,7 +2964,7 @@ class DAG(BaseDag, LoggingMixin): defines where jinja will look for your templates. Order matters. Note that jinja/airflow includes the path of your DAG file by default - :type template_searchpath: string or list of stings + :type template_searchpath: str or list[str] :param user_defined_macros: a dictionary of macros that will be exposed in your jinja templates. For example, passing ``dict(foo='bar')`` to this argument allows you to ``{{ foo }}`` in all jinja @@ -3130,9 +3003,9 @@ class DAG(BaseDag, LoggingMixin): :type sla_miss_callback: types.FunctionType :param default_view: Specify DAG default view (tree, graph, duration, gantt, landing_times) - :type default_view: string + :type default_view: str :param orientation: Specify DAG orientation in graph view (LR, TB, RL, BT) - :type orientation: string + :type orientation: str :param catchup: Perform scheduler catchup (or only run latest)? Defaults to True :type catchup: bool :param on_failure_callback: A function to be called when a DagRun of this dag fails. @@ -3158,10 +3031,11 @@ def __init__( 'core', 'max_active_runs_per_dag'), dagrun_timeout=None, sla_miss_callback=None, - default_view=configuration.conf.get('webserver', 'dag_default_view').lower(), + default_view=None, orientation=configuration.conf.get('webserver', 'dag_orientation'), catchup=configuration.conf.getboolean('scheduler', 'catchup_by_default'), on_success_callback=None, on_failure_callback=None, + doc_md=None, params=None): self.user_defined_macros = user_defined_macros @@ -3196,7 +3070,8 @@ def __init__( timezone.parse(self.default_args['start_date']) ) self.timezone = self.default_args['start_date'].tzinfo - else: + + if not hasattr(self, 'timezone') or not self.timezone: self.timezone = settings.TIMEZONE self.start_date = timezone.convert_to_utc(start_date) @@ -3213,7 +3088,7 @@ def __init__( ) self.schedule_interval = schedule_interval - if schedule_interval in cron_presets: + if isinstance(schedule_interval, Hashable) and schedule_interval in cron_presets: self._schedule_interval = cron_presets.get(schedule_interval) elif schedule_interval == '@once': self._schedule_interval = None @@ -3228,7 +3103,7 @@ def __init__( self.max_active_runs = max_active_runs self.dagrun_timeout = dagrun_timeout self.sla_miss_callback = sla_miss_callback - self.default_view = default_view + self._default_view = default_view self.orientation = orientation self.catchup = catchup self.is_subdag = False # DagBag.bag_dag() will set this to True if appropriate @@ -3236,6 +3111,9 @@ def __init__( self.partial = False self.on_success_callback = on_success_callback self.on_failure_callback = on_failure_callback + self.doc_md = doc_md + + self._old_context_manager_dags = [] self._comps = { 'dag_id', @@ -3252,12 +3130,13 @@ def __repr__(self): return "".format(self=self) def __eq__(self, other): - return ( - type(self) == type(other) and + if (type(self) == type(other) and + self.dag_id == other.dag_id): + # Use getattr() instead of __dict__ as __dict__ doesn't return # correct values for properties. - all(getattr(self, c, None) == getattr(other, c, None) - for c in self._comps)) + return all(getattr(self, c, None) == getattr(other, c, None) for c in self._comps) + return False def __ne__(self, other): return not self == other @@ -3284,16 +3163,23 @@ def __hash__(self): def __enter__(self): global _CONTEXT_MANAGER_DAG - self._old_context_manager_dag = _CONTEXT_MANAGER_DAG + self._old_context_manager_dags.append(_CONTEXT_MANAGER_DAG) _CONTEXT_MANAGER_DAG = self return self def __exit__(self, _type, _value, _tb): global _CONTEXT_MANAGER_DAG - _CONTEXT_MANAGER_DAG = self._old_context_manager_dag + _CONTEXT_MANAGER_DAG = self._old_context_manager_dags.pop() # /Context Manager ---------------------------------------------- + def get_default_view(self): + """This is only there for backward compatible jinja2 templates""" + if self._default_view is None: + return configuration.conf.get('webserver', 'dag_default_view').lower() + else: + return self._default_view + def date_range(self, start_date, num=None, end_date=timezone.utcnow()): if num: end_date = None @@ -3301,34 +3187,77 @@ def date_range(self, start_date, num=None, end_date=timezone.utcnow()): start_date=start_date, end_date=end_date, num=num, delta=self._schedule_interval) + def is_fixed_time_schedule(self): + """ + Figures out if the DAG schedule has a fixed time (e.g. 3 AM). + + :return: True if the schedule has a fixed time, False if not. + """ + now = datetime.now() + cron = croniter(self._schedule_interval, now) + + start = cron.get_next(datetime) + cron_next = cron.get_next(datetime) + + if cron_next.minute == start.minute and cron_next.hour == start.hour: + return True + + return False + def following_schedule(self, dttm): """ - Calculates the following schedule for this dag in local time + Calculates the following schedule for this dag in UTC. :param dttm: utc datetime :return: utc datetime """ if isinstance(self._schedule_interval, six.string_types): - dttm = timezone.make_naive(dttm, self.timezone) - cron = croniter(self._schedule_interval, dttm) - following = timezone.make_aware(cron.get_next(datetime), self.timezone) + # we don't want to rely on the transitions created by + # croniter as they are not always correct + dttm = pendulum.instance(dttm) + naive = timezone.make_naive(dttm, self.timezone) + cron = croniter(self._schedule_interval, naive) + + # We assume that DST transitions happen on the minute/hour + if not self.is_fixed_time_schedule(): + # relative offset (eg. every 5 minutes) + delta = cron.get_next(datetime) - naive + following = dttm.in_timezone(self.timezone).add_timedelta(delta) + else: + # absolute (e.g. 3 AM) + naive = cron.get_next(datetime) + tz = pendulum.timezone(self.timezone.name) + following = timezone.make_aware(naive, tz) return timezone.convert_to_utc(following) - elif isinstance(self._schedule_interval, timedelta): + elif self._schedule_interval is not None: return dttm + self._schedule_interval def previous_schedule(self, dttm): """ - Calculates the previous schedule for this dag in local time + Calculates the previous schedule for this dag in UTC :param dttm: utc datetime :return: utc datetime """ if isinstance(self._schedule_interval, six.string_types): - dttm = timezone.make_naive(dttm, self.timezone) - cron = croniter(self._schedule_interval, dttm) - prev = timezone.make_aware(cron.get_prev(datetime), self.timezone) - return timezone.convert_to_utc(prev) - elif isinstance(self._schedule_interval, timedelta): + # we don't want to rely on the transitions created by + # croniter as they are not always correct + dttm = pendulum.instance(dttm) + naive = timezone.make_naive(dttm, self.timezone) + cron = croniter(self._schedule_interval, naive) + + # We assume that DST transitions happen on the minute/hour + if not self.is_fixed_time_schedule(): + # relative offset (eg. every 5 minutes) + delta = naive - cron.get_prev(datetime) + previous = dttm.in_timezone(self.timezone).subtract_timedelta(delta) + else: + # absolute (e.g. 3 AM) + naive = cron.get_prev(datetime) + tz = pendulum.timezone(self.timezone.name) + previous = timezone.make_aware(naive, tz) + return timezone.convert_to_utc(previous) + elif self._schedule_interval is not None: return dttm - self._schedule_interval def get_run_dates(self, start_date, end_date=None): @@ -3379,23 +3308,8 @@ def normalize_schedule(self, dttm): @provide_session def get_last_dagrun(self, session=None, include_externally_triggered=False): - """ - Returns the last dag run for this dag, None if there was none. - Last dag run can be any type of run eg. scheduled or backfilled. - Overridden DagRuns are ignored - """ - DR = DagRun - qry = session.query(DR).filter( - DR.dag_id == self.dag_id, - ) - if not include_externally_triggered: - qry = qry.filter(DR.external_trigger.__eq__(False)) - - qry = qry.order_by(DR.execution_date.desc()) - - last = qry.first() - - return last + return get_last_dagrun(self.dag_id, session=session, + include_externally_triggered=include_externally_triggered) @property def dag_id(self): @@ -3446,14 +3360,6 @@ def tasks(self, val): def task_ids(self): return list(self.task_dict.keys()) - @property - def active_task_ids(self): - return list(k for k, v in self.task_dict.items() if not v.adhoc) - - @property - def active_tasks(self): - return [t for t in self.tasks if not t.adhoc] - @property def filepath(self): """ @@ -3474,13 +3380,8 @@ def folder(self): def owner(self): return ", ".join(list(set([t.owner for t in self.tasks]))) - @property @provide_session - def concurrency_reached(self, session=None): - """ - Returns a boolean indicating whether the concurrency limit for this DAG - has been reached - """ + def _get_concurrency_reached(self, session=None): TI = TaskInstance qry = session.query(func.count(TI.task_id)).filter( TI.dag_id == self.dag_id, @@ -3489,15 +3390,26 @@ def concurrency_reached(self, session=None): return qry.scalar() >= self.concurrency @property - @provide_session - def is_paused(self, session=None): + def concurrency_reached(self): """ - Returns a boolean indicating whether this DAG is paused + Returns a boolean indicating whether the concurrency limit for this DAG + has been reached """ + return self._get_concurrency_reached() + + @provide_session + def _get_is_paused(self, session=None): qry = session.query(DagModel).filter( DagModel.dag_id == self.dag_id) return qry.value('is_paused') + @property + def is_paused(self): + """ + Returns a boolean indicating whether this DAG is paused + """ + return self._get_is_paused() + @provide_session def handle_callback(self, dagrun, success=True, reason=None, session=None): """ @@ -3505,8 +3417,10 @@ def handle_callback(self, dagrun, success=True, reason=None, session=None): on_failure_callback or on_success_callback. This method gets the context of a single TaskInstance part of this DagRun and passes that to the callable along with a 'reason', primarily to differentiate DagRun failures. - .. note:: - The logs end up in $AIRFLOW_HOME/logs/scheduler/latest/PROJECT/DAG_FILE.py.log + + .. note: The logs end up in + ``$AIRFLOW_HOME/logs/scheduler/latest/PROJECT/DAG_FILE.py.log`` + :param dagrun: DagRun object :param success: Flag to specify if failure or success callback should be called :param reason: Completion reason @@ -3577,24 +3491,25 @@ def get_dagrun(self, execution_date, session=None): return dagrun - @property @provide_session - def latest_execution_date(self, session=None): + def _get_latest_execution_date(self, session=None): + return session.query(func.max(DagRun.execution_date)).filter( + DagRun.dag_id == self.dag_id + ).scalar() + + @property + def latest_execution_date(self): """ Returns the latest date for which at least one dag run exists """ - execution_date = session.query(func.max(DagRun.execution_date)).filter( - DagRun.dag_id == self.dag_id - ).scalar() - return execution_date + return self._get_latest_execution_date() @property def subdags(self): """ Returns a list of the subdag objects associated to this DAG """ - # Check SubDag for class but don't check class directly, see - # https://github.com/airbnb/airflow/issues/1168 + # Check SubDag for class but don't check class directly from airflow.operators.subdag_operator import SubDagOperator subdag_lst = [] for task in self.tasks: @@ -3671,8 +3586,8 @@ def topological_sort(self): :return: list of tasks in topological order """ - # copy the the tasks so we leave it unmodified - graph_unsorted = self.tasks[:] + # convert into an OrderedDict to speedup lookup while keeping order the same + graph_unsorted = OrderedDict((task.task_id, task) for task in self.tasks) graph_sorted = [] @@ -3695,14 +3610,14 @@ def topological_sort(self): # not, we need to bail out as the graph therefore can't be # sorted. acyclic = False - for node in list(graph_unsorted): + for node in list(graph_unsorted.values()): for edge in node.upstream_list: - if edge in graph_unsorted: + if edge.task_id in graph_unsorted: break # no edges in upstream tasks else: acyclic = True - graph_unsorted.remove(node) + del graph_unsorted[node.task_id] graph_sorted.append(node) if not acyclic: @@ -3730,7 +3645,6 @@ def set_dag_runs_state( for dr in drs: dr.state = state dirty_ids.append(dr.dag_id) - DagStat.update(dirty_ids, session=session) @provide_session def clear( @@ -3739,9 +3653,11 @@ def clear( only_running=False, confirm_prompt=False, include_subdags=True, + include_parentdag=True, reset_dag_runs=True, dry_run=False, session=None, + get_tis=False, ): """ Clears a set of task instances associated with the current dag for @@ -3762,15 +3678,39 @@ def clear( tis = session.query(TI).filter(TI.dag_id == self.dag_id) tis = tis.filter(TI.task_id.in_(self.task_ids)) + if include_parentdag and self.is_subdag: + + p_dag = self.parent_dag.sub_dag( + task_regex=self.dag_id.split('.')[1], + include_upstream=False, + include_downstream=True) + + tis = tis.union(p_dag.clear( + start_date=start_date, end_date=end_date, + only_failed=only_failed, + only_running=only_running, + confirm_prompt=confirm_prompt, + include_subdags=include_subdags, + include_parentdag=False, + reset_dag_runs=reset_dag_runs, + get_tis=True, + session=session, + )) + if start_date: tis = tis.filter(TI.execution_date >= start_date) if end_date: tis = tis.filter(TI.execution_date <= end_date) if only_failed: - tis = tis.filter(TI.state == State.FAILED) + tis = tis.filter(or_( + TI.state == State.FAILED, + TI.state == State.UPSTREAM_FAILED)) if only_running: tis = tis.filter(TI.state == State.RUNNING) + if get_tis: + return tis + if dry_run: tis = tis.all() session.expunge_all() @@ -3814,6 +3754,7 @@ def clear_dags( only_running=False, confirm_prompt=False, include_subdags=True, + include_parentdag=False, reset_dag_runs=True, dry_run=False, ): @@ -3826,6 +3767,7 @@ def clear_dags( only_running=only_running, confirm_prompt=False, include_subdags=include_subdags, + include_parentdag=include_parentdag, reset_dag_runs=reset_dag_runs, dry_run=True) all_tis.extend(tis) @@ -4012,16 +3954,6 @@ def add_tasks(self, tasks): for task in tasks: self.add_task(task) - @provide_session - def db_merge(self, session=None): - BO = BaseOperator - tasks = session.query(BO).filter(BO.dag_id == self.dag_id).all() - for t in tasks: - session.delete(t) - session.commit() - session.merge(self) - session.commit() - def run( self, start_date=None, @@ -4037,20 +3969,21 @@ def run( verbose=False, conf=None, rerun_failed_tasks=False, + run_backwards=False, ): """ Runs the DAG. :param start_date: the start date of the range to run - :type start_date: datetime + :type start_date: datetime.datetime :param end_date: the end date of the range to run - :type end_date: datetime + :type end_date: datetime.datetime :param mark_success: True to mark jobs as succeeded without running them :type mark_success: bool :param local: True to run the tasks using the LocalExecutor :type local: bool :param executor: The executor instance to run the tasks - :type executor: BaseExecutor + :type executor: airflow.executor.BaseExecutor :param donot_pickle: True to avoid pickling DAG object and send to workers :type donot_pickle: bool :param ignore_task_deps: True to skip upstream tasks @@ -4059,14 +3992,19 @@ def run( dependencies for the first set of tasks only :type ignore_first_depends_on_past: bool :param pool: Resource pool to use - :type pool: string + :type pool: str :param delay_on_limit_secs: Time in seconds to wait before next attempt to run dag run when max_active_runs limit has been reached :type delay_on_limit_secs: float :param verbose: Make logging output more verbose - :type verbose: boolean + :type verbose: bool :param conf: user defined dictionary passed from CLI :type conf: dict + :param rerun_failed_tasks: + :type: bool + :param run_backwards: + :type: bool + """ from airflow.jobs import BackfillJob if not executor and local: @@ -4087,6 +4025,7 @@ def run( verbose=verbose, conf=conf, rerun_failed_tasks=rerun_failed_tasks, + run_backwards=run_backwards, ) job.run() @@ -4113,17 +4052,17 @@ def create_dagrun(self, Returns the dag run. :param run_id: defines the the run id for this dag run - :type run_id: string + :type run_id: str :param execution_date: the execution date of this dag run - :type execution_date: datetime + :type execution_date: datetime.datetime :param state: the state of the dag run - :type state: State + :type state: airflow.utils.state.State :param start_date: the date this dag run should be evaluated :type start_date: datetime :param external_trigger: whether this dag run is externally triggered :type external_trigger: bool :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session """ run = DagRun( dag_id=self.dag_id, @@ -4136,8 +4075,6 @@ def create_dagrun(self, ) session.add(run) - DagStat.set_dirty(dag_id=self.dag_id, session=session) - session.commit() run.dag = self @@ -4158,7 +4095,7 @@ def sync_to_db(self, owner=None, sync_time=None, session=None): SubDagOperator. :param dag: the DAG object to save to the DB - :type dag: DAG + :type dag: airflow.models.DAG :param sync_time: The time that the DAG should be marked as sync'ed :type sync_time: datetime :return: None @@ -4179,6 +4116,9 @@ def sync_to_db(self, owner=None, sync_time=None, session=None): orm_dag.owners = owner orm_dag.is_active = True orm_dag.last_scheduler_run = sync_time + orm_dag.default_view = self._default_view + orm_dag.description = self.description + orm_dag.schedule_interval = self.schedule_interval session.merge(orm_dag) session.commit() @@ -4203,6 +4143,7 @@ def deactivate_unknown_dags(active_dag_ids, session=None): DagModel).filter(~DagModel.dag_id.in_(active_dag_ids)).all(): dag.is_active = False session.merge(dag) + session.commit() @staticmethod @provide_session @@ -4351,6 +4292,7 @@ def __repr__(self): class Variable(Base, LoggingMixin): __tablename__ = "variable" + __NO_DEFAULT_SENTINEL = object() id = Column(Integer, primary_key=True) key = Column(String(ID_LEN), unique=True) @@ -4366,32 +4308,23 @@ def get_val(self): if self._val and self.is_encrypted: try: fernet = get_fernet() - except Exception: - log.error("Can't decrypt _val for key={}, FERNET_KEY " - "configuration missing".format(self.key)) - return None - try: return fernet.decrypt(bytes(self._val, 'utf-8')).decode() - except cryptography.fernet.InvalidToken: + except InvalidFernetToken: log.error("Can't decrypt _val for key={}, invalid token " "or value".format(self.key)) return None + except Exception: + log.error("Can't decrypt _val for key={}, FERNET_KEY " + "configuration missing".format(self.key)) + return None else: return self._val def set_val(self, value): if value: - try: - fernet = get_fernet() - self._val = fernet.encrypt(bytes(value, 'utf-8')).decode() - self.is_encrypted = True - except AirflowException: - self.log.exception( - "Failed to load fernet while encrypting value, " - "using non-encrypted value." - ) - self._val = value - self.is_encrypted = False + fernet = get_fernet() + self._val = fernet.encrypt(bytes(value, 'utf-8')).decode() + self.is_encrypted = fernet.is_encrypted @declared_attr def val(cls): @@ -4405,18 +4338,17 @@ def setdefault(cls, key, default, deserialize_json=False): for a key, and if it isn't there, stores the default value and returns it. :param key: Dict key for this Variable - :type key: String + :type key: str :param default: Default value to set and return if the variable - isn't already in the DB + isn't already in the DB :type default: Mixed :param deserialize_json: Store this as a JSON encoded value in the DB - and un-encode it when retrieving a value + and un-encode it when retrieving a value :return: Mixed """ - default_sentinel = object() - obj = Variable.get(key, default_var=default_sentinel, + obj = Variable.get(key, default_var=None, deserialize_json=deserialize_json) - if obj is default_sentinel: + if obj is None: if default is not None: Variable.set(key, default, serialize_json=deserialize_json) return default @@ -4427,10 +4359,10 @@ def setdefault(cls, key, default, deserialize_json=False): @classmethod @provide_session - def get(cls, key, default_var=None, deserialize_json=False, session=None): + def get(cls, key, default_var=__NO_DEFAULT_SENTINEL, deserialize_json=False, session=None): obj = session.query(cls).filter(cls.key == key).first() if obj is None: - if default_var is not None: + if default_var is not cls.__NO_DEFAULT_SENTINEL: return default_var else: raise KeyError('Variable {} does not exist'.format(key)) @@ -4453,6 +4385,11 @@ def set(cls, key, value, serialize_json=False, session=None): session.add(Variable(key=key, val=stored_value)) session.flush() + def rotate_fernet_key(self): + fernet = get_fernet() + if self._val and self.is_encrypted: + self._val = fernet.rotate(self._val.encode('utf-8')).decode() + class XCom(Base, LoggingMixin): """ @@ -4464,7 +4401,7 @@ class XCom(Base, LoggingMixin): key = Column(String(512)) value = Column(LargeBinary) timestamp = Column( - DateTime, default=timezone.utcnow, nullable=False) + UtcDateTime, default=timezone.utcnow, nullable=False) execution_date = Column(UtcDateTime, nullable=False) # source information @@ -4512,7 +4449,8 @@ def set( """ Store an XCom value. TODO: "pickling" has been deprecated and JSON is preferred. - "pickling" will be removed in Airflow 2.0. + "pickling" will be removed in Airflow 2.0. + :return: None """ session.expunge_all() @@ -4562,7 +4500,8 @@ def get_one(cls, """ Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. - "pickling" will be removed in Airflow 2.0. + "pickling" will be removed in Airflow 2.0. + :return: XCom value """ filters = [] @@ -4610,7 +4549,7 @@ def get_many(cls, """ Retrieve an XCom value, optionally meeting certain criteria TODO: "pickling" has been deprecated and JSON is preferred. - "pickling" will be removed in Airflow 2.0. + "pickling" will be removed in Airflow 2.0. """ filters = [] if key: @@ -4645,122 +4584,6 @@ def delete(cls, xcoms, session=None): session.commit() -class DagStat(Base): - __tablename__ = "dag_stats" - - dag_id = Column(String(ID_LEN), primary_key=True) - state = Column(String(50), primary_key=True) - count = Column(Integer, default=0) - dirty = Column(Boolean, default=False) - - def __init__(self, dag_id, state, count=0, dirty=False): - self.dag_id = dag_id - self.state = state - self.count = count - self.dirty = dirty - - @staticmethod - @provide_session - def set_dirty(dag_id, session=None): - """ - :param dag_id: the dag_id to mark dirty - :param session: database session - :return: - """ - DagStat.create(dag_id=dag_id, session=session) - - try: - stats = session.query(DagStat).filter( - DagStat.dag_id == dag_id - ).with_for_update().all() - - for stat in stats: - stat.dirty = True - session.commit() - except Exception as e: - session.rollback() - log = LoggingMixin().log - log.warning("Could not update dag stats for %s", dag_id) - log.exception(e) - - @staticmethod - @provide_session - def update(dag_ids=None, dirty_only=True, session=None): - """ - Updates the stats for dirty/out-of-sync dags - - :param dag_ids: dag_ids to be updated - :type dag_ids: list - :param dirty_only: only updated for marked dirty, defaults to True - :type dirty_only: bool - :param session: db session to use - :type session: Session - """ - try: - qry = session.query(DagStat) - if dag_ids: - qry = qry.filter(DagStat.dag_id.in_(set(dag_ids))) - if dirty_only: - qry = qry.filter(DagStat.dirty == True) # noqa - - qry = qry.with_for_update().all() - - ids = set([dag_stat.dag_id for dag_stat in qry]) - - # avoid querying with an empty IN clause - if len(ids) == 0: - session.commit() - return - - dagstat_states = set(itertools.product(ids, State.dag_states)) - qry = ( - session.query(DagRun.dag_id, DagRun.state, func.count('*')) - .filter(DagRun.dag_id.in_(ids)) - .group_by(DagRun.dag_id, DagRun.state) - ) - - counts = {(dag_id, state): count for dag_id, state, count in qry} - for dag_id, state in dagstat_states: - count = 0 - if (dag_id, state) in counts: - count = counts[(dag_id, state)] - - session.merge( - DagStat(dag_id=dag_id, state=state, count=count, dirty=False) - ) - - session.commit() - except Exception as e: - session.rollback() - log = LoggingMixin().log - log.warning("Could not update dag stat table") - log.exception(e) - - @staticmethod - @provide_session - def create(dag_id, session=None): - """ - Creates the missing states the stats table for the dag specified - - :param dag_id: dag id of the dag to create stats for - :param session: database session - :return: - """ - # unfortunately sqlalchemy does not know upsert - qry = session.query(DagStat).filter(DagStat.dag_id == dag_id).all() - states = [dag_stat.state for dag_stat in qry] - for state in State.dag_states: - if state not in states: - try: - session.merge(DagStat(dag_id=dag_id, state=state)) - session.commit() - except Exception as e: - session.rollback() - log = LoggingMixin().log - log.warning("Could not create stat record") - log.exception(e) - - class DagRun(Base, LoggingMixin): """ DagRun describes an instance of a Dag. It can be created @@ -4784,7 +4607,9 @@ class DagRun(Base, LoggingMixin): dag = None __table_args__ = ( - Index('dr_run_id', dag_id, run_id, unique=True), + Index('dag_id_state', dag_id, _state), + UniqueConstraint('dag_id', 'execution_date'), + UniqueConstraint('dag_id', 'run_id'), ) def __repr__(self): @@ -4803,12 +4628,7 @@ def get_state(self): def set_state(self, state): if self._state != state: self._state = state - if self.dag_id is not None: - # FIXME: Due to the scoped_session factor we we don't get a clean - # session here, so something really weird goes on: - # if you try to close the session dag runs will end up detached - session = settings.Session() - DagStat.set_dirty(self.dag_id, session=session) + self.end_date = timezone.utcnow() if self._state in State.finished() else None @declared_attr def state(self): @@ -4847,20 +4667,20 @@ def find(dag_id=None, run_id=None, execution_date=None, Returns a set of dag runs for the given search criteria. :param dag_id: the dag_id to find dag runs for - :type dag_id: integer, list + :type dag_id: int, list :param run_id: defines the the run id for this dag run - :type run_id: string + :type run_id: str :param execution_date: the execution date - :type execution_date: datetime + :type execution_date: datetime.datetime :param state: the state of the dag run - :type state: State + :type state: airflow.utils.state.State :param external_trigger: whether this dag run is externally triggered :type external_trigger: bool :param no_backfills: return no backfills (True), return all (False). - Defaults to False + Defaults to False :type no_backfills: bool :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session """ DR = DagRun @@ -4977,7 +4797,6 @@ def update_state(self, session=None): dag = self.get_dag() tis = self.get_task_instances(session=session) - self.log.debug("Updating state for %s considering %s task(s)", self, len(tis)) for ti in list(tis): @@ -5003,12 +4822,13 @@ def update_state(self, session=None): no_dependencies_met = True for ut in unfinished_tasks: # We need to flag upstream and check for changes because upstream - # failures can result in deadlock false positives + # failures/re-schedules can result in deadlock false positives old_state = ut.state deps_met = ut.are_dependencies_met( dep_context=DepContext( flag_upstream_failed=True, - ignore_in_retry_period=True), + ignore_in_retry_period=True, + ignore_in_reschedule_period=True), session=session) if deps_met or old_state != ut.current_state(session=session): no_dependencies_met = False @@ -5017,37 +4837,35 @@ def update_state(self, session=None): duration = (timezone.utcnow() - start_dttm).total_seconds() * 1000 Stats.timing("dagrun.dependency-check.{}".format(self.dag_id), duration) - # future: remove the check on adhoc tasks (=active_tasks) - if len(tis) == len(dag.active_tasks): - root_ids = [t.task_id for t in dag.roots] - roots = [t for t in tis if t.task_id in root_ids] - - # if all roots finished and at least one failed, the run failed - if (not unfinished_tasks and - any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): - self.log.info('Marking run %s failed', self) - self.state = State.FAILED - dag.handle_callback(self, success=False, reason='task_failure', - session=session) - - # if all roots succeeded and no unfinished tasks, the run succeeded - elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED) - for r in roots): - self.log.info('Marking run %s successful', self) - self.state = State.SUCCESS - dag.handle_callback(self, success=True, reason='success', session=session) - - # if *all tasks* are deadlocked, the run failed - elif (unfinished_tasks and none_depends_on_past and - none_task_concurrency and no_dependencies_met): - self.log.info('Deadlock; marking run %s failed', self) - self.state = State.FAILED - dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', - session=session) - - # finally, if the roots aren't done, the dag is still running - else: - self.state = State.RUNNING + root_ids = [t.task_id for t in dag.roots] + roots = [t for t in tis if t.task_id in root_ids] + + # if all roots finished and at least one failed, the run failed + if (not unfinished_tasks and + any(r.state in (State.FAILED, State.UPSTREAM_FAILED) for r in roots)): + self.log.info('Marking run %s failed', self) + self.set_state(State.FAILED) + dag.handle_callback(self, success=False, reason='task_failure', + session=session) + + # if all roots succeeded and no unfinished tasks, the run succeeded + elif not unfinished_tasks and all(r.state in (State.SUCCESS, State.SKIPPED) + for r in roots): + self.log.info('Marking run %s successful', self) + self.set_state(State.SUCCESS) + dag.handle_callback(self, success=True, reason='success', session=session) + + # if *all tasks* are deadlocked, the run failed + elif (unfinished_tasks and none_depends_on_past and + none_task_concurrency and no_dependencies_met): + self.log.info('Deadlock; marking run %s failed', self) + self.set_state(State.FAILED) + dag.handle_callback(self, success=False, reason='all_tasks_deadlocked', + session=session) + + # finally, if the roots aren't done, the dag is still running + else: + self.set_state(State.RUNNING) # todo: determine we want to use with_for_update to make sure to lock the run session.merge(self) @@ -5091,10 +4909,13 @@ def verify_integrity(self, session=None): # check for missing tasks for task in six.itervalues(dag.task_dict): - if task.adhoc: + if task.start_date > self.execution_date and not self.is_backfill: continue if task.task_id not in task_ids: + Stats.incr( + "task_instance_created-{}".format(task.__class__.__name__), + 1, 1) ti = TaskInstance(task, self.execution_date) session.add(ti) @@ -5108,7 +4929,7 @@ def get_run(session, dag_id, execution_date): :param execution_date: execution date :type execution_date: datetime :return: DagRun corresponding to the given dag_id and execution date - if one exists. None otherwise. + if one exists. None otherwise. :rtype: DagRun """ qry = session.query(DagRun).filter( @@ -5121,7 +4942,10 @@ def get_run(session, dag_id, execution_date): @property def is_backfill(self): from airflow.jobs import BackfillJob - return self.run_id.startswith(BackfillJob.ID_PREFIX) + return ( + self.run_id is not None and + self.run_id.startswith(BackfillJob.ID_PREFIX) + ) @classmethod @provide_session @@ -5202,84 +5026,6 @@ def open_slots(self, session): return self.slots - used_slots - queued_slots -class SlaMiss(Base): - """ - Model that stores a history of the SLA that have been missed. - It is used to keep track of SLA failures over time and to avoid double - triggering alert emails. - """ - __tablename__ = "sla_miss" - - task_id = Column(String(ID_LEN), primary_key=True) - dag_id = Column(String(ID_LEN), primary_key=True) - execution_date = Column(UtcDateTime, primary_key=True) - email_sent = Column(Boolean, default=False) - timestamp = Column(UtcDateTime) - description = Column(Text) - notification_sent = Column(Boolean, default=False) - - def __repr__(self): - return str(( - self.dag_id, self.task_id, self.execution_date.isoformat())) - - -class ImportError(Base): - __tablename__ = "import_error" - id = Column(Integer, primary_key=True) - timestamp = Column(UtcDateTime) - filename = Column(String(1024)) - stacktrace = Column(Text) - - -class KubeResourceVersion(Base): - __tablename__ = "kube_resource_version" - one_row_id = Column(Boolean, server_default=sqltrue(), primary_key=True) - resource_version = Column(String(255)) - - @staticmethod - @provide_session - def get_current_resource_version(session=None): - (resource_version,) = session.query(KubeResourceVersion.resource_version).one() - return resource_version - - @staticmethod - @provide_session - def checkpoint_resource_version(resource_version, session=None): - if resource_version: - session.query(KubeResourceVersion).update({ - KubeResourceVersion.resource_version: resource_version - }) - session.commit() - - @staticmethod - @provide_session - def reset_resource_version(session=None): - session.query(KubeResourceVersion).update({ - KubeResourceVersion.resource_version: '0' - }) - session.commit() - return '0' - - -class KubeWorkerIdentifier(Base): - __tablename__ = "kube_worker_uuid" - one_row_id = Column(Boolean, server_default=sqltrue(), primary_key=True) - worker_uuid = Column(String(255)) - - @staticmethod - @provide_session - def get_or_create_current_kube_worker_uuid(session=None): - (worker_uuid,) = session.query(KubeWorkerIdentifier.worker_uuid).one() - if worker_uuid == '': - worker_uuid = str(uuid.uuid4()) - KubeWorkerIdentifier.checkpoint_kube_worker_uuid(worker_uuid, session) - return worker_uuid - - @staticmethod - @provide_session - def checkpoint_kube_worker_uuid(worker_uuid, session=None): - if worker_uuid: - session.query(KubeWorkerIdentifier).update({ - KubeWorkerIdentifier.worker_uuid: worker_uuid - }) - session.commit() +# To avoid circular import on Python2.7 we need to define this at the _bottom_ +from airflow.models.connection import Connection # noqa: E402,F401 +from airflow.models.skipmixin import SkipMixin # noqa: F401 diff --git a/airflow/models/base.py b/airflow/models/base.py new file mode 100644 index 0000000000000..97c6b777984d8 --- /dev/null +++ b/airflow/models/base.py @@ -0,0 +1,35 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from typing import Any +from sqlalchemy import MetaData +from sqlalchemy.ext.declarative import declarative_base + +import airflow + +SQL_ALCHEMY_SCHEMA = airflow.configuration.get("core", "SQL_ALCHEMY_SCHEMA") + +metadata = ( + None + if not SQL_ALCHEMY_SCHEMA or SQL_ALCHEMY_SCHEMA.isspace() + else MetaData(schema=SQL_ALCHEMY_SCHEMA) +) +Base = declarative_base(metadata=metadata) # type: Any + +ID_LEN = 250 diff --git a/airflow/models/connection.py b/airflow/models/connection.py new file mode 100644 index 0000000000000..1ac53553a9609 --- /dev/null +++ b/airflow/models/connection.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import json +from builtins import bytes +from urllib.parse import urlparse, unquote, parse_qsl + +from sqlalchemy import Column, Integer, String, Boolean +from sqlalchemy.ext.declarative import declared_attr +from sqlalchemy.orm import synonym + +from airflow.exceptions import AirflowException +from airflow import LoggingMixin +from airflow.models import get_fernet +from airflow.models.base import Base, ID_LEN + + +# Python automatically converts all letters to lowercase in hostname +# See: https://issues.apache.org/jira/browse/AIRFLOW-3615 +def parse_netloc_to_hostname(uri_parts): + hostname = unquote(uri_parts.hostname or '') + if '/' in hostname: + hostname = uri_parts.netloc + if "@" in hostname: + hostname = hostname.rsplit("@", 1)[1] + if ":" in hostname: + hostname = hostname.split(":", 1)[0] + hostname = unquote(hostname) + return hostname + + +class Connection(Base, LoggingMixin): + """ + Placeholder to store information about different database instances + connection information. The idea here is that scripts use references to + database instances (conn_id) instead of hard coding hostname, logins and + passwords when using operators or hooks. + """ + __tablename__ = "connection" + + id = Column(Integer(), primary_key=True) + conn_id = Column(String(ID_LEN)) + conn_type = Column(String(500)) + host = Column(String(500)) + schema = Column(String(500)) + login = Column(String(500)) + _password = Column('password', String(5000)) + port = Column(Integer()) + is_encrypted = Column(Boolean, unique=False, default=False) + is_extra_encrypted = Column(Boolean, unique=False, default=False) + _extra = Column('extra', String(5000)) + + _types = [ + ('docker', 'Docker Registry',), + ('fs', 'File (path)'), + ('ftp', 'FTP',), + ('google_cloud_platform', 'Google Cloud Platform'), + ('hdfs', 'HDFS',), + ('http', 'HTTP',), + ('hive_cli', 'Hive Client Wrapper',), + ('hive_metastore', 'Hive Metastore Thrift',), + ('hiveserver2', 'Hive Server 2 Thrift',), + ('jdbc', 'Jdbc Connection',), + ('jenkins', 'Jenkins'), + ('mysql', 'MySQL',), + ('postgres', 'Postgres',), + ('oracle', 'Oracle',), + ('vertica', 'Vertica',), + ('presto', 'Presto',), + ('s3', 'S3',), + ('samba', 'Samba',), + ('sqlite', 'Sqlite',), + ('ssh', 'SSH',), + ('cloudant', 'IBM Cloudant',), + ('mssql', 'Microsoft SQL Server'), + ('mesos_framework-id', 'Mesos Framework ID'), + ('jira', 'JIRA',), + ('redis', 'Redis',), + ('wasb', 'Azure Blob Storage'), + ('databricks', 'Databricks',), + ('aws', 'Amazon Web Services',), + ('emr', 'Elastic MapReduce',), + ('snowflake', 'Snowflake',), + ('segment', 'Segment',), + ('azure_data_lake', 'Azure Data Lake'), + ('azure_container_instances', 'Azure Container Instances'), + ('azure_cosmos', 'Azure CosmosDB'), + ('cassandra', 'Cassandra',), + ('qubole', 'Qubole'), + ('mongo', 'MongoDB'), + ('gcpcloudsql', 'Google Cloud SQL'), + ] + + def __init__( + self, conn_id=None, conn_type=None, + host=None, login=None, password=None, + schema=None, port=None, extra=None, + uri=None): + self.conn_id = conn_id + if uri: + self.parse_from_uri(uri) + else: + self.conn_type = conn_type + self.host = host + self.login = login + self.password = password + self.schema = schema + self.port = port + self.extra = extra + + def parse_from_uri(self, uri): + uri_parts = urlparse(uri) + conn_type = uri_parts.scheme + if conn_type == 'postgresql': + conn_type = 'postgres' + elif '-' in conn_type: + conn_type = conn_type.replace('-', '_') + self.conn_type = conn_type + self.host = parse_netloc_to_hostname(uri_parts) + quoted_schema = uri_parts.path[1:] + self.schema = unquote(quoted_schema) if quoted_schema else quoted_schema + self.login = unquote(uri_parts.username) \ + if uri_parts.username else uri_parts.username + self.password = unquote(uri_parts.password) \ + if uri_parts.password else uri_parts.password + self.port = uri_parts.port + if uri_parts.query: + self.extra = json.dumps(dict(parse_qsl(uri_parts.query))) + + def get_password(self): + if self._password and self.is_encrypted: + fernet = get_fernet() + if not fernet.is_encrypted: + raise AirflowException( + "Can't decrypt encrypted password for login={}, \ + FERNET_KEY configuration is missing".format(self.login)) + return fernet.decrypt(bytes(self._password, 'utf-8')).decode() + else: + return self._password + + def set_password(self, value): + if value: + fernet = get_fernet() + self._password = fernet.encrypt(bytes(value, 'utf-8')).decode() + self.is_encrypted = fernet.is_encrypted + + @declared_attr + def password(cls): + return synonym('_password', + descriptor=property(cls.get_password, cls.set_password)) + + def get_extra(self): + if self._extra and self.is_extra_encrypted: + fernet = get_fernet() + if not fernet.is_encrypted: + raise AirflowException( + "Can't decrypt `extra` params for login={},\ + FERNET_KEY configuration is missing".format(self.login)) + return fernet.decrypt(bytes(self._extra, 'utf-8')).decode() + else: + return self._extra + + def set_extra(self, value): + if value: + fernet = get_fernet() + self._extra = fernet.encrypt(bytes(value, 'utf-8')).decode() + self.is_extra_encrypted = fernet.is_encrypted + else: + self._extra = value + self.is_extra_encrypted = False + + @declared_attr + def extra(cls): + return synonym('_extra', + descriptor=property(cls.get_extra, cls.set_extra)) + + def rotate_fernet_key(self): + fernet = get_fernet() + if self._password and self.is_encrypted: + self._password = fernet.rotate(self._password.encode('utf-8')).decode() + if self._extra and self.is_extra_encrypted: + self._extra = fernet.rotate(self._extra.encode('utf-8')).decode() + + def get_hook(self): + try: + if self.conn_type == 'mysql': + from airflow.hooks.mysql_hook import MySqlHook + return MySqlHook(mysql_conn_id=self.conn_id) + elif self.conn_type == 'google_cloud_platform': + from airflow.contrib.hooks.bigquery_hook import BigQueryHook + return BigQueryHook(bigquery_conn_id=self.conn_id) + elif self.conn_type == 'postgres': + from airflow.hooks.postgres_hook import PostgresHook + return PostgresHook(postgres_conn_id=self.conn_id) + elif self.conn_type == 'hive_cli': + from airflow.hooks.hive_hooks import HiveCliHook + return HiveCliHook(hive_cli_conn_id=self.conn_id) + elif self.conn_type == 'presto': + from airflow.hooks.presto_hook import PrestoHook + return PrestoHook(presto_conn_id=self.conn_id) + elif self.conn_type == 'hiveserver2': + from airflow.hooks.hive_hooks import HiveServer2Hook + return HiveServer2Hook(hiveserver2_conn_id=self.conn_id) + elif self.conn_type == 'sqlite': + from airflow.hooks.sqlite_hook import SqliteHook + return SqliteHook(sqlite_conn_id=self.conn_id) + elif self.conn_type == 'jdbc': + from airflow.hooks.jdbc_hook import JdbcHook + return JdbcHook(jdbc_conn_id=self.conn_id) + elif self.conn_type == 'mssql': + from airflow.hooks.mssql_hook import MsSqlHook + return MsSqlHook(mssql_conn_id=self.conn_id) + elif self.conn_type == 'oracle': + from airflow.hooks.oracle_hook import OracleHook + return OracleHook(oracle_conn_id=self.conn_id) + elif self.conn_type == 'vertica': + from airflow.contrib.hooks.vertica_hook import VerticaHook + return VerticaHook(vertica_conn_id=self.conn_id) + elif self.conn_type == 'cloudant': + from airflow.contrib.hooks.cloudant_hook import CloudantHook + return CloudantHook(cloudant_conn_id=self.conn_id) + elif self.conn_type == 'jira': + from airflow.contrib.hooks.jira_hook import JiraHook + return JiraHook(jira_conn_id=self.conn_id) + elif self.conn_type == 'redis': + from airflow.contrib.hooks.redis_hook import RedisHook + return RedisHook(redis_conn_id=self.conn_id) + elif self.conn_type == 'wasb': + from airflow.contrib.hooks.wasb_hook import WasbHook + return WasbHook(wasb_conn_id=self.conn_id) + elif self.conn_type == 'docker': + from airflow.hooks.docker_hook import DockerHook + return DockerHook(docker_conn_id=self.conn_id) + elif self.conn_type == 'azure_data_lake': + from airflow.contrib.hooks.azure_data_lake_hook import AzureDataLakeHook + return AzureDataLakeHook(azure_data_lake_conn_id=self.conn_id) + elif self.conn_type == 'azure_cosmos': + from airflow.contrib.hooks.azure_cosmos_hook import AzureCosmosDBHook + return AzureCosmosDBHook(azure_cosmos_conn_id=self.conn_id) + elif self.conn_type == 'cassandra': + from airflow.contrib.hooks.cassandra_hook import CassandraHook + return CassandraHook(cassandra_conn_id=self.conn_id) + elif self.conn_type == 'mongo': + from airflow.contrib.hooks.mongo_hook import MongoHook + return MongoHook(conn_id=self.conn_id) + elif self.conn_type == 'gcpcloudsql': + from airflow.contrib.hooks.gcp_sql_hook import CloudSqlDatabaseHook + return CloudSqlDatabaseHook(gcp_cloudsql_conn_id=self.conn_id) + except Exception: + pass + + def __repr__(self): + return self.conn_id + + def debug_info(self): + return ("id: {}. Host: {}, Port: {}, Schema: {}, " + "Login: {}, Password: {}, extra: {}". + format(self.conn_id, + self.host, + self.port, + self.schema, + self.login, + "XXXXXXXX" if self.password else None, + self.extra_dejson)) + + @property + def extra_dejson(self): + """Returns the extra property by deserializing json.""" + obj = {} + if self.extra: + try: + obj = json.loads(self.extra) + except Exception as e: + self.log.exception(e) + self.log.error("Failed parsing the json for conn_id %s", self.conn_id) + + return obj diff --git a/airflow/models/dagpickle.py b/airflow/models/dagpickle.py new file mode 100644 index 0000000000000..e261ab1ac009d --- /dev/null +++ b/airflow/models/dagpickle.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import dill +from sqlalchemy import Column, Integer, PickleType, Text + +from airflow.models.base import Base +from airflow.utils import timezone +from airflow.utils.sqlalchemy import UtcDateTime + + +class DagPickle(Base): + """ + Dags can originate from different places (user repos, master repo, ...) + and also get executed in different places (different executors). This + object represents a version of a DAG and becomes a source of truth for + a BackfillJob execution. A pickle is a native python serialized object, + and in this case gets stored in the database for the duration of the job. + + The executors pick up the DagPickle id and read the dag definition from + the database. + """ + + id = Column(Integer, primary_key=True) + pickle = Column(PickleType(pickler=dill)) + created_dttm = Column(UtcDateTime, default=timezone.utcnow) + pickle_hash = Column(Text) + + __tablename__ = "dag_pickle" + + def __init__(self, dag): + self.dag_id = dag.dag_id + if hasattr(dag, 'template_env'): + dag.template_env = None + self.pickle_hash = hash(dag) + self.pickle = dag diff --git a/airflow/models/errors.py b/airflow/models/errors.py new file mode 100644 index 0000000000000..6a3797ca3df9b --- /dev/null +++ b/airflow/models/errors.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sqlalchemy import Integer, Column, String, Text + +from airflow.models.base import Base +from airflow.utils.sqlalchemy import UtcDateTime + + +class ImportError(Base): + __tablename__ = "import_error" + id = Column(Integer, primary_key=True) + timestamp = Column(UtcDateTime) + filename = Column(String(1024)) + stacktrace = Column(Text) diff --git a/airflow/models/kubernetes.py b/airflow/models/kubernetes.py new file mode 100644 index 0000000000000..a18689eefd316 --- /dev/null +++ b/airflow/models/kubernetes.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import uuid + +from sqlalchemy import Column, Boolean, true as sqltrue, String + +from airflow.models.base import Base +from airflow.utils.db import provide_session + + +class KubeResourceVersion(Base): + __tablename__ = "kube_resource_version" + one_row_id = Column(Boolean, server_default=sqltrue(), primary_key=True) + resource_version = Column(String(255)) + + @staticmethod + @provide_session + def get_current_resource_version(session=None): + (resource_version,) = session.query(KubeResourceVersion.resource_version).one() + return resource_version + + @staticmethod + @provide_session + def checkpoint_resource_version(resource_version, session=None): + if resource_version: + session.query(KubeResourceVersion).update({ + KubeResourceVersion.resource_version: resource_version + }) + session.commit() + + @staticmethod + @provide_session + def reset_resource_version(session=None): + session.query(KubeResourceVersion).update({ + KubeResourceVersion.resource_version: '0' + }) + session.commit() + return '0' + + +class KubeWorkerIdentifier(Base): + __tablename__ = "kube_worker_uuid" + one_row_id = Column(Boolean, server_default=sqltrue(), primary_key=True) + worker_uuid = Column(String(255)) + + @staticmethod + @provide_session + def get_or_create_current_kube_worker_uuid(session=None): + (worker_uuid,) = session.query(KubeWorkerIdentifier.worker_uuid).one() + if worker_uuid == '': + worker_uuid = str(uuid.uuid4()) + KubeWorkerIdentifier.checkpoint_kube_worker_uuid(worker_uuid, session) + return worker_uuid + + @staticmethod + @provide_session + def checkpoint_kube_worker_uuid(worker_uuid, session=None): + if worker_uuid: + session.query(KubeWorkerIdentifier).update({ + KubeWorkerIdentifier.worker_uuid: worker_uuid + }) + session.commit() diff --git a/airflow/models/log.py b/airflow/models/log.py new file mode 100644 index 0000000000000..fa3fed60ee240 --- /dev/null +++ b/airflow/models/log.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sqlalchemy import Column, Integer, String, Text, Index + +from airflow.models.base import Base, ID_LEN +from airflow.utils import timezone +from airflow.utils.sqlalchemy import UtcDateTime + + +class Log(Base): + """ + Used to actively log events to the database + """ + + __tablename__ = "log" + + id = Column(Integer, primary_key=True) + dttm = Column(UtcDateTime) + dag_id = Column(String(ID_LEN)) + task_id = Column(String(ID_LEN)) + event = Column(String(30)) + execution_date = Column(UtcDateTime) + owner = Column(String(500)) + extra = Column(Text) + + __table_args__ = ( + Index('idx_log_dag', dag_id), + ) + + def __init__(self, event, task_instance, owner=None, extra=None, **kwargs): + self.dttm = timezone.utcnow() + self.event = event + self.extra = extra + + task_owner = None + + if task_instance: + self.dag_id = task_instance.dag_id + self.task_id = task_instance.task_id + self.execution_date = task_instance.execution_date + task_owner = task_instance.task.owner + + if 'task_id' in kwargs: + self.task_id = kwargs['task_id'] + if 'dag_id' in kwargs: + self.dag_id = kwargs['dag_id'] + if 'execution_date' in kwargs: + if kwargs['execution_date']: + self.execution_date = kwargs['execution_date'] + + self.owner = owner or task_owner diff --git a/airflow/models/skipmixin.py b/airflow/models/skipmixin.py new file mode 100644 index 0000000000000..c0adbd20aa4a4 --- /dev/null +++ b/airflow/models/skipmixin.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.models import TaskInstance +from airflow.utils import timezone +from airflow.utils.db import provide_session +from airflow.utils.log.logging_mixin import LoggingMixin +from airflow.utils.state import State + + +class SkipMixin(LoggingMixin): + @provide_session + def skip(self, dag_run, execution_date, tasks, session=None): + """ + Sets tasks instances to skipped from the same dag run. + + :param dag_run: the DagRun for which to set the tasks to skipped + :param execution_date: execution_date + :param tasks: tasks to skip (not task_ids) + :param session: db session to use + """ + if not tasks: + return + + task_ids = [d.task_id for d in tasks] + now = timezone.utcnow() + + if dag_run: + session.query(TaskInstance).filter( + TaskInstance.dag_id == dag_run.dag_id, + TaskInstance.execution_date == dag_run.execution_date, + TaskInstance.task_id.in_(task_ids) + ).update({TaskInstance.state: State.SKIPPED, + TaskInstance.start_date: now, + TaskInstance.end_date: now}, + synchronize_session=False) + session.commit() + else: + assert execution_date is not None, "Execution date is None and no dag run" + + self.log.warning("No DAG RUN present this should not happen") + # this is defensive against dag runs that are not complete + for task in tasks: + ti = TaskInstance(task, execution_date=execution_date) + ti.state = State.SKIPPED + ti.start_date = now + ti.end_date = now + session.merge(ti) + + session.commit() diff --git a/airflow/models/slamiss.py b/airflow/models/slamiss.py new file mode 100644 index 0000000000000..0981be886ea01 --- /dev/null +++ b/airflow/models/slamiss.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sqlalchemy import Boolean, Column, String, Index, Text + +from airflow.models.base import Base, ID_LEN +from airflow.utils.sqlalchemy import UtcDateTime + + +class SlaMiss(Base): + """ + Model that stores a history of the SLA that have been missed. + It is used to keep track of SLA failures over time and to avoid double + triggering alert emails. + """ + __tablename__ = "sla_miss" + + task_id = Column(String(ID_LEN), primary_key=True) + dag_id = Column(String(ID_LEN), primary_key=True) + execution_date = Column(UtcDateTime, primary_key=True) + email_sent = Column(Boolean, default=False) + timestamp = Column(UtcDateTime) + description = Column(Text) + notification_sent = Column(Boolean, default=False) + + __table_args__ = ( + Index('sm_dag', dag_id, unique=False), + ) + + def __repr__(self): + return str(( + self.dag_id, self.task_id, self.execution_date.isoformat())) diff --git a/airflow/models/taskfail.py b/airflow/models/taskfail.py new file mode 100755 index 0000000000000..d9cc7af922730 --- /dev/null +++ b/airflow/models/taskfail.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sqlalchemy import Column, Index, Integer, String + +from airflow.models.base import Base, ID_LEN +from airflow.utils.sqlalchemy import UtcDateTime + + +class TaskFail(Base): + """ + TaskFail tracks the failed run durations of each task instance. + """ + + __tablename__ = "task_fail" + + id = Column(Integer, primary_key=True) + task_id = Column(String(ID_LEN), nullable=False) + dag_id = Column(String(ID_LEN), nullable=False) + execution_date = Column(UtcDateTime, nullable=False) + start_date = Column(UtcDateTime) + end_date = Column(UtcDateTime) + duration = Column(Integer) + + __table_args__ = ( + Index('idx_task_fail_dag_task_date', dag_id, task_id, execution_date, + unique=False), + ) + + def __init__(self, task, execution_date, start_date, end_date): + self.dag_id = task.dag_id + self.task_id = task.task_id + self.execution_date = execution_date + self.start_date = start_date + self.end_date = end_date + if self.end_date and self.start_date: + self.duration = (self.end_date - self.start_date).total_seconds() + else: + self.duration = None diff --git a/airflow/models/taskreschedule.py b/airflow/models/taskreschedule.py new file mode 100755 index 0000000000000..4bdb6431f1ae4 --- /dev/null +++ b/airflow/models/taskreschedule.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from sqlalchemy import Column, ForeignKeyConstraint, Index, Integer, String, asc + +from airflow.models.base import Base, ID_LEN +from airflow.utils.db import provide_session +from airflow.utils.sqlalchemy import UtcDateTime + + +class TaskReschedule(Base): + """ + TaskReschedule tracks rescheduled task instances. + """ + + __tablename__ = "task_reschedule" + + id = Column(Integer, primary_key=True) + task_id = Column(String(ID_LEN), nullable=False) + dag_id = Column(String(ID_LEN), nullable=False) + execution_date = Column(UtcDateTime, nullable=False) + try_number = Column(Integer, nullable=False) + start_date = Column(UtcDateTime, nullable=False) + end_date = Column(UtcDateTime, nullable=False) + duration = Column(Integer, nullable=False) + reschedule_date = Column(UtcDateTime, nullable=False) + + __table_args__ = ( + Index('idx_task_reschedule_dag_task_date', dag_id, task_id, execution_date, + unique=False), + ForeignKeyConstraint([task_id, dag_id, execution_date], + ['task_instance.task_id', 'task_instance.dag_id', + 'task_instance.execution_date'], + name='task_reschedule_dag_task_date_fkey', + ondelete='CASCADE') + ) + + def __init__(self, task, execution_date, try_number, start_date, end_date, + reschedule_date): + self.dag_id = task.dag_id + self.task_id = task.task_id + self.execution_date = execution_date + self.try_number = try_number + self.start_date = start_date + self.end_date = end_date + self.reschedule_date = reschedule_date + self.duration = (self.end_date - self.start_date).total_seconds() + + @staticmethod + @provide_session + def find_for_task_instance(task_instance, session): + """ + Returns all task reschedules for the task instance and try number, + in ascending order. + + :param task_instance: the task instance to find task reschedules for + :type task_instance: airflow.models.TaskInstance + """ + TR = TaskReschedule + return ( + session + .query(TR) + .filter(TR.dag_id == task_instance.dag_id, + TR.task_id == task_instance.task_id, + TR.execution_date == task_instance.execution_date, + TR.try_number == task_instance.try_number) + .order_by(asc(TR.id)) + .all() + ) diff --git a/airflow/operators/__init__.py b/airflow/operators/__init__.py index efdfd3ebe2dc1..00f34d038ccfd 100644 --- a/airflow/operators/__init__.py +++ b/airflow/operators/__init__.py @@ -19,7 +19,7 @@ import sys import os -from airflow.models import BaseOperator +from airflow.models import BaseOperator # noqa: F401 # ------------------------------------------------------------------------ # diff --git a/airflow/operators/bash_operator.py b/airflow/operators/bash_operator.py index 37a19dbe9bc31..54c2dfba8907a 100644 --- a/airflow/operators/bash_operator.py +++ b/airflow/operators/bash_operator.py @@ -18,25 +18,31 @@ # under the License. -from builtins import bytes import os import signal from subprocess import Popen, STDOUT, PIPE from tempfile import gettempdir, NamedTemporaryFile +from builtins import bytes + from airflow.exceptions import AirflowException from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults from airflow.utils.file import TemporaryDirectory +from airflow.utils.operator_helpers import context_to_airflow_vars class BashOperator(BaseOperator): """ Execute a Bash script, command or set of commands. + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:BashOperator` + :param bash_command: The command, set of commands or reference to a bash script (must be '.sh') to be executed. (templated) - :type bash_command: string + :type bash_command: str :param xcom_push: If xcom_push is True, the last line written to stdout will also be pushed to an XCom when the bash command completes. :type xcom_push: bool @@ -45,7 +51,8 @@ class BashOperator(BaseOperator): of inheriting the current process environment, which is the default behavior. (templated) :type env: dict - :type output_encoding: output encoding of bash command + :param output_encoding: Output encoding of bash command + :type output_encoding: str """ template_fields = ('bash_command', 'env') template_ext = ('.sh', '.bash',) @@ -73,6 +80,16 @@ def execute(self, context): """ self.log.info("Tmp dir root location: \n %s", gettempdir()) + # Prepare env for child process. + if self.env is None: + self.env = os.environ.copy() + airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) + self.log.info('Exporting the following env vars:\n%s', + '\n'.join(["{}={}".format(k, v) + for k, v in + airflow_context_vars.items()])) + self.env.update(airflow_context_vars) + self.lineage_data = self.bash_command with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: diff --git a/airflow/operators/check_operator.py b/airflow/operators/check_operator.py index 5a31737fd5a6c..e611ce0af683d 100644 --- a/airflow/operators/check_operator.py +++ b/airflow/operators/check_operator.py @@ -19,6 +19,7 @@ from builtins import zip from builtins import str +from typing import Iterable from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook @@ -59,11 +60,11 @@ class CheckOperator(BaseOperator): single record from an external source. :param sql: the sql to be executed. (templated) - :type sql: string + :type sql: str """ - template_fields = ('sql',) - template_ext = ('.hql', '.sql',) + template_fields = ('sql',) # type: Iterable[str] + template_ext = ('.hql', '.sql',) # type: Iterable[str] ui_color = '#fff7e6' @apply_defaults @@ -114,14 +115,14 @@ class ValueCheckOperator(BaseOperator): single record from an external source. :param sql: the sql to be executed. (templated) - :type sql: string + :type sql: str """ __mapper_args__ = { 'polymorphic_identity': 'ValueCheckOperator' } - template_fields = ('sql', 'pass_value',) - template_ext = ('.hql', '.sql',) + template_fields = ('sql', 'pass_value',) # type: Iterable[str] + template_ext = ('.hql', '.sql',) # type: Iterable[str] ui_color = '#fff7e6' @apply_defaults @@ -147,7 +148,7 @@ def execute(self, context=None): is_numeric_value_check = isinstance(pass_value_conv, float) tolerance_pct_str = None - if (self.tol is not None): + if self.tol is not None: tolerance_pct_str = str(self.tol * 100) + '%' except_temp = ("Test failed.\nPass value:{pass_value_conv}\n" @@ -158,7 +159,7 @@ def execute(self, context=None): elif is_numeric_value_check: try: num_rec = [float(r) for r in records] - except (ValueError, TypeError) as e: + except (ValueError, TypeError): cvestr = "Converting a result to float failed.\n" raise AirflowException(cvestr + except_temp.format(**locals())) if self.has_tolerance: @@ -196,8 +197,8 @@ class IntervalCheckOperator(BaseOperator): __mapper_args__ = { 'polymorphic_identity': 'IntervalCheckOperator' } - template_fields = ('sql1', 'sql2') - template_ext = ('.hql', '.sql',) + template_fields = ('sql1', 'sql2') # type: Iterable[str] + template_ext = ('.hql', '.sql',) # type: Iterable[str] ui_color = '#fff7e6' @apply_defaults @@ -233,29 +234,25 @@ def execute(self, context=None): reference = dict(zip(self.metrics_sorted, row2)) ratios = {} test_results = {} - rlog = "Ratio for {0}: {1} \n Ratio threshold : {2}" - fstr = "'{k}' check failed. {r} is above {tr}" - estr = "The following tests have failed:\n {0}" - countstr = "The following {j} tests out of {n} failed:" for m in self.metrics_sorted: if current[m] == 0 or reference[m] == 0: ratio = None else: ratio = float(max(current[m], reference[m])) / \ min(current[m], reference[m]) - self.log.info(rlog.format(m, ratio, self.metrics_thresholds[m])) + self.log.info("Ratio for %s: %s \n Ratio threshold : %s", m, ratio, self.metrics_thresholds[m]) ratios[m] = ratio test_results[m] = ratio < self.metrics_thresholds[m] if not all(test_results.values()): failed_tests = [it[0] for it in test_results.items() if not it[1]] j = len(failed_tests) n = len(self.metrics_sorted) - self.log.warning(countstr.format(**locals())) + self.log.warning("The following %s tests out of %s failed:", j, n) for k in failed_tests: self.log.warning( - fstr.format(k=k, r=ratios[k], tr=self.metrics_thresholds[k]) + "'%s' check failed. %s is above %s", k, ratios[k], self.metrics_thresholds[k] ) - raise AirflowException(estr.format(", ".join(failed_tests))) + raise AirflowException("The following tests have failed:\n {0}".format(", ".join(failed_tests))) self.log.info("All tests have passed") def get_db_hook(self): diff --git a/airflow/operators/dagrun_operator.py b/airflow/operators/dagrun_operator.py index 53814af41024d..1bc1979a1afb4 100644 --- a/airflow/operators/dagrun_operator.py +++ b/airflow/operators/dagrun_operator.py @@ -17,6 +17,8 @@ # specific language governing permissions and limitations # under the License. +import datetime +import six from airflow.models import BaseOperator from airflow.utils import timezone from airflow.utils.decorators import apply_defaults @@ -35,7 +37,7 @@ class TriggerDagRunOperator(BaseOperator): """ Triggers a DAG run for a specified ``dag_id`` - :param trigger_dag_id: the dag_id to trigger + :param trigger_dag_id: the dag_id to trigger (templated) :type trigger_dag_id: str :param python_callable: a reference to a python function that will be called while passing it the ``context`` object and a placeholder @@ -47,11 +49,10 @@ class TriggerDagRunOperator(BaseOperator): to your tasks while executing that DAG run. Your function header should look like ``def foo(context, dag_run_obj):`` :type python_callable: python callable - :param execution_date: Execution date for the dag - :type execution_date: datetime.datetime + :param execution_date: Execution date for the dag (templated) + :type execution_date: str or datetime.datetime """ - template_fields = tuple() - template_ext = tuple() + template_fields = ('trigger_dag_id', 'execution_date') ui_color = '#ffefeb' @apply_defaults @@ -64,10 +65,26 @@ def __init__( super(TriggerDagRunOperator, self).__init__(*args, **kwargs) self.python_callable = python_callable self.trigger_dag_id = trigger_dag_id - self.execution_date = execution_date + + if isinstance(execution_date, datetime.datetime): + self.execution_date = execution_date.isoformat() + elif isinstance(execution_date, six.string_types): + self.execution_date = execution_date + elif execution_date is None: + self.execution_date = execution_date + else: + raise TypeError( + 'Expected str or datetime.datetime type ' + 'for execution_date. Got {}'.format( + type(execution_date))) def execute(self, context): - dro = DagRunOrder(run_id='trig__' + timezone.utcnow().isoformat()) + if self.execution_date is not None: + run_id = 'trig__{}'.format(self.execution_date) + self.execution_date = timezone.parse(self.execution_date) + else: + run_id = 'trig__' + timezone.utcnow().isoformat() + dro = DagRunOrder(run_id=run_id) if self.python_callable is not None: dro = self.python_callable(context, dro) if dro: diff --git a/airflow/operators/docker_operator.py b/airflow/operators/docker_operator.py index 69dc1ebef71e6..1d550cec8804a 100644 --- a/airflow/operators/docker_operator.py +++ b/airflow/operators/docker_operator.py @@ -43,22 +43,31 @@ class DockerOperator(BaseOperator): be provided with the parameter ``docker_conn_id``. :param image: Docker image from which to create the container. + If image tag is omitted, "latest" will be used. :type image: str :param api_version: Remote API version. Set to ``auto`` to automatically detect the server's version. :type api_version: str + :param auto_remove: Auto-removal of the container on daemon side when the + container's process exits. + The default is False. + :type auto_remove: bool :param command: Command to be run in the container. (templated) :type command: str or list :param cpus: Number of CPUs to assign to the container. This value gets multiplied with 1024. See https://docs.docker.com/engine/reference/run/#cpu-share-constraint :type cpus: float + :param dns: Docker custom DNS servers + :type dns: list[str] + :param dns_search: Docker custom DNS search domain + :type dns_search: list[str] :param docker_url: URL of the host running the docker daemon. Default is unix://var/run/docker.sock :type docker_url: str :param environment: Environment variables to set in the container. (templated) :type environment: dict - :param force_pull: Pull the docker image on every run. Default is false. + :param force_pull: Pull the docker image on every run. Default is False. :type force_pull: bool :param mem_limit: Maximum amount of memory the container can use. Either a float value, which represents the limit in bytes, @@ -99,6 +108,9 @@ class DockerOperator(BaseOperator): :type xcom_all: bool :param docker_conn_id: ID of the Airflow connection to use :type docker_conn_id: str + :param shm_size: Size of ``/dev/shm`` in bytes. The size must be + greater than 0. If omitted uses system default. + :type shm_size: int """ template_fields = ('command', 'environment',) template_ext = ('.sh', '.bash',) @@ -127,13 +139,20 @@ def __init__( xcom_push=False, xcom_all=False, docker_conn_id=None, + dns=None, + dns_search=None, + auto_remove=False, + shm_size=None, *args, **kwargs): super(DockerOperator, self).__init__(*args, **kwargs) self.api_version = api_version + self.auto_remove = auto_remove self.command = command self.cpus = cpus + self.dns = dns + self.dns_search = dns_search self.docker_url = docker_url self.environment = environment or {} self.force_pull = force_pull @@ -152,7 +171,7 @@ def __init__( self.xcom_push_flag = xcom_push self.xcom_all = xcom_all self.docker_conn_id = docker_conn_id - self.shm_size = kwargs.get('shm_size') + self.shm_size = shm_size self.cli = None self.container = None @@ -179,18 +198,12 @@ def execute(self, context): tls=tls_config ) - if ':' not in self.image: - image = self.image + ':latest' - else: - image = self.image - - if self.force_pull or len(self.cli.images(name=image)) == 0: - self.log.info('Pulling docker image %s', image) - for l in self.cli.pull(image, stream=True): - output = json.loads(l.decode('utf-8')) - self.log.info("%s", output['status']) - - cpu_shares = int(round(self.cpus * 1024)) + if self.force_pull or len(self.cli.images(name=self.image)) == 0: + self.log.info('Pulling docker image %s', self.image) + for l in self.cli.pull(self.image, stream=True): + output = json.loads(l.decode('utf-8').strip()) + if 'status' in output: + self.log.info("%s", output['status']) with TemporaryDirectory(prefix='airflowtmp') as host_tmp_dir: self.environment['AIRFLOW_TMP_DIR'] = self.tmp_dir @@ -198,14 +211,17 @@ def execute(self, context): self.container = self.cli.create_container( command=self.get_command(), - cpu_shares=cpu_shares, environment=self.environment, host_config=self.cli.create_host_config( + auto_remove=self.auto_remove, binds=self.volumes, network_mode=self.network_mode, - shm_size=self.shm_size), - image=image, - mem_limit=self.mem_limit, + shm_size=self.shm_size, + dns=self.dns, + dns_search=self.dns_search, + cpu_shares=int(round(self.cpus * 1024)), + mem_limit=self.mem_limit), + image=self.image, user=self.user, working_dir=self.working_dir ) @@ -218,9 +234,9 @@ def execute(self, context): line = line.decode('utf-8') self.log.info(line) - exit_code = self.cli.wait(self.container['Id']) - if exit_code != 0: - raise AirflowException('docker container failed') + result = self.cli.wait(self.container['Id']) + if result['StatusCode'] != 0: + raise AirflowException('docker container failed: ' + repr(result)) if self.xcom_push_flag: return self.cli.logs(container=self.container['Id']) \ diff --git a/airflow/operators/druid_check_operator.py b/airflow/operators/druid_check_operator.py index 73f7915ca5f40..ac18f0567d4ad 100644 --- a/airflow/operators/druid_check_operator.py +++ b/airflow/operators/druid_check_operator.py @@ -47,13 +47,13 @@ class DruidCheckOperator(CheckOperator): This operator can be used as a data quality check in your pipeline, and depending on where you put it in your DAG, you have the choice to stop the critical path, preventing from - publishing dubious data, or on the side and receive email alterts + publishing dubious data, or on the side and receive email alerts without stopping the progress of the DAG. :param sql: the sql to be executed - :type sql: string + :type sql: str :param druid_broker_conn_id: reference to the druid broker - :type druid_broker_conn_id: string + :type druid_broker_conn_id: str """ @apply_defaults @@ -83,9 +83,9 @@ def get_first(self, sql): return cur.fetchone() def execute(self, context=None): - self.log.info('Executing SQL check: {}'.format(self.sql)) + self.log.info('Executing SQL check: %s', self.sql) record = self.get_first(self.sql) - self.log.info("Record: {}".format(str(record))) + self.log.info("Record: %s", str(record)) if not record: raise AirflowException("The query returned None") self.log.info("Success.") diff --git a/airflow/operators/dummy_operator.py b/airflow/operators/dummy_operator.py index 025a242fd1b94..222c853b1d1fb 100644 --- a/airflow/operators/dummy_operator.py +++ b/airflow/operators/dummy_operator.py @@ -27,7 +27,6 @@ class DummyOperator(BaseOperator): DAG. """ - template_fields = tuple() ui_color = '#e8f7e4' @apply_defaults diff --git a/airflow/operators/email_operator.py b/airflow/operators/email_operator.py index ae176caecff2a..0ea7fc0cc476a 100644 --- a/airflow/operators/email_operator.py +++ b/airflow/operators/email_operator.py @@ -29,10 +29,10 @@ class EmailOperator(BaseOperator): :param to: list of emails to send the email to. (templated) :type to: list or string (comma or semicolon delimited) :param subject: subject line for the email. (templated) - :type subject: string + :type subject: str :param html_content: content of the email, html markup is allowed. (templated) - :type html_content: string + :type html_content: str :param files: file names to attach in email :type files: list :param cc: list of recipients to be added in CC field @@ -40,10 +40,10 @@ class EmailOperator(BaseOperator): :param bcc: list of recipients to be added in BCC field :type bcc: list or string (comma or semicolon delimited) :param mime_subtype: MIME sub content type - :type mime_subtype: string + :type mime_subtype: str :param mime_charset: character set parameter added to the Content-Type header. - :type mime_charset: string + :type mime_charset: str """ template_fields = ('to', 'subject', 'html_content') diff --git a/airflow/operators/generic_transfer.py b/airflow/operators/generic_transfer.py index 7b1a64ee753d5..61d6692e5ff6b 100644 --- a/airflow/operators/generic_transfer.py +++ b/airflow/operators/generic_transfer.py @@ -41,7 +41,7 @@ class GenericTransfer(BaseOperator): :type destination_conn_id: str :param preoperator: sql statement or list of statements to be executed prior to loading the data. (templated) - :type preoperator: str or list of str + :type preoperator: str or list[str] """ template_fields = ('sql', 'destination_table', 'preoperator') diff --git a/airflow/operators/hive_operator.py b/airflow/operators/hive_operator.py index bd727039b9f52..9cebc386c3192 100644 --- a/airflow/operators/hive_operator.py +++ b/airflow/operators/hive_operator.py @@ -34,9 +34,9 @@ class HiveOperator(BaseOperator): :param hql: the hql to be executed. Note that you may also use a relative path from the dag file of a (template) hive script. (templated) - :type hql: string + :type hql: str :param hive_cli_conn_id: reference to the Hive database. (templated) - :type hive_cli_conn_id: string + :type hive_cli_conn_id: str :param hiveconfs: if defined, these key value pairs will be passed to hive as ``-hiveconf "key"="value"`` :type hiveconfs: dict @@ -46,18 +46,18 @@ class HiveOperator(BaseOperator): Note that you may want to use this along with the ``DAG(user_defined_macros=myargs)`` parameter. View the DAG object documentation for more details. - :type hiveconf_jinja_translate: boolean + :type hiveconf_jinja_translate: bool :param script_begin_tag: If defined, the operator will get rid of the part of the script before the first occurrence of `script_begin_tag` :type script_begin_tag: str :param mapred_queue: queue used by the Hadoop CapacityScheduler. (templated) - :type mapred_queue: string + :type mapred_queue: str :param mapred_queue_priority: priority within CapacityScheduler queue. Possible settings include: VERY_HIGH, HIGH, NORMAL, LOW, VERY_LOW - :type mapred_queue_priority: string + :type mapred_queue_priority: str :param mapred_job_name: This name will appear in the jobtracker. This can make monitoring easier. - :type mapred_job_name: string + :type mapred_job_name: str """ template_fields = ('hql', 'schema', 'hive_cli_conn_id', 'mapred_queue', @@ -110,7 +110,7 @@ def get_hook(self): def prepare_template(self): if self.hiveconf_jinja_translate: self.hql = re.sub( - "(\$\{(hiveconf:)?([ a-zA-Z0-9_]*)\})", "{{ \g<3> }}", self.hql) + r"(\$\{(hiveconf:)?([ a-zA-Z0-9_]*)\})", r"{{ \g<3> }}", self.hql) if self.script_begin_tag and self.script_begin_tag in self.hql: self.hql = "\n".join(self.hql.split(self.script_begin_tag)[1:]) diff --git a/airflow/operators/hive_stats_operator.py b/airflow/operators/hive_stats_operator.py index fe83284193757..b0bb874956a92 100644 --- a/airflow/operators/hive_stats_operator.py +++ b/airflow/operators/hive_stats_operator.py @@ -91,8 +91,7 @@ def __init__( def get_default_exprs(self, col, col_type): if col in self.col_blacklist: return {} - d = {} - d[(col, 'non_null')] = "COUNT({col})" + d = {(col, 'non_null'): "COUNT({col})"} if col_type in ['double', 'int', 'bigint', 'float', 'double']: d[(col, 'sum')] = 'SUM({col})' d[(col, 'min')] = 'MIN({col})' diff --git a/airflow/operators/hive_to_druid.py b/airflow/operators/hive_to_druid.py index 9b4a881c5c3d4..7c2fa19c033ed 100644 --- a/airflow/operators/hive_to_druid.py +++ b/airflow/operators/hive_to_druid.py @@ -48,13 +48,15 @@ class HiveToDruidTransfer(BaseOperator): :type metastore_conn_id: str :param hadoop_dependency_coordinates: list of coordinates to squeeze int the ingest json - :type hadoop_dependency_coordinates: list of str + :type hadoop_dependency_coordinates: list[str] :param intervals: list of time intervals that defines segments, this is passed as is to the json object. (templated) :type intervals: list :param hive_tblproperties: additional properties for tblproperties in hive for the staging table :type hive_tblproperties: dict + :param job_properties: additional properties for job + :type job_properties: dict """ template_fields = ('sql', 'intervals') @@ -77,6 +79,7 @@ def __init__( query_granularity="NONE", segment_granularity="DAY", hive_tblproperties=None, + job_properties=None, *args, **kwargs): super(HiveToDruidTransfer, self).__init__(*args, **kwargs) self.sql = sql @@ -95,6 +98,7 @@ def __init__( self.druid_ingest_conn_id = druid_ingest_conn_id self.metastore_conn_id = metastore_conn_id self.hive_tblproperties = hive_tblproperties + self.job_properties = job_properties def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) @@ -162,7 +166,7 @@ def construct_ingest_query(self, static_path, columns): :type columns: list """ - # backward compatibilty for num_shards, + # backward compatibility for num_shards, # but target_partition_size is the default setting # and overwrites the num_shards num_shards = self.num_shards @@ -231,6 +235,10 @@ def construct_ingest_query(self, static_path, columns): } } + if self.job_properties: + ingest_query_dict['spec']['tuningConfig']['jobProperties'] \ + .update(self.job_properties) + if self.hadoop_dependency_coordinates: ingest_query_dict['hadoopDependencyCoordinates'] \ = self.hadoop_dependency_coordinates diff --git a/airflow/operators/hive_to_mysql.py b/airflow/operators/hive_to_mysql.py index 4dc25a6e8f8c5..882a9d854016a 100644 --- a/airflow/operators/hive_to_mysql.py +++ b/airflow/operators/hive_to_mysql.py @@ -17,12 +17,13 @@ # specific language governing permissions and limitations # under the License. +from tempfile import NamedTemporaryFile + from airflow.hooks.hive_hooks import HiveServer2Hook from airflow.hooks.mysql_hook import MySqlHook from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults - -from tempfile import NamedTemporaryFile +from airflow.utils.operator_helpers import context_to_airflow_vars class HiveToMySqlTransfer(BaseOperator): @@ -88,7 +89,8 @@ def execute(self, context): if self.bulk_load: tmpfile = NamedTemporaryFile() hive.to_csv(self.sql, tmpfile.name, delimiter='\t', - lineterminator='\n', output_header=False) + lineterminator='\n', output_header=False, + hive_conf=context_to_airflow_vars(context)) else: results = hive.get_records(self.sql) diff --git a/airflow/operators/hive_to_samba_operator.py b/airflow/operators/hive_to_samba_operator.py index f6978ac30e2e6..7963524a106e3 100644 --- a/airflow/operators/hive_to_samba_operator.py +++ b/airflow/operators/hive_to_samba_operator.py @@ -23,6 +23,7 @@ from airflow.hooks.samba_hook import SambaHook from airflow.models import BaseOperator from airflow.utils.decorators import apply_defaults +from airflow.utils.operator_helpers import context_to_airflow_vars class Hive2SambaOperator(BaseOperator): @@ -31,11 +32,11 @@ class Hive2SambaOperator(BaseOperator): results of the query as a csv to a Samba location. :param hql: the hql to be exported. (templated) - :type hql: string + :type hql: str :param hiveserver2_conn_id: reference to the hiveserver2 service - :type hiveserver2_conn_id: string + :type hiveserver2_conn_id: str :param samba_conn_id: reference to the samba destination - :type samba_conn_id: string + :type samba_conn_id: str """ template_fields = ('hql', 'destination_filepath') @@ -60,6 +61,7 @@ def execute(self, context): hive = HiveServer2Hook(hiveserver2_conn_id=self.hiveserver2_conn_id) tmpfile = tempfile.NamedTemporaryFile() self.log.info("Fetching file from Hive") - hive.to_csv(hql=self.hql, csv_filepath=tmpfile.name) + hive.to_csv(hql=self.hql, csv_filepath=tmpfile.name, + hive_conf=context_to_airflow_vars(context)) self.log.info("Pushing to samba") samba.push_from_local(self.destination_filepath, tmpfile.name) diff --git a/airflow/operators/http_operator.py b/airflow/operators/http_operator.py index 2cfc9c0057c7c..d382b012ab090 100644 --- a/airflow/operators/http_operator.py +++ b/airflow/operators/http_operator.py @@ -27,12 +27,12 @@ class SimpleHttpOperator(BaseOperator): """ Calls an endpoint on an HTTP system to execute an action - :param http_conn_id: The connection to run the sensor against - :type http_conn_id: string + :param http_conn_id: The connection to run the operator against + :type http_conn_id: str :param endpoint: The relative part of the full url. (templated) - :type endpoint: string + :type endpoint: str :param method: The HTTP method to use, default = "POST" - :type method: string + :type method: str :param data: The data to pass. POST-data in POST/PUT and params in the URL for a GET request. (templated) :type data: For POST/PUT, depends on the content-type parameter, @@ -46,6 +46,12 @@ class SimpleHttpOperator(BaseOperator): 'requests' documentation (options to modify timeout, ssl, etc.) :type extra_options: A dictionary of options, where key is string and value depends on the option that's being modified. + :param xcom_push: Push the response to Xcom (default: False). + If xcom_push is True, response of an HTTP request will also + be pushed to an XCom. + :type xcom_push: bool + :param log_response: Log the response (default: False) + :type log_response: bool """ template_fields = ('endpoint', 'data',) @@ -61,11 +67,9 @@ def __init__(self, response_check=None, extra_options=None, xcom_push=False, - http_conn_id='http_default', *args, **kwargs): - """ - If xcom_push is True, response of an HTTP request will also - be pushed to an XCom. - """ + http_conn_id='http_default', + log_response=False, + *args, **kwargs): super(SimpleHttpOperator, self).__init__(*args, **kwargs) self.http_conn_id = http_conn_id self.method = method @@ -75,6 +79,7 @@ def __init__(self, self.response_check = response_check self.extra_options = extra_options or {} self.xcom_push_flag = xcom_push + self.log_response = log_response def execute(self, context): http = HttpHook(self.method, http_conn_id=self.http_conn_id) @@ -85,6 +90,8 @@ def execute(self, context): self.data, self.headers, self.extra_options) + if self.log_response: + self.log.info(response.text) if self.response_check: if not self.response_check(response): raise AirflowException("Response check returned False.") diff --git a/airflow/operators/jdbc_operator.py b/airflow/operators/jdbc_operator.py index 9e7f24dbf6fa3..1c9423d5abd8b 100644 --- a/airflow/operators/jdbc_operator.py +++ b/airflow/operators/jdbc_operator.py @@ -28,12 +28,17 @@ class JdbcOperator(BaseOperator): Requires jaydebeapi. - :param jdbc_conn_id: reference to a predefined database - :type jdbc_conn_id: string :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' + :param jdbc_conn_id: reference to a predefined database + :type jdbc_conn_id: str + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable """ template_fields = ('sql',) diff --git a/airflow/operators/latest_only_operator.py b/airflow/operators/latest_only_operator.py index 113977491cc4b..6fee50965d42a 100644 --- a/airflow/operators/latest_only_operator.py +++ b/airflow/operators/latest_only_operator.py @@ -17,7 +17,8 @@ # specific language governing permissions and limitations # under the License. -from airflow.models import BaseOperator, SkipMixin +from airflow.models import BaseOperator +from airflow.models.skipmixin import SkipMixin from airflow.utils import timezone diff --git a/airflow/operators/mssql_operator.py b/airflow/operators/mssql_operator.py index 1309be94e4520..086d1e39a9d28 100644 --- a/airflow/operators/mssql_operator.py +++ b/airflow/operators/mssql_operator.py @@ -26,13 +26,18 @@ class MsSqlOperator(BaseOperator): """ Executes sql code in a specific Microsoft SQL database - :param mssql_conn_id: reference to a specific mssql database - :type mssql_conn_id: string :param sql: the sql code to be executed - :type sql: string or string pointing to a template file with .sql + :type sql: str or string pointing to a template file with .sql extension. (templated) + :param mssql_conn_id: reference to a specific mssql database + :type mssql_conn_id: str + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool :param database: name of database which overwrite defined one in connection - :type database: string + :type database: str """ template_fields = ('sql',) diff --git a/airflow/operators/mysql_operator.py b/airflow/operators/mysql_operator.py index 2b940c785be35..905a43d64b964 100644 --- a/airflow/operators/mysql_operator.py +++ b/airflow/operators/mysql_operator.py @@ -26,14 +26,20 @@ class MySqlOperator(BaseOperator): """ Executes sql code in a specific MySQL database - :param mysql_conn_id: reference to a specific mysql database - :type mysql_conn_id: string - :param sql: the sql code to be executed. (templated) - :type sql: Can receive a str representing a sql statement, - a list of str (sql statements), or reference to a template file. + :param sql: the sql code to be executed. Can receive a str representing a + sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' + (templated) + :type sql: str or list[str] + :param mysql_conn_id: reference to a specific mysql database + :type mysql_conn_id: str + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool :param database: name of database which overwrite defined one in connection - :type database: string + :type database: str """ template_fields = ('sql',) diff --git a/airflow/operators/mysql_to_hive.py b/airflow/operators/mysql_to_hive.py index 94d660801ff82..edffe9164e577 100644 --- a/airflow/operators/mysql_to_hive.py +++ b/airflow/operators/mysql_to_hive.py @@ -101,6 +101,7 @@ def type_map(cls, mysql_type): d = { t.BIT: 'INT', t.DECIMAL: 'DOUBLE', + t.NEWDECIMAL: 'DOUBLE', t.DOUBLE: 'DOUBLE', t.FLOAT: 'DOUBLE', t.INT24: 'INT', @@ -122,7 +123,8 @@ def execute(self, context): cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("wb") as f: - csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8") + csv_writer = csv.writer(f, delimiter=self.delimiter, + encoding="utf-8") field_dict = OrderedDict() for field in cursor.description: field_dict[field[0]] = self.type_map(field[1]) diff --git a/airflow/operators/oracle_operator.py b/airflow/operators/oracle_operator.py index 84820c079013e..5cb62ef8a3455 100644 --- a/airflow/operators/oracle_operator.py +++ b/airflow/operators/oracle_operator.py @@ -25,12 +25,19 @@ class OracleOperator(BaseOperator): """ Executes sql code in a specific Oracle database - :param oracle_conn_id: reference to a specific Oracle database - :type oracle_conn_id: string - :param sql: the sql code to be executed. (templated) - :type sql: Can receive a str representing a sql statement, + + :param sql: the sql code to be executed. Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' + (templated) + :type sql: str or list[str] + :param oracle_conn_id: reference to a specific Oracle database + :type oracle_conn_id: str + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool """ template_fields = ('sql',) diff --git a/airflow/operators/pig_operator.py b/airflow/operators/pig_operator.py index d22f19f70921e..3426ec522e4b6 100644 --- a/airflow/operators/pig_operator.py +++ b/airflow/operators/pig_operator.py @@ -29,15 +29,15 @@ class PigOperator(BaseOperator): Executes pig script. :param pig: the pig latin script to be executed. (templated) - :type pig: string + :type pig: str :param pig_cli_conn_id: reference to the Hive database - :type pig_cli_conn_id: string + :type pig_cli_conn_id: str :param pigparams_jinja_translate: when True, pig params-type templating ${var} gets translated into jinja-type templating {{ var }}. Note that you may want to use this along with the ``DAG(user_defined_macros=myargs)`` parameter. View the DAG object documentation for more details. - :type pigparams_jinja_translate: boolean + :type pigparams_jinja_translate: bool """ template_fields = ('pig',) @@ -62,7 +62,7 @@ def get_hook(self): def prepare_template(self): if self.pigparams_jinja_translate: self.pig = re.sub( - "(\$([a-zA-Z_][a-zA-Z0-9_]*))", "{{ \g<2> }}", self.pig) + r"(\$([a-zA-Z_][a-zA-Z0-9_]*))", r"{{ \g<2> }}", self.pig) def execute(self, context): self.log.info('Executing: %s', self.pig) diff --git a/airflow/operators/postgres_operator.py b/airflow/operators/postgres_operator.py index 5ff6e9e643278..8f2aaf553dc1c 100644 --- a/airflow/operators/postgres_operator.py +++ b/airflow/operators/postgres_operator.py @@ -25,14 +25,19 @@ class PostgresOperator(BaseOperator): """ Executes sql code in a specific Postgres database - :param postgres_conn_id: reference to a specific postgres database - :type postgres_conn_id: string :param sql: the sql code to be executed. (templated) :type sql: Can receive a str representing a sql statement, a list of str (sql statements), or reference to a template file. Template reference are recognized by str ending in '.sql' + :param postgres_conn_id: reference to a specific postgres database + :type postgres_conn_id: str + :param autocommit: if True, each command is automatically committed. + (default value: False) + :type autocommit: bool + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable :param database: name of database which overwrite defined one in connection - :type database: string + :type database: str """ template_fields = ('sql',) diff --git a/airflow/operators/presto_check_operator.py b/airflow/operators/presto_check_operator.py index 608aebfe2966a..d70dcaa7d25af 100644 --- a/airflow/operators/presto_check_operator.py +++ b/airflow/operators/presto_check_operator.py @@ -48,13 +48,13 @@ class PrestoCheckOperator(CheckOperator): This operator can be used as a data quality check in your pipeline, and depending on where you put it in your DAG, you have the choice to stop the critical path, preventing from - publishing dubious data, or on the side and receive email alterts + publishing dubious data, or on the side and receive email alerts without stopping the progress of the DAG. :param sql: the sql to be executed - :type sql: string + :type sql: str :param presto_conn_id: reference to the Presto database - :type presto_conn_id: string + :type presto_conn_id: str """ @apply_defaults @@ -76,9 +76,9 @@ class PrestoValueCheckOperator(ValueCheckOperator): Performs a simple value check using sql code. :param sql: the sql to be executed - :type sql: string + :type sql: str :param presto_conn_id: reference to the Presto database - :type presto_conn_id: string + :type presto_conn_id: str """ @apply_defaults @@ -108,7 +108,7 @@ class PrestoIntervalCheckOperator(IntervalCheckOperator): :param metrics_threshold: a dictionary of ratios indexed by metrics :type metrics_threshold: dict :param presto_conn_id: reference to the Presto database - :type presto_conn_id: string + :type presto_conn_id: str """ @apply_defaults diff --git a/airflow/operators/python_operator.py b/airflow/operators/python_operator.py index 88f3b1a8a6347..67cf5492656e2 100644 --- a/airflow/operators/python_operator.py +++ b/airflow/operators/python_operator.py @@ -17,35 +17,42 @@ # specific language governing permissions and limitations # under the License. -from builtins import str -import dill import inspect import os import pickle import subprocess import sys import types +from builtins import str +from textwrap import dedent + +import dill +import six from airflow.exceptions import AirflowException -from airflow.models import BaseOperator, SkipMixin +from airflow.models import BaseOperator +from airflow.models.skipmixin import SkipMixin from airflow.utils.decorators import apply_defaults from airflow.utils.file import TemporaryDirectory - -from textwrap import dedent +from airflow.utils.operator_helpers import context_to_airflow_vars class PythonOperator(BaseOperator): """ Executes a Python callable + .. seealso:: + For more information on how to use this operator, take a look at the guide: + :ref:`howto/operator:PythonOperator` + :param python_callable: A reference to an object that is callable :type python_callable: python callable :param op_kwargs: a dictionary of keyword arguments that will get unpacked in your function - :type op_kwargs: dict + :type op_kwargs: dict (templated) :param op_args: a list of positional arguments that will get unpacked when calling your callable - :type op_args: list + :type op_args: list (templated) :param provide_context: if set to true, Airflow will pass a set of keyword arguments that can be used in your function. This set of kwargs correspond exactly to what you can use in your jinja @@ -56,15 +63,18 @@ class PythonOperator(BaseOperator): will get templated by the Airflow engine sometime between ``__init__`` and ``execute`` takes place and are made available in your callable's context after the template has been applied. (templated) - :type templates_dict: dict of str + :type templates_dict: dict[str] :param templates_exts: a list of file extensions to resolve while processing templated fields, for examples ``['.sql', '.hql']`` - :type templates_exts: list(str) + :type templates_exts: list[str] """ - template_fields = ('templates_dict',) - template_ext = tuple() + template_fields = ('templates_dict', 'op_args', 'op_kwargs') ui_color = '#ffefeb' + # since we won't mutate the arguments, we should just do the shallow copy + # there are some cases we can't deepcopy the objects(e.g protobuf). + shallow_copy_attrs = ('python_callable', 'op_kwargs',) + @apply_defaults def __init__( self, @@ -87,6 +97,13 @@ def __init__( self.template_ext = templates_exts def execute(self, context): + # Export context to make it available for callables to use. + airflow_context_vars = context_to_airflow_vars(context, in_env_var_format=True) + self.log.info("Exporting the following env vars:\n%s", + '\n'.join(["{}={}".format(k, v) + for k, v in airflow_context_vars.items()])) + os.environ.update(airflow_context_vars) + if self.provide_context: context.update(self.op_kwargs) context['templates_dict'] = self.templates_dict @@ -102,14 +119,14 @@ def execute_callable(self): class BranchPythonOperator(PythonOperator, SkipMixin): """ - Allows a workflow to "branch" or follow a single path following the - execution of this task. + Allows a workflow to "branch" or follow a path following the execution + of this task. It derives the PythonOperator and expects a Python function that returns - the task_id to follow. The task_id returned should point to a task - directly downstream from {self}. All other "branches" or - directly downstream tasks are marked with a state of ``skipped`` so that - these paths can't move forward. The ``skipped`` states are propageted + a single task_id or list of task_ids to follow. The task_id(s) returned + should point to a task directly downstream from {self}. All other "branches" + or directly downstream tasks are marked with a state of ``skipped`` so that + these paths can't move forward. The ``skipped`` states are propagated downstream to allow for the DAG state to fill up and the DAG run's state to be inferred. @@ -121,14 +138,26 @@ class BranchPythonOperator(PythonOperator, SkipMixin): """ def execute(self, context): branch = super(BranchPythonOperator, self).execute(context) + if isinstance(branch, six.string_types): + branch = [branch] self.log.info("Following branch %s", branch) self.log.info("Marking other directly downstream tasks as skipped") downstream_tasks = context['task'].downstream_list self.log.debug("Downstream task_ids %s", downstream_tasks) - skip_tasks = [t for t in downstream_tasks if t.task_id != branch] if downstream_tasks: + # Also check downstream tasks of the branch task. In case the task to skip + # is a downstream task of the branch task, we exclude it from skipping. + branch_downstream_task_ids = set() + for b in branch: + branch_downstream_task_ids.update(context["dag"]. + get_task(b). + get_flat_relative_ids(upstream=False)) + skip_tasks = [t + for t in downstream_tasks + if t.task_id not in branch and + t.task_id not in branch_downstream_task_ids] self.skip(context['dag_run'], context['ti'].execution_date, skip_tasks) self.log.info("Done.") @@ -176,7 +205,6 @@ class PythonVirtualenvOperator(PythonOperator): variable named virtualenv_string_args will be available (populated by string_args). In addition, one can pass stuff through op_args and op_kwargs, and one can use a return value. - Note that if your virtualenv runs in a different Python major version than Airflow, you cannot use return values, op_args, or op_kwargs. You can use string_args though. @@ -184,7 +212,7 @@ class PythonVirtualenvOperator(PythonOperator): defined with def, which will be run in a virtualenv :type python_callable: function :param requirements: A list of requirements as specified in a pip install command - :type requirements: list(str) + :type requirements: list[str] :param python_version: The Python version to run the virtualenv with. Note that both 2 and 2.7 are acceptable forms. :type python_version: str @@ -200,10 +228,16 @@ class PythonVirtualenvOperator(PythonOperator): :type op_kwargs: list :param op_kwargs: A dict of keyword arguments to pass to python_callable. :type op_kwargs: dict + :param provide_context: if set to true, Airflow will pass a set of + keyword arguments that can be used in your function. This set of + kwargs correspond exactly to what you can use in your jinja + templates. For this to work, you need to define `**kwargs` in your + function header. + :type provide_context: bool :param string_args: Strings that are present in the global var virtualenv_string_args, - available to python_callable at runtime as a list(str). Note that args are split + available to python_callable at runtime as a list[str]. Note that args are split by newline. - :type string_args: list(str) + :type string_args: list[str] :param templates_dict: a dictionary where the values are templates that will get templated by the Airflow engine sometime between ``__init__`` and ``execute`` takes place and are made available @@ -211,21 +245,23 @@ class PythonVirtualenvOperator(PythonOperator): :type templates_dict: dict of str :param templates_exts: a list of file extensions to resolve while processing templated fields, for examples ``['.sql', '.hql']`` - :type templates_exts: list(str) + :type templates_exts: list[str] """ + @apply_defaults def __init__(self, python_callable, requirements=None, python_version=None, use_dill=False, system_site_packages=True, - op_args=None, op_kwargs=None, string_args=None, - templates_dict=None, templates_exts=None, *args, **kwargs): + op_args=None, op_kwargs=None, provide_context=False, + string_args=None, templates_dict=None, templates_exts=None, + *args, **kwargs): super(PythonVirtualenvOperator, self).__init__( python_callable=python_callable, op_args=op_args, op_kwargs=op_kwargs, templates_dict=templates_dict, templates_exts=templates_exts, - provide_context=False, + provide_context=provide_context, *args, **kwargs) self.requirements = requirements or [] @@ -289,14 +325,14 @@ def _pass_op_args(self): def _execute_in_subprocess(self, cmd): try: - self.log.info("Executing cmd\n{}".format(cmd)) + self.log.info("Executing cmd\n%s", cmd) output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, close_fds=True) if output: - self.log.info("Got output\n{}".format(output)) + self.log.info("Got output\n%s", output) except subprocess.CalledProcessError as e: - self.log.info("Got error output\n{}".format(e.output)) + self.log.info("Got error output\n%s", e.output) raise def _write_string_args(self, filename): @@ -351,7 +387,8 @@ def _generate_pip_install_cmd(self, tmp_dir): cmd = ['{}/bin/pip'.format(tmp_dir), 'install'] return cmd + self.requirements - def _generate_python_cmd(self, tmp_dir, script_filename, + @staticmethod + def _generate_python_cmd(tmp_dir, script_filename, input_filename, output_filename, string_args_filename): # direct path alleviates need to activate return ['{}/bin/python'.format(tmp_dir), script_filename, diff --git a/airflow/operators/redshift_to_s3_operator.py b/airflow/operators/redshift_to_s3_operator.py index 9c1b621dae965..c0c4db3503467 100644 --- a/airflow/operators/redshift_to_s3_operator.py +++ b/airflow/operators/redshift_to_s3_operator.py @@ -28,17 +28,28 @@ class RedshiftToS3Transfer(BaseOperator): Executes an UNLOAD command to s3 as a CSV with headers :param schema: reference to a specific schema in redshift database - :type schema: string + :type schema: str :param table: reference to a specific table in redshift database - :type table: string + :type table: str :param s3_bucket: reference to a specific S3 bucket - :type s3_bucket: string + :type s3_bucket: str :param s3_key: reference to a specific S3 key - :type s3_key: string + :type s3_key: str :param redshift_conn_id: reference to a specific redshift database - :type redshift_conn_id: string + :type redshift_conn_id: str :param aws_conn_id: reference to a specific S3 connection - :type aws_conn_id: string + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str :param unload_options: reference to a list of UNLOAD options :type unload_options: list """ @@ -56,9 +67,9 @@ def __init__( s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', + verify=None, unload_options=tuple(), autocommit=False, - parameters=None, include_header=False, *args, **kwargs): super(RedshiftToS3Transfer, self).__init__(*args, **kwargs) @@ -68,9 +79,9 @@ def __init__( self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id + self.verify = verify self.unload_options = unload_options self.autocommit = autocommit - self.parameters = parameters self.include_header = include_header if self.include_header and \ @@ -79,7 +90,7 @@ def __init__( def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) - self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) + self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self.s3.get_credentials() unload_options = '\n\t\t\t'.join(self.unload_options) diff --git a/airflow/operators/s3_file_transform_operator.py b/airflow/operators/s3_file_transform_operator.py index 84a6eda0c8101..74c351db86356 100644 --- a/airflow/operators/s3_file_transform_operator.py +++ b/airflow/operators/s3_file_transform_operator.py @@ -19,6 +19,7 @@ from tempfile import NamedTemporaryFile import subprocess +import sys from airflow.exceptions import AirflowException from airflow.hooks.S3_hook import S3Hook @@ -47,6 +48,19 @@ class S3FileTransformOperator(BaseOperator): :type source_s3_key: str :param source_aws_conn_id: source s3 connection :type source_aws_conn_id: str + :param source_verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + + This is also applicable to ``dest_verify``. + :type source_verify: bool or str :param dest_s3_key: The key to be written from S3. (templated) :type dest_s3_key: str :param dest_aws_conn_id: destination s3 connection @@ -71,25 +85,32 @@ def __init__( transform_script=None, select_expression=None, source_aws_conn_id='aws_default', + source_verify=None, dest_aws_conn_id='aws_default', + dest_verify=None, replace=False, *args, **kwargs): super(S3FileTransformOperator, self).__init__(*args, **kwargs) self.source_s3_key = source_s3_key self.source_aws_conn_id = source_aws_conn_id + self.source_verify = source_verify self.dest_s3_key = dest_s3_key self.dest_aws_conn_id = dest_aws_conn_id + self.dest_verify = dest_verify self.replace = replace self.transform_script = transform_script self.select_expression = select_expression + self.output_encoding = sys.getdefaultencoding() def execute(self, context): if self.transform_script is None and self.select_expression is None: raise AirflowException( "Either transform_script or select_expression must be specified") - source_s3 = S3Hook(aws_conn_id=self.source_aws_conn_id) - dest_s3 = S3Hook(aws_conn_id=self.dest_aws_conn_id) + source_s3 = S3Hook(aws_conn_id=self.source_aws_conn_id, + verify=self.source_verify) + dest_s3 = S3Hook(aws_conn_id=self.dest_aws_conn_id, + verify=self.dest_verify) self.log.info("Downloading source S3 file %s", self.source_s3_key) if not source_s3.check_for_key(self.source_s3_key): @@ -114,15 +135,23 @@ def execute(self, context): f_source.flush() if self.transform_script is not None: - transform_script_process = subprocess.Popen( + process = subprocess.Popen( [self.transform_script, f_source.name, f_dest.name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) - (transform_script_stdoutdata, transform_script_stderrdata) = \ - transform_script_process.communicate() - self.log.info("Transform script stdout %s", transform_script_stdoutdata) - if transform_script_process.returncode > 0: + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + close_fds=True + ) + + self.log.info("Output:") + for line in iter(process.stdout.readline, b''): + self.log.info(line.decode(self.output_encoding).rstrip()) + + process.wait() + + if process.returncode > 0: raise AirflowException( - "Transform script failed %s", transform_script_stderrdata) + "Transform script failed: {0}".format(process.returncode) + ) else: self.log.info( "Transform script successful. Output temporarily located at %s", diff --git a/airflow/operators/s3_to_hive_operator.py b/airflow/operators/s3_to_hive_operator.py index 09eb8363c0dc5..33aa5b0103627 100644 --- a/airflow/operators/s3_to_hive_operator.py +++ b/airflow/operators/s3_to_hive_operator.py @@ -78,6 +78,17 @@ class S3ToHiveTransfer(BaseOperator): :type delimiter: str :param aws_conn_id: source s3 connection :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str :param hive_cli_conn_id: destination hive connection :type hive_cli_conn_id: str :param input_compressed: Boolean to determine if file decompression is @@ -107,6 +118,7 @@ def __init__( check_headers=False, wildcard_match=False, aws_conn_id='aws_default', + verify=None, hive_cli_conn_id='hive_cli_default', input_compressed=False, tblproperties=None, @@ -125,6 +137,7 @@ def __init__( self.wildcard_match = wildcard_match self.hive_cli_conn_id = hive_cli_conn_id self.aws_conn_id = aws_conn_id + self.verify = verify self.input_compressed = input_compressed self.tblproperties = tblproperties self.select_expression = select_expression @@ -136,7 +149,7 @@ def __init__( def execute(self, context): # Downloading file from S3 - self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) + self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.log.info("Downloading S3 file") @@ -153,7 +166,7 @@ def execute(self, context): root, file_ext = os.path.splitext(s3_key_object.key) if (self.select_expression and self.input_compressed and - file_ext != '.gz'): + file_ext.lower() != '.gz'): raise AirflowException("GZIP is the only compression " + "format Amazon S3 Select supports") @@ -161,8 +174,9 @@ def execute(self, context): NamedTemporaryFile(mode="wb", dir=tmp_dir, suffix=file_ext) as f: - self.log.info("Dumping S3 key {0} contents to local file {1}" - .format(s3_key_object.key, f.name)) + self.log.info( + "Dumping S3 key %s contents to local file %s", s3_key_object.key, f.name + ) if self.select_expression: option = {} if self.headers: @@ -245,24 +259,23 @@ def _match_headers(self, header_list): raise AirflowException("Unable to retrieve header row from file") field_names = self.field_dict.keys() if len(field_names) != len(header_list): - self.log.warning("Headers count mismatch" - "File headers:\n {header_list}\n" - "Field names: \n {field_names}\n" - .format(**locals())) + self.log.warning( + "Headers count mismatch File headers:\n %s\nField names: \n %s\n", header_list, field_names + ) return False test_field_match = [h1.lower() == h2.lower() for h1, h2 in zip(header_list, field_names)] if not all(test_field_match): - self.log.warning("Headers do not match field names" - "File headers:\n {header_list}\n" - "Field names: \n {field_names}\n" - .format(**locals())) + self.log.warning( + "Headers do not match field names File headers:\n %s\nField names: \n %s\n", + header_list, field_names + ) return False else: return True + @staticmethod def _delete_top_row_and_compress( - self, input_file_name, output_file_ext, dest_dir): @@ -275,7 +288,7 @@ def _delete_top_row_and_compress( os_fh_output, fn_output = \ tempfile.mkstemp(suffix=output_file_ext, dir=dest_dir) - with open(input_file_name, 'rb') as f_in,\ + with open(input_file_name, 'rb') as f_in, \ open_fn(fn_output, 'wb') as f_out: f_in.seek(0) next(f_in) diff --git a/airflow/operators/s3_to_redshift_operator.py b/airflow/operators/s3_to_redshift_operator.py index 0d7921e9ed0f1..65bc679f243ab 100644 --- a/airflow/operators/s3_to_redshift_operator.py +++ b/airflow/operators/s3_to_redshift_operator.py @@ -28,17 +28,28 @@ class S3ToRedshiftTransfer(BaseOperator): Executes an COPY command to load files from s3 to Redshift :param schema: reference to a specific schema in redshift database - :type schema: string + :type schema: str :param table: reference to a specific table in redshift database - :type table: string + :type table: str :param s3_bucket: reference to a specific S3 bucket - :type s3_bucket: string + :type s3_bucket: str :param s3_key: reference to a specific S3 key - :type s3_key: string + :type s3_key: str :param redshift_conn_id: reference to a specific redshift database - :type redshift_conn_id: string + :type redshift_conn_id: str :param aws_conn_id: reference to a specific S3 connection - :type aws_conn_id: string + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str :param copy_options: reference to a list of COPY options :type copy_options: list """ @@ -56,6 +67,7 @@ def __init__( s3_key, redshift_conn_id='redshift_default', aws_conn_id='aws_default', + verify=None, copy_options=tuple(), autocommit=False, parameters=None, @@ -67,13 +79,14 @@ def __init__( self.s3_key = s3_key self.redshift_conn_id = redshift_conn_id self.aws_conn_id = aws_conn_id + self.verify = verify self.copy_options = copy_options self.autocommit = autocommit self.parameters = parameters def execute(self, context): self.hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) - self.s3 = S3Hook(aws_conn_id=self.aws_conn_id) + self.s3 = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) credentials = self.s3.get_credentials() copy_options = '\n\t\t\t'.join(self.copy_options) diff --git a/airflow/operators/slack_operator.py b/airflow/operators/slack_operator.py index c5a69456fbe86..b6fb75a2360e3 100644 --- a/airflow/operators/slack_operator.py +++ b/airflow/operators/slack_operator.py @@ -32,11 +32,11 @@ class SlackAPIOperator(BaseOperator): In the future additional Slack API Operators will be derived from this class as well :param slack_conn_id: Slack connection ID which its password is Slack API token - :type slack_conn_id: string + :type slack_conn_id: str :param token: Slack API token (https://api.slack.com/web) - :type token: string + :type token: str :param method: The Slack API Method to Call (https://api.slack.com/methods) - :type method: string + :type method: str :param api_params: API Method call parameters (https://api.slack.com/methods) :type api_params: dict """ @@ -93,13 +93,13 @@ class SlackAPIPostOperator(SlackAPIOperator): :param channel: channel in which to post message on slack name (#general) or ID (C12318391). (templated) - :type channel: string + :type channel: str :param username: Username that airflow will be posting to Slack as. (templated) - :type username: string + :type username: str :param text: message to send to slack. (templated) - :type text: string + :type text: str :param icon_url: url to icon used for this message - :type icon_url: string + :type icon_url: str :param attachments: extra formatting details. (templated) - see https://api.slack.com/docs/attachments. :type attachments: array of hashes @@ -115,8 +115,8 @@ def __init__(self, text='No message has been set.\n' 'Here is a cat video instead\n' 'https://www.youtube.com/watch?v=J---aiyznGQ', - icon_url='https://raw.githubusercontent.com' - '/airbnb/airflow/master/airflow/www/static/pin_100.png', + icon_url='https://raw.githubusercontent.com/apache/' + 'airflow/master/airflow/www/static/pin_100.jpg', attachments=None, *args, **kwargs): self.method = 'chat.postMessage' diff --git a/airflow/operators/sqlite_operator.py b/airflow/operators/sqlite_operator.py index 5b7213ed816d8..fb4a30e350250 100644 --- a/airflow/operators/sqlite_operator.py +++ b/airflow/operators/sqlite_operator.py @@ -26,11 +26,13 @@ class SqliteOperator(BaseOperator): """ Executes sql code in a specific Sqlite database - :param sqlite_conn_id: reference to a specific sqlite database - :type sqlite_conn_id: string :param sql: the sql code to be executed. (templated) - :type sql: string or string pointing to a template file. File must have + :type sql: str or string pointing to a template file. File must have a '.sql' extensions. + :param sqlite_conn_id: reference to a specific sqlite database + :type sqlite_conn_id: str + :param parameters: (optional) the parameters to render the SQL query with. + :type parameters: mapping or iterable """ template_fields = ('sql',) diff --git a/airflow/operators/subdag_operator.py b/airflow/operators/subdag_operator.py index 052095e2a618f..812f486e16428 100644 --- a/airflow/operators/subdag_operator.py +++ b/airflow/operators/subdag_operator.py @@ -25,8 +25,19 @@ class SubDagOperator(BaseOperator): + """ + This runs a sub dag. By convention, a sub dag's dag_id + should be prefixed by its parent and a dot. As in `parent.child`. + + :param subdag: the DAG object to run as a subdag of the current DAG. + :type subdag: airflow.models.DAG + :param dag: the parent DAG for the subdag. + :type dag: airflow.models.DAG + :param executor: the executor for this subdag. Default to use SequentialExecutor. + Please find AIRFLOW-74 for more details. + :type executor: airflow.executors.base_executor.BaseExecutor + """ - template_fields = tuple() ui_color = '#555' ui_fgcolor = '#fff' @@ -37,18 +48,6 @@ def __init__( subdag, executor=SequentialExecutor(), *args, **kwargs): - """ - This runs a sub dag. By convention, a sub dag's dag_id - should be prefixed by its parent and a dot. As in `parent.child`. - - :param subdag: the DAG object to run as a subdag of the current DAG. - :type subdag: airflow.DAG. - :param dag: the parent DAG for the subdag. - :type dag: airflow.DAG. - :param executor: the executor for this subdag. Default to use SequentialExecutor. - Please find AIRFLOW-74 for more details. - :type executor: airflow.executors. - """ import airflow.models dag = kwargs.get('dag') or airflow.models._CONTEXT_MANAGER_DAG if not dag: diff --git a/airflow/plugins_manager.py b/airflow/plugins_manager.py index 735f2de1e8c79..4277b33432077 100644 --- a/airflow/plugins_manager.py +++ b/airflow/plugins_manager.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -27,48 +27,99 @@ import inspect import os import re -import sys +import pkg_resources +from typing import List, Any -from airflow import configuration +from airflow import settings from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log +import_errors = {} + + class AirflowPluginException(Exception): pass class AirflowPlugin(object): - name = None - operators = [] - sensors = [] - hooks = [] - executors = [] - macros = [] - admin_views = [] - flask_blueprints = [] - menu_links = [] + name = None # type: str + operators = [] # type: List[Any] + sensors = [] # type: List[Any] + hooks = [] # type: List[Any] + executors = [] # type: List[Any] + macros = [] # type: List[Any] + admin_views = [] # type: List[Any] + flask_blueprints = [] # type: List[Any] + menu_links = [] # type: List[Any] + appbuilder_views = [] # type: List[Any] + appbuilder_menu_items = [] # type: List[Any] @classmethod def validate(cls): if not cls.name: raise AirflowPluginException("Your plugin needs a name.") - -plugins_folder = configuration.conf.get('core', 'plugins_folder') -if not plugins_folder: - plugins_folder = configuration.conf.get('core', 'airflow_home') + '/plugins' -plugins_folder = os.path.expanduser(plugins_folder) - -if plugins_folder not in sys.path: - sys.path.append(plugins_folder) - -plugins = [] + @classmethod + def on_load(cls, *args, **kwargs): + """ + Executed when the plugin is loaded. + This method is only called once during runtime. + + :param args: If future arguments are passed in on call. + :param kwargs: If future arguments are passed in on call. + """ + pass + + +def load_entrypoint_plugins(entry_points, airflow_plugins): + """ + Load AirflowPlugin subclasses from the entrypoints + provided. The entry_point group should be 'airflow.plugins'. + + :param entry_points: A collection of entrypoints to search for plugins + :type entry_points: Generator[setuptools.EntryPoint, None, None] + :param airflow_plugins: A collection of existing airflow plugins to + ensure we don't load duplicates + :type airflow_plugins: list[type[airflow.plugins_manager.AirflowPlugin]] + :rtype: list[airflow.plugins_manager.AirflowPlugin] + """ + for entry_point in entry_points: + log.debug('Importing entry_point plugin %s', entry_point.name) + plugin_obj = entry_point.load() + if is_valid_plugin(plugin_obj, airflow_plugins): + if callable(getattr(plugin_obj, 'on_load', None)): + plugin_obj.on_load() + airflow_plugins.append(plugin_obj) + return airflow_plugins + + +def is_valid_plugin(plugin_obj, existing_plugins): + """ + Check whether a potential object is a subclass of + the AirflowPlugin class. + + :param plugin_obj: potential subclass of AirflowPlugin + :param existing_plugins: Existing list of AirflowPlugin subclasses + :return: Whether or not the obj is a valid subclass of + AirflowPlugin + """ + if ( + inspect.isclass(plugin_obj) and + issubclass(plugin_obj, AirflowPlugin) and + (plugin_obj is not AirflowPlugin) + ): + plugin_obj.validate() + return plugin_obj not in existing_plugins + return False + + +plugins = [] # type: List[AirflowPlugin] norm_pattern = re.compile(r'[/|.]') # Crawl through the plugins folder to find AirflowPlugin derivatives -for root, dirs, files in os.walk(plugins_folder, followlinks=True): +for root, dirs, files in os.walk(settings.PLUGINS_FOLDER, followlinks=True): for f in files: try: filepath = os.path.join(root, f) @@ -85,17 +136,18 @@ def validate(cls): m = imp.load_source(namespace, filepath) for obj in list(m.__dict__.values()): - if ( - inspect.isclass(obj) and - issubclass(obj, AirflowPlugin) and - obj is not AirflowPlugin): - obj.validate() - if obj not in plugins: - plugins.append(obj) + if is_valid_plugin(obj, plugins): + plugins.append(obj) except Exception as e: log.exception(e) log.error('Failed to import plugin %s', filepath) + import_errors[filepath] = str(e) + +plugins = load_entrypoint_plugins( + pkg_resources.iter_entry_points('airflow.plugins'), + plugins +) def make_module(name, objects): @@ -107,6 +159,7 @@ def make_module(name, objects): module.__dict__.update((o.__name__, o) for o in objects) return module + # Plugin components to integrate as modules operators_modules = [] sensors_modules = [] @@ -115,9 +168,11 @@ def make_module(name, objects): macros_modules = [] # Plugin components to integrate directly -admin_views = [] -flask_blueprints = [] -menu_links = [] +admin_views = [] # type: List[Any] +flask_blueprints = [] # type: List[Any] +menu_links = [] # type: List[Any] +flask_appbuilder_views = [] # type: List[Any] +flask_appbuilder_menu_links = [] # type: List[Any] for p in plugins: operators_modules.append( @@ -131,5 +186,10 @@ def make_module(name, objects): macros_modules.append(make_module('airflow.macros.' + p.name, p.macros)) admin_views.extend(p.admin_views) - flask_blueprints.extend(p.flask_blueprints) menu_links.extend(p.menu_links) + flask_appbuilder_views.extend(p.appbuilder_views) + flask_appbuilder_menu_links.extend(p.appbuilder_menu_items) + flask_blueprints.extend([{ + 'name': p.name, + 'blueprint': bp + } for bp in p.flask_blueprints]) diff --git a/airflow/security/kerberos.py b/airflow/security/kerberos.py index 43c9fcccdca85..3fc6aa345d79f 100644 --- a/airflow/security/kerberos.py +++ b/airflow/security/kerberos.py @@ -27,11 +27,13 @@ log = LoggingMixin().log -def renew_from_kt(): +def renew_from_kt(principal, keytab): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. + renewal_lifetime = "%sm" % configuration.conf.getint('kerberos', 'reinit_frequency') - principal = configuration.conf.get('kerberos', 'principal').replace( + + cmd_principal = principal or configuration.conf.get('kerberos', 'principal').replace( "_HOST", socket.getfqdn() ) @@ -39,11 +41,11 @@ def renew_from_kt(): configuration.conf.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket - "-t", configuration.conf.get('kerberos', 'keytab'), # specify keytab + "-t", keytab, # specify keytab "-c", configuration.conf.get('kerberos', 'ccache'), # specify credentials cache - principal + cmd_principal ] - log.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) + log.info("Reinitting kerberos from keytab: %s", " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, @@ -53,10 +55,10 @@ def renew_from_kt(): universal_newlines=True) subp.wait() if subp.returncode != 0: - log.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % ( - subp.returncode, - b"\n".join(subp.stdout.readlines()), - b"\n".join(subp.stderr.readlines()))) + log.error( + "Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s", + subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(subp.stderr.readlines()) + ) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND @@ -66,34 +68,32 @@ def renew_from_kt(): # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) - perform_krb181_workaround() + perform_krb181_workaround(principal) -def perform_krb181_workaround(): +def perform_krb181_workaround(principal): cmdv = [configuration.conf.get('kerberos', 'kinit_path'), "-c", configuration.conf.get('kerberos', 'ccache'), "-R"] # Renew ticket_cache - log.info("Renewing kerberos ticket to work around kerberos 1.8.1: " + - " ".join(cmdv)) + log.info( + "Renewing kerberos ticket to work around kerberos 1.8.1: %s", " ".join(cmdv) + ) ret = subprocess.call(cmdv, close_fds=True) if ret != 0: - principal = "%s/%s" % ( - configuration.conf.get('kerberos', 'principal'), - socket.getfqdn() + principal = "%s/%s" % (principal or configuration.conf.get('kerberos', 'principal'), + socket.getfqdn()) + princ = principal + ccache = configuration.conf.get('kerberos', 'principal') + log.error( + "Couldn't renew kerberos ticket in order to work around Kerberos 1.8.1 issue. Please check that " + "the ticket for '%s' is still renewable:\n $ kinit -f -c %s\nIf the 'renew until' date is the " + "same as the 'valid starting' date, the ticket cannot be renewed. Please check your KDC " + "configuration, and the ticket renewal policy (maxrenewlife) for the '%s' and `krbtgt' " + "principals.", princ, ccache, princ ) - fmt_dict = dict(princ=principal, - ccache=configuration.conf.get('kerberos', 'principal')) - log.error("Couldn't renew kerberos ticket in order to work around " - "Kerberos 1.8.1 issue. Please check that the ticket for " - "'%(princ)s' is still renewable:\n" - " $ kinit -f -c %(ccache)s\n" - "If the 'renew until' date is the same as the 'valid starting' " - "date, the ticket cannot be renewed. Please check your KDC " - "configuration, and the ticket renewal policy (maxrenewlife) " - "for the '%(princ)s' and `krbtgt' principals." % fmt_dict) sys.exit(ret) @@ -110,11 +110,11 @@ def detect_conf_var(): return b'X-CACHECONF:' in f.read() -def run(): - if configuration.conf.get('kerberos', 'keytab') is None: +def run(principal, keytab): + if not keytab: log.debug("Keytab renewer not starting, no keytab configured") sys.exit(0) while True: - renew_from_kt() + renew_from_kt(principal, keytab) time.sleep(configuration.conf.getint('kerberos', 'reinit_frequency')) diff --git a/airflow/security/utils.py b/airflow/security/utils.py index 8e4fcbd4bfc0e..e10510e24eb71 100644 --- a/airflow/security/utils.py +++ b/airflow/security/utils.py @@ -18,24 +18,9 @@ import re import socket -import airflow.configuration as conf from airflow.utils.net import get_hostname -# Pattern to replace with hostname -HOSTNAME_PATTERN = '_HOST' - - -def get_kerberos_principal(principal, host): - components = get_components(principal) - if not components or len(components) != 3 or components[1] != HOSTNAME_PATTERN: - return principal - else: - if not host: - raise IOError("Can't replace %s pattern " - "since host is null." % HOSTNAME_PATTERN) - return replace_hostname_pattern(components, host) - def get_components(principal): """ @@ -45,39 +30,33 @@ def get_components(principal): """ if not principal: return None - return re.split('[\/@]', str(principal)) + return re.split(r'[\/@]', str(principal)) def replace_hostname_pattern(components, host=None): fqdn = host if not fqdn or fqdn == '0.0.0.0': - fqdn = get_localhost_name() + fqdn = get_hostname() return '%s/%s@%s' % (components[0], fqdn.lower(), components[2]) -def get_localhost_name(): - return get_hostname() - - def get_fqdn(hostname_or_ip=None): # Get hostname try: if hostname_or_ip: fqdn = socket.gethostbyaddr(hostname_or_ip)[0] + if fqdn == 'localhost': + fqdn = get_hostname() else: - fqdn = get_localhost_name() + fqdn = get_hostname() except IOError: fqdn = hostname_or_ip - if fqdn == 'localhost': - fqdn = get_localhost_name() - return fqdn -def principal_from_username(username): - realm = conf.get("security", "default_realm") - if '@' not in username and realm: +def principal_from_username(username, realm): + if ('@' not in username) and realm: username = "{}@{}".format(username, realm) return username diff --git a/airflow/sensors/base_sensor_operator.py b/airflow/sensors/base_sensor_operator.py index 74b0e0fe1ca5b..fe96a36d8b3ec 100644 --- a/airflow/sensors/base_sensor_operator.py +++ b/airflow/sensors/base_sensor_operator.py @@ -19,20 +19,24 @@ from time import sleep +from datetime import timedelta from airflow.exceptions import AirflowException, AirflowSensorTimeout, \ - AirflowSkipException -from airflow.models import BaseOperator, SkipMixin + AirflowSkipException, AirflowRescheduleException +from airflow.models import BaseOperator +from airflow.models.skipmixin import SkipMixin +from airflow.models.taskreschedule import TaskReschedule from airflow.utils import timezone from airflow.utils.decorators import apply_defaults +from airflow.ti_deps.deps.ready_to_reschedule import ReadyToRescheduleDep class BaseSensorOperator(BaseOperator, SkipMixin): """ - Sensor operators are derived from this class an inherit these attributes. + Sensor operators are derived from this class and inherit these attributes. Sensor operators keep executing at a time interval and succeed when - a criteria is met and fail if and when they time out. + a criteria is met and fail if and when they time out. :param soft_fail: Set to true to mark the task as SKIPPED on failure :type soft_fail: bool @@ -41,20 +45,51 @@ class BaseSensorOperator(BaseOperator, SkipMixin): :type poke_interval: int :param timeout: Time, in seconds before the task times out and fails. :type timeout: int + :param mode: How the sensor operates. + Options are: ``{ poke | reschedule }``, default is ``poke``. + When set to ``poke`` the sensor is taking up a worker slot for its + whole execution time and sleeps between pokes. Use this mode if the + expected runtime of the sensor is short or if a short poke interval + is requried. + When set to ``reschedule`` the sensor task frees the worker slot when + the criteria is not yet met and it's rescheduled at a later time. Use + this mode if the expected time until the criteria is met is. The poke + interval should be more than one minute to prevent too much load on + the scheduler. + :type mode: str """ ui_color = '#e6f1f2' + valid_modes = ['poke', 'reschedule'] @apply_defaults def __init__(self, poke_interval=60, timeout=60 * 60 * 24 * 7, soft_fail=False, + mode='poke', *args, **kwargs): super(BaseSensorOperator, self).__init__(*args, **kwargs) self.poke_interval = poke_interval self.soft_fail = soft_fail self.timeout = timeout + self.mode = mode + self._validate_input_values() + + def _validate_input_values(self): + if not isinstance(self.poke_interval, (int, float)) or self.poke_interval < 0: + raise AirflowException( + "The poke_interval must be a non-negative number") + if not isinstance(self.timeout, (int, float)) or self.timeout < 0: + raise AirflowException( + "The timeout must be a non-negative number") + if self.mode not in self.valid_modes: + raise AirflowException( + "The mode must be one of {valid_modes}," + "'{d}.{t}'; received '{m}'." + .format(valid_modes=self.valid_modes, + d=self.dag.dag_id if self.dag else "", + t=self.task_id, m=self.mode)) def poke(self, context): """ @@ -65,6 +100,11 @@ def poke(self, context): def execute(self, context): started_at = timezone.utcnow() + if self.reschedule: + # If reschedule, use first start date of current try + task_reschedules = TaskReschedule.find_for_task_instance(context['ti']) + if task_reschedules: + started_at = task_reschedules[0].start_date while not self.poke(context): if (timezone.utcnow() - started_at).total_seconds() > self.timeout: # If sensor is in soft fail mode but will be retried then @@ -75,7 +115,12 @@ def execute(self, context): raise AirflowSkipException('Snap. Time is OUT.') else: raise AirflowSensorTimeout('Snap. Time is OUT.') - sleep(self.poke_interval) + if self.reschedule: + reschedule_date = timezone.utcnow() + timedelta( + seconds=self.poke_interval) + raise AirflowRescheduleException(reschedule_date) + else: + sleep(self.poke_interval) self.log.info("Success criteria met. Exiting.") def _do_skip_downstream_tasks(self, context): @@ -83,3 +128,15 @@ def _do_skip_downstream_tasks(self, context): self.log.debug("Downstream task_ids %s", downstream_tasks) if downstream_tasks: self.skip(context['dag_run'], context['ti'].execution_date, downstream_tasks) + + @property + def reschedule(self): + return self.mode == 'reschedule' + + @property + def deps(self): + """ + Adds one additional dependency for all sensor operators that + checks if a sensor task instance can be rescheduled. + """ + return BaseOperator.deps.fget(self) | {ReadyToRescheduleDep()} diff --git a/airflow/sensors/external_task_sensor.py b/airflow/sensors/external_task_sensor.py index eda1a2d9d2d28..ca98cbf50e736 100644 --- a/airflow/sensors/external_task_sensor.py +++ b/airflow/sensors/external_task_sensor.py @@ -17,7 +17,10 @@ # specific language governing permissions and limitations # under the License. -from airflow.models import TaskInstance +import os + +from airflow.exceptions import AirflowException +from airflow.models import TaskInstance, DagBag, DagModel, DagRun from airflow.sensors.base_sensor_operator import BaseSensorOperator from airflow.utils.db import provide_session from airflow.utils.decorators import apply_defaults @@ -26,18 +29,19 @@ class ExternalTaskSensor(BaseSensorOperator): """ - Waits for a task to complete in a different DAG + Waits for a different DAG or a task in a different DAG to complete for a + specific execution_date :param external_dag_id: The dag_id that contains the task you want to wait for - :type external_dag_id: string + :type external_dag_id: str :param external_task_id: The task_id that contains the task you want to - wait for - :type external_task_id: string + wait for. If ``None`` the sensor waits for the DAG + :type external_task_id: str :param allowed_states: list of allowed states, default is ``['success']`` :type allowed_states: list :param execution_delta: time difference with the previous execution to - look at, the default is the same execution_date as the current task. + look at, the default is the same execution_date as the current task or DAG. For yesterday, use [positive!] datetime.timedelta(days=1). Either execution_delta or execution_date_fn can be passed to ExternalTaskSensor, but not both. @@ -46,6 +50,11 @@ class ExternalTaskSensor(BaseSensorOperator): and returns the desired execution dates to query. Either execution_delta or execution_date_fn can be passed to ExternalTaskSensor, but not both. :type execution_date_fn: callable + :param check_existence: Set to `True` to check if the external task exists (when + external_task_id is not None) or check if the DAG to wait for exists (when + external_task_id is None), and immediately cease waiting if the external task + or DAG does not exist (default value: False). + :type check_existence: bool """ template_fields = ['external_dag_id', 'external_task_id'] ui_color = '#19647e' @@ -57,19 +66,34 @@ def __init__(self, allowed_states=None, execution_delta=None, execution_date_fn=None, + check_existence=False, *args, **kwargs): super(ExternalTaskSensor, self).__init__(*args, **kwargs) self.allowed_states = allowed_states or [State.SUCCESS] + if external_task_id: + if not set(self.allowed_states) <= set(State.task_states): + raise ValueError( + 'Valid values for `allowed_states` ' + 'when `external_task_id` is not `None`: {}'.format(State.task_states) + ) + else: + if not set(self.allowed_states) <= set(State.dag_states): + raise ValueError( + 'Valid values for `allowed_states` ' + 'when `external_task_id` is `None`: {}'.format(State.dag_states) + ) + if execution_delta is not None and execution_date_fn is not None: raise ValueError( - 'Only one of `execution_date` or `execution_date_fn` may' + 'Only one of `execution_delta` or `execution_date_fn` may ' 'be provided to ExternalTaskSensor; not both.') self.execution_delta = execution_delta self.execution_date_fn = execution_date_fn self.external_dag_id = external_dag_id self.external_task_id = external_task_id + self.check_existence = check_existence @provide_session def poke(self, context, session=None): @@ -85,17 +109,46 @@ def poke(self, context, session=None): [datetime.isoformat() for datetime in dttm_filter]) self.log.info( - 'Poking for ' - '{self.external_dag_id}.' - '{self.external_task_id} on ' - '{} ... '.format(serialized_dttm_filter, **locals())) + 'Poking for %s.%s on %s ... ', + self.external_dag_id, self.external_task_id, serialized_dttm_filter + ) + + DM = DagModel TI = TaskInstance + DR = DagRun + if self.check_existence: + dag_to_wait = session.query(DM).filter( + DM.dag_id == self.external_dag_id + ).first() + + if not dag_to_wait: + raise AirflowException('The external DAG ' + '{} does not exist.'.format(self.external_dag_id)) + else: + if not os.path.exists(dag_to_wait.fileloc): + raise AirflowException('The external DAG ' + '{} was deleted.'.format(self.external_dag_id)) + + if self.external_task_id: + refreshed_dag_info = DagBag(dag_to_wait.fileloc).get_dag(self.external_dag_id) + if not refreshed_dag_info.has_task(self.external_task_id): + raise AirflowException('The external task' + '{} in DAG {} does not exist.'.format(self.external_task_id, + self.external_dag_id)) + + if self.external_task_id: + count = session.query(TI).filter( + TI.dag_id == self.external_dag_id, + TI.task_id == self.external_task_id, + TI.state.in_(self.allowed_states), + TI.execution_date.in_(dttm_filter), + ).count() + else: + count = session.query(DR).filter( + DR.dag_id == self.external_dag_id, + DR.state.in_(self.allowed_states), + DR.execution_date.in_(dttm_filter), + ).count() - count = session.query(TI).filter( - TI.dag_id == self.external_dag_id, - TI.task_id == self.external_task_id, - TI.state.in_(self.allowed_states), - TI.execution_date.in_(dttm_filter), - ).count() session.commit() return count == len(dttm_filter) diff --git a/airflow/sensors/hdfs_sensor.py b/airflow/sensors/hdfs_sensor.py index c9bac08ecbfe7..bc3a3c63fd64e 100644 --- a/airflow/sensors/hdfs_sensor.py +++ b/airflow/sensors/hdfs_sensor.py @@ -39,13 +39,15 @@ class HdfsSensor(BaseSensorOperator): def __init__(self, filepath, hdfs_conn_id='hdfs_default', - ignored_ext=['_COPYING_'], + ignored_ext=None, ignore_copying=True, file_size=None, hook=HDFSHook, *args, **kwargs): super(HdfsSensor, self).__init__(*args, **kwargs) + if ignored_ext is None: + ignored_ext = ['_COPYING_'] self.filepath = filepath self.hdfs_conn_id = hdfs_conn_id self.file_size = file_size @@ -78,26 +80,30 @@ def filter_for_ignored_ext(result, ignored_ext, ignore_copying): """ Will filter if instructed to do so the result to remove matching criteria - :param result: (list) of dicts returned by Snakebite ls - :param ignored_ext: (list) of ignored extensions - :param ignore_copying: (bool) shall we ignore ? - :return: (list) of dicts which were not removed + :param result: list of dicts returned by Snakebite ls + :type result: list[dict] + :param ignored_ext: list of ignored extensions + :type ignored_ext: list + :param ignore_copying: shall we ignore ? + :type ignore_copying: bool + :return: list of dicts which were not removed + :rtype: list[dict] """ if ignore_copying: log = LoggingMixin().log - regex_builder = "^.*\.(%s$)$" % '$|'.join(ignored_ext) - ignored_extentions_regex = re.compile(regex_builder) + regex_builder = r"^.*\.(%s$)$" % '$|'.join(ignored_ext) + ignored_extensions_regex = re.compile(regex_builder) log.debug( 'Filtering result for ignored extensions: %s in files %s', - ignored_extentions_regex.pattern, map(lambda x: x['path'], result) + ignored_extensions_regex.pattern, map(lambda x: x['path'], result) ) - result = [x for x in result if not ignored_extentions_regex.match(x['path'])] + result = [x for x in result if not ignored_extensions_regex.match(x['path'])] log.debug('HdfsSensor.poke: after ext filter result is %s', result) return result def poke(self, context): sb = self.hook(self.hdfs_conn_id).get_conn() - self.log.info('Poking for file {self.filepath}'.format(**locals())) + self.log.info('Poking for file %s', self.filepath) try: # IMOO it's not right here, as there no raise of any kind. # if the filepath is let's say '/data/mydirectory', diff --git a/airflow/sensors/hive_partition_sensor.py b/airflow/sensors/hive_partition_sensor.py index c8eddc3443fc6..5165540bb4346 100644 --- a/airflow/sensors/hive_partition_sensor.py +++ b/airflow/sensors/hive_partition_sensor.py @@ -31,12 +31,12 @@ class HivePartitionSensor(BaseSensorOperator): :param table: The name of the table to wait for, supports the dot notation (my_database.my_table) - :type table: string + :type table: str :param partition: The partition clause to wait for. This is passed as is to the metastore Thrift client ``get_partitions_by_filter`` method, and apparently supports SQL like notation as in ``ds='2015-01-01' AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"`` - :type partition: string + :type partition: str :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str @@ -65,8 +65,8 @@ def poke(self, context): if '.' in self.table: self.schema, self.table = self.table.split('.') self.log.info( - 'Poking for table {self.schema}.{self.table}, ' - 'partition {self.partition}'.format(**locals())) + 'Poking for table %s.%s, partition %s', self.schema, self.table, self.partition + ) if not hasattr(self, 'hook'): from airflow.hooks.hive_hooks import HiveMetastoreHook self.hook = HiveMetastoreHook( diff --git a/airflow/sensors/http_sensor.py b/airflow/sensors/http_sensor.py index 33f8531368ec1..4a1c75be30bb4 100644 --- a/airflow/sensors/http_sensor.py +++ b/airflow/sensors/http_sensor.py @@ -32,11 +32,11 @@ class HttpSensor(BaseSensorOperator): 404 not found or response_check function returned False :param http_conn_id: The connection to run the sensor against - :type http_conn_id: string + :type http_conn_id: str :param method: The HTTP request method to use - :type method: string + :type method: str :param endpoint: The relative part of the full url - :type endpoint: string + :type endpoint: str :param request_params: The parameters to be added to the GET url :type request_params: a dictionary of string key/value pairs :param headers: The HTTP headers to be added to the GET request diff --git a/airflow/sensors/named_hive_partition_sensor.py b/airflow/sensors/named_hive_partition_sensor.py index 4a076a3dd6870..6f25458965854 100644 --- a/airflow/sensors/named_hive_partition_sensor.py +++ b/airflow/sensors/named_hive_partition_sensor.py @@ -34,7 +34,7 @@ class NamedHivePartitionSensor(BaseSensorOperator): Thrift client ``get_partitions_by_name`` method. Note that you cannot use logical or comparison operators as in HivePartitionSensor. - :type partition_names: list of strings + :type partition_names: list[str] :param metastore_conn_id: reference to the metastore thrift service connection id :type metastore_conn_id: str @@ -61,9 +61,9 @@ def __init__(self, self.partition_names = partition_names self.hook = hook if self.hook and metastore_conn_id != 'metastore_default': - self.log.warning('A hook was passed but a non default' - 'metastore_conn_id=' - '{} was used'.format(metastore_conn_id)) + self.log.warning( + 'A hook was passed but a non defaul metastore_conn_id=%s was used', metastore_conn_id + ) @staticmethod def parse_partition_name(partition): @@ -89,9 +89,7 @@ def poke_partition(self, partition): schema, table, partition = self.parse_partition_name(partition) - self.log.info( - 'Poking for {schema}.{table}/{partition}'.format(**locals()) - ) + self.log.info('Poking for %s.%s/%s', schema, table, partition) return self.hook.check_for_named_partition( schema, table, partition) diff --git a/airflow/sensors/s3_key_sensor.py b/airflow/sensors/s3_key_sensor.py index fa2eb786ffa03..a72b1c7a87d46 100644 --- a/airflow/sensors/s3_key_sensor.py +++ b/airflow/sensors/s3_key_sensor.py @@ -41,6 +41,17 @@ class S3KeySensor(BaseSensorOperator): :type wildcard_match: bool :param aws_conn_id: a reference to the s3 connection :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str """ template_fields = ('bucket_key', 'bucket_name') @@ -50,6 +61,7 @@ def __init__(self, bucket_name=None, wildcard_match=False, aws_conn_id='aws_default', + verify=None, *args, **kwargs): super(S3KeySensor, self).__init__(*args, **kwargs) @@ -68,12 +80,12 @@ def __init__(self, self.bucket_key = bucket_key self.wildcard_match = wildcard_match self.aws_conn_id = aws_conn_id + self.verify = verify def poke(self, context): from airflow.hooks.S3_hook import S3Hook - hook = S3Hook(aws_conn_id=self.aws_conn_id) - full_url = "s3://" + self.bucket_name + "/" + self.bucket_key - self.log.info('Poking for key : {full_url}'.format(**locals())) + hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) + self.log.info('Poking for key : s3://%s/%s', self.bucket_name, self.bucket_key) if self.wildcard_match: return hook.check_for_wildcard_key(self.bucket_key, self.bucket_name) diff --git a/airflow/sensors/s3_prefix_sensor.py b/airflow/sensors/s3_prefix_sensor.py index 917dd46e26c28..16980e7b4ac31 100644 --- a/airflow/sensors/s3_prefix_sensor.py +++ b/airflow/sensors/s3_prefix_sensor.py @@ -38,6 +38,19 @@ class S3PrefixSensor(BaseSensorOperator): :param delimiter: The delimiter intended to show hierarchy. Defaults to '/'. :type delimiter: str + :param aws_conn_id: a reference to the s3 connection + :type aws_conn_id: str + :param verify: Whether or not to verify SSL certificates for S3 connection. + By default SSL certificates are verified. + You can provide the following values: + + - ``False``: do not validate SSL certificates. SSL will still be used + (unless use_ssl is False), but SSL certificates will not be + verified. + - ``path/to/cert/bundle.pem``: A filename of the CA cert bundle to uses. + You can specify this argument if you want to use a different + CA cert bundle than the one used by botocore. + :type verify: bool or str """ template_fields = ('prefix', 'bucket_name') @@ -47,6 +60,7 @@ def __init__(self, prefix, delimiter='/', aws_conn_id='aws_default', + verify=None, *args, **kwargs): super(S3PrefixSensor, self).__init__(*args, **kwargs) @@ -56,12 +70,12 @@ def __init__(self, self.delimiter = delimiter self.full_url = "s3://" + bucket_name + '/' + prefix self.aws_conn_id = aws_conn_id + self.verify = verify def poke(self, context): - self.log.info('Poking for prefix : {self.prefix}\n' - 'in bucket s3://{self.bucket_name}'.format(**locals())) + self.log.info('Poking for prefix : %s in bucket s3://%s', self.prefix, self.bucket_name) from airflow.hooks.S3_hook import S3Hook - hook = S3Hook(aws_conn_id=self.aws_conn_id) + hook = S3Hook(aws_conn_id=self.aws_conn_id, verify=self.verify) return hook.check_for_prefix( prefix=self.prefix, delimiter=self.delimiter, diff --git a/airflow/sensors/sql_sensor.py b/airflow/sensors/sql_sensor.py index e9208fa89042d..ba4e6bb03d426 100644 --- a/airflow/sensors/sql_sensor.py +++ b/airflow/sensors/sql_sensor.py @@ -18,7 +18,9 @@ # under the License. from builtins import str +from typing import Iterable +from airflow.exceptions import AirflowException from airflow.hooks.base_hook import BaseHook from airflow.sensors.base_sensor_operator import BaseSensorOperator from airflow.utils.decorators import apply_defaults @@ -30,25 +32,37 @@ class SqlSensor(BaseSensorOperator): sql returns no row, or if the first cell in (0, '0', ''). :param conn_id: The connection to run the sensor against - :type conn_id: string + :type conn_id: str :param sql: The sql to run. To pass, it needs to return at least one cell that contains a non-zero / empty string value. + :type sql: str + :param parameters: The parameters to render the SQL query with (optional). + :type parameters: mapping or iterable """ - template_fields = ('sql',) - template_ext = ('.hql', '.sql',) + template_fields = ('sql',) # type: Iterable[str] + template_ext = ('.hql', '.sql',) # type: Iterable[str] ui_color = '#7c7287' @apply_defaults - def __init__(self, conn_id, sql, *args, **kwargs): - self.sql = sql + def __init__(self, conn_id, sql, parameters=None, *args, **kwargs): self.conn_id = conn_id + self.sql = sql + self.parameters = parameters super(SqlSensor, self).__init__(*args, **kwargs) def poke(self, context): - hook = BaseHook.get_connection(self.conn_id).get_hook() + conn = BaseHook.get_connection(self.conn_id) + + allowed_conn_type = {'google_cloud_platform', 'jdbc', 'mssql', + 'mysql', 'oracle', 'postgres', + 'presto', 'sqlite', 'vertica'} + if conn.conn_type not in allowed_conn_type: + raise AirflowException("The connection type is not supported by SqlSensor. " + + "Supported connection types: {}".format(list(allowed_conn_type))) + hook = conn.get_hook() - self.log.info('Poking: %s', self.sql) - records = hook.get_records(self.sql) + self.log.info('Poking: %s (with parameters %s)', self.sql, self.parameters) + records = hook.get_records(self.sql, self.parameters) if not records: return False else: diff --git a/airflow/sensors/web_hdfs_sensor.py b/airflow/sensors/web_hdfs_sensor.py index 67b1d385b40a2..f31a307a67ff9 100644 --- a/airflow/sensors/web_hdfs_sensor.py +++ b/airflow/sensors/web_hdfs_sensor.py @@ -40,5 +40,5 @@ def __init__(self, def poke(self, context): from airflow.hooks.webhdfs_hook import WebHDFSHook c = WebHDFSHook(self.webhdfs_conn_id) - self.log.info('Poking for file {self.filepath}'.format(**locals())) + self.log.info('Poking for file %s', self.filepath) return c.check_for_path(hdfs_path=self.filepath) diff --git a/airflow/settings.py b/airflow/settings.py index 7c0376d12f9b5..fcf54a869aeb0 100644 --- a/airflow/settings.py +++ b/airflow/settings.py @@ -26,12 +26,14 @@ import logging import os import pendulum +import sys +from typing import Any -from sqlalchemy import create_engine +from sqlalchemy import create_engine, exc from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool -from airflow import configuration as conf +from airflow.configuration import conf, AIRFLOW_HOME, WEBSERVER_CONFIG # NOQA F401 from airflow.logging_config import configure_logging from airflow.utils.sqlalchemy import setup_event_handlers @@ -46,7 +48,7 @@ TIMEZONE = pendulum.local_timezone() else: TIMEZONE = pendulum.timezone(tz) -except: +except Exception: pass log.info("Configured default timezone %s" % TIMEZONE) @@ -69,7 +71,7 @@ def timing(cls, stat, dt): pass -Stats = DummyStatsLogger +Stats = DummyStatsLogger # type: Any if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient @@ -82,13 +84,13 @@ def timing(cls, stat, dt): else: Stats = DummyStatsLogger -HEADER = """\ - ____________ _____________ - ____ |__( )_________ __/__ /________ __ -____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / -___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ / - _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/ - """ +HEADER = '\n'.join([ + r' ____________ _____________', + r' ____ |__( )_________ __/__ /________ __', + r'____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / /', + r'___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ /', + r' _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/', +]) LOGGING_LEVEL = logging.INFO @@ -98,9 +100,10 @@ def timing(cls, stat, dt): LOG_FORMAT = conf.get('core', 'log_format') SIMPLE_LOG_FORMAT = conf.get('core', 'simple_log_format') -AIRFLOW_HOME = None SQL_ALCHEMY_CONN = None DAGS_FOLDER = None +PLUGINS_FOLDER = None +LOGGING_CLASS_PATH = None engine = None Session = None @@ -134,13 +137,18 @@ def policy(task_instance): def configure_vars(): - global AIRFLOW_HOME global SQL_ALCHEMY_CONN global DAGS_FOLDER - AIRFLOW_HOME = os.path.expanduser(conf.get('core', 'AIRFLOW_HOME')) + global PLUGINS_FOLDER SQL_ALCHEMY_CONN = conf.get('core', 'SQL_ALCHEMY_CONN') DAGS_FOLDER = os.path.expanduser(conf.get('core', 'DAGS_FOLDER')) + PLUGINS_FOLDER = conf.get( + 'core', + 'plugins_folder', + fallback=os.path.join(AIRFLOW_HOME, 'plugins') + ) + def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) @@ -153,7 +161,7 @@ def configure_orm(disable_connection_pool=False): engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: - # Engine args not supported by sqlite. + # Pool size engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: @@ -170,17 +178,27 @@ def configure_orm(disable_connection_pool=False): except conf.AirflowConfigException: pool_recycle = 1800 - log.info("setting.configure_orm(): Using pool settings. pool_size={}, " - "pool_recycle={}".format(pool_size, pool_recycle)) + log.info("settings.configure_orm(): Using pool settings. pool_size={}, " + "pool_recycle={}, pid={}".format(pool_size, pool_recycle, os.getpid())) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle + # Allow the user to specify an encoding for their DB otherwise default + # to utf-8 so jobs & users with non-latin1 characters can still use + # us. + engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING', fallback='utf-8') + # For Python2 we get back a newstr and need a str + engine_args['encoding'] = engine_args['encoding'].__str__() + engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( - sessionmaker(autocommit=False, autoflush=False, bind=engine)) + sessionmaker(autocommit=False, + autoflush=False, + bind=engine, + expire_on_commit=False)) def dispose_orm(): @@ -211,30 +229,70 @@ def configure_adapters(): pass +def validate_session(): + worker_precheck = conf.getboolean('core', 'worker_precheck', fallback=False) + if not worker_precheck: + return True + else: + check_session = sessionmaker(bind=engine) + session = check_session() + try: + session.execute("select 1") + conn_status = True + except exc.DBAPIError as err: + log.error(err) + conn_status = False + session.close() + return conn_status + + def configure_action_logging(): """ Any additional configuration (register callback) for airflow.utils.action_loggers module - :return: None + :rtype: None """ pass +def prepare_classpath(): + """ + Ensures that certain subfolders of AIRFLOW_HOME are on the classpath + """ + + if DAGS_FOLDER not in sys.path: + sys.path.append(DAGS_FOLDER) + + # Add ./config/ for loading custom log parsers etc, or + # airflow_local_settings etc. + config_path = os.path.join(AIRFLOW_HOME, 'config') + if config_path not in sys.path: + sys.path.append(config_path) + + if PLUGINS_FOLDER not in sys.path: + sys.path.append(PLUGINS_FOLDER) + + try: - from airflow_local_settings import * + from airflow_local_settings import * # noqa F403 F401 log.info("Loaded airflow_local_settings.") -except: +except Exception: pass -configure_logging() -configure_vars() -configure_adapters() -# The webservers import this file from models.py with the default settings. -configure_orm() -configure_action_logging() -# Ensure we close DB connections at scheduler and gunicon worker terminations -atexit.register(dispose_orm) +def initialize(): + configure_vars() + prepare_classpath() + global LOGGING_CLASS_PATH + LOGGING_CLASS_PATH = configure_logging() + configure_adapters() + # The webservers import this file from models.py with the default settings. + configure_orm() + configure_action_logging() + + # Ensure we close DB connections at scheduler and gunicon worker terminations + atexit.register(dispose_orm) + # Const stuff diff --git a/airflow/task/task_runner/__init__.py b/airflow/task/task_runner/__init__.py index 0edc020d41e3d..960a9e9e8e426 100644 --- a/airflow/task/task_runner/__init__.py +++ b/airflow/task/task_runner/__init__.py @@ -18,7 +18,7 @@ # under the License. from airflow import configuration -from airflow.task.task_runner.bash_task_runner import BashTaskRunner +from airflow.task.task_runner.standard_task_runner import StandardTaskRunner from airflow.exceptions import AirflowException _TASK_RUNNER = configuration.conf.get('core', 'TASK_RUNNER') @@ -29,13 +29,13 @@ def get_task_runner(local_task_job): Get the task runner that can be used to run the given job. :param local_task_job: The LocalTaskJob associated with the TaskInstance - that needs to be executed. + that needs to be executed. :type local_task_job: airflow.jobs.LocalTaskJob :return: The task runner to use to run the task. :rtype: airflow.task.task_runner.base_task_runner.BaseTaskRunner """ - if _TASK_RUNNER == "BashTaskRunner": - return BashTaskRunner(local_task_job) + if _TASK_RUNNER == "StandardTaskRunner": + return StandardTaskRunner(local_task_job) elif _TASK_RUNNER == "CgroupTaskRunner": from airflow.contrib.task_runner.cgroup_task_runner import CgroupTaskRunner return CgroupTaskRunner(local_task_job) diff --git a/airflow/task/task_runner/base_task_runner.py b/airflow/task/task_runner/base_task_runner.py index 337f6a63baec6..f215c91b90a29 100644 --- a/airflow/task/task_runner/base_task_runner.py +++ b/airflow/task/task_runner/base_task_runner.py @@ -60,12 +60,6 @@ def __init__(self, local_task_job): # Always provide a copy of the configuration file settings cfg_path = tmp_configuration_copy() - # The following command should always work since the user doing chmod is the same - # as the one who just created the file. - subprocess.call( - ['chmod', '600', cfg_path], - close_fds=True - ) # Add sudo commands to change user if we need to. Needed to handle SubDagOperator # case using a SequentialExecutor. @@ -106,21 +100,22 @@ def _read_task_logs(self, stream): self._task_instance.job_id, self._task_instance.task_id, line.rstrip('\n')) - def run_command(self, run_with, join_args=False): + def run_command(self, run_with=None, join_args=False): """ - Run the task command + Run the task command. - :param run_with: list of tokens to run the task command with - E.g. ['bash', '-c'] + :param run_with: list of tokens to run the task command with e.g. ``['bash', '-c']`` :type run_with: list - :param join_args: whether to concatenate the list of command tokens - E.g. ['airflow', 'run'] vs ['airflow run'] + :param join_args: whether to concatenate the list of command tokens e.g. ``['airflow', 'run']`` vs + ``['airflow run']`` :param join_args: bool :return: the process that was run :rtype: subprocess.Popen """ + run_with = run_with or [] cmd = [" ".join(self._command)] if join_args else self._command full_cmd = run_with + cmd + self.log.info('Running: %s', full_cmd) proc = subprocess.Popen( full_cmd, @@ -150,8 +145,8 @@ def start(self): def return_code(self): """ :return: The return code associated with running the task instance or - None if the task is not yet done. - :rtype int: + None if the task is not yet done. + :rtype: int """ raise NotImplementedError() @@ -166,4 +161,7 @@ def on_finish(self): A callback that should be called when this is done running. """ if self._cfg_path and os.path.isfile(self._cfg_path): - subprocess.call(['sudo', 'rm', self._cfg_path], close_fds=True) + if self.run_as_user: + subprocess.call(['sudo', 'rm', self._cfg_path], close_fds=True) + else: + os.remove(self._cfg_path) diff --git a/airflow/task/task_runner/bash_task_runner.py b/airflow/task/task_runner/standard_task_runner.py similarity index 85% rename from airflow/task/task_runner/bash_task_runner.py rename to airflow/task/task_runner/standard_task_runner.py index 4ddcac5982666..6b65fc35c555a 100644 --- a/airflow/task/task_runner/bash_task_runner.py +++ b/airflow/task/task_runner/standard_task_runner.py @@ -23,15 +23,15 @@ from airflow.utils.helpers import reap_process_group -class BashTaskRunner(BaseTaskRunner): +class StandardTaskRunner(BaseTaskRunner): """ Runs the raw Airflow task by invoking through the Bash shell. """ def __init__(self, local_task_job): - super(BashTaskRunner, self).__init__(local_task_job) + super(StandardTaskRunner, self).__init__(local_task_job) def start(self): - self.process = self.run_command(['bash', '-c'], join_args=True) + self.process = self.run_command() def return_code(self): return self.process.poll() @@ -41,4 +41,4 @@ def terminate(self): reap_process_group(self.process.pid, self.log) def on_finish(self): - super(BashTaskRunner, self).on_finish() + super(StandardTaskRunner, self).on_finish() diff --git a/airflow/ti_deps/dep_context.py b/airflow/ti_deps/dep_context.py index 6d39998988f83..256063f3048e9 100644 --- a/airflow/ti_deps/dep_context.py +++ b/airflow/ti_deps/dep_context.py @@ -33,10 +33,11 @@ class DepContext(object): """ A base class for contexts that specifies which dependencies should be evaluated in the context for a task instance to satisfy the requirements of the context. Also - stores state related to the context that can be used by dependendency classes. + stores state related to the context that can be used by dependency classes. For example there could be a SomeRunContext that subclasses this class which has dependencies for: + - Making sure there are slots available on the infrastructure to run the task instance - A task-instance's task-specific dependencies are met (e.g. the previous task instance completed successfully) @@ -44,25 +45,27 @@ class DepContext(object): :param deps: The context-specific dependencies that need to be evaluated for a task instance to run in this execution context. - :type deps: set(BaseTIDep) + :type deps: set(airflow.ti_deps.deps.base_ti_dep.BaseTIDep) :param flag_upstream_failed: This is a hack to generate the upstream_failed state creation while checking to see whether the task instance is runnable. It was the shortest path to add the feature. This is bad since this class should be pure (no side effects). - :type flag_upstream_failed: boolean + :type flag_upstream_failed: bool :param ignore_all_deps: Whether or not the context should ignore all ignoreable dependencies. Overrides the other ignore_* parameters - :type ignore_all_deps: boolean + :type ignore_all_deps: bool :param ignore_depends_on_past: Ignore depends_on_past parameter of DAGs (e.g. for Backfills) - :type ignore_depends_on_past: boolean + :type ignore_depends_on_past: bool :param ignore_in_retry_period: Ignore the retry period for task instances - :type ignore_in_retry_period: boolean + :type ignore_in_retry_period: bool + :param ignore_in_reschedule_period: Ignore the reschedule period for task instances + :type ignore_in_reschedule_period: bool :param ignore_task_deps: Ignore task-specific dependencies such as depends_on_past and trigger rule - :type ignore_task_deps: boolean + :type ignore_task_deps: bool :param ignore_ti_state: Ignore the task instance's previous failure/success - :type ignore_ti_state: boolean + :type ignore_ti_state: bool """ def __init__( self, @@ -71,6 +74,7 @@ def __init__( ignore_all_deps=False, ignore_depends_on_past=False, ignore_in_retry_period=False, + ignore_in_reschedule_period=False, ignore_task_deps=False, ignore_ti_state=False): self.deps = deps or set() @@ -78,6 +82,7 @@ def __init__( self.ignore_all_deps = ignore_all_deps self.ignore_depends_on_past = ignore_depends_on_past self.ignore_in_retry_period = ignore_in_retry_period + self.ignore_in_reschedule_period = ignore_in_reschedule_period self.ignore_task_deps = ignore_task_deps self.ignore_ti_state = ignore_ti_state @@ -91,6 +96,7 @@ def __init__( State.SKIPPED, State.UPSTREAM_FAILED, State.UP_FOR_RETRY, + State.UP_FOR_RESCHEDULE, } # Context to get the dependencies that need to be met in order for a task instance to diff --git a/airflow/ti_deps/deps/base_ti_dep.py b/airflow/ti_deps/deps/base_ti_dep.py index 90c156cb760b9..1be9bd7d67650 100644 --- a/airflow/ti_deps/deps/base_ti_dep.py +++ b/airflow/ti_deps/deps/base_ti_dep.py @@ -66,9 +66,9 @@ def _get_dep_statuses(self, ti, session, dep_context=None): representing if each of the passed in task's upstream tasks succeeded or not. :param ti: the task instance to get the dependency status for - :type ti: TaskInstance + :type ti: airflow.models.TaskInstance :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session :param dep_context: the context for which this dependency should be evaluated for :type dep_context: DepContext """ @@ -81,9 +81,9 @@ def get_dep_statuses(self, ti, session, dep_context=None): checks for all dependencies. :param ti: the task instance to get the dependency status for - :type ti: TaskInstance + :type ti: airflow.models.TaskInstance :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session :param dep_context: the context for which this dependency should be evaluated for :type dep_context: DepContext """ @@ -114,9 +114,9 @@ def is_met(self, ti, session, dep_context=None): passing. :param ti: the task instance to see if this dependency is met for - :type ti: TaskInstance + :type ti: airflow.models.TaskInstance :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session :param dep_context: The context this dependency is being checked under that stores state that can be used by this dependency. :type dep_context: BaseDepContext @@ -130,9 +130,9 @@ def get_failure_reasons(self, ti, session, dep_context=None): Returns an iterable of strings that explain why this dependency wasn't met. :param ti: the task instance to see if this dependency is met for - :type ti: TaskInstance + :type ti: airflow.models.TaskInstance :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session :param dep_context: The context this dependency is being checked under that stores state that can be used by this dependency. :type dep_context: BaseDepContext diff --git a/airflow/ti_deps/deps/ready_to_reschedule.py b/airflow/ti_deps/deps/ready_to_reschedule.py new file mode 100644 index 0000000000000..348cf00f5feb6 --- /dev/null +++ b/airflow/ti_deps/deps/ready_to_reschedule.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from airflow.models.taskreschedule import TaskReschedule +from airflow.ti_deps.deps.base_ti_dep import BaseTIDep +from airflow.utils import timezone +from airflow.utils.db import provide_session +from airflow.utils.state import State + + +class ReadyToRescheduleDep(BaseTIDep): + NAME = "Ready To Reschedule" + IGNOREABLE = True + IS_TASK_DEP = True + RESCHEDULEABLE_STATES = {State.UP_FOR_RESCHEDULE, State.NONE} + + @provide_session + def _get_dep_statuses(self, ti, session, dep_context): + """ + Determines whether a task is ready to be rescheduled. Only tasks in + NONE state with at least one row in task_reschedule table are + handled by this dependency class, otherwise this dependency is + considered as passed. This dependency fails if the latest reschedule + request's reschedule date is still in future. + """ + if dep_context.ignore_in_reschedule_period: + yield self._passing_status( + reason="The context specified that being in a reschedule period was " + "permitted.") + return + + if ti.state not in self.RESCHEDULEABLE_STATES: + yield self._passing_status( + reason="The task instance is not in State_UP_FOR_RESCHEDULE or NONE state.") + return + + task_reschedules = TaskReschedule.find_for_task_instance(task_instance=ti) + if not task_reschedules: + yield self._passing_status( + reason="There is no reschedule request for this task instance.") + return + + now = timezone.utcnow() + next_reschedule_date = task_reschedules[-1].reschedule_date + if now >= next_reschedule_date: + yield self._passing_status( + reason="Task instance id ready for reschedule.") + return + + yield self._failing_status( + reason="Task is not ready for reschedule yet but will be rescheduled " + "automatically. Current date is {0} and task will be rescheduled " + "at {1}.".format(now.isoformat(), next_reschedule_date.isoformat())) diff --git a/airflow/ti_deps/deps/trigger_rule_dep.py b/airflow/ti_deps/deps/trigger_rule_dep.py index 76e5e1324ffd7..833e5f1d5fc34 100644 --- a/airflow/ti_deps/deps/trigger_rule_dep.py +++ b/airflow/ti_deps/deps/trigger_rule_dep.py @@ -103,24 +103,24 @@ def _evaluate_trigger_rule( rule was met. :param ti: the task instance to evaluate the trigger rule of - :type ti: TaskInstance + :type ti: airflow.models.TaskInstance :param successes: Number of successful upstream tasks - :type successes: boolean + :type successes: bool :param skipped: Number of skipped upstream tasks - :type skipped: boolean + :type skipped: bool :param failed: Number of failed upstream tasks - :type failed: boolean + :type failed: bool :param upstream_failed: Number of upstream_failed upstream tasks - :type upstream_failed: boolean + :type upstream_failed: bool :param done: Number of completed upstream tasks - :type done: boolean + :type done: bool :param flag_upstream_failed: This is a hack to generate the upstream_failed state creation while checking to see whether the task instance is runnable. It was the shortest path to add the feature - :type flag_upstream_failed: boolean + :type flag_upstream_failed: bool :param session: database session - :type session: Session + :type session: sqlalchemy.orm.session.Session """ TR = airflow.models.TriggerRule @@ -152,6 +152,14 @@ def _evaluate_trigger_rule( elif tr == TR.ONE_FAILED: if upstream_done and not (failed or upstream_failed): ti.set_state(State.SKIPPED, session) + elif tr == TR.NONE_FAILED: + if upstream_failed or failed: + ti.set_state(State.UPSTREAM_FAILED, session) + elif skipped == upstream: + ti.set_state(State.SKIPPED, session) + elif tr == TR.NONE_SKIPPED: + if skipped: + ti.set_state(State.SKIPPED, session) if tr == TR.ONE_SUCCESS: if successes <= 0: @@ -194,6 +202,23 @@ def _evaluate_trigger_rule( "upstream_task_ids={3}" .format(tr, upstream_done, upstream_tasks_state, task.upstream_task_ids)) + elif tr == TR.NONE_FAILED: + num_failures = upstream - successes - skipped + if num_failures > 0: + yield self._failing_status( + reason="Task's trigger rule '{0}' requires all upstream " + "tasks to have succeeded or been skipped, but found {1} non-success(es). " + "upstream_tasks_state={2}, upstream_task_ids={3}" + .format(tr, num_failures, upstream_tasks_state, + task.upstream_task_ids)) + elif tr == TR.NONE_SKIPPED: + if skipped > 0: + yield self._failing_status( + reason="Task's trigger rule '{0}' requires all upstream " + "tasks to not have been skipped, but found {1} task(s) skipped. " + "upstream_tasks_state={2}, upstream_task_ids={3}" + .format(tr, skipped, upstream_tasks_state, + task.upstream_task_ids)) else: yield self._failing_status( reason="No strategy to evaluate trigger rule '{0}'.".format(tr)) diff --git a/airflow/utils/__init__.py b/airflow/utils/__init__.py index cc4e12be8033a..e8e889c59b7a5 100644 --- a/airflow/utils/__init__.py +++ b/airflow/utils/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/asciiart.py b/airflow/utils/asciiart.py index faf796c18d4fd..491d79aa3e74f 100644 --- a/airflow/utils/asciiart.py +++ b/airflow/utils/asciiart.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. # -bug = r"""\ +bug = r""" =, .= =.| ,---. |.= =.| "-(:::::)-" |.= diff --git a/airflow/utils/cli.py b/airflow/utils/cli.py index 4a1e57a062b4b..6c3bf42c9458c 100644 --- a/airflow/utils/cli.py +++ b/airflow/utils/cli.py @@ -30,7 +30,7 @@ from argparse import Namespace from datetime import datetime -import airflow.models +from airflow.models.log import Log from airflow.utils import cli_action_loggers @@ -46,7 +46,7 @@ def action_logging(f): end_datetime : end datetime instance by utc full_command : full command line arguments user : current user - log : airflow.models.Log ORM instance + log : airflow.models.log.Log ORM instance dag_id : dag id (optional) task_id : task_id (optional) execution_date : execution date (optional) @@ -94,10 +94,8 @@ def _build_metrics(func_name, namespace): :return: dict with metrics """ - metrics = {'sub_command': func_name} - metrics['start_datetime'] = datetime.utcnow() - metrics['full_command'] = '{}'.format(list(sys.argv)) - metrics['user'] = getpass.getuser() + metrics = {'sub_command': func_name, 'start_datetime': datetime.utcnow(), + 'full_command': '{}'.format(list(sys.argv)), 'user': getpass.getuser()} assert isinstance(namespace, Namespace) tmp_dic = vars(namespace) @@ -107,7 +105,7 @@ def _build_metrics(func_name, namespace): metrics['host_name'] = socket.gethostname() extra = json.dumps(dict((k, metrics[k]) for k in ('host_name', 'full_command'))) - log = airflow.models.Log( + log = Log( event='cli_{}'.format(func_name), task_instance=None, owner=metrics['user'], diff --git a/airflow/utils/cli_action_loggers.py b/airflow/utils/cli_action_loggers.py index e61642d30012c..1c96e5f55c89e 100644 --- a/airflow/utils/cli_action_loggers.py +++ b/airflow/utils/cli_action_loggers.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -24,8 +24,9 @@ from __future__ import absolute_import import logging +from typing import List, Callable -import airflow.settings +from airflow.utils.db import create_session def register_pre_exec_callback(action_logger): @@ -37,7 +38,7 @@ def register_pre_exec_callback(action_logger): :param action_logger: An action logger function :return: None """ - logging.debug("Adding {} to pre execution callback".format(action_logger)) + logging.debug("Adding %s to pre execution callback", action_logger) __pre_exec_callbacks.append(action_logger) @@ -50,7 +51,7 @@ def register_post_exec_callback(action_logger): :param action_logger: An action logger function :return: None """ - logging.debug("Adding {} to post execution callback".format(action_logger)) + logging.debug("Adding %s to post execution callback", action_logger) __post_exec_callbacks.append(action_logger) @@ -61,12 +62,12 @@ def on_pre_execution(**kwargs): :param kwargs: :return: None """ - logging.debug("Calling callbacks: {}".format(__pre_exec_callbacks)) + logging.debug("Calling callbacks: %s", __pre_exec_callbacks) for cb in __pre_exec_callbacks: try: cb(**kwargs) except Exception: - logging.exception('Failed on pre-execution callback using {}'.format(cb)) + logging.exception('Failed on pre-execution callback using %s', cb) def on_post_execution(**kwargs): @@ -78,12 +79,12 @@ def on_post_execution(**kwargs): :param kwargs: :return: None """ - logging.debug("Calling callbacks: {}".format(__post_exec_callbacks)) + logging.debug("Calling callbacks: %s", __post_exec_callbacks) for cb in __post_exec_callbacks: try: cb(**kwargs) except Exception: - logging.exception('Failed on post-execution callback using {}'.format(cb)) + logging.exception('Failed on post-execution callback using %s', cb) def default_action_log(log, **_): @@ -94,13 +95,12 @@ def default_action_log(log, **_): :param **_: other keyword arguments that is not being used by this function :return: None """ - session = airflow.settings.Session() - session.add(log) - session.commit() + with create_session() as session: + session.add(log) -__pre_exec_callbacks = [] -__post_exec_callbacks = [] +__pre_exec_callbacks = [] # type: List[Callable] +__post_exec_callbacks = [] # type: List[Callable] # By default, register default action log into pre-execution callback register_pre_exec_callback(default_action_log) diff --git a/airflow/utils/compression.py b/airflow/utils/compression.py index f6e923a15619f..2565299175bb0 100644 --- a/airflow/utils/compression.py +++ b/airflow/utils/compression.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/configuration.py b/airflow/utils/configuration.py index 18a338c23f6ff..6a621d5fa9c18 100644 --- a/airflow/utils/configuration.py +++ b/airflow/utils/configuration.py @@ -26,16 +26,18 @@ from airflow import configuration as conf -def tmp_configuration_copy(): +def tmp_configuration_copy(chmod=0o600): """ Returns a path for a temporary file including a full copy of the configuration settings. :return: a path to a temporary file """ - cfg_dict = conf.as_dict(display_sensitive=True) + cfg_dict = conf.as_dict(display_sensitive=True, raw=True) temp_fd, cfg_path = mkstemp() with os.fdopen(temp_fd, 'w') as temp_file: + if chmod is not None: + os.fchmod(temp_fd, chmod) json.dump(cfg_dict, temp_file) return cfg_path diff --git a/airflow/utils/dag_processing.py b/airflow/utils/dag_processing.py index 543eb41692e03..b2055a5d7f6fc 100644 --- a/airflow/utils/dag_processing.py +++ b/airflow/utils/dag_processing.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,17 +22,37 @@ from __future__ import print_function from __future__ import unicode_literals +import logging +import multiprocessing import os import re +import signal +import sys import time import zipfile from abc import ABCMeta, abstractmethod from collections import defaultdict - +from collections import namedtuple +from datetime import timedelta +from importlib import import_module +import enum + +import psutil +from six.moves import range, reload_module +from sqlalchemy import or_ +from tabulate import tabulate + +# To avoid circular imports +import airflow.models +from airflow import configuration as conf from airflow.dag.base_dag import BaseDag, BaseDagBag from airflow.exceptions import AirflowException +from airflow.settings import Stats +from airflow.models import errors from airflow.utils import timezone +from airflow.utils.db import provide_session from airflow.utils.log.logging_mixin import LoggingMixin +from airflow.utils.state import State class SimpleDag(BaseDag): @@ -44,7 +64,7 @@ class SimpleDag(BaseDag): def __init__(self, dag, pickle_id=None): """ :param dag: the DAG - :type dag: DAG + :type dag: airflow.models.DAG :param pickle_id: ID associated with the pickled version of this DAG. :type pickle_id: unicode """ @@ -121,6 +141,103 @@ def get_task_special_arg(self, task_id, special_arg_name): return None +class SimpleTaskInstance(object): + def __init__(self, ti): + self._dag_id = ti.dag_id + self._task_id = ti.task_id + self._execution_date = ti.execution_date + self._start_date = ti.start_date + self._end_date = ti.end_date + self._try_number = ti.try_number + self._state = ti.state + self._executor_config = ti.executor_config + if hasattr(ti, 'run_as_user'): + self._run_as_user = ti.run_as_user + else: + self._run_as_user = None + if hasattr(ti, 'pool'): + self._pool = ti.pool + else: + self._pool = None + if hasattr(ti, 'priority_weight'): + self._priority_weight = ti.priority_weight + else: + self._priority_weight = None + self._queue = ti.queue + self._key = ti.key + + @property + def dag_id(self): + return self._dag_id + + @property + def task_id(self): + return self._task_id + + @property + def execution_date(self): + return self._execution_date + + @property + def start_date(self): + return self._start_date + + @property + def end_date(self): + return self._end_date + + @property + def try_number(self): + return self._try_number + + @property + def state(self): + return self._state + + @property + def pool(self): + return self._pool + + @property + def priority_weight(self): + return self._priority_weight + + @property + def queue(self): + return self._queue + + @property + def key(self): + return self._key + + @property + def executor_config(self): + return self._executor_config + + @provide_session + def construct_task_instance(self, session=None, lock_for_update=False): + """ + Construct a TaskInstance from the database based on the primary key + + :param session: DB session. + :param lock_for_update: if True, indicates that the database should + lock the TaskInstance (issuing a FOR UPDATE clause) until the + session is committed. + """ + TI = airflow.models.TaskInstance + + qry = session.query(TI).filter( + TI.dag_id == self._dag_id, + TI.task_id == self._task_id, + TI.execution_date == self._execution_date) + + if lock_for_update: + ti = qry.with_for_update().first() + else: + ti = qry.first() + return ti + + class SimpleDagBag(BaseDagBag): """ A collection of SimpleDag objects with some convenience methods. @@ -131,7 +248,7 @@ def __init__(self, simple_dags): Constructor. :param simple_dags: SimpleDag objects that should be in this - :type: list(SimpleDag) + :type list(airflow.utils.dag_processing.SimpleDagBag) """ self.simple_dags = simple_dags self.dag_id_to_simple_dag = {} @@ -153,37 +270,57 @@ def get_dag(self, dag_id): :type dag_id: unicode :return: if the given DAG ID exists in the bag, return the BaseDag corresponding to that ID. Otherwise, throw an Exception - :rtype: SimpleDag + :rtype: airflow.utils.dag_processing.SimpleDag """ if dag_id not in self.dag_id_to_simple_dag: raise AirflowException("Unknown DAG ID {}".format(dag_id)) return self.dag_id_to_simple_dag[dag_id] -def list_py_file_paths(directory, safe_mode=True): +def list_py_file_paths(directory, safe_mode=True, + include_examples=None): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file - contains Airflow DAG definitions + contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ + if include_examples is None: + include_examples = conf.getboolean('core', 'LOAD_EXAMPLES') file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): - patterns = [] + patterns_by_dir = {} for root, dirs, files in os.walk(directory, followlinks=True): - ignore_file = [f for f in files if f == '.airflowignore'] - if ignore_file: - f = open(os.path.join(root, ignore_file[0]), 'r') - patterns += [p for p in f.read().split('\n') if p] - f.close() + patterns = patterns_by_dir.get(root, []) + ignore_file = os.path.join(root, '.airflowignore') + if os.path.isfile(ignore_file): + with open(ignore_file, 'r') as f: + # If we have new patterns create a copy so we don't change + # the previous list (which would affect other subdirs) + patterns = patterns + [p for p in f.read().split('\n') if p] + + # If we can ignore any subdirs entirely we should - fewer paths + # to walk is better. We have to modify the ``dirs`` array in + # place for this to affect os.walk + dirs[:] = [ + d + for d in dirs + if not any(re.search(p, os.path.join(root, d)) for p in patterns) + ] + + # We want patterns defined in a parent folder's .airflowignore to + # apply to subdirs too + for d in dirs: + patterns_by_dir[os.path.join(root, d)] = patterns + for f in files: try: file_path = os.path.join(root, f) @@ -212,6 +349,10 @@ def list_py_file_paths(directory, safe_mode=True): except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) + if include_examples: + import airflow.example_dags + example_dag_folder = airflow.example_dags.__path__[0] + file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths @@ -268,7 +409,7 @@ def done(self): def result(self): """ :return: result of running SchedulerJob.process_file() - :rtype: list[SimpleDag] + :rtype: list[airflow.utils.dag_processing.SimpleDag] """ raise NotImplementedError() @@ -291,69 +432,584 @@ def file_path(self): raise NotImplementedError() +DagParsingStat = namedtuple('DagParsingStat', + ['file_paths', 'all_pids', 'done', + 'all_files_processed', 'result_count']) + + +class DagParsingSignal(enum.Enum): + AGENT_HEARTBEAT = 'agent_heartbeat' + MANAGER_DONE = 'manager_done' + TERMINATE_MANAGER = 'terminate_manager' + END_MANAGER = 'end_manager' + + +class DagFileProcessorAgent(LoggingMixin): + """ + Agent for DAG file processing. It is responsible for all DAG parsing + related jobs in scheduler process. Mainly it can spin up DagFileProcessorManager + in a subprocess, collect DAG parsing results from it and communicate + signal/DAG parsing stat with it. + """ + + def __init__(self, + dag_directory, + file_paths, + max_runs, + processor_factory, + async_mode): + """ + :param dag_directory: Directory where DAG definitions are kept. All + files in file_paths should be under this directory + :type dag_directory: unicode + :param file_paths: list of file paths that contain DAG definitions + :type file_paths: list[unicode] + :param max_runs: The number of times to parse and schedule each file. -1 + for unlimited. + :type max_runs: int + :param processor_factory: function that creates processors for DAG + definition files. Arguments are (dag_definition_path, log_file_path) + :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) + :param async_mode: Whether to start agent in async mode + :type async_mode: bool + """ + self._file_paths = file_paths + self._file_path_queue = [] + self._dag_directory = dag_directory + self._max_runs = max_runs + self._processor_factory = processor_factory + self._async_mode = async_mode + # Map from file path to the processor + self._processors = {} + # Map from file path to the last runtime + self._last_runtime = {} + # Map from file path to the last finish time + self._last_finish_time = {} + # Map from file path to the number of runs + self._run_count = defaultdict(int) + # Pids of DAG parse + self._all_pids = [] + # Pipe for communicating signals + self._parent_signal_conn, self._child_signal_conn = multiprocessing.Pipe() + # Pipe for communicating DagParsingStat + self._stat_queue = multiprocessing.Queue() + self._result_queue = multiprocessing.Queue() + self._process = None + self._done = False + # Initialized as true so we do not deactivate w/o any actual DAG parsing. + self._all_files_processed = True + self._result_count = 0 + + def start(self): + """ + Launch DagFileProcessorManager processor and start DAG parsing loop in manager. + """ + self._process = self._launch_process(self._dag_directory, + self._file_paths, + self._max_runs, + self._processor_factory, + self._child_signal_conn, + self._stat_queue, + self._result_queue, + self._async_mode) + self.log.info("Launched DagFileProcessorManager with pid: %s", self._process.pid) + + def heartbeat(self): + """ + Should only be used when launched DAG file processor manager in sync mode. + Send agent heartbeat signal to the manager. + """ + self._parent_signal_conn.send(DagParsingSignal.AGENT_HEARTBEAT) + + def wait_until_finished(self): + """ + Should only be used when launched DAG file processor manager in sync mode. + Wait for done signal from the manager. + """ + while True: + if self._parent_signal_conn.recv() == DagParsingSignal.MANAGER_DONE: + break + + @staticmethod + def _launch_process(dag_directory, + file_paths, + max_runs, + processor_factory, + signal_conn, + _stat_queue, + result_queue, + async_mode): + def helper(): + # Reload configurations and settings to avoid collision with parent process. + # Because this process may need custom configurations that cannot be shared, + # e.g. RotatingFileHandler. And it can cause connection corruption if we + # do not recreate the SQLA connection pool. + os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER'] = 'True' + # Replicating the behavior of how logging module was loaded + # in logging_config.py + reload_module(import_module(airflow.settings.LOGGING_CLASS_PATH.rsplit('.', 1)[0])) + reload_module(airflow.settings) + airflow.settings.initialize() + del os.environ['CONFIG_PROCESSOR_MANAGER_LOGGER'] + processor_manager = DagFileProcessorManager(dag_directory, + file_paths, + max_runs, + processor_factory, + signal_conn, + _stat_queue, + result_queue, + async_mode) + + processor_manager.start() + + p = multiprocessing.Process(target=helper, + args=(), + name="DagFileProcessorManager") + p.start() + return p + + def harvest_simple_dags(self): + """ + Harvest DAG parsing results from result queue and sync metadata from stat queue. + :return: List of parsing result in SimpleDag format. + """ + # Metadata and results to be harvested can be inconsistent, + # but it should not be a big problem. + self._sync_metadata() + # Heartbeating after syncing metadata so we do not restart manager + # if it processed all files for max_run times and exit normally. + self._heartbeat_manager() + simple_dags = [] + # multiprocessing.Queue().qsize will not work on MacOS. + if sys.platform == "darwin": + qsize = self._result_count + else: + qsize = self._result_queue.qsize() + for _ in range(qsize): + simple_dags.append(self._result_queue.get()) + + self._result_count = 0 + + return simple_dags + + def _heartbeat_manager(self): + """ + Heartbeat DAG file processor and start it if it is not alive. + :return: + """ + if self._process and not self._process.is_alive() and not self.done: + self.start() + + def _sync_metadata(self): + """ + Sync metadata from stat queue and only keep the latest stat. + :return: + """ + while not self._stat_queue.empty(): + stat = self._stat_queue.get() + self._file_paths = stat.file_paths + self._all_pids = stat.all_pids + self._done = stat.done + self._all_files_processed = stat.all_files_processed + self._result_count += stat.result_count + + @property + def file_paths(self): + return self._file_paths + + @property + def done(self): + return self._done + + @property + def all_files_processed(self): + return self._all_files_processed + + def terminate(self): + """ + Send termination signal to DAG parsing processor manager + and expect it to terminate all DAG file processors. + """ + self.log.info("Sending termination message to manager.") + self._child_signal_conn.send(DagParsingSignal.TERMINATE_MANAGER) + + def end(self): + """ + Terminate (and then kill) the manager process launched. + :return: + """ + if not self._process: + self.log.warn('Ending without manager process.') + return + this_process = psutil.Process(os.getpid()) + try: + manager_process = psutil.Process(self._process.pid) + except psutil.NoSuchProcess: + self.log.info("Manager process not running.") + return + + # First try SIGTERM + if manager_process.is_running() \ + and manager_process.pid in [x.pid for x in this_process.children()]: + self.log.info("Terminating manager process: %s", manager_process.pid) + manager_process.terminate() + # TODO: Remove magic number + timeout = 5 + self.log.info("Waiting up to %ss for manager process to exit...", timeout) + try: + psutil.wait_procs({manager_process}, timeout) + except psutil.TimeoutExpired: + self.log.debug("Ran out of time while waiting for " + "processes to exit") + + # Then SIGKILL + if manager_process.is_running() \ + and manager_process.pid in [x.pid for x in this_process.children()]: + self.log.info("Killing manager process: %s", manager_process.pid) + manager_process.kill() + manager_process.wait() + + class DagFileProcessorManager(LoggingMixin): """ Given a list of DAG definition files, this kicks off several processors - in parallel to process them. The parallelism is limited and as the + in parallel to process them and put the results to a multiprocessing.Queue + for DagFileProcessorAgent to harvest. The parallelism is limited and as the processors finish, more are launched. The files are processed over and over again, but no more often than the specified interval. :type _file_path_queue: list[unicode] :type _processors: dict[unicode, AbstractDagFileProcessor] :type _last_runtime: dict[unicode, float] - :type _last_finish_time: dict[unicode, datetime] + :type _last_finish_time: dict[unicode, datetime.datetime] """ def __init__(self, dag_directory, file_paths, - parallelism, - process_file_interval, - min_file_parsing_loop_time, max_runs, - processor_factory): + processor_factory, + signal_conn, + stat_queue, + result_queue, + async_mode=True): """ :param dag_directory: Directory where DAG definitions are kept. All - files in file_paths should be under this directory + files in file_paths should be under this directory :type dag_directory: unicode :param file_paths: list of file paths that contain DAG definitions :type file_paths: list[unicode] - :param parallelism: maximum number of simultaneous process to run at once - :type parallelism: int - :param process_file_interval: process a file at most once every this - many seconds - :type process_file_interval: float - :param min_file_parsing_loop_time: wait until at least this many seconds have - passed before parsing files once all files have finished parsing. - :type min_file_parsing_loop_time: float :param max_runs: The number of times to parse and schedule each file. -1 - for unlimited. + for unlimited. :type max_runs: int - :type process_file_interval: float :param processor_factory: function that creates processors for DAG - definition files. Arguments are (dag_definition_path) - :type processor_factory: (unicode, unicode) -> (AbstractDagFileProcessor) - + definition files. Arguments are (dag_definition_path) + :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) + :param signal_conn: connection to communicate signal with processor agent. + :type signal_conn: airflow.models.connection.Connection + :param stat_queue: the queue to use for passing back parsing stat to agent. + :type stat_queue: multiprocessing.Queue + :param result_queue: the queue to use for passing back the result to agent. + :type result_queue: multiprocessing.Queue + :param async_mode: whether to start the manager in async mode + :type async_mode: bool """ self._file_paths = file_paths self._file_path_queue = [] - self._parallelism = parallelism self._dag_directory = dag_directory self._max_runs = max_runs - self._process_file_interval = process_file_interval - self._min_file_parsing_loop_time = min_file_parsing_loop_time self._processor_factory = processor_factory + self._signal_conn = signal_conn + self._stat_queue = stat_queue + self._result_queue = result_queue + self._async_mode = async_mode + + self._parallelism = conf.getint('scheduler', 'max_threads') + if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: + self.log.error("Cannot use more than 1 thread when using sqlite. " + "Setting parallelism to 1") + self._parallelism = 1 + + # Parse and schedule each file no faster than this interval. + self._file_process_interval = conf.getint('scheduler', + 'min_file_process_interval') + # How often to print out DAG file processing stats to the log. Default to + # 30 seconds. + self.print_stats_interval = conf.getint('scheduler', + 'print_stats_interval') + # How many seconds do we wait for tasks to heartbeat before mark them as zombies. + self._zombie_threshold_secs = ( + conf.getint('scheduler', 'scheduler_zombie_task_threshold')) # Map from file path to the processor self._processors = {} # Map from file path to the last runtime self._last_runtime = {} # Map from file path to the last finish time self._last_finish_time = {} + self._last_zombie_query_time = timezone.utcnow() + # Last time that the DAG dir was traversed to look for files + self.last_dag_dir_refresh_time = timezone.utcnow() + # Last time stats were printed + self.last_stat_print_time = timezone.datetime(2000, 1, 1) + # TODO: Remove magic number + self._zombie_query_interval = 10 # Map from file path to the number of runs self._run_count = defaultdict(int) - # Scheduler heartbeat key. + # Manager heartbeat key. self._heart_beat_key = 'heart-beat' + # How often to scan the DAGs directory for new files. Default to 5 minutes. + self.dag_dir_list_interval = conf.getint('scheduler', + 'dag_dir_list_interval') + + self._log = logging.getLogger('airflow.processor_manager') + + signal.signal(signal.SIGINT, self._exit_gracefully) + signal.signal(signal.SIGTERM, self._exit_gracefully) + + def _exit_gracefully(self, signum, frame): + """ + Helper method to clean up DAG file processors to avoid leaving orphan processes. + """ + self.log.info("Exiting gracefully upon receiving signal %s", signum) + self.terminate() + self.end() + self.log.debug("Finished terminating DAG processors.") + sys.exit(os.EX_OK) + + def start(self): + """ + Use multiple processes to parse and generate tasks for the + DAGs in parallel. By processing them in separate processes, + we can get parallelism and isolation from potentially harmful + user code. + """ + + self.log.info("Processing files using up to %s processes at a time ", self._parallelism) + self.log.info("Process each file at most once every %s seconds", self._file_process_interval) + self.log.info( + "Checking for new files in %s every %s seconds", self._dag_directory, self.dag_dir_list_interval + ) + + if self._async_mode: + self.log.debug("Starting DagFileProcessorManager in async mode") + self.start_in_async() + else: + self.log.debug("Starting DagFileProcessorManager in sync mode") + self.start_in_sync() + + def start_in_async(self): + """ + Parse DAG files repeatedly in a standalone loop. + """ + while True: + loop_start_time = time.time() + + if self._signal_conn.poll(): + agent_signal = self._signal_conn.recv() + if agent_signal == DagParsingSignal.TERMINATE_MANAGER: + self.terminate() + break + elif agent_signal == DagParsingSignal.END_MANAGER: + self.end() + sys.exit(os.EX_OK) + + self._refresh_dag_dir() + + simple_dags = self.heartbeat() + for simple_dag in simple_dags: + self._result_queue.put(simple_dag) + + self._print_stat() + + all_files_processed = all(self.get_last_finish_time(x) is not None + for x in self.file_paths) + max_runs_reached = self.max_runs_reached() + + dag_parsing_stat = DagParsingStat(self._file_paths, + self.get_all_pids(), + max_runs_reached, + all_files_processed, + len(simple_dags)) + self._stat_queue.put(dag_parsing_stat) + + if max_runs_reached: + self.log.info("Exiting dag parsing loop as all files " + "have been processed %s times", self._max_runs) + break + + loop_duration = time.time() - loop_start_time + if loop_duration < 1: + sleep_length = 1 - loop_duration + self.log.debug("Sleeping for %.2f seconds to prevent excessive logging", sleep_length) + time.sleep(sleep_length) + + def start_in_sync(self): + """ + Parse DAG files in a loop controlled by DagParsingSignal. + Actual DAG parsing loop will run once upon receiving one + agent heartbeat message and will report done when finished the loop. + """ + while True: + agent_signal = self._signal_conn.recv() + if agent_signal == DagParsingSignal.TERMINATE_MANAGER: + self.terminate() + break + elif agent_signal == DagParsingSignal.END_MANAGER: + self.end() + sys.exit(os.EX_OK) + elif agent_signal == DagParsingSignal.AGENT_HEARTBEAT: + + self._refresh_dag_dir() + + simple_dags = self.heartbeat() + for simple_dag in simple_dags: + self._result_queue.put(simple_dag) + + self._print_stat() + + all_files_processed = all(self.get_last_finish_time(x) is not None + for x in self.file_paths) + max_runs_reached = self.max_runs_reached() + + dag_parsing_stat = DagParsingStat(self._file_paths, + self.get_all_pids(), + self.max_runs_reached(), + all_files_processed, + len(simple_dags)) + self._stat_queue.put(dag_parsing_stat) + + self.wait_until_finished() + self._signal_conn.send(DagParsingSignal.MANAGER_DONE) + + if max_runs_reached: + self.log.info("Exiting dag parsing loop as all files " + "have been processed %s times", self._max_runs) + self._signal_conn.send(DagParsingSignal.MANAGER_DONE) + break + + def _refresh_dag_dir(self): + """ + Refresh file paths from dag dir if we haven't done it for too long. + """ + elapsed_time_since_refresh = (timezone.utcnow() - + self.last_dag_dir_refresh_time).total_seconds() + if elapsed_time_since_refresh > self.dag_dir_list_interval: + # Build up a list of Python files that could contain DAGs + self.log.info("Searching for files in %s", self._dag_directory) + self._file_paths = list_py_file_paths(self._dag_directory) + self.last_dag_dir_refresh_time = timezone.utcnow() + self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) + self.set_file_paths(self._file_paths) + + try: + self.log.debug("Removing old import errors") + self.clear_nonexistent_import_errors() + except Exception: + self.log.exception("Error removing old import errors") + + def _print_stat(self): + """ + Occasionally print out stats about how fast the files are getting processed + """ + if ((timezone.utcnow() - self.last_stat_print_time).total_seconds() > + self.print_stats_interval): + if len(self._file_paths) > 0: + self._log_file_processing_stats(self._file_paths) + self.last_stat_print_time = timezone.utcnow() + + @provide_session + def clear_nonexistent_import_errors(self, session): + """ + Clears import errors for files that no longer exist. + + :param session: session for ORM operations + :type session: sqlalchemy.orm.session.Session + """ + query = session.query(errors.ImportError) + if self._file_paths: + query = query.filter( + ~errors.ImportError.filename.in_(self._file_paths) + ) + query.delete(synchronize_session='fetch') + session.commit() + + def _log_file_processing_stats(self, known_file_paths): + """ + Print out stats about how files are getting processed. + + :param known_file_paths: a list of file paths that may contain Airflow + DAG definitions + :type known_file_paths: list[unicode] + :return: None + """ + + # File Path: Path to the file containing the DAG definition + # PID: PID associated with the process that's processing the file. May + # be empty. + # Runtime: If the process is currently running, how long it's been + # running for in seconds. + # Last Runtime: If the process ran before, how long did it take to + # finish in seconds + # Last Run: When the file finished processing in the previous run. + headers = ["File Path", + "PID", + "Runtime", + "Last Runtime", + "Last Run"] + + rows = [] + for file_path in known_file_paths: + last_runtime = self.get_last_runtime(file_path) + file_name = os.path.basename(file_path) + file_name = os.path.splitext(file_name)[0].replace(os.sep, '.') + if last_runtime: + Stats.gauge( + 'dag_processing.last_runtime.{}'.format(file_name), + last_runtime + ) + + processor_pid = self.get_pid(file_path) + processor_start_time = self.get_start_time(file_path) + runtime = ((timezone.utcnow() - processor_start_time).total_seconds() + if processor_start_time else None) + last_run = self.get_last_finish_time(file_path) + if last_run: + seconds_ago = (timezone.utcnow() - last_run).total_seconds() + Stats.gauge( + 'dag_processing.last_run.seconds_ago.{}'.format(file_name), + seconds_ago + ) + + rows.append((file_path, + processor_pid, + runtime, + last_runtime, + last_run)) + + # Sort by longest last runtime. (Can't sort None values in python3) + rows = sorted(rows, key=lambda x: x[3] or 0.0) + + formatted_rows = [] + for file_path, pid, runtime, last_runtime, last_run in rows: + formatted_rows.append((file_path, + pid, + "{:.2f}s".format(runtime) + if runtime else None, + "{:.2f}s".format(last_runtime) + if last_runtime else None, + last_run.strftime("%Y-%m-%dT%H:%M:%S") + if last_run else None)) + log_str = ("\n" + + "=" * 80 + + "\n" + + "DAG File Processing Stats\n\n" + + tabulate(formatted_rows, headers=headers) + + "\n" + + "=" * 80) + + self.log.info(log_str) + @property def file_paths(self): return self._file_paths @@ -363,7 +1019,7 @@ def get_pid(self, file_path): :param file_path: the path to the file that's being processed :type file_path: unicode :return: the PID of the process processing the given file or None if - the specified file is not being processed + the specified file is not being processed :rtype: int """ if file_path in self._processors: @@ -382,8 +1038,8 @@ def get_runtime(self, file_path): :param file_path: the path to the file that's being processed :type file_path: unicode :return: the current runtime (in seconds) of the process that's - processing the specified file or None if the file is not currently - being processed + processing the specified file or None if the file is not currently + being processed """ if file_path in self._processors: return (timezone.utcnow() - self._processors[file_path].start_time)\ @@ -395,7 +1051,7 @@ def get_last_runtime(self, file_path): :param file_path: the path to the file that was processed :type file_path: unicode :return: the runtime (in seconds) of the process of the last run, or - None if the file was never processed. + None if the file was never processed. :rtype: float """ return self._last_runtime.get(file_path) @@ -405,7 +1061,7 @@ def get_last_finish_time(self, file_path): :param file_path: the path to the file that was processed :type file_path: unicode :return: the finish time of the process of the last run, or None if the - file was never processed. + file was never processed. :rtype: datetime """ return self._last_finish_time.get(file_path) @@ -415,7 +1071,7 @@ def get_start_time(self, file_path): :param file_path: the path to the file that's being processed :type file_path: unicode :return: the start time of the process that's processing the - specified file or None if the file is not currently being processed + specified file or None if the file is not currently being processed :rtype: datetime """ if file_path in self._processors: @@ -460,13 +1116,13 @@ def wait_until_finished(self): def heartbeat(self): """ - This should be periodically called by the scheduler. This method will + This should be periodically called by the manager loop. This method will kick off new processes to process DAG definition files and read the results from the finished processors. :return: a list of SimpleDags that were produced by processors that - have finished since the last time this was called - :rtype: list[SimpleDag] + have finished since the last time this was called + :rtype: list[airflow.utils.dag_processing.SimpleDag] """ finished_processors = {} """:type : dict[unicode, AbstractDagFileProcessor]""" @@ -475,7 +1131,7 @@ def heartbeat(self): for file_path, processor in self._processors.items(): if processor.done: - self.log.info("Processor for %s finished", file_path) + self.log.debug("Processor for %s finished", file_path) now = timezone.utcnow() finished_processors[file_path] = processor self._last_runtime[file_path] = (now - @@ -486,7 +1142,7 @@ def heartbeat(self): running_processors[file_path] = processor self._processors = running_processors - self.log.debug("%s/%s scheduler processes running", + self.log.debug("%s/%s DAG parsing processes running", len(self._processors), self._parallelism) self.log.debug("%s file paths queued for processing", @@ -512,24 +1168,12 @@ def heartbeat(self): file_paths_in_progress = self._processors.keys() now = timezone.utcnow() file_paths_recently_processed = [] - - longest_parse_duration = 0 for file_path in self._file_paths: last_finish_time = self.get_last_finish_time(file_path) - if last_finish_time is not None: - duration = now - last_finish_time - longest_parse_duration = max(duration.total_seconds(), - longest_parse_duration) - if duration.total_seconds() < self._process_file_interval: - file_paths_recently_processed.append(file_path) - - sleep_length = max(self._min_file_parsing_loop_time - longest_parse_duration, - 0) - if sleep_length > 0: - self.log.debug("Sleeping for %.2f seconds to prevent excessive " - "logging", - sleep_length) - time.sleep(sleep_length) + if (last_finish_time is not None and + (now - last_finish_time).total_seconds() < + self._file_process_interval): + file_paths_recently_processed.append(file_path) files_paths_at_run_limit = [file_path for file_path, num_runs in self._run_count.items() @@ -553,24 +1197,61 @@ def heartbeat(self): self._file_path_queue.extend(files_paths_to_queue) + zombies = self._find_zombies() + # Start more processors if we have enough slots and files to process while (self._parallelism - len(self._processors) > 0 and len(self._file_path_queue) > 0): file_path = self._file_path_queue.pop(0) - processor = self._processor_factory(file_path) + processor = self._processor_factory(file_path, zombies) processor.start() - self.log.info( + self.log.debug( "Started a process (PID: %s) to generate tasks for %s", processor.pid, file_path ) self._processors[file_path] = processor - # Update scheduler heartbeat count. + # Update heartbeat count. self._run_count[self._heart_beat_key] += 1 return simple_dags + @provide_session + def _find_zombies(self, session): + """ + Find zombie task instances, which are tasks haven't heartbeated for too long. + :return: Zombie task instances in SimpleTaskInstance format. + """ + now = timezone.utcnow() + zombies = [] + if (now - self._last_zombie_query_time).total_seconds() \ + > self._zombie_query_interval: + # to avoid circular imports + from airflow.jobs import LocalTaskJob as LJ + self.log.info("Finding 'running' jobs without a recent heartbeat") + TI = airflow.models.TaskInstance + limit_dttm = timezone.utcnow() - timedelta( + seconds=self._zombie_threshold_secs) + self.log.info("Failing jobs without heartbeat after %s", limit_dttm) + + tis = ( + session.query(TI) + .join(LJ, TI.job_id == LJ.id) + .filter(TI.state == State.RUNNING) + .filter( + or_( + LJ.state != State.RUNNING, + LJ.latest_heartbeat < limit_dttm, + ) + ).all() + ) + self._last_zombie_query_time = timezone.utcnow() + for ti in tis: + zombies.append(SimpleTaskInstance(ti)) + + return zombies + def max_runs_reached(self): """ :return: whether all file paths have been processed max_runs times @@ -578,7 +1259,7 @@ def max_runs_reached(self): if self._max_runs == -1: # Unlimited runs. return False for file_path in self._file_paths: - if self._run_count[file_path] != self._max_runs: + if self._run_count[file_path] < self._max_runs: return False if self._run_count[self._heart_beat_key] < self._max_runs: return False @@ -591,3 +1272,40 @@ def terminate(self): """ for processor in self._processors.values(): processor.terminate() + + def end(self): + """ + Kill all child processes on exit since we don't want to leave + them as orphaned. + """ + pids_to_kill = self.get_all_pids() + if len(pids_to_kill) > 0: + # First try SIGTERM + this_process = psutil.Process(os.getpid()) + # Only check child processes to ensure that we don't have a case + # where we kill the wrong process because a child process died + # but the PID got reused. + child_processes = [x for x in this_process.children(recursive=True) + if x.is_running() and x.pid in pids_to_kill] + for child in child_processes: + self.log.info("Terminating child PID: %s", child.pid) + child.terminate() + # TODO: Remove magic number + timeout = 5 + self.log.info("Waiting up to %s seconds for processes to exit...", timeout) + try: + psutil.wait_procs( + child_processes, timeout=timeout, + callback=lambda x: self.log.info('Terminated PID %s', x.pid)) + except psutil.TimeoutExpired: + self.log.debug("Ran out of time while waiting for processes to exit") + + # Then SIGKILL + child_processes = [x for x in this_process.children(recursive=True) + if x.is_running() and x.pid in pids_to_kill] + if len(child_processes) > 0: + self.log.info("SIGKILL processes that did not terminate gracefully") + for child in child_processes: + self.log.info("Killing child PID: %s", child.pid) + child.kill() + child.wait() diff --git a/airflow/utils/dates.py b/airflow/utils/dates.py index c147a65430da2..e3340f557f264 100644 --- a/airflow/utils/dates.py +++ b/airflow/utils/dates.py @@ -24,12 +24,11 @@ from airflow.utils import timezone from datetime import datetime, timedelta -from dateutil.relativedelta import relativedelta # for doctest +from dateutil.relativedelta import relativedelta # noqa: F401 for doctest import six from croniter import croniter - cron_presets = { '@hourly': '0 * * * *', '@daily': '0 0 * * *', @@ -39,15 +38,24 @@ } -def date_range( - start_date, - end_date=None, - num=None, - delta=None): +def date_range(start_date, end_date=None, num=None, delta=None): """ Get a set of dates as a list based on a start, end and delta, delta - can be something that can be added to ``datetime.datetime`` - or a cron expression as a ``str`` + can be something that can be added to `datetime.datetime` + or a cron expression as a `str` + + :Example:: + + date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) + [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), + datetime.datetime(2016, 1, 3, 0, 0)] + date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') + [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), + datetime.datetime(2016, 1, 3, 0, 0)] + date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") + [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), + datetime.datetime(2016, 3, 1, 0, 0)] + :param start_date: anchor date to start the series from :type start_date: datetime.datetime :param end_date: right boundary for the date range @@ -56,15 +64,6 @@ def date_range( number of entries you want in the range. This number can be negative, output will always be sorted regardless :type num: int - >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta=timedelta(1)) - [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), - datetime.datetime(2016, 1, 3, 0, 0)] - >>> date_range(datetime(2016, 1, 1), datetime(2016, 1, 3), delta='0 0 * * *') - [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 1, 2, 0, 0), - datetime.datetime(2016, 1, 3, 0, 0)] - >>> date_range(datetime(2016, 1, 1), datetime(2016, 3, 3), delta="0 0 0 * *") - [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), - datetime.datetime(2016, 3, 1, 0, 0)] """ if not delta: return [] @@ -160,7 +159,7 @@ def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)): # We first search an upper limit for i for which start_date + upper * delta # exceeds dt. upper = 1 - while start_date + upper*delta < dt: + while start_date + upper * delta < dt: # To speed up finding an upper limit we grow this exponentially by a # factor of 2 upper *= 2 @@ -177,20 +176,18 @@ def round_time(dt, delta, start_date=timezone.make_aware(datetime.min)): # Invariant: start + lower * delta < dt <= start + upper * delta # If start_date + (lower + 1)*delta exceeds dt, then either lower or # lower+1 has to be the solution we are searching for - if start_date + (lower + 1)*delta >= dt: + if start_date + (lower + 1) * delta >= dt: # Check if start_date + (lower + 1)*delta or # start_date + lower*delta is closer to dt and return the solution - if ( - (start_date + (lower + 1) * delta) - dt <= - dt - (start_date + lower * delta)): - return start_date + (lower + 1)*delta + if (start_date + (lower + 1) * delta) - dt <= dt - (start_date + lower * delta): + return start_date + (lower + 1) * delta else: return start_date + lower * delta # We intersect the interval and either replace the lower or upper # limit with the candidate candidate = lower + (upper - lower) // 2 - if start_date + candidate*delta >= dt: + if start_date + candidate * delta >= dt: upper = candidate else: lower = candidate @@ -209,11 +206,11 @@ def infer_time_unit(time_seconds_arr): if len(time_seconds_arr) == 0: return 'hours' max_time_seconds = max(time_seconds_arr) - if max_time_seconds <= 60*2: + if max_time_seconds <= 60 * 2: return 'seconds' - elif max_time_seconds <= 60*60*2: + elif max_time_seconds <= 60 * 60 * 2: return 'minutes' - elif max_time_seconds <= 24*60*60*2: + elif max_time_seconds <= 24 * 60 * 60 * 2: return 'hours' else: return 'days' @@ -224,11 +221,11 @@ def scale_time_units(time_seconds_arr, unit): Convert an array of time durations in seconds to the specified time unit. """ if unit == 'minutes': - return list(map(lambda x: x*1.0/60, time_seconds_arr)) + return list(map(lambda x: x * 1.0 / 60, time_seconds_arr)) elif unit == 'hours': - return list(map(lambda x: x*1.0/(60*60), time_seconds_arr)) + return list(map(lambda x: x * 1.0 / (60 * 60), time_seconds_arr)) elif unit == 'days': - return list(map(lambda x: x*1.0/(24*60*60), time_seconds_arr)) + return list(map(lambda x: x * 1.0 / (24 * 60 * 60), time_seconds_arr)) return time_seconds_arr diff --git a/airflow/utils/db.py b/airflow/utils/db.py index b5e0c49c61d33..da3cc5c36e9b4 100644 --- a/airflow/utils/db.py +++ b/airflow/utils/db.py @@ -41,9 +41,8 @@ def create_session(): session = settings.Session() try: yield session - session.expunge_all() session.commit() - except: + except Exception: session.rollback() raise finally: @@ -78,9 +77,8 @@ def wrapper(*args, **kwargs): @provide_session def merge_conn(conn, session=None): - from airflow import models - C = models.Connection - if not session.query(C).filter(C.conn_id == conn.conn_id).first(): + from airflow.models.connection import Connection + if not session.query(Connection).filter(Connection.conn_id == conn.conn_id).first(): session.add(conn) session.commit() @@ -89,142 +87,142 @@ def initdb(rbac=False): session = settings.Session() from airflow import models + from airflow.models.connection import Connection upgradedb() merge_conn( - models.Connection( + Connection( conn_id='airflow_db', conn_type='mysql', - host='localhost', login='root', password='', + host='mysql', login='root', password='', schema='airflow')) merge_conn( - models.Connection( - conn_id='airflow_ci', conn_type='mysql', - host='localhost', login='root', extra="{\"local_infile\": true}", - schema='airflow_ci')) - merge_conn( - models.Connection( - conn_id='beeline_default', conn_type='beeline', port="10000", + Connection( + conn_id='beeline_default', conn_type='beeline', port=10000, host='localhost', extra="{\"use_beeline\": true, \"auth\": \"\"}", schema='default')) merge_conn( - models.Connection( + Connection( conn_id='bigquery_default', conn_type='google_cloud_platform', schema='default')) merge_conn( - models.Connection( + Connection( conn_id='local_mysql', conn_type='mysql', host='localhost', login='airflow', password='airflow', schema='airflow')) merge_conn( - models.Connection( + Connection( conn_id='presto_default', conn_type='presto', host='localhost', schema='hive', port=3400)) merge_conn( - models.Connection( + Connection( conn_id='google_cloud_default', conn_type='google_cloud_platform', schema='default',)) merge_conn( - models.Connection( + Connection( conn_id='hive_cli_default', conn_type='hive_cli', schema='default',)) merge_conn( - models.Connection( + Connection( conn_id='hiveserver2_default', conn_type='hiveserver2', host='localhost', schema='default', port=10000)) merge_conn( - models.Connection( + Connection( conn_id='metastore_default', conn_type='hive_metastore', host='localhost', extra="{\"authMechanism\": \"PLAIN\"}", port=9083)) merge_conn( - models.Connection( + Connection( conn_id='mongo_default', conn_type='mongo', - host='localhost', port=27017)) + host='mongo', port=27017)) merge_conn( - models.Connection( + Connection( conn_id='mysql_default', conn_type='mysql', login='root', - host='localhost')) + schema='airflow', + host='mysql')) merge_conn( - models.Connection( + Connection( conn_id='postgres_default', conn_type='postgres', login='postgres', + password='airflow', schema='airflow', - host='localhost')) + host='postgres')) merge_conn( - models.Connection( + Connection( conn_id='sqlite_default', conn_type='sqlite', host='/tmp/sqlite_default.db')) merge_conn( - models.Connection( + Connection( conn_id='http_default', conn_type='http', host='https://www.google.com/')) merge_conn( - models.Connection( + Connection( conn_id='mssql_default', conn_type='mssql', host='localhost', port=1433)) merge_conn( - models.Connection( + Connection( conn_id='vertica_default', conn_type='vertica', host='localhost', port=5433)) merge_conn( - models.Connection( + Connection( conn_id='wasb_default', conn_type='wasb', extra='{"sas_token": null}')) merge_conn( - models.Connection( + Connection( conn_id='webhdfs_default', conn_type='hdfs', host='localhost', port=50070)) merge_conn( - models.Connection( + Connection( conn_id='ssh_default', conn_type='ssh', host='localhost')) merge_conn( - models.Connection( + Connection( conn_id='sftp_default', conn_type='sftp', - host='localhost', port=22, login='travis', + host='localhost', port=22, login='airflow', extra=''' - {"private_key": "~/.ssh/id_rsa", "ignore_hostkey_verification": true} + {"key_file": "~/.ssh/id_rsa", "no_host_key_check": true} ''')) merge_conn( - models.Connection( + Connection( conn_id='fs_default', conn_type='fs', extra='{"path": "/"}')) merge_conn( - models.Connection( + Connection( conn_id='aws_default', conn_type='aws', extra='{"region_name": "us-east-1"}')) merge_conn( - models.Connection( + Connection( conn_id='spark_default', conn_type='spark', host='yarn', extra='{"queue": "root.default"}')) merge_conn( - models.Connection( + Connection( conn_id='druid_broker_default', conn_type='druid', host='druid-broker', port=8082, extra='{"endpoint": "druid/v2/sql"}')) merge_conn( - models.Connection( + Connection( conn_id='druid_ingest_default', conn_type='druid', host='druid-overlord', port=8081, extra='{"endpoint": "druid/indexer/v1/task"}')) merge_conn( - models.Connection( + Connection( conn_id='redis_default', conn_type='redis', - host='localhost', port=6379, + host='redis', port=6379, extra='{"db": 0}')) merge_conn( - models.Connection( + Connection( conn_id='sqoop_default', conn_type='sqoop', host='rmdbs', extra='')) merge_conn( - models.Connection( + Connection( conn_id='emr_default', conn_type='emr', extra=''' { "Name": "default_job_flow_name", "LogUri": "s3://my-emr-log-bucket/default_job_flow_location", "ReleaseLabel": "emr-4.6.0", "Instances": { + "Ec2KeyName": "mykey", + "Ec2SubnetId": "somesubnet", "InstanceGroups": [ { "Name": "Master nodes", @@ -240,12 +238,10 @@ def initdb(rbac=False): "InstanceType": "r3.2xlarge", "InstanceCount": 1 } - ] + ], + "TerminationProtected": false, + "KeepJobFlowAliveWhenNoSteps": false }, - "Ec2KeyName": "mykey", - "KeepJobFlowAliveWhenNoSteps": false, - "TerminationProtected": false, - "Ec2SubnetId": "somesubnet", "Applications":[ { "Name": "Spark" } ], @@ -265,25 +261,41 @@ def initdb(rbac=False): } ''')) merge_conn( - models.Connection( + Connection( conn_id='databricks_default', conn_type='databricks', host='localhost')) merge_conn( - models.Connection( + Connection( conn_id='qubole_default', conn_type='qubole', - host= 'localhost')) + host='localhost')) merge_conn( - models.Connection( + Connection( conn_id='segment_default', conn_type='segment', extra='{"write_key": "my-segment-write-key"}')), merge_conn( - models.Connection( + Connection( conn_id='azure_data_lake_default', conn_type='azure_data_lake', extra='{"tenant": "", "account_name": "" }')) merge_conn( - models.Connection( + Connection( + conn_id='azure_cosmos_default', conn_type='azure_cosmos', + extra='{"database_name": "", "collection_name": "" }')) + merge_conn( + Connection( + conn_id='azure_container_instances_default', conn_type='azure_container_instances', + extra='{"tenantId": "", "subscriptionId": "" }')) + merge_conn( + Connection( conn_id='cassandra_default', conn_type='cassandra', - host='localhost', port=9042)) + host='cassandra', port=9042)) + merge_conn( + Connection( + conn_id='dingding_default', conn_type='http', + host='', password='')) + merge_conn( + Connection( + conn_id='opsgenie_default', conn_type='http', + host='', password='')) # Known event types KET = models.KnownEventType @@ -341,8 +353,8 @@ def upgradedb(): package_dir = os.path.normpath(os.path.join(current_dir, '..')) directory = os.path.join(package_dir, 'migrations') config = Config(os.path.join(package_dir, 'alembic.ini')) - config.set_main_option('script_location', directory) - config.set_main_option('sqlalchemy.url', settings.SQL_ALCHEMY_CONN) + config.set_main_option('script_location', directory.replace('%', '%%')) + config.set_main_option('sqlalchemy.url', settings.SQL_ALCHEMY_CONN.replace('%', '%%')) command.upgrade(config, 'heads') @@ -357,7 +369,7 @@ def resetdb(rbac): log.info("Dropping tables that exist") - models.Base.metadata.drop_all(settings.engine) + models.base.Base.metadata.drop_all(settings.engine) mc = MigrationContext.configure(settings.engine) if mc._version.exists(settings.engine): mc._version.drop(settings.engine) diff --git a/airflow/utils/decorators.py b/airflow/utils/decorators.py index 966d27e30aeae..15847cf050857 100644 --- a/airflow/utils/decorators.py +++ b/airflow/utils/decorators.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -100,5 +100,22 @@ def wrapper(*args, **kwargs): return wrapper if 'BUILDING_AIRFLOW_DOCS' in os.environ: + # flake8: noqa: F811 # Monkey patch hook to get good function headers while building docs apply_defaults = lambda x: x + + +class cached_property: + """ + A decorator creating a property, the value of which is calculated only once and cached for later use. + """ + def __init__(self, func): + self.func = func + self.__doc__ = getattr(func, '__doc__') + + def __get__(self, instance, cls=None): + if instance is None: + return self + result = self.func(instance) + instance.__dict__[self.func.__name__] = result + return result diff --git a/airflow/utils/email.py b/airflow/utils/email.py index b37e3d471a413..532880161eaf9 100644 --- a/airflow/utils/email.py +++ b/airflow/utils/email.py @@ -22,7 +22,6 @@ from __future__ import print_function from __future__ import unicode_literals -from builtins import str from past.builtins import basestring import importlib @@ -48,6 +47,9 @@ def send_email(to, subject, html_content, path, attr = configuration.conf.get('email', 'EMAIL_BACKEND').rsplit('.', 1) module = importlib.import_module(path) backend = getattr(module, attr) + to = get_email_address_list(to) + to = ", ".join(to) + return backend(to, subject, html_content, files=files, dryrun=dryrun, cc=cc, bcc=bcc, mime_subtype=mime_subtype, mime_charset=mime_charset, **kwargs) @@ -62,13 +64,13 @@ def send_email_smtp(to, subject, html_content, files=None, >>> send_email('test@example.com', 'foo', 'Foo bar', ['/dev/null'], dryrun=True) """ - SMTP_MAIL_FROM = configuration.conf.get('smtp', 'SMTP_MAIL_FROM') + smtp_mail_from = configuration.conf.get('smtp', 'SMTP_MAIL_FROM') to = get_email_address_list(to) msg = MIMEMultipart(mime_subtype) msg['Subject'] = subject - msg['From'] = SMTP_MAIL_FROM + msg['From'] = smtp_mail_from msg['To'] = ", ".join(to) recipients = to if cc: @@ -96,7 +98,7 @@ def send_email_smtp(to, subject, html_content, files=None, part['Content-ID'] = '<%s>' % basename msg.attach(part) - send_MIME_email(SMTP_MAIL_FROM, recipients, msg, dryrun) + send_MIME_email(smtp_mail_from, recipients, msg, dryrun) def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): @@ -129,9 +131,9 @@ def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): def get_email_address_list(address_string): if isinstance(address_string, basestring): if ',' in address_string: - address_string = address_string.split(',') + address_string = [address.strip() for address in address_string.split(',')] elif ';' in address_string: - address_string = address_string.split(';') + address_string = [address.strip() for address in address_string.split(';')] else: address_string = [address_string] diff --git a/airflow/utils/file.py b/airflow/utils/file.py index 88938d422b3af..918375fe5e5b5 100644 --- a/airflow/utils/file.py +++ b/airflow/utils/file.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/helpers.py b/airflow/utils/helpers.py index 911890dc30ab5..06229079338b9 100644 --- a/airflow/utils/helpers.py +++ b/airflow/utils/helpers.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,19 +22,20 @@ from __future__ import print_function from __future__ import unicode_literals +import errno +import imp +import sys +import warnings + import psutil from builtins import input from past.builtins import basestring from datetime import datetime from functools import reduce -import imp import os import re import signal -import subprocess -import sys -import warnings from jinja2 import Template @@ -127,7 +128,7 @@ def chunks(items, chunk_size): """ Yield successive chunks of a given size from a list of items """ - if (chunk_size <= 0): + if chunk_size <= 0: raise ValueError('Chunk size must be a positive integer') for i in range(0, len(items), chunk_size): yield items[i:i + chunk_size] @@ -171,6 +172,37 @@ def chain(*tasks): up_task.set_downstream(down_task) +def cross_downstream(from_tasks, to_tasks): + r""" + Set downstream dependencies for all tasks in from_tasks to all tasks in to_tasks. + E.g.: cross_downstream(from_tasks=[t1, t2, t3], to_tasks=[t4, t5, t6]) + Is equivalent to: + + t1 --> t4 + \ / + t2 -X> t5 + / \ + t3 --> t6 + + t1.set_downstream(t4) + t1.set_downstream(t5) + t1.set_downstream(t6) + t2.set_downstream(t4) + t2.set_downstream(t5) + t2.set_downstream(t6) + t3.set_downstream(t4) + t3.set_downstream(t5) + t3.set_downstream(t6) + + :param from_tasks: List of tasks to start from. + :type from_tasks: List[airflow.models.BaseOperator] + :param to_tasks: List of tasks to set as downstream dependencies. + :type to_tasks: List[airflow.models.BaseOperator] + """ + for task in from_tasks: + task.set_downstream(to_tasks) + + def pprinttable(rows): """Returns a pretty ascii table from tuples @@ -226,6 +258,7 @@ def reap_process_group(pid, log, sig=signal.SIGTERM, :param sig: signal type :param timeout: how much time a process has to terminate """ + def on_terminate(p): log.info("Process %s (%s) terminated with exit code %s", p, p.pid, p.returncode) @@ -237,7 +270,15 @@ def on_terminate(p): children = parent.children(recursive=True) children.append(parent) - log.info("Sending %s to GPID %s", sig, os.getpgid(pid)) + try: + pg = os.getpgid(pid) + except OSError as err: + # Skip if not such process - we experience a race and it just terminated + if err.errno == errno.ESRCH: + return + raise + + log.info("Sending %s to GPID %s", sig, pg) os.killpg(os.getpgid(pid), sig) gone, alive = psutil.wait_procs(children, timeout=timeout, callback=on_terminate) @@ -281,10 +322,10 @@ def __init__(self, parent_module, module_attributes): """ :param parent_module: The string package name of the parent module. For example, 'airflow.operators' - :type parent_module: string + :type parent_module: str :param module_attributes: The file to class mappings for all importable classes. - :type module_attributes: string + :type module_attributes: str """ self._parent_module = parent_module self._attribute_modules = self._build_attribute_modules(module_attributes) @@ -374,3 +415,23 @@ def __getattr__(self, attribute): return loaded_attribute raise AttributeError + + +def render_log_filename(ti, try_number, filename_template): + """ + Given task instance, try_number, filename_template, return the rendered log filename + + :param ti: task instance + :param try_number: try_number of the task + :param filename_template: filename template, which can be jinja template or python string template + """ + filename_template, filename_jinja_template = parse_template_string(filename_template) + if filename_jinja_template: + jinja_context = ti.get_template_context() + jinja_context['try_number'] = try_number + return filename_jinja_template.render(**jinja_context) + + return filename_template.format(dag_id=ti.dag_id, + task_id=ti.task_id, + execution_date=ti.execution_date.isoformat(), + try_number=try_number) diff --git a/airflow/utils/json.py b/airflow/utils/json.py index 434926f9d2be8..1767c523d6a3f 100644 --- a/airflow/utils/json.py +++ b/airflow/utils/json.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/log/__init__.py b/airflow/utils/log/__init__.py index 4067cc78ee9a2..114d189da14ab 100644 --- a/airflow/utils/log/__init__.py +++ b/airflow/utils/log/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/log/es_task_handler.py b/airflow/utils/log/es_task_handler.py index 04682318655b1..3d4273cb5d4d3 100644 --- a/airflow/utils/log/es_task_handler.py +++ b/airflow/utils/log/es_task_handler.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -# Using `from elasticsearch import *` would break elasticseach mocking used in unit test. +# Using `from elasticsearch import *` would break elasticsearch mocking used in unit test. import elasticsearch import pendulum from elasticsearch_dsl import Search @@ -87,7 +87,7 @@ def _read(self, ti, try_number, metadata=None): :param try_number: try_number of the task instance :param metadata: log metadata, can be used for steaming log reading and auto-tailing. - :return a list of log documents and metadata. + :return: a list of log documents and metadata. """ if not metadata: metadata = {'offset': 0} @@ -147,9 +147,7 @@ def es_read(self, log_id, offset): logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \ .execute() except Exception as e: - msg = 'Could not read log with log_id: {}, ' \ - 'error: {}'.format(log_id, str(e)) - self.log.exception(msg) + self.log.exception('Could not read log with log_id: %s, error: %s', log_id, str(e)) return logs diff --git a/airflow/utils/log/file_processor_handler.py b/airflow/utils/log/file_processor_handler.py index 5e637440a1fda..098b018d47275 100644 --- a/airflow/utils/log/file_processor_handler.py +++ b/airflow/utils/log/file_processor_handler.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,11 +17,10 @@ # specific language governing permissions and limitations # under the License. -import errno import logging import os -from airflow import configuration as conf +from airflow import settings from airflow.utils.helpers import parse_template_string from datetime import datetime @@ -41,7 +40,7 @@ def __init__(self, base_log_folder, filename_template): super(FileProcessorHandler, self).__init__() self.handler = None self.base_log_folder = base_log_folder - self.dag_dir = os.path.expanduser(conf.get('core', 'DAGS_FOLDER')) + self.dag_dir = os.path.expanduser(settings.DAGS_FOLDER) self.filename_template, self.filename_jinja_template = \ parse_template_string(filename_template) @@ -49,9 +48,9 @@ def __init__(self, base_log_folder, filename_template): if not os.path.exists(self._get_log_directory()): try: os.makedirs(self._get_log_directory()) - except OSError as e: + except OSError: # only ignore case where the directory already exist - if e.errno != errno.EEXIST: + if not os.path.isdir(self._get_log_directory()): raise logging.warning("%s already exists", self._get_log_directory()) @@ -116,7 +115,7 @@ def _symlink_latest_log_directory(self): os.unlink(latest_log_directory_path) os.symlink(log_directory, latest_log_directory_path) elif (os.path.isdir(latest_log_directory_path) or - os.path.isfile(latest_log_directory_path)): + os.path.isfile(latest_log_directory_path)): logging.warning( "%s already exists as a dir/file. Skip creating symlink.", latest_log_directory_path @@ -131,14 +130,18 @@ def _init_file(self, filename): """ Create log file and directory if required. :param filename: task instance object - :return relative log path of the given task instance + :return: relative log path of the given task instance """ relative_path = self._render_filename(filename) full_path = os.path.join(self._get_log_directory(), relative_path) directory = os.path.dirname(full_path) if not os.path.exists(directory): - os.makedirs(directory) + try: + os.makedirs(directory) + except OSError: + if not os.path.isdir(directory): + raise if not os.path.exists(full_path): open(full_path, "a").close() diff --git a/airflow/utils/log/file_task_handler.py b/airflow/utils/log/file_task_handler.py index 627c8755707ef..111f26c279cf0 100644 --- a/airflow/utils/log/file_task_handler.py +++ b/airflow/utils/log/file_task_handler.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -171,7 +171,7 @@ def _init_file(self, ti): """ Create log directory and give it correct permissions. :param ti: task instance object - :return relative log path of the given task instance + :return: relative log path of the given task instance """ # To handle log writing when tasks are impersonated, the log files need to # be writable by the user that runs the Airflow command and the user diff --git a/airflow/utils/log/gcs_task_handler.py b/airflow/utils/log/gcs_task_handler.py index 8c34792bb2138..ed7edf4a27010 100644 --- a/airflow/utils/log/gcs_task_handler.py +++ b/airflow/utils/log/gcs_task_handler.py @@ -49,8 +49,8 @@ def _build_hook(self): except Exception as e: self.log.error( 'Could not create a GoogleCloudStorageHook with connection id ' - '"{}". {}\n\nPlease make sure that airflow[gcp_api] is installed ' - 'and the GCS connection exists.'.format(remote_conn_id, str(e)) + '"%s". %s\n\nPlease make sure that airflow[gcp_api] is installed ' + 'and the GCS connection exists.', remote_conn_id, str(e) ) @property @@ -126,7 +126,7 @@ def gcs_read(self, remote_log_location): """ Returns the log found at the remote_log_location. :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) """ bkt, blob = self.parse_gcs_url(remote_log_location) return self.hook.download(bkt, blob).decode() @@ -136,9 +136,9 @@ def gcs_write(self, log, remote_log_location, append=True): Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location - :type log: string + :type log: str :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool @@ -164,7 +164,8 @@ def gcs_write(self, log, remote_log_location, append=True): except Exception as e: self.log.error('Could not write logs to %s: %s', remote_log_location, e) - def parse_gcs_url(self, gsurl): + @staticmethod + def parse_gcs_url(gsurl): """ Given a Google Cloud Storage URL (gs:///), returns a tuple containing the corresponding bucket and blob. diff --git a/airflow/utils/log/logging_mixin.py b/airflow/utils/log/logging_mixin.py index 3f696931c955b..442fecd0363c8 100644 --- a/airflow/utils/log/logging_mixin.py +++ b/airflow/utils/log/logging_mixin.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -92,7 +92,7 @@ def write(self, message): self._buffer += message else: self._buffer += message - self.logger.log(self.level, self._buffer) + self.logger.log(self.level, self._buffer.rstrip()) self._buffer = str() def flush(self): diff --git a/airflow/utils/log/s3_task_handler.py b/airflow/utils/log/s3_task_handler.py index 07b9b3ec11e8b..196aec50f1bd4 100644 --- a/airflow/utils/log/s3_task_handler.py +++ b/airflow/utils/log/s3_task_handler.py @@ -42,7 +42,7 @@ def _build_hook(self): try: from airflow.hooks.S3_hook import S3Hook return S3Hook(remote_conn_id) - except: + except Exception: self.log.error( 'Could not create an S3Hook with connection id "%s". ' 'Please make sure that airflow[s3] is installed and ' @@ -132,14 +132,14 @@ def s3_read(self, remote_log_location, return_error=False): Returns the log found at the remote_log_location. Returns '' if no logs are found or there is an error. :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool """ try: return self.hook.read_key(remote_log_location) - except: + except Exception: msg = 'Could not read logs from {}'.format(remote_log_location) self.log.exception(msg) # return error if needed @@ -151,9 +151,9 @@ def s3_write(self, log, remote_log_location, append=True): Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location - :type log: string + :type log: str :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool @@ -169,5 +169,5 @@ def s3_write(self, log, remote_log_location, append=True): replace=True, encrypt=configuration.conf.getboolean('core', 'ENCRYPT_S3_LOGS'), ) - except: + except Exception: self.log.exception('Could not write logs to %s', remote_log_location) diff --git a/airflow/utils/log/wasb_task_handler.py b/airflow/utils/log/wasb_task_handler.py index a2a0c0daee999..71e1ae5d16dfd 100644 --- a/airflow/utils/log/wasb_task_handler.py +++ b/airflow/utils/log/wasb_task_handler.py @@ -20,7 +20,6 @@ import shutil from airflow import configuration -from airflow.contrib.hooks.wasb_hook import WasbHook from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.log.file_task_handler import FileTaskHandler from azure.common import AzureHttpError @@ -47,6 +46,7 @@ def __init__(self, base_log_folder, wasb_log_folder, wasb_container, def _build_hook(self): remote_conn_id = configuration.get('core', 'REMOTE_LOG_CONN_ID') try: + from airflow.contrib.hooks.wasb_hook import WasbHook return WasbHook(remote_conn_id) except AzureHttpError: self.log.error( @@ -140,7 +140,7 @@ def wasb_read(self, remote_log_location, return_error=False): Returns the log found at the remote_log_location. Returns '' if no logs are found or there is an error. :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) :param return_error: if True, returns a string error message if an error occurs. Otherwise returns '' when an error occurs. :type return_error: bool @@ -159,9 +159,9 @@ def wasb_write(self, log, remote_log_location, append=True): Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location - :type log: string + :type log: str :param remote_log_location: the log's location in remote storage - :type remote_log_location: string (path) + :type remote_log_location: str (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool diff --git a/airflow/utils/module_loading.py b/airflow/utils/module_loading.py index 6e638b00d3318..d5b4971b736dd 100644 --- a/airflow/utils/module_loading.py +++ b/airflow/utils/module_loading.py @@ -16,24 +16,10 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -import os -import sys -from airflow import configuration as conf from importlib import import_module -def prepare_classpath(): - """ - Ensures that the Airflow home directory is on the classpath - """ - config_path = os.path.join(conf.get('core', 'airflow_home'), 'config') - config_path = os.path.expanduser(config_path) - - if config_path not in sys.path: - sys.path.append(config_path) - - def import_string(dotted_path): """ Import a dotted module path and return the attribute/class designated by the @@ -48,7 +34,7 @@ def import_string(dotted_path): try: return getattr(module, class_name) - except AttributeError as err: + except AttributeError: raise ImportError('Module "{}" does not define a "{}" attribute/class'.format( module_path, class_name) ) diff --git a/airflow/utils/net.py b/airflow/utils/net.py index 03e8fde9c90cb..c435496ffc76a 100644 --- a/airflow/utils/net.py +++ b/airflow/utils/net.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/utils/operator_helpers.py b/airflow/utils/operator_helpers.py index 356aa650a3a40..e015f1d5ef072 100644 --- a/airflow/utils/operator_helpers.py +++ b/airflow/utils/operator_helpers.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,32 +18,49 @@ # under the License. # +AIRFLOW_VAR_NAME_FORMAT_MAPPING = { + 'AIRFLOW_CONTEXT_DAG_ID': {'default': 'airflow.ctx.dag_id', + 'env_var_format': 'AIRFLOW_CTX_DAG_ID'}, + 'AIRFLOW_CONTEXT_TASK_ID': {'default': 'airflow.ctx.task_id', + 'env_var_format': 'AIRFLOW_CTX_TASK_ID'}, + 'AIRFLOW_CONTEXT_EXECUTION_DATE': {'default': 'airflow.ctx.execution_date', + 'env_var_format': 'AIRFLOW_CTX_EXECUTION_DATE'}, + 'AIRFLOW_CONTEXT_DAG_RUN_ID': {'default': 'airflow.ctx.dag_run_id', + 'env_var_format': 'AIRFLOW_CTX_DAG_RUN_ID'} +} + -def context_to_airflow_vars(context): +def context_to_airflow_vars(context, in_env_var_format=False): """ Given a context, this function provides a dictionary of values that can be used to externally reconstruct relations between dags, dag_runs, tasks and task_instances. + Default to abc.def.ghi format and can be made to ABC_DEF_GHI format if + in_env_var_format is set to True. - :param context: The context for the task_instance of interest + :param context: The context for the task_instance of interest. :type context: dict + :param in_env_var_format: If returned vars should be in ABC_DEF_GHI format. + :type in_env_var_format: bool + :return: task_instance context as dict. """ - params = {} - dag = context.get('dag') - if dag and dag.dag_id: - params['airflow.ctx.dag.dag_id'] = dag.dag_id - - dag_run = context.get('dag_run') - if dag_run and dag_run.execution_date: - params['airflow.ctx.dag_run.execution_date'] = dag_run.execution_date.isoformat() - - task = context.get('task') - if task and task.task_id: - params['airflow.ctx.task.task_id'] = task.task_id - + params = dict() + if in_env_var_format: + name_format = 'env_var_format' + else: + name_format = 'default' task_instance = context.get('task_instance') + if task_instance and task_instance.dag_id: + params[AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_ID'][ + name_format]] = task_instance.dag_id + if task_instance and task_instance.task_id: + params[AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_TASK_ID'][ + name_format]] = task_instance.task_id if task_instance and task_instance.execution_date: - params['airflow.ctx.task_instance.execution_date'] = ( - task_instance.execution_date.isoformat() - ) - + params[ + AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_EXECUTION_DATE'][ + name_format]] = task_instance.execution_date.isoformat() + dag_run = context.get('dag_run') + if dag_run and dag_run.run_id: + params[AIRFLOW_VAR_NAME_FORMAT_MAPPING['AIRFLOW_CONTEXT_DAG_RUN_ID'][ + name_format]] = dag_run.run_id return params diff --git a/airflow/utils/operator_resources.py b/airflow/utils/operator_resources.py index 47ff336991b95..44df83eafb508 100644 --- a/airflow/utils/operator_resources.py +++ b/airflow/utils/operator_resources.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -33,10 +33,10 @@ class Resource(object): Represents a resource requirement in an execution environment for an operator. :param name: Name of the resource - :type name: string + :type name: str :param units_str: The string representing the units of a resource (e.g. MB for a CPU resource) to be used for display purposes - :type units_str: string + :type units_str: str :param qty: The number of units of the specified resource that are required for execution of the operator. :type qty: long diff --git a/airflow/utils/sqlalchemy.py b/airflow/utils/sqlalchemy.py index a00fe17c18a48..62488ef4e2077 100644 --- a/airflow/utils/sqlalchemy.py +++ b/airflow/utils/sqlalchemy.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -22,23 +22,27 @@ from __future__ import print_function from __future__ import unicode_literals +import datetime import os +import json +import pendulum import time import random +from dateutil import relativedelta from sqlalchemy import event, exc, select +from sqlalchemy.types import Text, DateTime, TypeDecorator from airflow.utils.log.logging_mixin import LoggingMixin log = LoggingMixin().log +utc = pendulum.timezone('UTC') -def setup_event_handlers( - engine, - reconnect_timeout_seconds, - initial_backoff_seconds=0.2, - max_backoff_seconds=120): - +def setup_event_handlers(engine, + reconnect_timeout_seconds, + initial_backoff_seconds=0.2, + max_backoff_seconds=120): @event.listens_for(engine, "engine_connect") def ping_connection(connection, branch): """ @@ -64,7 +68,7 @@ def ping_connection(connection, branch): try: connection.scalar(select([1])) - # If we made it here then the connection appears to be healty + # If we made it here then the connection appears to be healthy break except exc.DBAPIError as err: if time.time() - start >= reconnect_timeout_seconds: @@ -96,11 +100,24 @@ def ping_connection(connection, branch): # restore "close with result" connection.should_close_with_result = save_should_close_with_result - @event.listens_for(engine, "connect") def connect(dbapi_connection, connection_record): connection_record.info['pid'] = os.getpid() + if engine.dialect.name == "sqlite": + @event.listens_for(engine, "connect") + def set_sqlite_pragma(dbapi_connection, connection_record): + cursor = dbapi_connection.cursor() + cursor.execute("PRAGMA foreign_keys=ON") + cursor.close() + + # this ensures sanity in mysql when storing datetimes (not required for postgres) + if engine.dialect.name == "mysql": + @event.listens_for(engine, "connect") + def set_mysql_timezone(dbapi_connection, connection_record): + cursor = dbapi_connection.cursor() + cursor.execute("SET time_zone = '+00:00'") + cursor.close() @event.listens_for(engine, "checkout") def checkout(dbapi_connection, connection_record, connection_proxy): @@ -111,3 +128,79 @@ def checkout(dbapi_connection, connection_record, connection_proxy): "Connection record belongs to pid {}, " "attempting to check out in pid {}".format(connection_record.info['pid'], pid) ) + + +class UtcDateTime(TypeDecorator): + """ + Almost equivalent to :class:`~sqlalchemy.types.DateTime` with + ``timezone=True`` option, but it differs from that by: + + - Never silently take naive :class:`~datetime.datetime`, instead it + always raise :exc:`ValueError` unless time zone aware value. + - :class:`~datetime.datetime` value's :attr:`~datetime.datetime.tzinfo` + is always converted to UTC. + - Unlike SQLAlchemy's built-in :class:`~sqlalchemy.types.DateTime`, + it never return naive :class:`~datetime.datetime`, but time zone + aware value, even with SQLite or MySQL. + - Always returns DateTime in UTC + + """ + + impl = DateTime(timezone=True) + + def process_bind_param(self, value, dialect): + if value is not None: + if not isinstance(value, datetime.datetime): + raise TypeError('expected datetime.datetime, not ' + + repr(value)) + elif value.tzinfo is None: + raise ValueError('naive datetime is disallowed') + + return value.astimezone(utc) + + def process_result_value(self, value, dialect): + """ + Processes DateTimes from the DB making sure it is always + returning UTC. Not using timezone.convert_to_utc as that + converts to configured TIMEZONE while the DB might be + running with some other setting. We assume UTC datetimes + in the database. + """ + if value is not None: + if value.tzinfo is None: + value = value.replace(tzinfo=utc) + else: + value = value.astimezone(utc) + + return value + + +class Interval(TypeDecorator): + + impl = Text + + attr_keys = { + datetime.timedelta: ('days', 'seconds', 'microseconds'), + relativedelta.relativedelta: ( + 'years', 'months', 'days', 'leapdays', 'hours', 'minutes', 'seconds', 'microseconds', + 'year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond', + ), + } + + def process_bind_param(self, value, dialect): + if type(value) in self.attr_keys: + attrs = { + key: getattr(value, key) + for key in self.attr_keys[type(value)] + } + return json.dumps({'type': type(value).__name__, 'attrs': attrs}) + return json.dumps(value) + + def process_result_value(self, value, dialect): + if not value: + return value + data = json.loads(value) + if isinstance(data, dict): + type_map = {key.__name__: key for key in self.attr_keys} + return type_map[data['type']](**data['attrs']) + return data diff --git a/airflow/utils/state.py b/airflow/utils/state.py index 9da98510eb03d..320b996d5d500 100644 --- a/airflow/utils/state.py +++ b/airflow/utils/state.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -43,6 +43,7 @@ class State(object): SHUTDOWN = "shutdown" # External request to shut down FAILED = "failed" UP_FOR_RETRY = "up_for_retry" + UP_FOR_RESCHEDULE = "up_for_reschedule" UPSTREAM_FAILED = "upstream_failed" SKIPPED = "skipped" @@ -51,7 +52,9 @@ class State(object): RUNNING, FAILED, UPSTREAM_FAILED, + SKIPPED, UP_FOR_RETRY, + UP_FOR_RESCHEDULE, QUEUED, NONE, SCHEDULED, @@ -70,6 +73,7 @@ class State(object): SHUTDOWN: 'blue', FAILED: 'red', UP_FOR_RETRY: 'gold', + UP_FOR_RESCHEDULE: 'turquoise', UPSTREAM_FAILED: 'orange', SKIPPED: 'pink', REMOVED: 'lightgrey', @@ -79,18 +83,14 @@ class State(object): @classmethod def color(cls, state): - if state in cls.state_color: - return cls.state_color[state] - else: - return 'white' + return cls.state_color.get(state, 'white') @classmethod def color_fg(cls, state): color = cls.color(state) if color in ['green', 'red']: return 'white' - else: - return 'black' + return 'black' @classmethod def finished(cls): @@ -101,7 +101,6 @@ def finished(cls): """ return [ cls.SUCCESS, - cls.SHUTDOWN, cls.FAILED, cls.SKIPPED, ] @@ -117,5 +116,7 @@ def unfinished(cls): cls.SCHEDULED, cls.QUEUED, cls.RUNNING, - cls.UP_FOR_RETRY + cls.SHUTDOWN, + cls.UP_FOR_RETRY, + cls.UP_FOR_RESCHEDULE ] diff --git a/airflow/utils/tests.py b/airflow/utils/tests.py index 6f29ffc25b2eb..33157f5fba9fc 100644 --- a/airflow/utils/tests.py +++ b/airflow/utils/tests.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -35,5 +35,5 @@ def skipUnlessImported(module, obj): def assertEqualIgnoreMultipleSpaces(case, first, second, msg=None): def _trim(s): - re.sub("\s+", " ", s.strip()) + return re.sub(r"\s+", " ", s.strip()) return case.assertEqual(_trim(first), _trim(second), msg) diff --git a/airflow/utils/timeout.py b/airflow/utils/timeout.py index c5ca5e2f1724f..362baa197f33d 100644 --- a/airflow/utils/timeout.py +++ b/airflow/utils/timeout.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -23,6 +23,7 @@ from __future__ import unicode_literals import signal +import os from airflow.exceptions import AirflowTaskTimeout from airflow.utils.log.logging_mixin import LoggingMixin @@ -35,10 +36,10 @@ class timeout(LoggingMixin): def __init__(self, seconds=1, error_message='Timeout'): self.seconds = seconds - self.error_message = error_message + self.error_message = error_message + ', PID: ' + str(os.getpid()) def handle_timeout(self, signum, frame): - self.log.error("Process timed out") + self.log.error("Process timed out, PID: %s", str(os.getpid())) raise AirflowTaskTimeout(self.error_message) def __enter__(self): diff --git a/airflow/utils/timezone.py b/airflow/utils/timezone.py index 6d49fbcbb3e85..5adaa2f5c4a5a 100644 --- a/airflow/utils/timezone.py +++ b/airflow/utils/timezone.py @@ -164,9 +164,9 @@ def datetime(*args, **kwargs): return dt.datetime(*args, **kwargs) -def parse(string): +def parse(string, timezone=None): """ Parse a time string and return an aware datetime :param string: time string """ - return pendulum.parse(string, tz=TIMEZONE) + return pendulum.parse(string, tz=timezone or TIMEZONE) diff --git a/airflow/utils/trigger_rule.py b/airflow/utils/trigger_rule.py index 7bad8ecb78ab5..a1e14c2e61568 100644 --- a/airflow/utils/trigger_rule.py +++ b/airflow/utils/trigger_rule.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,6 +20,7 @@ from __future__ import unicode_literals from builtins import object +from typing import Set class TriggerRule(object): @@ -28,9 +29,12 @@ class TriggerRule(object): ALL_DONE = 'all_done' ONE_SUCCESS = 'one_success' ONE_FAILED = 'one_failed' + NONE_FAILED = 'none_failed' + NONE_SKIPPED = 'none_skipped' DUMMY = 'dummy' - _ALL_TRIGGER_RULES = {} + _ALL_TRIGGER_RULES = set() # type: Set[str] + @classmethod def is_valid(cls, trigger_rule): return trigger_rule in cls.all_triggers() diff --git a/airflow/utils/weight_rule.py b/airflow/utils/weight_rule.py index 745cc76970cea..f7f85c0734ac6 100644 --- a/airflow/utils/weight_rule.py +++ b/airflow/utils/weight_rule.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -20,6 +20,7 @@ from __future__ import unicode_literals from builtins import object +from typing import Set class WeightRule(object): @@ -27,7 +28,8 @@ class WeightRule(object): UPSTREAM = 'upstream' ABSOLUTE = 'absolute' - _ALL_WEIGHT_RULES = {} + _ALL_WEIGHT_RULES = set() # type: Set[str] + @classmethod def is_valid(cls, weight_rule): return weight_rule in cls.all_weight_rules() diff --git a/airflow/version.py b/airflow/version.py index d11d76608dfdf..b7367c22a47e5 100644 --- a/airflow/version.py +++ b/airflow/version.py @@ -18,4 +18,4 @@ # under the License. # -version = '2.0.0dev0+incubating' +version = '1.10.3' diff --git a/airflow/www/__init__.py b/airflow/www/__init__.py index f0f8b68337da6..114d189da14ab 100644 --- a/airflow/www/__init__.py +++ b/airflow/www/__init__.py @@ -7,13 +7,12 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - diff --git a/airflow/www/api/__init__.py b/airflow/www/api/__init__.py index db5ba598d7c23..b7f8352944d3f 100644 --- a/airflow/www/api/__init__.py +++ b/airflow/www/api/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www/api/experimental/__init__.py b/airflow/www/api/experimental/__init__.py index db5ba598d7c23..b7f8352944d3f 100644 --- a/airflow/www/api/experimental/__init__.py +++ b/airflow/www/api/experimental/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www/api/experimental/endpoints.py b/airflow/www/api/experimental/endpoints.py index f0bc319eb6665..8ba47fb114a62 100644 --- a/airflow/www/api/experimental/endpoints.py +++ b/airflow/www/api/experimental/endpoints.py @@ -24,12 +24,17 @@ from airflow.api.common.experimental import delete_dag as delete from airflow.api.common.experimental import pool as pool_api from airflow.api.common.experimental import trigger_dag as trigger +from airflow.api.common.experimental.get_dag_runs import get_dag_runs from airflow.api.common.experimental.get_task import get_task from airflow.api.common.experimental.get_task_instance import get_task_instance +from airflow.api.common.experimental.get_code import get_code +from airflow.api.common.experimental.get_dag_run_state import get_dag_run_state from airflow.exceptions import AirflowException from airflow.utils import timezone from airflow.utils.log.logging_mixin import LoggingMixin from airflow.www.app import csrf +from airflow import models +from airflow.utils.db import create_session _log = LoggingMixin().log @@ -83,7 +88,7 @@ def trigger_dag(dag_id): return response if getattr(g, 'user', None): - _log.info("User {} created {}".format(g.user, dr)) + _log.info("User %s created %s", g.user, dr) response = jsonify(message="Created {}".format(dr)) return response @@ -106,12 +111,47 @@ def delete_dag(dag_id): return jsonify(message="Removed {} record(s)".format(count), count=count) +@api_experimental.route('/dags//dag_runs', methods=['GET']) +@requires_authentication +def dag_runs(dag_id): + """ + Returns a list of Dag Runs for a specific DAG ID. + :query param state: a query string parameter '?state=queued|running|success...' + :param dag_id: String identifier of a DAG + :return: List of DAG runs of a DAG with requested state, + or all runs if the state is not specified + """ + try: + state = request.args.get('state') + dagruns = get_dag_runs(dag_id, state, run_url_route='airflow.graph') + except AirflowException as err: + _log.info(err) + response = jsonify(error="{}".format(err)) + response.status_code = 400 + return response + + return jsonify(dagruns) + + @api_experimental.route('/test', methods=['GET']) @requires_authentication def test(): return jsonify(status='OK') +@api_experimental.route('/dags//code', methods=['GET']) +@requires_authentication +def get_dag_code(dag_id): + """Return python code of a given dag_id.""" + try: + return get_code(dag_id) + except AirflowException as err: + _log.info(err) + response = jsonify(error="{}".format(err)) + response.status_code = err.status_code + return response + + @api_experimental.route('/dags//tasks/', methods=['GET']) @requires_authentication def task_info(dag_id, task_id): @@ -131,6 +171,28 @@ def task_info(dag_id, task_id): return jsonify(fields) +# ToDo: Shouldn't this be a PUT method? +@api_experimental.route('/dags//paused/', methods=['GET']) +@requires_authentication +def dag_paused(dag_id, paused): + """(Un)pauses a dag""" + + DagModel = models.DagModel + with create_session() as session: + orm_dag = ( + session.query(DagModel) + .filter(DagModel.dag_id == dag_id).first() + ) + if paused == 'true': + orm_dag.is_paused = True + else: + orm_dag.is_paused = False + session.merge(orm_dag) + session.commit() + + return jsonify({'response': 'ok'}) + + @api_experimental.route( '/dags//dag_runs//tasks/', methods=['GET']) @@ -172,6 +234,43 @@ def task_instance_info(dag_id, execution_date, task_id): return jsonify(fields) +@api_experimental.route( + '/dags//dag_runs/', + methods=['GET']) +@requires_authentication +def dag_run_status(dag_id, execution_date): + """ + Returns a JSON with a dag_run's public instance variables. + The format for the exec_date is expected to be + "YYYY-mm-DDTHH:MM:SS", for example: "2016-11-16T11:34:15". This will + of course need to have been encoded for URL in the request. + """ + + # Convert string datetime into actual datetime + try: + execution_date = timezone.parse(execution_date) + except ValueError: + error_message = ( + 'Given execution date, {}, could not be identified ' + 'as a date. Example date format: 2015-11-16T14:34:15+00:00'.format( + execution_date)) + _log.info(error_message) + response = jsonify({'error': error_message}) + response.status_code = 400 + + return response + + try: + info = get_dag_run_state(dag_id, execution_date) + except AirflowException as err: + _log.info(err) + response = jsonify(error="{}".format(err)) + response.status_code = err.status_code + return response + + return jsonify(info) + + @api_experimental.route('/latest_runs', methods=['GET']) @requires_authentication def latest_dag_runs(): diff --git a/airflow/www/app.py b/airflow/www/app.py index e9b101db37689..1fce12bc4e7a9 100644 --- a/airflow/www/app.py +++ b/airflow/www/app.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,18 +17,21 @@ # specific language governing permissions and limitations # under the License. # -import six +from typing import Any +import six from flask import Flask from flask_admin import Admin, base from flask_caching import Cache from flask_wtf.csrf import CSRFProtect from six.moves.urllib.parse import urlparse from werkzeug.wsgi import DispatcherMiddleware +from werkzeug.contrib.fixers import ProxyFix import airflow from airflow import configuration as conf from airflow import models, LoggingMixin +from airflow.models.connection import Connection from airflow.settings import Session from airflow.www.blueprints import routes @@ -43,10 +46,19 @@ def create_app(config=None, testing=False): app = Flask(__name__) + if configuration.conf.getboolean('webserver', 'ENABLE_PROXY_FIX'): + app.wsgi_app = ProxyFix(app.wsgi_app) app.secret_key = configuration.conf.get('webserver', 'SECRET_KEY') app.config['LOGIN_DISABLED'] = not configuration.conf.getboolean( 'webserver', 'AUTHENTICATE') + app.config['SESSION_COOKIE_HTTPONLY'] = True + app.config['SESSION_COOKIE_SECURE'] = conf.getboolean('webserver', 'COOKIE_SECURE') + app.config['SESSION_COOKIE_SAMESITE'] = conf.get('webserver', 'COOKIE_SAMESITE') + + if config: + app.config.from_mapping(config) + csrf.init_app(app) app.config['TESTING'] = testing @@ -58,8 +70,8 @@ def create_app(config=None, testing=False): api.load_auth() api.api_auth.init_app(app) - cache = Cache( - app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) + # flake8: noqa: F841 + cache = Cache(app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) app.register_blueprint(routes) @@ -101,7 +113,7 @@ def create_app(config=None, testing=False): av(vs.UserModelView( models.User, Session, name="Users", category="Admin")) av(vs.ConnectionModelView( - models.Connection, Session, name="Connections", category="Admin")) + Connection, Session, name="Connections", category="Admin")) av(vs.VariableView( models.Variable, Session, name="Variables", category="Admin")) av(vs.XComView( @@ -109,11 +121,11 @@ def create_app(config=None, testing=False): admin.add_link(base.MenuLink( category='Docs', name='Documentation', - url='https://airflow.incubator.apache.org/')) + url='https://airflow.apache.org/')) admin.add_link( base.MenuLink(category='Docs', - name='Github', - url='https://github.com/apache/incubator-airflow')) + name='GitHub', + url='https://github.com/apache/airflow')) av(vs.VersionView(name='Version', category="About")) @@ -132,8 +144,8 @@ def integrate_plugins(): log.debug('Adding view %s', v.name) admin.add_view(v) for bp in flask_blueprints: - log.debug('Adding blueprint %s', bp.name) - app.register_blueprint(bp) + log.debug("Adding blueprint %s:%s", bp["name"], bp["blueprint"].import_name) + app.register_blueprint(bp["blueprint"]) for ml in sorted(menu_links, key=lambda x: x.name): log.debug('Adding menu link %s', ml.name) admin.add_link(ml) @@ -144,11 +156,7 @@ def integrate_plugins(): # required for testing purposes otherwise the module retains # a link to the default_auth if app.config['TESTING']: - if six.PY2: - reload(e) - else: - import importlib - importlib.reload(e) + six.moves.reload_module(e) app.register_blueprint(e.api_experimental, url_prefix='/api/experimental') @@ -166,11 +174,11 @@ def shutdown_session(exception=None): return app -app = None +app = None # type: Any def root_app(env, resp): - resp(b'404 Not Found', [(b'Content-Type', b'text/plain')]) + resp('404 Not Found', [('Content-Type', 'text/plain')]) return [b'Apache Airflow is not at this location'] diff --git a/airflow/www/blueprints.py b/airflow/www/blueprints.py index ec719b96d2f6a..e964081f8a1a2 100644 --- a/airflow/www/blueprints.py +++ b/airflow/www/blueprints.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,10 +17,16 @@ # specific language governing permissions and limitations # under the License. # +from datetime import timedelta from flask import ( - url_for, Markup, Blueprint, redirect, + url_for, Blueprint, redirect, ) -import markdown +from sqlalchemy import func + +from airflow import configuration as conf +from airflow import jobs, settings +from airflow.utils import timezone +from airflow.www import utils as wwwutils routes = Blueprint('routes', __name__) @@ -32,6 +38,35 @@ def index(): @routes.route('/health') def health(): - """ We can add an array of tests here to check the server's health """ - content = Markup(markdown.markdown("The server is healthy!")) - return content + """ + An endpoint helping check the health status of the Airflow instance, + including metadatabase and scheduler. + """ + session = settings.Session() + BJ = jobs.BaseJob + payload = {} + scheduler_health_check_threshold = timedelta(seconds=conf.getint('scheduler', + 'scheduler_health_check_threshold' + )) + + latest_scheduler_heartbeat = None + payload['metadatabase'] = {'status': 'healthy'} + try: + latest_scheduler_heartbeat = session.query(func.max(BJ.latest_heartbeat)). \ + filter(BJ.state == 'running', BJ.job_type == 'SchedulerJob'). \ + scalar() + except Exception: + payload['metadatabase']['status'] = 'unhealthy' + + if not latest_scheduler_heartbeat: + scheduler_status = 'unhealthy' + else: + if timezone.utcnow() - latest_scheduler_heartbeat <= scheduler_health_check_threshold: + scheduler_status = 'healthy' + else: + scheduler_status = 'unhealthy' + + payload['scheduler'] = {'status': scheduler_status, + 'latest_scheduler_heartbeat': str(latest_scheduler_heartbeat)} + + return wwwutils.json_response(payload) diff --git a/airflow/www/gunicorn_config.py b/airflow/www/gunicorn_config.py index 7cf12c56c778e..db76b6de0c5af 100644 --- a/airflow/www/gunicorn_config.py +++ b/airflow/www/gunicorn_config.py @@ -8,9 +8,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www/static/connection_form.js b/airflow/www/static/connection_form.js index 902cf747e52b2..8517ad2582bf4 100644 --- a/airflow/www/static/connection_form.js +++ b/airflow/www/static/connection_form.js @@ -54,6 +54,22 @@ 'login': 'Username', } }, + qubole: { + hidden_fields: ['login', 'schema', 'port', 'extra'], + relabeling: { + 'host': 'API Endpoint', + 'password': 'Auth Token', + }, + placeholders: { + 'host': 'https://.qubole.com/api' + } + }, + ssh: { + hidden_fields: ['schema'], + relabeling: { + 'login': 'Username', + } + }, } function connTypeChange(connectionType) { $("div.form-group").removeClass("hide"); @@ -68,6 +84,8 @@ $("label[orig_text]").each(function(){ $(this).text($(this).attr("orig_text")); }); + $(".form-control").each(function(){$(this).attr('placeholder', '')}); + if (config[connectionType] != undefined){ $.each(config[connectionType].hidden_fields, function(i, field){ $("#" + field).parent().parent().addClass('hide') @@ -77,6 +95,9 @@ lbl.attr("orig_text", lbl.text()); $("label[for='" + k + "']").text(v); }); + $.each(config[connectionType].placeholders, function(k, v){ + $("#" + k).attr('placeholder', v); + }); } } var connectionType=$("#conn_type").val(); diff --git a/airflow/www/static/gantt-chart-d3v2.js b/airflow/www/static/gantt-chart-d3v2.js index d21311a1c541d..245a0147e9f72 100644 --- a/airflow/www/static/gantt-chart-d3v2.js +++ b/airflow/www/static/gantt-chart-d3v2.js @@ -129,7 +129,7 @@ d3.gantt = function() { call_modal(d.taskName, d.executionDate); }) .attr("class", function(d){ - if(taskStatus[d.status] == null){ return "bar";} + if(taskStatus[d.status] == null){ return "null";} return taskStatus[d.status]; }) .attr("y", 0) diff --git a/airflow/www/static/graph.css b/airflow/www/static/graph.css index f1d34800612fa..a40abf3ed7930 100644 --- a/airflow/www/static/graph.css +++ b/airflow/www/static/graph.css @@ -31,11 +31,12 @@ g.node.success rect { g.node.up_for_retry rect { stroke: gold; } - g.node.queued rect { stroke: grey; } - +g.node.up_for_reschedule rect{ + stroke: turquoise; +} g.node.running rect{ stroke: lime; } diff --git a/airflow/www/static/main.css b/airflow/www/static/main.css index 57164b94e5ccf..08bfc798a01ed 100644 --- a/airflow/www/static/main.css +++ b/airflow/www/static/main.css @@ -44,10 +44,14 @@ td>span.glyphicon{ padding-left: 3px; padding-top: 3px; } -button.btn { +button.btn, label.btn { border: 1px solid black; } +.btn-group label.btn { + background-color: #f0f0f0; +} + div.rich_doc { padding: 5px 10px; border: 1px solid #dddddd; @@ -75,6 +79,9 @@ span.success{ span.up_for_retry{ background-color: gold; } +span.up_for_reschedule{ + background-color: turquoise; +} span.started{ background-color: lime; } @@ -262,3 +269,4 @@ div.square { .sc { color: #BA2121 } /* Literal.String.Char */ .sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */ .s2 { color: #BA2121 } /* Literal.String.Double */ +.s1 { color: #BA2121 } /* Literal.String.Single */ diff --git a/airflow/www/static/tree.css b/airflow/www/static/tree.css index 9304bb1c55f78..17ff748b59a94 100644 --- a/airflow/www/static/tree.css +++ b/airflow/www/static/tree.css @@ -62,6 +62,9 @@ rect.upstream_failed { rect.up_for_retry { fill: gold; } +rect.up_for_reschedule { + fill: turquoise; +} rect.skipped { fill: pink; } diff --git a/airflow/www/templates/admin/master.html b/airflow/www/templates/admin/master.html index 2939a099d4aa2..7837c93f73e07 100644 --- a/airflow/www/templates/admin/master.html +++ b/airflow/www/templates/admin/master.html @@ -37,14 +37,52 @@ alert('{{ hostname }}'); }); $('span').tooltip(); + var CSRF = {{ csrf_token() | tojson }}; $.ajaxSetup({ beforeSend: function(xhr, settings) { if (!/^(GET|HEAD|OPTIONS|TRACE)$/i.test(settings.type) && !this.crossDomain) { - xhr.setRequestHeader("X-CSRFToken", "{{ csrf_token() }}"); + xhr.setRequestHeader("X-CSRFToken", CSRF); } } }); + +var el = document.createElement("span"); + +function escapeHtml(text) { + el.textContent = text; + return el.innerHTML; +} + +function postAsForm(url, parameters) { + var form = $("
    "); + + form.attr("method", "POST"); + form.attr("action", url); + + $.each(parameters || {}, function(key, value) { + var field = $(''); + + field.attr("type", "hidden"); + field.attr("name", key); + field.attr("value", value); + + form.append(field); + }); + + var field = $(''); + + field.attr("type", "hidden"); + field.attr("name", "csrf_token"); + field.attr("value", CSRF); + + form.append(field); + + // The form needs to be a part of the document in order for us to be able + // to submit it. + $(document.body).append(form); + form.submit(); +} {% endblock %} @@ -89,7 +127,7 @@

    + - {% for dag_id in dag_ids_in_page %} - {% set dag = webserver_dags[dag_id] if dag_id in webserver_dags else None %} + {% for dag in dags %} @@ -118,14 +103,12 @@

    DAGs

    @@ -139,8 +122,8 @@

    DAGs

    {% if dag %} - + @@ -186,10 +169,16 @@

    DAGs

    {% endif %} - + + + + + +
    {% endfor %} @@ -241,8 +230,22 @@

    DAGs

    window.location = DAGS_INDEX + "?page_size=" + p_size; }); - function confirmTriggerDag(dag_id){ - return confirm("Are you sure you want to run '"+dag_id+"' now?"); + function confirmTriggerDag(link, dag_id){ + if (confirm("Are you sure you want to run '"+dag_id+"' now?")) { + postAsForm(link.href, {}); + } + // Never follow the link + return false; + } + + function confirmDeleteDag(link, dag_id){ + if (confirm("Are you sure you want to delete '"+dag_id+"' now?\n\ + This option will delete ALL metadata, DAG runs, etc.\n\ + EXCEPT Log.\n\ + This cannot be undone.")) { + postAsForm(link.href, {}); + } + return false; } all_dags = $("[id^=toggle]"); $.each(all_dags, function(i,v) { @@ -253,7 +256,7 @@

    DAGs

    } else { is_paused = 'false' } - url = 'airflow/paused?is_paused=' + is_paused + '&dag_id=' + dag_id; + url = {{ url_for('airflow.paused') | tojson }} + '?is_paused=' + is_paused + '&dag_id=' + encodeURIComponent(dag_id); $.post(url); }); }); @@ -309,7 +312,7 @@

    DAGs

    d3.json("{{ url_for('airflow.dag_stats') }}", function(error, json) { for(var dag_id in json) { states = json[dag_id]; - g = d3.select('svg#dag-run-' + dag_id) + g = d3.select('svg#dag-run-' + dag_id.replace(/\./g, '__dot__')) .attr('height', diameter + (stroke_width_hover * 2)) .attr('width', '110px') .selectAll("g") @@ -388,9 +391,9 @@

    DAGs

    d3.json("{{ url_for('airflow.task_stats') }}", function(error, json) { for(var dag_id in json) { states = json[dag_id]; - g = d3.select('svg#task-run-' + dag_id) + g = d3.select('svg#task-run-' + dag_id.replace(/\./g, '__dot__')) .attr('height', diameter + (stroke_width_hover * 2)) - .attr('width', '240px') + .attr('width', '300px') .selectAll("g") .data(states) .enter() diff --git a/airflow/www/templates/airflow/gantt.html b/airflow/www/templates/airflow/gantt.html index 01d3cd8cdfcc5..b070409654e08 100644 --- a/airflow/www/templates/airflow/gantt.html +++ b/airflow/www/templates/airflow/gantt.html @@ -55,7 +55,7 @@ var dag_id = '{{ dag.dag_id }}'; var task_id = ''; var exection_date = ''; - data = {{ data |safe }}; + data = {{ data |tojson|safe }}; var gantt = d3.gantt() .taskTypes(data.taskNames) .taskStatus(data.taskStatus) diff --git a/airflow/www/templates/airflow/graph.html b/airflow/www/templates/airflow/graph.html index 33cdd9737b2fa..79fbb74394745 100644 --- a/airflow/www/templates/airflow/graph.html +++ b/airflow/www/templates/airflow/graph.html @@ -23,99 +23,105 @@ {% block head_css %} {{ super() }} - - + + {% endblock %} {% block body %} {{ super() }} - {% if dag.doc_md %} -
    {{ doc_md|safe }}
    - {% endif %} -
    -
    - {{ state_token }} - Base date: {{ form.base_date(class_="form-control") }} - Number of runs: {{ form.num_runs(class_="form-control") }} - Run: - {{ form.execution_date(class_="form-control") | safe }} - Layout: - {{ form.arrange(class_="form-control") | safe }} - - - - - -
    - -
    -
    -
    +{% if dag.doc_md %} +
    {{ dag.doc_md|safe }}
    +{% endif %} +
    +
    + {{ state_token }} + Base date: {{ form.base_date(class_="form-control") }} + Number of runs: {{ form.num_runs(class_="form-control") }} + Run: + {{ form.execution_date(class_="form-control") | safe }} + Layout: + {{ form.arrange(class_="form-control") | safe }} + + + + + +
    + +
    +
    +

    - {% for op in operators %} -
    - {{ op.__name__ }} -
    - {% endfor %} - -
    -
    no status
    + {% for op in operators %} +
    + {{ op.__name__ }} +
    + {% endfor %} + +
    +
    no_status
    queued
    -
    retry
    +
    up_for_retry
    +
    up_for_reschedule
    skipped
    failed
    running
    success
    -
    -
    +
    +

    - - - - - - - spinner + + + + + + + spinner

    {% endblock %} {% block tail %} - {{ super() }} +{{ super() }} - - - + + + {% endblock %} diff --git a/airflow/www/templates/airflow/login.html b/airflow/www/templates/airflow/login.html index cdae3178f01db..72cdff202cc11 100644 --- a/airflow/www/templates/airflow/login.html +++ b/airflow/www/templates/airflow/login.html @@ -1,13 +1,13 @@ -{# +{# Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - + http://www.apache.org/licenses/LICENSE-2.0 - + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,7 +25,7 @@

    Sign in to Airflow

    - -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/airflow/www/templates/airflow/task.html b/airflow/www/templates/airflow/task.html index e45b745fb879c..820a4a98149c7 100644 --- a/airflow/www/templates/airflow/task.html +++ b/airflow/www/templates/airflow/task.html @@ -37,7 +37,9 @@
    Dependencies Blocking Task From Getting Scheduled
    {% endfor %}
    max_active_runs - {{ dag.active_runs | length }} / {{ dag.max_active_runs }}{{ active_runs | length }} / {{ dag.max_active_runs }}
    concurrency diff --git a/airflow/www/templates/airflow/dags.html b/airflow/www/templates/airflow/dags.html index 23978905780ea..1a0ebc01f44ba 100644 --- a/airflow/www/templates/airflow/dags.html +++ b/airflow/www/templates/airflow/dags.html @@ -28,7 +28,7 @@ {% block body %}

    DAGs

    -
    - {% if dag_id in orm_dags %} - + - {% endif %} - {% if dag_id in orm_dags %} - - {% endif %} + - {% if dag_id in webserver_dags %} - - {{ dag_id }} - - {% else %} - {{ dag_id }} - - {% endif %} - {% if dag_id not in orm_dags %} - - {% endif %} + + {{ dag.dag_id }} + - {% if dag_id in webserver_dags %} - {{ dag.schedule_interval }} + {{ dag.schedule_interval | string }} - {% endif %} - {{ dag.owner if dag else orm_dags[dag_id].owners }} + {{ dag.owners }} - {% if dag %} - {% set last_run = dag.get_last_dagrun() %} - {% if last_run and last_run.execution_date %} - - {{ last_run.execution_date.strftime("%Y-%m-%d %H:%M") }} - - - {% endif %} + {% set last_run = dag.get_last_dagrun(include_externally_triggered=True) %} + {% if last_run and last_run.execution_date %} + + {{ last_run.execution_date.strftime("%Y-%m-%d %H:%M") }} + + {% endif %}
    - {{ html_code|safe }} + {% if html_code is defined %} + {{ html_code|safe }} + {% endif %}

    {% for attr, value in special_attrs_rendered.items() %} @@ -70,6 +72,8 @@
    Task Attributes
    {% endfor %} - {{ html_code|safe }} + {% if html_code is defined %} + {{ html_code|safe }} + {% endif %}
    {% endblock %} diff --git a/airflow/www/templates/airflow/ti_code.html b/airflow/www/templates/airflow/ti_code.html index 44942ca7d7e5d..72008f04682bb 100644 --- a/airflow/www/templates/airflow/ti_code.html +++ b/airflow/www/templates/airflow/ti_code.html @@ -21,23 +21,8 @@ {% block body %} {{ super() }}

    {{ title }}

    - {% if html_code %} - {{ html_code|safe }} - {% endif %} - {% if code %} -
    {{ code }}
    - {% endif %} - - {% if code_dict %} - {% for k, v in code_dict.items() %} -
    {{ k }}
    -
    {{ v }}
    - {% endfor %} - {% endif %} - {% if html_dict %} - {% for k, v in html_dict.items() %} -
    {{ k }}
    - {{ v|safe }} - {% endfor %} - {% endif %} + {% for k, v in html_dict.items() %} +
    {{ k }}
    + {{ v|safe }} + {% endfor %} {% endblock %} diff --git a/airflow/www/templates/airflow/tree.html b/airflow/www/templates/airflow/tree.html index f20127ceb3178..f16743ce35967 100644 --- a/airflow/www/templates/airflow/tree.html +++ b/airflow/www/templates/airflow/tree.html @@ -20,67 +20,66 @@ {% block head_css %} {{ super() }} - + {% endblock %} {% block body %} {{ super() }}
    - - Base date: {{ form.base_date(class_="form-control") }} - Number of runs: {{ form.num_runs(class_="form-control") }} - - - - - +
    + Base date: {{ form.base_date(class_="form-control") }} + Number of runs: {{ form.num_runs(class_="form-control") }} + + + + +

    -
    no status
    -
    -
    queued
    -
    -
    retry
    -
    -
    skipped
    -
    -
    failed
    -
    -
    running
    -
    -
    success
    -
    - {% for op in operators %} -
    -
    -
    {{ op.__name__ }}
    - {% endfor %} -
    +
    no_status
    +
    +
    queued
    +
    +
    up_for_retry
    +
    +
    up_for_reschedule
    +
    +
    skipped
    +
    +
    failed
    +
    +
    running
    +
    +
    success
    +
    + {% for op in operators %} +
    +
    +
    {{ op.__name__ }}
    + {% endfor %} +

    - - - - - - + + + + + +
    {% endblock %} {% block tail %} - {{ super() }} - - + + + {% endblock %} diff --git a/airflow/www/templates/airflow/variables/README.md b/airflow/www/templates/airflow/variables/README.md index 3fd539f8b54fb..e3b30a1a64b93 100644 --- a/airflow/www/templates/airflow/variables/README.md +++ b/airflow/www/templates/airflow/variables/README.md @@ -1,5 +1,24 @@ -## Variable Editor ----- + + +# Variable Editor + This folder contains forms used to edit values in the "Variable" key-value store. This data can be edited under the "Admin" admin tab, but sometimes it is preferable to use a form that can perform checking and provide a nicer diff --git a/airflow/www/utils.py b/airflow/www/utils.py index 7d9c8a07501c7..b15c515882bfb 100644 --- a/airflow/www/utils.py +++ b/airflow/www/utils.py @@ -17,24 +17,30 @@ # specific language governing permissions and limitations # under the License. # +# flake8: noqa: E402 +import inspect from future import standard_library -standard_library.install_aliases() -from builtins import str -from builtins import object +standard_library.install_aliases() # noqa: E402 +from builtins import str, object from cgi import escape from io import BytesIO as IO import functools import gzip +import io import json +import os +import re import time +import wtforms +from wtforms.compat import text_type +import zipfile -from flask import after_this_request, request, Response -from flask_admin.contrib.sqla.filters import FilterConverter +from flask import after_this_request, request, Markup, Response from flask_admin.model import filters +import flask_admin.contrib.sqla.filters as sqlafilters from flask_login import current_user -import wtforms -from wtforms.compat import text_type +from six.moves.urllib.parse import urlencode from airflow import configuration, models, settings from airflow.utils.db import create_session @@ -55,16 +61,21 @@ def should_hide_value_for_key(key_name): - return any(s in key_name.lower() for s in DEFAULT_SENSITIVE_VARIABLE_FIELDS) \ - and configuration.conf.getboolean('admin', 'hide_sensitive_variable_fields') + # It is possible via importing variables from file that a key is empty. + if key_name: + config_set = configuration.conf.getboolean('admin', + 'hide_sensitive_variable_fields') + field_comp = any(s in key_name.lower() for s in DEFAULT_SENSITIVE_VARIABLE_FIELDS) + return config_set and field_comp + return False class LoginMixin(object): def is_accessible(self): return ( not AUTHENTICATE or ( - not current_user.is_anonymous() and - current_user.is_authenticated() + not current_user.is_anonymous and + current_user.is_authenticated ) ) @@ -73,7 +84,7 @@ class SuperUserMixin(object): def is_accessible(self): return ( not AUTHENTICATE or - (not current_user.is_anonymous() and current_user.is_superuser()) + (not current_user.is_anonymous and current_user.is_superuser()) ) @@ -81,22 +92,16 @@ class DataProfilingMixin(object): def is_accessible(self): return ( not AUTHENTICATE or - (not current_user.is_anonymous() and current_user.data_profiling()) + (not current_user.is_anonymous and current_user.data_profiling()) ) def get_params(**kwargs): - params = [] - for k, v in kwargs.items(): - if k == 'showPaused': - # True is default or None - if v or v is None: - continue - params.append('{}={}'.format(k, v)) - elif v: - params.append('{}={}'.format(k, v)) - params = sorted(params, key=lambda x: x.split('=')[0]) - return '&'.join(params) + if 'showPaused' in kwargs: + v = kwargs['showPaused'] + if v or v is None: + kwargs.pop('showPaused') + return urlencode({d: v if v is not None else '' for d, v in kwargs.items()}) def generate_pages(current_page, num_of_pages, @@ -127,27 +132,27 @@ def generate_pages(current_page, num_of_pages, """ void_link = 'javascript:void(0)' - first_node = """
  • + first_node = Markup("""
  • « -
  • """ +""") - previous_node = """""" +""") - next_node = """""" +""") - last_node = """
  • + last_node = Markup("""
  • » -
  • """ +""") - page_node = """
  • + page_node = Markup("""
  • {page_num} -
  • """ +""") - output = ['
      '] + output = [Markup('
        ')] is_disabled = 'disabled' if current_page <= 0 else '' output.append(first_node.format(href_link="?{}" @@ -203,9 +208,9 @@ def is_current(current, page): showPaused=showPaused)), disabled=is_disabled)) - output.append('
      ') + output.append(Markup('
    ')) - return wtforms.widgets.core.HTMLString('\n'.join(output)) + return Markup('\n'.join(output)) def limit_sql(sql, limit, conn_type): @@ -246,8 +251,8 @@ def action_logging(f): """ @functools.wraps(f) def wrapper(*args, **kwargs): - # Only AnonymousUserMixin() does not have user attribute - if current_user and hasattr(current_user, 'user'): + # AnonymousUserMixin() has user attribute but its value is None. + if current_user and hasattr(current_user, 'user') and current_user.user: user = current_user.user.username else: user = 'anonymous' @@ -286,7 +291,7 @@ def wrapper(*args, **kwargs): dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) - if current_user and hasattr(current_user, 'username'): + if current_user and hasattr(current_user, 'user') and current_user.user: user = current_user.username else: user = 'anonymous' @@ -366,6 +371,22 @@ def zipper(response): return view_func +def open_maybe_zipped(f, mode='r'): + """ + Opens the given file. If the path contains a folder with a .zip suffix, then + the folder is treated as a zip archive, opening the file inside the archive. + + :return: a file object, as in `open`, or as in `ZipFile.open`. + """ + + _, archive, filename = re.search( + r'((.*\.zip){})?(.*)'.format(re.escape(os.sep)), f).groups() + if archive and zipfile.is_zipfile(archive): + return zipfile.ZipFile(archive, mode=mode).open(filename) + else: + return io.open(f, mode=mode) + + def make_cache_key(*args, **kwargs): """ Used by cache to get a unique key per URL @@ -375,6 +396,33 @@ def make_cache_key(*args, **kwargs): return (path + args).encode('ascii', 'ignore') +def get_python_source(x): + """ + Helper function to get Python source (or not), preventing exceptions + """ + source_code = None + + if isinstance(x, functools.partial): + source_code = inspect.getsource(x.func) + + if source_code is None: + try: + source_code = inspect.getsource(x) + except TypeError: + pass + + if source_code is None: + try: + source_code = inspect.getsource(x.__call__) + except (TypeError, AttributeError): + pass + + if source_code is None: + source_code = 'No source code available for {}'.format(type(x)) + + return source_code + + class AceEditorWidget(wtforms.widgets.TextArea): """ Renders an ACE code editor. @@ -395,7 +443,43 @@ def __call__(self, field, **kwargs): return wtforms.widgets.core.HTMLString(html) -class UtcFilterConverter(FilterConverter): +class UtcDateTimeFilterMixin(object): + def clean(self, value): + dt = super(UtcDateTimeFilterMixin, self).clean(value) + return timezone.make_aware(dt, timezone=timezone.utc) + + +class UtcDateTimeEqualFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeEqualFilter): + pass + + +class UtcDateTimeNotEqualFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeNotEqualFilter): + pass + + +class UtcDateTimeGreaterFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeGreaterFilter): + pass + + +class UtcDateTimeSmallerFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeSmallerFilter): + pass + + +class UtcDateTimeBetweenFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeBetweenFilter): + pass + + +class UtcDateTimeNotBetweenFilter(UtcDateTimeFilterMixin, sqlafilters.DateTimeNotBetweenFilter): + pass + + +class UtcFilterConverter(sqlafilters.FilterConverter): + + utcdatetime_filters = (UtcDateTimeEqualFilter, UtcDateTimeNotEqualFilter, + UtcDateTimeGreaterFilter, UtcDateTimeSmallerFilter, + UtcDateTimeBetweenFilter, UtcDateTimeNotBetweenFilter, + sqlafilters.FilterEmpty) + @filters.convert('utcdatetime') def conv_utcdatetime(self, column, name, **kwargs): - return self.conv_datetime(column, name, **kwargs) + return [f(column, name, **kwargs) for f in self.utcdatetime_filters] diff --git a/airflow/www/validators.py b/airflow/www/validators.py index 6eee76a32a8ad..3db8b2850e0ad 100644 --- a/airflow/www/validators.py +++ b/airflow/www/validators.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -45,8 +45,11 @@ def __call__(self, form, field): if field.data < other.data: d = { - 'other_label': hasattr(other, 'label') and other.label.text - or self.fieldname, + 'other_label': ( + hasattr(other, 'label') and + other.label.text or + self.fieldname + ), 'other_name': self.fieldname, } message = self.message diff --git a/airflow/www/views.py b/airflow/www/views.py index 8f6725ef59b44..49ef0eba8518e 100644 --- a/airflow/www/views.py +++ b/airflow/www/views.py @@ -18,76 +18,66 @@ # under the License. # -from past.builtins import basestring, unicode - import ast +import codecs +import copy import datetime as dt +from io import BytesIO +import itertools +import json import logging -import os -import pkg_resources -import socket -from functools import wraps -from datetime import timedelta -import copy import math -import json -import bleach -import pendulum -import codecs +import os +import traceback from collections import defaultdict - -import inspect +from datetime import timedelta +from functools import wraps from textwrap import dedent -import traceback +import markdown +import pendulum import sqlalchemy as sqla -from sqlalchemy import or_, desc, and_, union_all - from flask import ( abort, jsonify, redirect, url_for, request, Markup, Response, - current_app, render_template, make_response) + current_app, render_template, make_response, send_file) +from flask import flash from flask_admin import BaseView, expose, AdminIndexView -from flask_admin.contrib.sqla import ModelView from flask_admin.actions import action from flask_admin.babel import lazy_gettext +from flask_admin.contrib.sqla import ModelView +from flask_admin.form.fields import DateTimeField from flask_admin.tools import iterdecode -from flask import flash -from flask._compat import PY2 - -from jinja2.sandbox import ImmutableSandboxedEnvironment from jinja2 import escape - -import markdown -import nvd3 - +from jinja2.sandbox import ImmutableSandboxedEnvironment +from past.builtins import basestring +from pygments import highlight, lexers +from pygments.formatters import HtmlFormatter +from sqlalchemy import or_, desc, and_, union_all from wtforms import ( Form, SelectField, TextAreaField, PasswordField, StringField, validators) -from flask_admin.form.fields import DateTimeField - -from pygments import highlight, lexers -from pygments.formatters import HtmlFormatter import airflow from airflow import configuration as conf from airflow import models from airflow import settings -from airflow.api.common.experimental.mark_tasks import set_dag_run_state +from airflow.api.common.experimental.mark_tasks import (set_dag_run_state_to_running, + set_dag_run_state_to_success, + set_dag_run_state_to_failed) from airflow.exceptions import AirflowException +from airflow.models import BaseOperator, errors from airflow.models import XCom, DagRun -from airflow.ti_deps.dep_context import DepContext, QUEUE_DEPS, SCHEDULER_DEPS - -from airflow.models import BaseOperator +from airflow.models.connection import Connection from airflow.operators.subdag_operator import SubDagOperator - +from airflow.ti_deps.dep_context import DepContext, QUEUE_DEPS, SCHEDULER_DEPS from airflow.utils import timezone -from airflow.utils.json import json_ser -from airflow.utils.state import State -from airflow.utils.db import create_session, provide_session -from airflow.utils.helpers import alchemy_to_dict from airflow.utils.dates import infer_time_unit, scale_time_units, parse_execution_date -from airflow.utils.timezone import datetime +from airflow.utils.db import create_session, provide_session +from airflow.utils.helpers import alchemy_to_dict, render_log_filename from airflow.utils.net import get_hostname +from airflow.utils.state import State +from airflow.utils.timezone import datetime +from airflow._vendor import nvd3 from airflow.www import utils as wwwutils from airflow.www.forms import (DateTimeForm, DateTimeWithNumRunsForm, DateTimeWithNumRunsWithDagRunsForm) @@ -117,13 +107,12 @@ def dag_link(v, c, m, p): if m.dag_id is None: return Markup() - dag_id = bleach.clean(m.dag_id) url = url_for( 'airflow.graph', - dag_id=dag_id, + dag_id=m.dag_id, execution_date=m.execution_date) return Markup( - '{}'.format(url, dag_id)) + '{}').format(url, m.dag_id) def log_url_formatter(v, c, m, p): @@ -134,45 +123,43 @@ def log_url_formatter(v, c, m, p): def dag_run_link(v, c, m, p): - dag_id = bleach.clean(m.dag_id) url = url_for( 'airflow.graph', dag_id=m.dag_id, run_id=m.run_id, execution_date=m.execution_date) - return Markup('{m.run_id}'.format(**locals())) + title = m.run_id + return Markup('{title}').format(**locals()) def task_instance_link(v, c, m, p): - dag_id = bleach.clean(m.dag_id) - task_id = bleach.clean(m.task_id) url = url_for( 'airflow.task', - dag_id=dag_id, - task_id=task_id, + dag_id=m.dag_id, + task_id=m.task_id, execution_date=m.execution_date.isoformat()) url_root = url_for( 'airflow.graph', - dag_id=dag_id, - root=task_id, + dag_id=m.dag_id, + root=m.task_id, execution_date=m.execution_date.isoformat()) return Markup( """ - {task_id} + {m.task_id} - """.format(**locals())) + """).format(**locals()) def state_token(state): color = State.color(state) return Markup( '' - '{state}'.format(**locals())) + '{state}').format(**locals()) def parse_datetime_f(value): @@ -196,27 +183,30 @@ def datetime_f(v, c, m, p): dttm = attr.isoformat() if attr else '' if timezone.utcnow().isoformat()[:4] == dttm[:4]: dttm = dttm[5:] - return Markup("{}".format(dttm)) + return Markup("{}").format(dttm) def nobr_f(v, c, m, p): - return Markup("{}".format(getattr(m, p))) + return Markup("{}").format(getattr(m, p)) def label_link(v, c, m, p): try: default_params = ast.literal_eval(m.default_params) - except: + except Exception: default_params = {} url = url_for( 'airflow.chart', chart_id=m.id, iteration_no=m.iteration_no, **default_params) - return Markup("{m.label}".format(**locals())) + title = m.label + return Markup("{title}").format(**locals()) def pool_link(v, c, m, p): - url = '/admin/taskinstance/?flt1_pool_equals=' + m.pool - return Markup("{m.pool}".format(**locals())) + title = m.pool + + url = url_for('airflow.task', flt1_pool_equals=m.pool) + return Markup("{title}").format(**locals()) def pygment_html_render(s, lexer=lexers.TextLexer): @@ -256,7 +246,9 @@ def wrapped_markdown(s): 'doc_yaml': lambda x: render(x, lexers.YamlLexer), 'doc_md': wrapped_markdown, 'python_callable': lambda x: render( - inspect.getsource(x), lexers.PythonLexer), + wwwutils.get_python_source(x), + lexers.PythonLexer, + ), } @@ -266,7 +258,7 @@ def data_profiling_required(f): def decorated_function(*args, **kwargs): if ( current_app.config['LOGIN_DISABLED'] or - (not current_user.is_anonymous() and current_user.data_profiling()) + (not current_user.is_anonymous and current_user.data_profiling()) ): return f(*args, **kwargs) else: @@ -277,19 +269,23 @@ def decorated_function(*args, **kwargs): def fused_slots(v, c, m, p): - url = ( - '/admin/taskinstance/' + - '?flt1_pool_equals=' + m.pool + - '&flt2_state_equals=running') - return Markup("{1}".format(url, m.used_slots())) + url = url_for( + 'taskinstance.index_view', + flt1_pool_equals=m.pool, + flt2_state_equals='running', + ) + return Markup("{1}").format(url, m.used_slots()) def fqueued_slots(v, c, m, p): - url = ( - '/admin/taskinstance/' + - '?flt1_pool_equals=' + m.pool + - '&flt2_state_equals=queued&sort=10&desc=1') - return Markup("{1}".format(url, m.queued_slots())) + url = url_for( + 'taskinstance.index_view', + flt1_pool_equals=m.pool, + flt2_state_equals='queued', + sort='1', + desc='1' + ) + return Markup("{1}").format(url, m.queued_slots()) def recurse_tasks(tasks, task_ids, dag_ids, task_id_to_dag): @@ -370,6 +366,7 @@ def get_date_time_num_runs_dag_runs_form_data(request, session, dag): 'dr_state': dr_state, } + class Airflow(BaseView): def is_visible(self): return False @@ -382,7 +379,6 @@ def index(self): @expose('/chart_data') @data_profiling_required @wwwutils.gzipped - # @cache.cached(timeout=3600, key_prefix=wwwutils.make_cache_key) def chart_data(self): from airflow import macros import pandas as pd @@ -394,7 +390,7 @@ def chart_data(self): csv = request.args.get('csv') == "true" chart = session.query(models.Chart).filter_by(id=chart_id).first() db = session.query( - models.Connection).filter_by(conn_id=chart.conn_id).first() + Connection).filter_by(conn_id=chart.conn_id).first() payload = { "state": "ERROR", @@ -404,9 +400,9 @@ def chart_data(self): # Processing templated fields try: args = ast.literal_eval(chart.default_params) - if type(args) is not type(dict()): + if not isinstance(args, dict): raise AirflowException('Not a dict') - except: + except Exception: args = {} payload['error'] += ( "Default params is not valid, string has to evaluate as " @@ -448,15 +444,15 @@ def chart_data(self): if not payload['error'] and len(df) == 0: payload['error'] += "Empty result set. " elif ( - not payload['error'] and - chart.sql_layout == 'series' and - chart.chart_type != "datatable" and - len(df.columns) < 3): + not payload['error'] and + chart.sql_layout == 'series' and + chart.chart_type != "datatable" and + len(df.columns) < 3): payload['error'] += "SQL needs to return at least 3 columns. " elif ( - not payload['error'] and - chart.sql_layout == 'columns' and - len(df.columns) < 2): + not payload['error'] and + chart.sql_layout == 'columns' and + len(df.columns) < 2): payload['error'] += "SQL needs to return at least 2 columns. " elif not payload['error']: import numpy as np @@ -477,7 +473,7 @@ def chart_data(self): df[df.columns[x_col]]) df[df.columns[x_col]] = df[df.columns[x_col]].apply( lambda x: int(x.strftime("%s")) * 1000) - except Exception as e: + except Exception: payload['error'] = "Time conversion failed" if chart_type == 'datatable': @@ -486,8 +482,6 @@ def chart_data(self): else: if chart.sql_layout == 'series': # User provides columns (series, x, y) - xaxis_label = df.columns[1] - yaxis_label = df.columns[2] df[df.columns[2]] = df[df.columns[2]].astype(np.float) df = df.pivot_table( index=df.columns[1], @@ -495,10 +489,8 @@ def chart_data(self): values=df.columns[2], aggfunc=np.sum) else: # User provides columns (x, y, metric1, metric2, ...) - xaxis_label = df.columns[0] - yaxis_label = 'y' df.index = df[df.columns[0]] - df = df.sort(df.columns[0]) + df = df.sort_values(by=df.columns[0]) del df[df.columns[0]] for col in df.columns: df[col] = df[col].astype(np.float) @@ -559,37 +551,33 @@ def chart(self): @login_required @provide_session def dag_stats(self, session=None): - ds = models.DagStat + dr = models.DagRun + dm = models.DagModel + dag_ids = session.query(dm.dag_id) - ds.update( - dag_ids=[dag.dag_id for dag in dagbag.dags.values() if not dag.is_subdag] - ) - - qry = ( - session.query(ds.dag_id, ds.state, ds.count) + dag_state_stats = ( + session.query(dr.dag_id, dr.state, sqla.func.count(dr.state)).group_by(dr.dag_id, dr.state) ) data = {} - for dag_id, state, count in qry: + for (dag_id, ) in dag_ids: + data[dag_id] = {} + for dag_id, state, count in dag_state_stats: if dag_id not in data: data[dag_id] = {} data[dag_id][state] = count payload = {} - for dag in dagbag.dags.values(): - payload[dag.safe_dag_id] = [] + for dag_id, d in data.items(): + payload[dag_id] = [] for state in State.dag_states: - try: - count = data[dag.dag_id][state] - except Exception: - count = 0 - d = { + count = d.get(state, 0) + payload[dag_id].append({ 'state': state, 'count': count, - 'dag_id': dag.dag_id, + 'dag_id': dag_id, 'color': State.color(state) - } - payload[dag.safe_dag_id].append(d) + }) return wwwutils.json_response(payload) @expose('/task_stats') @@ -600,12 +588,14 @@ def task_stats(self, session=None): DagRun = models.DagRun Dag = models.DagModel + dag_ids = session.query(Dag.dag_id) + LastDagRun = ( session.query(DagRun.dag_id, sqla.func.max(DagRun.execution_date).label('execution_date')) .join(Dag, Dag.dag_id == DagRun.dag_id) .filter(DagRun.state != State.RUNNING) - .filter(Dag.is_active == True) - .filter(Dag.is_subdag == False) + .filter(Dag.is_active == True) # noqa: E712 + .filter(Dag.is_subdag == False) # noqa: E712 .group_by(DagRun.dag_id) .subquery('last_dag_run') ) @@ -613,8 +603,8 @@ def task_stats(self, session=None): session.query(DagRun.dag_id, DagRun.execution_date) .join(Dag, Dag.dag_id == DagRun.dag_id) .filter(DagRun.state == State.RUNNING) - .filter(Dag.is_active == True) - .filter(Dag.is_subdag == False) + .filter(Dag.is_active == True) # noqa: E712 + .filter(Dag.is_subdag == False) # noqa: E712 .subquery('running_dag_run') ) @@ -622,13 +612,13 @@ def task_stats(self, session=None): # If no dag_run is active, return task instances from most recent dag_run. LastTI = ( session.query(TI.dag_id.label('dag_id'), TI.state.label('state')) - .join(LastDagRun, and_( + .join(LastDagRun, and_( LastDagRun.c.dag_id == TI.dag_id, LastDagRun.c.execution_date == TI.execution_date)) ) RunningTI = ( session.query(TI.dag_id.label('dag_id'), TI.state.label('state')) - .join(RunningDagRun, and_( + .join(RunningDagRun, and_( RunningDagRun.c.dag_id == TI.dag_id, RunningDagRun.c.execution_date == TI.execution_date)) ) @@ -636,7 +626,7 @@ def task_stats(self, session=None): UnionTI = union_all(LastTI, RunningTI).alias('union_ti') qry = ( session.query(UnionTI.c.dag_id, UnionTI.c.state, sqla.func.count()) - .group_by(UnionTI.c.dag_id, UnionTI.c.state) + .group_by(UnionTI.c.dag_id, UnionTI.c.state) ) data = {} @@ -647,30 +637,27 @@ def task_stats(self, session=None): session.commit() payload = {} - for dag in dagbag.dags.values(): - payload[dag.safe_dag_id] = [] + for (dag_id, ) in dag_ids: + payload[dag_id] = [] for state in State.task_states: - try: - count = data[dag.dag_id][state] - except: - count = 0 - d = { + count = data.get(dag_id, {}).get(state, 0) + payload[dag_id].append({ 'state': state, 'count': count, - 'dag_id': dag.dag_id, + 'dag_id': dag_id, 'color': State.color(state) - } - payload[dag.safe_dag_id].append(d) + }) return wwwutils.json_response(payload) @expose('/code') @login_required - def code(self): + @provide_session + def code(self, session=None): dag_id = request.args.get('dag_id') - dag = dagbag.get_dag(dag_id) - title = dag_id + dm = models.DagModel + dag = session.query(dm).filter(dm.dag_id == dag_id).first() try: - with open(dag.fileloc, 'r') as f: + with wwwutils.open_maybe_zipped(dag.fileloc, 'r') as f: code = f.read() html_code = highlight( code, lexers.PythonLexer(), HtmlFormatter(linenos=True)) @@ -678,7 +665,7 @@ def code(self): html_code = str(e) return self.render( - 'airflow/dag_code.html', html_code=html_code, dag=dag, title=title, + 'airflow/dag_code.html', html_code=html_code, dag=dag, title=dag_id, root=request.args.get('root'), demo_mode=conf.getboolean('webserver', 'demo_mode')) @@ -689,17 +676,26 @@ def dag_details(self, session=None): dag_id = request.args.get('dag_id') dag = dagbag.get_dag(dag_id) title = "DAG details" + root = request.args.get('root', '') TI = models.TaskInstance - states = ( - session.query(TI.state, sqla.func.count(TI.dag_id)) - .filter(TI.dag_id == dag_id) - .group_by(TI.state) - .all() + states = session\ + .query(TI.state, sqla.func.count(TI.dag_id))\ + .filter(TI.dag_id == dag_id)\ + .group_by(TI.state)\ + .all() + + active_runs = models.DagRun.find( + dag_id=dag.dag_id, + state=State.RUNNING, + external_trigger=False, + session=session ) + return self.render( 'airflow/dag_details.html', - dag=dag, title=title, states=states, State=State) + dag=dag, title=title, root=root, states=states, State=State, + active_runs=active_runs) @current_app.errorhandler(404) def circles(self): @@ -749,6 +745,7 @@ def rendered(self): execution_date = request.args.get('execution_date') dttm = pendulum.parse(execution_date) form = DateTimeForm(data={'execution_date': dttm}) + root = request.args.get('root', '') dag = dagbag.get_dag(dag_id) task = copy.copy(dag.get_task(task_id)) ti = models.TaskInstance(task=task, execution_date=dttm) @@ -773,7 +770,8 @@ def rendered(self): task_id=task_id, execution_date=execution_date, form=form, - title=title, ) + root=root, + title=title) @expose('/get_logs_with_metadata') @login_required @@ -784,7 +782,12 @@ def get_logs_with_metadata(self, session=None): task_id = request.args.get('task_id') execution_date = request.args.get('execution_date') dttm = pendulum.parse(execution_date) - try_number = int(request.args.get('try_number')) + if request.args.get('try_number') is not None: + try_number = int(request.args.get('try_number')) + else: + try_number = None + response_format = request.args.get('format', 'json') + metadata = request.args.get('metadata') metadata = json.loads(metadata) @@ -823,11 +826,21 @@ def get_logs_with_metadata(self, session=None): ti.task = dag.get_task(ti.task_id) logs, metadatas = handler.read(ti, try_number, metadata=metadata) metadata = metadatas[0] - for i, log in enumerate(logs): - if PY2 and not isinstance(log, unicode): - logs[i] = log.decode('utf-8') - message = logs[0] - return jsonify(message=message, metadata=metadata) + + if response_format == 'json': + message = logs[0] if try_number is not None else logs + return jsonify(message=message, metadata=metadata) + + file_obj = BytesIO(b'\n'.join( + log.encode('utf-8') for log in logs + )) + filename_template = conf.get('core', 'LOG_FILENAME_TEMPLATE') + attachment_filename = render_log_filename( + ti=ti, + try_number="all" if try_number is None else try_number, + filename_template=filename_template) + return send_file(file_obj, as_attachment=True, + attachment_filename=attachment_filename) except AttributeError as e: error_message = ["Task log handler {} does not support read logs.\n{}\n" .format(task_log_reader, str(e))] @@ -851,12 +864,20 @@ def log(self, session=None): models.TaskInstance.task_id == task_id, models.TaskInstance.execution_date == dttm).first() - logs = [''] * (ti.next_try_number - 1 if ti is not None else 0) + num_logs = 0 + if ti is not None: + num_logs = ti.next_try_number - 1 + if ti.state == State.UP_FOR_RESCHEDULE: + # Tasks in reschedule state decremented the try number + num_logs += 1 + logs = [''] * num_logs + root = request.args.get('root', '') return self.render( 'airflow/ti_log.html', logs=logs, dag=dag, title="Log by attempts", dag_id=dag.dag_id, task_id=task_id, - execution_date=execution_date, form=form) + execution_date=execution_date, form=form, + root=root) @expose('/task') @login_required @@ -871,6 +892,7 @@ def task(self): execution_date = request.args.get('execution_date') dttm = pendulum.parse(execution_date) form = DateTimeForm(data={'execution_date': dttm}) + root = request.args.get('root', '') dag = dagbag.get_dag(dag_id) if not dag or task_id not in dag.task_ids: @@ -888,7 +910,7 @@ def task(self): for attr_name in dir(ti): if not attr_name.startswith('_'): attr = getattr(ti, attr_name) - if type(attr) != type(self.task): + if type(attr) != type(self.task): # noqa: E721 ti_attrs.append((attr_name, str(attr))) task_attrs = [] @@ -896,7 +918,7 @@ def task(self): if not attr_name.startswith('_'): attr = getattr(task, attr_name) if type(attr) != type(self.task) and \ - attr_name not in attr_renderer: + attr_name not in attr_renderer: # noqa: E721 task_attrs.append((attr_name, str(attr))) # Color coding the special attributes that are code @@ -944,6 +966,7 @@ def task(self): execution_date=execution_date, special_attrs_rendered=special_attrs_rendered, form=form, + root=root, dag=dag, title=title) @expose('/xcom') @@ -958,8 +981,12 @@ def xcom(self, session=None): execution_date = request.args.get('execution_date') dttm = pendulum.parse(execution_date) form = DateTimeForm(data={'execution_date': dttm}) - dag = dagbag.get_dag(dag_id) - if not dag or task_id not in dag.task_ids: + root = request.args.get('root', '') + dm_db = models.DagModel + ti_db = models.TaskInstance + dag = session.query(dm_db).filter(dm_db.dag_id == dag_id).first() + ti = session.query(ti_db).filter(ti_db.dag_id == dag_id and ti_db.task_id == task_id).first() + if not ti: flash( "Task [{}.{}] doesn't seem to exist" " at the moment".format(dag_id, task_id), @@ -982,35 +1009,46 @@ def xcom(self, session=None): task_id=task_id, execution_date=execution_date, form=form, + root=root, dag=dag, title=title) - @expose('/run') + @expose('/run', methods=['POST']) @login_required @wwwutils.action_logging @wwwutils.notify_owner def run(self): - dag_id = request.args.get('dag_id') - task_id = request.args.get('task_id') - origin = request.args.get('origin') + dag_id = request.form.get('dag_id') + task_id = request.form.get('task_id') + origin = request.form.get('origin') + dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) - execution_date = request.args.get('execution_date') + execution_date = request.form.get('execution_date') execution_date = pendulum.parse(execution_date) - ignore_all_deps = request.args.get('ignore_all_deps') == "true" - ignore_task_deps = request.args.get('ignore_task_deps') == "true" - ignore_ti_state = request.args.get('ignore_ti_state') == "true" + ignore_all_deps = request.form.get('ignore_all_deps') == "true" + ignore_task_deps = request.form.get('ignore_task_deps') == "true" + ignore_ti_state = request.form.get('ignore_ti_state') == "true" + + from airflow.executors import GetDefaultExecutor + executor = GetDefaultExecutor() + valid_celery_config = False + valid_kubernetes_config = False try: - from airflow.executors import GetDefaultExecutor from airflow.executors.celery_executor import CeleryExecutor - executor = GetDefaultExecutor() - if not isinstance(executor, CeleryExecutor): - flash("Only works with the CeleryExecutor, sorry", "error") - return redirect(origin) + valid_celery_config = isinstance(executor, CeleryExecutor) + except ImportError: + pass + + try: + from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor + valid_kubernetes_config = isinstance(executor, KubernetesExecutor) except ImportError: - # in case CeleryExecutor cannot be imported it is not active either - flash("Only works with the CeleryExecutor, sorry", "error") + pass + + if not valid_celery_config and not valid_kubernetes_config: + flash("Only works with the Celery or Kubernetes executors, sorry", "error") return redirect(origin) ti = models.TaskInstance(task=task, execution_date=execution_date) @@ -1043,15 +1081,41 @@ def run(self): "it should start any moment now.".format(ti)) return redirect(origin) - @expose('/trigger') + @expose('/delete', methods=['POST']) @login_required @wwwutils.action_logging @wwwutils.notify_owner - def trigger(self): - dag_id = request.args.get('dag_id') - origin = request.args.get('origin') or "/admin/" - dag = dagbag.get_dag(dag_id) + def delete(self): + from airflow.api.common.experimental import delete_dag + from airflow.exceptions import DagNotFound, DagFileExists + + dag_id = request.values.get('dag_id') + origin = request.values.get('origin') or "/admin/" + + try: + delete_dag.delete_dag(dag_id) + except DagNotFound: + flash("DAG with id {} not found. Cannot delete".format(dag_id)) + return redirect(request.referrer) + except DagFileExists: + flash("Dag id {} is still in DagBag. " + "Remove the DAG file first.".format(dag_id)) + return redirect(request.referrer) + + flash("Deleting DAG with id {}. May take a couple minutes to fully" + " disappear.".format(dag_id)) + # Upon successful delete return to origin + return redirect(origin) + @expose('/trigger', methods=['POST']) + @login_required + @wwwutils.action_logging + @wwwutils.notify_owner + @provide_session + def trigger(self, session=None): + dag_id = request.values.get('dag_id') + origin = request.values.get('origin') or "/admin/" + dag = session.query(models.DagModel).filter(models.DagModel.dag_id == dag_id).first() if not dag: flash("Cannot find dag {}".format(dag_id)) return redirect(origin) @@ -1085,7 +1149,9 @@ def _clear_dag_tis(self, dag, start_date, end_date, origin, count = dag.clear( start_date=start_date, end_date=end_date, - include_subdags=recursive) + include_subdags=recursive, + include_parentdag=recursive, + ) flash("{0} task instances have been cleared".format(count)) return redirect(origin) @@ -1094,7 +1160,9 @@ def _clear_dag_tis(self, dag, start_date, end_date, origin, start_date=start_date, end_date=end_date, include_subdags=recursive, - dry_run=True) + dry_run=True, + include_parentdag=recursive, + ) if not tis: flash("No task instances to clear", 'error') response = redirect(origin) @@ -1109,24 +1177,24 @@ def _clear_dag_tis(self, dag, start_date, end_date, origin, return response - @expose('/clear') + @expose('/clear', methods=['POST']) @login_required @wwwutils.action_logging @wwwutils.notify_owner def clear(self): - dag_id = request.args.get('dag_id') - task_id = request.args.get('task_id') - origin = request.args.get('origin') + dag_id = request.form.get('dag_id') + task_id = request.form.get('task_id') + origin = request.form.get('origin') dag = dagbag.get_dag(dag_id) - execution_date = request.args.get('execution_date') + execution_date = request.form.get('execution_date') execution_date = pendulum.parse(execution_date) - confirmed = request.args.get('confirmed') == "true" - upstream = request.args.get('upstream') == "true" - downstream = request.args.get('downstream') == "true" - future = request.args.get('future') == "true" - past = request.args.get('past') == "true" - recursive = request.args.get('recursive') == "true" + confirmed = request.form.get('confirmed') == "true" + upstream = request.form.get('upstream') == "true" + downstream = request.form.get('downstream') == "true" + future = request.form.get('future') == "true" + past = request.form.get('past') == "true" + recursive = request.form.get('recursive') == "true" dag = dag.sub_dag( task_regex=r"^{0}$".format(task_id), @@ -1139,16 +1207,15 @@ def clear(self): return self._clear_dag_tis(dag, start_date, end_date, origin, recursive=recursive, confirmed=confirmed) - @expose('/dagrun_clear') + @expose('/dagrun_clear', methods=['POST']) @login_required @wwwutils.action_logging @wwwutils.notify_owner def dagrun_clear(self): - dag_id = request.args.get('dag_id') - task_id = request.args.get('task_id') - origin = request.args.get('origin') - execution_date = request.args.get('execution_date') - confirmed = request.args.get('confirmed') == "true" + dag_id = request.form.get('dag_id') + origin = request.form.get('origin') + execution_date = request.form.get('execution_date') + confirmed = request.form.get('confirmed') == "true" dag = dagbag.get_dag(dag_id) execution_date = pendulum.parse(execution_date) @@ -1163,12 +1230,12 @@ def dagrun_clear(self): @provide_session def blocked(self, session=None): DR = models.DagRun - dags = ( - session.query(DR.dag_id, sqla.func.count(DR.id)) - .filter(DR.state == State.RUNNING) - .group_by(DR.dag_id) - .all() - ) + dags = session\ + .query(DR.dag_id, sqla.func.count(DR.id))\ + .filter(DR.state == State.RUNNING)\ + .group_by(DR.dag_id)\ + .all() + payload = [] for dag_id, active_dag_runs in dags: max_active_runs = 0 @@ -1181,16 +1248,35 @@ def blocked(self, session=None): }) return wwwutils.json_response(payload) - @expose('/dagrun_success') - @login_required - @wwwutils.action_logging - @wwwutils.notify_owner - def dagrun_success(self): - dag_id = request.args.get('dag_id') - execution_date = request.args.get('execution_date') - confirmed = request.args.get('confirmed') == 'true' - origin = request.args.get('origin') + def _mark_dagrun_state_as_failed(self, dag_id, execution_date, confirmed, origin): + if not execution_date: + flash('Invalid execution date', 'error') + return redirect(origin) + + execution_date = pendulum.parse(execution_date) + dag = dagbag.get_dag(dag_id) + + if not dag: + flash('Cannot find DAG: {}'.format(dag_id), 'error') + return redirect(origin) + + new_dag_state = set_dag_run_state_to_failed(dag, execution_date, commit=confirmed) + + if confirmed: + flash('Marked failed on {} task instances'.format(len(new_dag_state))) + return redirect(origin) + + else: + details = '\n'.join([str(t) for t in new_dag_state]) + + response = self.render('airflow/confirm.html', + message=("Here's the list of task instances you are " + "about to mark as failed"), + details=details) + + return response + def _mark_dagrun_state_as_success(self, dag_id, execution_date, confirmed, origin): if not execution_date: flash('Invalid execution date', 'error') return redirect(origin) @@ -1202,8 +1288,8 @@ def dagrun_success(self): flash('Cannot find DAG: {}'.format(dag_id), 'error') return redirect(origin) - new_dag_state = set_dag_run_state(dag, execution_date, state=State.SUCCESS, - commit=confirmed) + new_dag_state = set_dag_run_state_to_success(dag, execution_date, + commit=confirmed) if confirmed: flash('Marked success on {} task instances'.format(len(new_dag_state))) @@ -1214,30 +1300,43 @@ def dagrun_success(self): response = self.render('airflow/confirm.html', message=("Here's the list of task instances you are " - "about to mark as successful:"), + "about to mark as success"), details=details) return response - @expose('/success') + @expose('/dagrun_failed', methods=['POST']) @login_required @wwwutils.action_logging @wwwutils.notify_owner - def success(self): - dag_id = request.args.get('dag_id') - task_id = request.args.get('task_id') - origin = request.args.get('origin') + def dagrun_failed(self): + dag_id = request.form.get('dag_id') + execution_date = request.form.get('execution_date') + confirmed = request.form.get('confirmed') == 'true' + origin = request.form.get('origin') + return self._mark_dagrun_state_as_failed(dag_id, execution_date, + confirmed, origin) + + @expose('/dagrun_success', methods=['POST']) + @login_required + @wwwutils.action_logging + @wwwutils.notify_owner + def dagrun_success(self): + dag_id = request.form.get('dag_id') + execution_date = request.form.get('execution_date') + confirmed = request.form.get('confirmed') == 'true' + origin = request.form.get('origin') + return self._mark_dagrun_state_as_success(dag_id, execution_date, + confirmed, origin) + + def _mark_task_instance_state(self, dag_id, task_id, origin, execution_date, + confirmed, upstream, downstream, + future, past, state): dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) task.dag = dag - execution_date = request.args.get('execution_date') execution_date = pendulum.parse(execution_date) - confirmed = request.args.get('confirmed') == "true" - upstream = request.args.get('upstream') == "true" - downstream = request.args.get('downstream') == "true" - future = request.args.get('future') == "true" - past = request.args.get('past') == "true" if not dag: flash("Cannot find DAG: {}".format(dag_id)) @@ -1252,26 +1351,66 @@ def success(self): if confirmed: altered = set_state(task=task, execution_date=execution_date, upstream=upstream, downstream=downstream, - future=future, past=past, state=State.SUCCESS, + future=future, past=past, state=state, commit=True) - flash("Marked success on {} task instances".format(len(altered))) + flash("Marked {} on {} task instances".format(state, len(altered))) return redirect(origin) to_be_altered = set_state(task=task, execution_date=execution_date, upstream=upstream, downstream=downstream, - future=future, past=past, state=State.SUCCESS, + future=future, past=past, state=state, commit=False) details = "\n".join([str(t) for t in to_be_altered]) response = self.render("airflow/confirm.html", message=("Here's the list of task instances you are " - "about to mark as successful:"), + "about to mark as {}:".format(state)), details=details) return response + @expose('/failed', methods=['POST']) + @login_required + @wwwutils.action_logging + @wwwutils.notify_owner + def failed(self): + dag_id = request.form.get('dag_id') + task_id = request.form.get('task_id') + origin = request.form.get('origin') + execution_date = request.form.get('execution_date') + + confirmed = request.form.get('confirmed') == "true" + upstream = request.form.get('upstream') == "true" + downstream = request.form.get('downstream') == "true" + future = request.form.get('future') == "true" + past = request.form.get('past') == "true" + + return self._mark_task_instance_state(dag_id, task_id, origin, execution_date, + confirmed, upstream, downstream, + future, past, State.FAILED) + + @expose('/success', methods=['POST']) + @login_required + @wwwutils.action_logging + @wwwutils.notify_owner + def success(self): + dag_id = request.form.get('dag_id') + task_id = request.form.get('task_id') + origin = request.form.get('origin') + execution_date = request.form.get('execution_date') + + confirmed = request.form.get('confirmed') == "true" + upstream = request.form.get('upstream') == "true" + downstream = request.form.get('downstream') == "true" + future = request.form.get('future') == "true" + past = request.form.get('past') == "true" + + return self._mark_task_instance_state(dag_id, task_id, origin, execution_date, + confirmed, upstream, downstream, + future, past, State.SUCCESS) + @expose('/tree') @login_required @wwwutils.gzipped @@ -1282,6 +1421,10 @@ def tree(self, session=None): dag_id = request.args.get('dag_id') blur = conf.getboolean('webserver', 'demo_mode') dag = dagbag.get_dag(dag_id) + if dag_id not in dagbag.dags: + flash('DAG "{0}" seems to be missing.'.format(dag_id), "error") + return redirect('/admin/') + root = request.args.get('root') if root: dag = dag.sub_dag( @@ -1349,8 +1492,8 @@ def recurse_nodes(task, visited): children_key = "_children" def set_duration(tid): - if (isinstance(tid, dict) and tid.get("state") == State.RUNNING and - tid["start_date"] is not None): + if isinstance(tid, dict) and tid.get("state") == State.RUNNING \ + and tid["start_date"] is not None: d = timezone.utcnow() - pendulum.parse(tid["start_date"]) tid["duration"] = d.total_seconds() return tid @@ -1377,12 +1520,9 @@ def set_duration(tid): data = { 'name': '[DAG]', 'children': [recurse_nodes(t, set()) for t in dag.roots], - 'instances': [ - dag_runs.get(d) or {'execution_date': d.isoformat()} - for d in dates], + 'instances': [dag_runs.get(d) or {'execution_date': d.isoformat()} for d in dates], } - data = json.dumps(data, indent=4, default=json_ser) session.commit() form = DateTimeWithNumRunsForm(data={'base_date': max_date, @@ -1489,10 +1629,10 @@ class GraphForm(DateTimeWithNumRunsWithDagRunsForm): ), blur=blur, root=root or '', - task_instances=json.dumps(task_instances, indent=2), - tasks=json.dumps(tasks, indent=2), - nodes=json.dumps(nodes, indent=2), - edges=json.dumps(edges, indent=2), ) + task_instances=task_instances, + tasks=tasks, + nodes=nodes, + edges=edges) @expose('/duration') @login_required @@ -1506,6 +1646,10 @@ def duration(self, session=None): num_runs = request.args.get('num_runs') num_runs = int(num_runs) if num_runs else default_dag_run + if dag is None: + flash('DAG "{0}" seems to be missing.'.format(dag_id), "error") + return redirect('/admin/') + if base_date: base_date = pendulum.parse(base_date) else: @@ -1536,13 +1680,13 @@ def duration(self, session=None): TF = models.TaskFail ti_fails = ( session - .query(TF) - .filter( + .query(TF) + .filter( TF.dag_id == dag.dag_id, TF.execution_date >= min_date, TF.execution_date <= base_date, TF.task_id.in_([t.task_id for t in dag.tasks])) - .all() + .all() ) fails_totals = defaultdict(int) @@ -1750,10 +1894,10 @@ def landing_times(self, session=None): @provide_session def paused(self, session=None): DagModel = models.DagModel - dag_id = request.args.get('dag_id') + dag_id = request.values.get('dag_id') orm_dag = session.query( DagModel).filter(DagModel.dag_id == dag_id).first() - if request.args.get('is_paused') == 'false': + if request.values.get('is_paused') == 'false': orm_dag.is_paused = True else: orm_dag.is_paused = False @@ -1763,30 +1907,29 @@ def paused(self, session=None): dagbag.get_dag(dag_id) return "OK" - @expose('/refresh') + @expose('/refresh', methods=['POST']) @login_required @wwwutils.action_logging @provide_session def refresh(self, session=None): - DagModel = models.DagModel - dag_id = request.args.get('dag_id') - orm_dag = session.query( - DagModel).filter(DagModel.dag_id == dag_id).first() + # TODO: Is this method still needed after AIRFLOW-3561? + dm = models.DagModel + dag_id = request.values.get('dag_id') + orm_dag = session.query(dm).filter(dm.dag_id == dag_id).first() if orm_dag: orm_dag.last_expired = timezone.utcnow() session.merge(orm_dag) session.commit() - dagbag.get_dag(dag_id) flash("DAG [{}] is now fresh as a daisy".format(dag_id)) return redirect(request.referrer) - @expose('/refresh_all') + @expose('/refresh_all', methods=['POST']) @login_required @wwwutils.action_logging def refresh_all(self): - dagbag.collect_dags(only_if_updated=False) + # TODO: Is this method still needed after AIRFLOW-3561? flash("All DAGs are now up to date") return redirect('/') @@ -1816,19 +1959,40 @@ def gantt(self, session=None): ti for ti in dag.get_task_instances(session, dttm, dttm) if ti.start_date] tis = sorted(tis, key=lambda ti: ti.start_date) + TF = models.TaskFail + ti_fails = list(itertools.chain(*[( + session + .query(TF) + .filter(TF.dag_id == ti.dag_id, + TF.task_id == ti.task_id, + TF.execution_date == ti.execution_date) + .all() + ) for ti in tis])) - tasks = [] + # determine bars to show in the gantt chart + gantt_bar_items = [] for ti in tis: - end_date = ti.end_date if ti.end_date else timezone.utcnow() + end_date = ti.end_date or timezone.utcnow() + gantt_bar_items.append((ti.task_id, ti.start_date, end_date, ti.state)) + for tf in ti_fails: + end_date = tf.end_date or timezone.utcnow() + gantt_bar_items.append((tf.task_id, tf.start_date, end_date, State.FAILED)) + + tasks = [] + for gantt_bar_item in gantt_bar_items: + task_id = gantt_bar_item[0] + start_date = gantt_bar_item[1] + end_date = gantt_bar_item[2] + state = gantt_bar_item[3] tasks.append({ - 'startDate': wwwutils.epoch(ti.start_date), + 'startDate': wwwutils.epoch(start_date), 'endDate': wwwutils.epoch(end_date), - 'isoStart': ti.start_date.isoformat()[:-4], + 'isoStart': start_date.isoformat()[:-4], 'isoEnd': end_date.isoformat()[:-4], - 'taskName': ti.task_id, - 'duration': "{}".format(end_date - ti.start_date)[:-4], - 'status': ti.state, - 'executionDate': ti.execution_date.isoformat(), + 'taskName': task_id, + 'duration': "{}".format(end_date - start_date)[:-4], + 'status': state, + 'executionDate': dttm.isoformat(), }) states = {ti.state: ti.state for ti in tis} data = { @@ -1845,7 +2009,7 @@ def gantt(self, session=None): dag=dag, execution_date=dttm.isoformat(), form=form, - data=json.dumps(data, indent=2), + data=data, base_date='', demo_mode=demo_mode, root=root, @@ -1863,7 +2027,7 @@ def task_instances(self, session=None): if dttm: dttm = pendulum.parse(dttm) else: - return ("Error: Invalid execution_date") + return "Error: Invalid execution_date" task_instances = { ti.task_id: alchemy_to_dict(ti) @@ -1888,13 +2052,13 @@ def variables(self, form): return self.render( 'airflow/variables/{}.html'.format(form) ) - except: + except Exception: # prevent XSS form = escape(form) return ("Error: form airflow/variables/{}.html " "not found.").format(form), 404 - @expose('/varimport', methods=["GET", "POST"]) + @expose('/varimport', methods=['POST']) @login_required @wwwutils.action_logging def varimport(self): @@ -1903,9 +2067,20 @@ def varimport(self): except Exception as e: flash("Missing file or syntax error: {}.".format(e)) else: + suc_count = fail_count = 0 for k, v in d.items(): - models.Variable.set(k, v, serialize_json=isinstance(v, dict)) - flash("{} variable(s) successfully updated.".format(len(d))) + try: + models.Variable.set(k, v, serialize_json=isinstance(v, dict)) + except Exception as e: + logging.info('Variable import failed: {}'.format(repr(e))) + fail_count += 1 + else: + suc_count += 1 + flash("{} variable(s) successfully updated.".format(suc_count), 'info') + if fail_count: + flash( + "{} variables(s) failed to be updated.".format(fail_count), 'error') + return redirect('/admin/variable') @@ -1944,108 +2119,63 @@ def get_int_arg(value, default=0): hide_paused = hide_paused_dags_by_default # read orm_dags from the db - sql_query = session.query(DM) + query = session.query(DM) if do_filter and owner_mode == 'ldapgroup': - sql_query = sql_query.filter( + query = query.filter( ~DM.is_subdag, DM.is_active, DM.owners.in_(current_user.ldap_groups) ) elif do_filter and owner_mode == 'user': - sql_query = sql_query.filter( + query = query.filter( ~DM.is_subdag, DM.is_active, DM.owners == current_user.user.username ) else: - sql_query = sql_query.filter( + query = query.filter( ~DM.is_subdag, DM.is_active ) # optionally filter out "paused" dags if hide_paused: - sql_query = sql_query.filter(~DM.is_paused) + query = query.filter(~DM.is_paused) + + if arg_search_query: + query = query.filter(sqla.func.lower(DM.dag_id) == arg_search_query.lower()) + + query = query.order_by(DM.dag_id) + + start = current_page * dags_per_page + end = start + dags_per_page - orm_dags = {dag.dag_id: dag for dag - in sql_query - .all()} + dags = query.offset(start).limit(dags_per_page).all() - import_errors = session.query(models.ImportError).all() + import_errors = session.query(errors.ImportError).all() for ie in import_errors: flash( "Broken DAG: [{ie.filename}] {ie.stacktrace}".format(ie=ie), "error") - # get a list of all non-subdag dags visible to everyone - # optionally filter out "paused" dags - if hide_paused: - unfiltered_webserver_dags = [dag for dag in dagbag.dags.values() if - not dag.parent_dag and not dag.is_paused] - - else: - unfiltered_webserver_dags = [dag for dag in dagbag.dags.values() if - not dag.parent_dag] - - # optionally filter to get only dags that the user should see - if do_filter and owner_mode == 'ldapgroup': - # only show dags owned by someone in @current_user.ldap_groups - webserver_dags = { - dag.dag_id: dag - for dag in unfiltered_webserver_dags - if dag.owner in current_user.ldap_groups - } - elif do_filter and owner_mode == 'user': - # only show dags owned by @current_user.user.username - webserver_dags = { - dag.dag_id: dag - for dag in unfiltered_webserver_dags - if dag.owner == current_user.user.username - } - else: - webserver_dags = { - dag.dag_id: dag - for dag in unfiltered_webserver_dags - } - - if arg_search_query: - lower_search_query = arg_search_query.lower() - # filter by dag_id - webserver_dags_filtered = { - dag_id: dag - for dag_id, dag in webserver_dags.items() - if (lower_search_query in dag_id.lower() or - lower_search_query in dag.owner.lower()) - } - - all_dag_ids = (set([dag.dag_id for dag in orm_dags.values() - if lower_search_query in dag.dag_id.lower() or - lower_search_query in dag.owners.lower()]) | - set(webserver_dags_filtered.keys())) - - sorted_dag_ids = sorted(all_dag_ids) - else: - webserver_dags_filtered = webserver_dags - sorted_dag_ids = sorted(set(orm_dags.keys()) | set(webserver_dags.keys())) - - start = current_page * dags_per_page - end = start + dags_per_page + from airflow.plugins_manager import import_errors as plugin_import_errors + for filename, stacktrace in plugin_import_errors.items(): + flash( + "Broken plugin: [{filename}] {stacktrace}".format( + stacktrace=stacktrace, + filename=filename), + "error") - num_of_all_dags = len(sorted_dag_ids) - page_dag_ids = sorted_dag_ids[start:end] + num_of_all_dags = query.count() num_of_pages = int(math.ceil(num_of_all_dags / float(dags_per_page))) auto_complete_data = set() - for dag in webserver_dags_filtered.values(): - auto_complete_data.add(dag.dag_id) - auto_complete_data.add(dag.owner) - for dag in orm_dags.values(): - auto_complete_data.add(dag.dag_id) - auto_complete_data.add(dag.owners) + for row in query.with_entities(DM.dag_id, DM.owners): + auto_complete_data.add(row.dag_id) + auto_complete_data.add(row.owners) return self.render( 'airflow/dags.html', - webserver_dags=webserver_dags_filtered, - orm_dags=orm_dags, + dags=dags, hide_paused=hide_paused, current_page=current_page, search_query=arg_search_query if arg_search_query else '', @@ -2057,7 +2187,6 @@ def get_int_arg(value, default=0): paging=wwwutils.generate_pages(current_page, num_of_pages, search=arg_search_query, showPaused=not hide_paused), - dag_ids_in_page=page_dag_ids, auto_complete_data=auto_complete_data) @@ -2066,8 +2195,7 @@ class QueryView(wwwutils.DataProfilingMixin, BaseView): @wwwutils.gzipped @provide_session def query(self, session=None): - dbs = session.query(models.Connection).order_by( - models.Connection.conn_id).all() + dbs = session.query(Connection).order_by(Connection.conn_id).all() session.expunge_all() db_choices = list( ((db.conn_id, db.conn_id) for db in dbs if db.get_hook())) @@ -2086,7 +2214,7 @@ class QueryForm(Form): results = None has_data = False error = False - if conn_id_str: + if conn_id_str and request.method == 'POST': db = [db for db in dbs if db.conn_id == conn_id_str][0] hook = db.get_hook() try: @@ -2183,13 +2311,10 @@ class SlaMissModelView(wwwutils.SuperUserMixin, ModelViewOnly): @provide_session def _connection_ids(session=None): - return [ - (c.conn_id, c.conn_id) - for c in ( - session.query(models.Connection.conn_id) - .group_by(models.Connection.conn_id) - ) - ] + return [(c.conn_id, c.conn_id) for c in ( + session + .query(Connection.conn_id) + .group_by(Connection.conn_id))] class ChartModelView(wwwutils.DataProfilingMixin, AirflowModelView): @@ -2296,7 +2421,7 @@ def on_model_change(self, form, model, is_created=True): model.last_modified = timezone.utcnow() -chart_mapping = ( +chart_mapping = dict(( ('line', 'lineChart'), ('spline', 'lineChart'), ('bar', 'multiBarChart'), @@ -2305,8 +2430,7 @@ def on_model_change(self, form, model, is_created=True): ('stacked_area', 'stackedAreaChart'), ('percent_area', 'stackedAreaChart'), ('datatable', 'datatable'), -) -chart_mapping = dict(chart_mapping) +)) class KnownEventView(wwwutils.DataProfilingMixin, AirflowModelView): @@ -2411,7 +2535,7 @@ def hidden_field_formatter(view, context, model, name): ) column_list = ('key', 'val', 'is_encrypted',) column_filters = ('key', 'val') - column_searchable_list = ('key', 'val') + column_searchable_list = ('key', 'val', 'is_encrypted',) column_default_sort = ('key', False) form_widget_args = { 'is_encrypted': {'disabled': True}, @@ -2448,7 +2572,7 @@ def action_varexport(self, ids, session=None): val = None try: val = d.decode(var.val) - except: + except Exception: val = var.val var_dict[var.key] = val @@ -2522,9 +2646,18 @@ class DagRunModelView(ModelViewOnly): ('failed', 'failed'), ], } - form_args = dict( - dag_id=dict(validators=[validators.DataRequired()]) - ) + form_args = { + 'dag_id': { + 'validators': [ + validators.DataRequired(), + ] + }, + 'execution_date': { + 'filters': [ + parse_datetime_f, + ] + } + } column_list = ( 'state', 'dag_id', 'execution_date', 'run_id', 'external_trigger') column_filters = column_list @@ -2537,6 +2670,7 @@ class DagRunModelView(ModelViewOnly): dag_id=dag_link, run_id=dag_run_link ) + form_overrides = dict(execution_date=DateTimeField) @action('new_delete', "Delete", "Are you sure you want to delete selected records?") @provide_session @@ -2551,43 +2685,106 @@ def action_new_delete(self, ids, session=None): dirty_ids = [] for row in deleted: dirty_ids.append(row.dag_id) - models.DagStat.update(dirty_ids, dirty_only=False, session=session) @action('set_running', "Set state to 'running'", None) - def action_set_running(self, ids): - self.set_dagrun_state(ids, State.RUNNING) - - @action('set_failed', "Set state to 'failed'", None) - def action_set_failed(self, ids): - self.set_dagrun_state(ids, State.FAILED) + @provide_session + def action_set_running(self, ids, session=None): + try: + DR = models.DagRun + count = 0 + dirty_ids = [] + for dr in session.query(DR).filter(DR.id.in_(ids)).all(): + dirty_ids.append(dr.dag_id) + count += 1 + dr.state = State.RUNNING + dr.start_date = timezone.utcnow() + flash( + "{count} dag runs were set to running".format(**locals())) + except Exception as ex: + if not self.handle_view_exception(ex): + raise Exception("Ooops") + flash('Failed to set state', 'error') - @action('set_success', "Set state to 'success'", None) - def action_set_success(self, ids): - self.set_dagrun_state(ids, State.SUCCESS) + @action('set_failed', "Set state to 'failed'", + "All running task instances would also be marked as failed, are you sure?") + @provide_session + def action_set_failed(self, ids, session=None): + try: + DR = models.DagRun + count = 0 + dirty_ids = [] + altered_tis = [] + for dr in session.query(DR).filter(DR.id.in_(ids)).all(): + dirty_ids.append(dr.dag_id) + count += 1 + altered_tis += \ + set_dag_run_state_to_failed(dagbag.get_dag(dr.dag_id), + dr.execution_date, + commit=True, + session=session) + altered_ti_count = len(altered_tis) + flash( + "{count} dag runs and {altered_ti_count} task instances " + "were set to failed".format(**locals())) + except Exception as ex: + if not self.handle_view_exception(ex): + raise Exception("Ooops") + flash('Failed to set state', 'error') + @action('set_success', "Set state to 'success'", + "All task instances would also be marked as success, are you sure?") @provide_session - def set_dagrun_state(self, ids, target_state, session=None): + def action_set_success(self, ids, session=None): try: DR = models.DagRun count = 0 dirty_ids = [] + altered_tis = [] for dr in session.query(DR).filter(DR.id.in_(ids)).all(): dirty_ids.append(dr.dag_id) count += 1 - dr.state = target_state - if target_state == State.RUNNING: - dr.start_date = timezone.utcnow() - else: - dr.end_date = timezone.utcnow() - session.commit() - models.DagStat.update(dirty_ids, session=session) + altered_tis += \ + set_dag_run_state_to_success(dagbag.get_dag(dr.dag_id), + dr.execution_date, + commit=True, + session=session) + altered_ti_count = len(altered_tis) flash( - "{count} dag runs were set to '{target_state}'".format(**locals())) + "{count} dag runs and {altered_ti_count} task instances " + "were set to success".format(**locals())) except Exception as ex: if not self.handle_view_exception(ex): raise Exception("Ooops") flash('Failed to set state', 'error') + # Called after editing DagRun model in the UI. + @provide_session + def after_model_change(self, form, dagrun, is_created, session=None): + altered_tis = [] + if dagrun.state == State.SUCCESS: + altered_tis = set_dag_run_state_to_success( + dagbag.get_dag(dagrun.dag_id), + dagrun.execution_date, + commit=True, + session=session) + elif dagrun.state == State.FAILED: + altered_tis = set_dag_run_state_to_failed( + dagbag.get_dag(dagrun.dag_id), + dagrun.execution_date, + commit=True, + session=session) + elif dagrun.state == State.RUNNING: + altered_tis = set_dag_run_state_to_running( + dagbag.get_dag(dagrun.dag_id), + dagrun.execution_date, + commit=True, + session=session) + + altered_ti_count = len(altered_tis) + flash( + "1 dag run and {altered_ti_count} task instances " + "were set to '{dagrun.state}'".format(**locals())) + class LogModelView(ModelViewOnly): verbose_name_plural = "logs" @@ -2771,7 +2968,7 @@ class ConnectionModelView(wwwutils.SuperUserMixin, AirflowModelView): 'extra__google_cloud_platform__scope': StringField('Scopes (comma separated)'), } form_choices = { - 'conn_type': models.Connection._types + 'conn_type': Connection._types } def on_model_change(self, form, model, is_created): @@ -2787,7 +2984,7 @@ def alert_fernet_key(cls): fk = None try: fk = conf.get('core', 'fernet_key') - except: + except Exception: pass return fk is None @@ -2799,10 +2996,10 @@ def is_secure(cls): """ is_secure = False try: - import cryptography + import cryptography # noqa F401 conf.get('core', 'fernet_key') is_secure = True - except: + except Exception: pass return is_secure @@ -2830,7 +3027,7 @@ class VersionView(wwwutils.SuperUserMixin, BaseView): def version(self): # Look at the version from setup.py try: - airflow_version = pkg_resources.require("apache-airflow")[0].version + airflow_version = airflow.__version__ except Exception as e: airflow_version = None logging.error(e) @@ -2921,20 +3118,16 @@ def get_query(self): """ Default filters for model """ - return ( - super(DagModelView, self) - .get_query() - .filter(or_(models.DagModel.is_active, models.DagModel.is_paused)) - .filter(~models.DagModel.is_subdag) - ) + return super(DagModelView, self)\ + .get_query()\ + .filter(or_(models.DagModel.is_active, models.DagModel.is_paused))\ + .filter(~models.DagModel.is_subdag) def get_count_query(self): """ Default filters for model """ - return ( - super(DagModelView, self) - .get_count_query() - .filter(models.DagModel.is_active) - .filter(~models.DagModel.is_subdag) - ) + return super(DagModelView, self)\ + .get_count_query()\ + .filter(models.DagModel.is_active)\ + .filter(~models.DagModel.is_subdag) diff --git a/airflow/www_rbac/.eslintignore b/airflow/www_rbac/.eslintignore new file mode 100644 index 0000000000000..924c9fc72b57d --- /dev/null +++ b/airflow/www_rbac/.eslintignore @@ -0,0 +1,8 @@ +**/*{.,-}min.js +**/*.sh +**/*.py +gantt-chart-d3v2.js +jqClock.min.js +coverage/** +static/dist/* +static/docs/* diff --git a/airflow/www_rbac/.eslintrc b/airflow/www_rbac/.eslintrc new file mode 100644 index 0000000000000..e566b6e0fb971 --- /dev/null +++ b/airflow/www_rbac/.eslintrc @@ -0,0 +1,5 @@ +{ + "extends": "airbnb-base", + "parser": "babel-eslint", + "plugins": [ "html" ] +} diff --git a/airflow/www_rbac/__init__.py b/airflow/www_rbac/__init__.py index 4067cc78ee9a2..114d189da14ab 100644 --- a/airflow/www_rbac/__init__.py +++ b/airflow/www_rbac/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www_rbac/api/__init__.py b/airflow/www_rbac/api/__init__.py index db5ba598d7c23..b7f8352944d3f 100644 --- a/airflow/www_rbac/api/__init__.py +++ b/airflow/www_rbac/api/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www_rbac/api/experimental/__init__.py b/airflow/www_rbac/api/experimental/__init__.py index db5ba598d7c23..b7f8352944d3f 100644 --- a/airflow/www_rbac/api/experimental/__init__.py +++ b/airflow/www_rbac/api/experimental/__init__.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/airflow/www_rbac/api/experimental/endpoints.py b/airflow/www_rbac/api/experimental/endpoints.py index 8663bad55d631..637d3ef53d55a 100644 --- a/airflow/www_rbac/api/experimental/endpoints.py +++ b/airflow/www_rbac/api/experimental/endpoints.py @@ -23,6 +23,7 @@ from airflow.api.common.experimental.get_dag_runs import get_dag_runs from airflow.api.common.experimental.get_task import get_task from airflow.api.common.experimental.get_task_instance import get_task_instance +from airflow.api.common.experimental.get_code import get_code from airflow.api.common.experimental.get_dag_run_state import get_dag_run_state from airflow.exceptions import AirflowException from airflow.utils.log.logging_mixin import LoggingMixin @@ -119,6 +120,19 @@ def test(): return jsonify(status='OK') +@api_experimental.route('/dags//code', methods=['GET']) +@requires_authentication +def get_dag_code(dag_id): + """Return python code of a given dag_id.""" + try: + return get_code(dag_id) + except AirflowException as err: + _log.info(err) + response = jsonify(error="{}".format(err)) + response.status_code = err.status_code + return response + + @api_experimental.route('/dags//tasks/', methods=['GET']) @requires_authentication def task_info(dag_id, task_id): @@ -237,6 +251,7 @@ def dag_run_status(dag_id, execution_date): return jsonify(info) + @api_experimental.route('/latest_runs', methods=['GET']) @requires_authentication def latest_dag_runs(): diff --git a/airflow/www_rbac/app.py b/airflow/www_rbac/app.py index 85125c6eb06b8..2af868682920d 100644 --- a/airflow/www_rbac/app.py +++ b/airflow/www_rbac/app.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,37 +17,49 @@ # specific language governing permissions and limitations # under the License. # +import logging import socket -import six +from typing import Any +import six from flask import Flask from flask_appbuilder import AppBuilder, SQLA from flask_caching import Cache from flask_wtf.csrf import CSRFProtect from six.moves.urllib.parse import urlparse from werkzeug.wsgi import DispatcherMiddleware +from werkzeug.contrib.fixers import ProxyFix from airflow import settings from airflow import configuration as conf from airflow.logging_config import configure_logging +from airflow.www_rbac.static_config import configure_manifest_files - -app = None +app = None # type: Any appbuilder = None csrf = CSRFProtect() +log = logging.getLogger(__name__) -def create_app(config=None, testing=False, app_name="Airflow"): +def create_app(config=None, session=None, testing=False, app_name="Airflow"): global app, appbuilder app = Flask(__name__) + if conf.getboolean('webserver', 'ENABLE_PROXY_FIX'): + app.wsgi_app = ProxyFix(app.wsgi_app) app.secret_key = conf.get('webserver', 'SECRET_KEY') - airflow_home_path = conf.get('core', 'AIRFLOW_HOME') - webserver_config_path = airflow_home_path + '/webserver_config.py' - app.config.from_pyfile(webserver_config_path, silent=True) + app.config.from_pyfile(settings.WEBSERVER_CONFIG, silent=True) + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False app.config['APP_NAME'] = app_name app.config['TESTING'] = testing + app.config['SESSION_COOKIE_HTTPONLY'] = True + app.config['SESSION_COOKIE_SECURE'] = conf.getboolean('webserver', 'COOKIE_SECURE') + app.config['SESSION_COOKIE_SAMESITE'] = conf.get('webserver', 'COOKIE_SAMESITE') + + if config: + app.config.from_mapping(config) + csrf.init_app(app) db = SQLA(app) @@ -56,18 +68,30 @@ def create_app(config=None, testing=False, app_name="Airflow"): api.load_auth() api.api_auth.init_app(app) - cache = Cache(app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) # noqa + # flake8: noqa: F841 + cache = Cache(app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) from airflow.www_rbac.blueprints import routes app.register_blueprint(routes) configure_logging() + configure_manifest_files(app) with app.app_context(): + + from airflow.www_rbac.security import AirflowSecurityManager + security_manager_class = app.config.get('SECURITY_MANAGER_CLASS') or \ + AirflowSecurityManager + + if not issubclass(security_manager_class, AirflowSecurityManager): + raise Exception( + """Your CUSTOM_SECURITY_MANAGER must now extend AirflowSecurityManager, + not FAB's security manager.""") + appbuilder = AppBuilder( app, - db.session, - security_manager_class=app.config.get('SECURITY_MANAGER_CLASS'), + db.session if not session else session, + security_manager_class=security_manager_class, base_template='appbuilder/baselayout.html') def init_views(appbuilder): @@ -112,24 +136,50 @@ def init_views(appbuilder): href='https://airflow.apache.org/', category="Docs", category_icon="fa-cube") - appbuilder.add_link("Github", - href='https://github.com/apache/incubator-airflow', + appbuilder.add_link("GitHub", + href='https://github.com/apache/airflow', category="Docs") appbuilder.add_link('Version', href='/version', category='About', category_icon='fa-th') + def integrate_plugins(): + """Integrate plugins to the context""" + from airflow.plugins_manager import ( + flask_appbuilder_views, flask_appbuilder_menu_links + ) + + for v in flask_appbuilder_views: + log.debug("Adding view %s", v["name"]) + appbuilder.add_view(v["view"], + v["name"], + category=v["category"]) + for ml in sorted(flask_appbuilder_menu_links, key=lambda x: x["name"]): + log.debug("Adding menu link %s", ml["name"]) + appbuilder.add_link(ml["name"], + href=ml["href"], + category=ml["category"], + category_icon=ml["category_icon"]) + + integrate_plugins() # Garbage collect old permissions/views after they have been modified. # Otherwise, when the name of a view or menu is changed, the framework # will add the new Views and Menus names to the backend, but will not # delete the old ones. - appbuilder.security_cleanup() + + def init_plugin_blueprints(app): + from airflow.plugins_manager import flask_blueprints + + for bp in flask_blueprints: + log.debug("Adding blueprint %s:%s", bp["name"], bp["blueprint"].import_name) + app.register_blueprint(bp["blueprint"]) init_views(appbuilder) + init_plugin_blueprints(app) - from airflow.www_rbac.security import init_roles - init_roles(appbuilder) + security_manager = appbuilder.sm + security_manager.sync_roles() from airflow.www_rbac.api.experimental import endpoints as e # required for testing purposes otherwise the module retains @@ -158,23 +208,23 @@ def shutdown_session(exception=None): def root_app(env, resp): - resp(b'404 Not Found', [(b'Content-Type', b'text/plain')]) + resp('404 Not Found', [('Content-Type', 'text/plain')]) return [b'Apache Airflow is not at this location'] -def cached_app(config=None, testing=False): +def cached_app(config=None, session=None, testing=False): global app, appbuilder if not app or not appbuilder: base_url = urlparse(conf.get('webserver', 'base_url'))[2] if not base_url or base_url == '/': base_url = "" - app, _ = create_app(config, testing) + app, _ = create_app(config, session, testing) app = DispatcherMiddleware(root_app, {base_url: app}) return app def cached_appbuilder(config=None, testing=False): global appbuilder - cached_app(config, testing) + cached_app(config=config, testing=testing) return appbuilder diff --git a/airflow/www_rbac/blueprints.py b/airflow/www_rbac/blueprints.py index 35a5badb6ad1e..36c365b185ad6 100644 --- a/airflow/www_rbac/blueprints.py +++ b/airflow/www_rbac/blueprints.py @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -17,19 +17,11 @@ # specific language governing permissions and limitations # under the License. # -from flask import Markup, Blueprint, redirect -import markdown +from flask import Blueprint, redirect, url_for routes = Blueprint('routes', __name__) @routes.route('/') def index(): - return redirect('/home') - - -@routes.route('/health') -def health(): - """ We can add an array of tests here to check the server's health """ - content = Markup(markdown.markdown("The server is healthy!")) - return content + return redirect(url_for('Airflow.index')) diff --git a/airflow/www_rbac/compile_assets.sh b/airflow/www_rbac/compile_assets.sh new file mode 100755 index 0000000000000..a3a4ea2349692 --- /dev/null +++ b/airflow/www_rbac/compile_assets.sh @@ -0,0 +1,28 @@ +#!/bin/sh +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +# first bump up package.json manually, commit and tag +if [ -d airflow/www_rbac/static/dist ]; then + rm -f airflow/www_rbac/static/dist/* +fi + +cd airflow/www_rbac/ +npm install +npm run build +cd ../.. diff --git a/airflow/www_rbac/decorators.py b/airflow/www_rbac/decorators.py index 2dd1af45df09d..1db2dfeb363b6 100644 --- a/airflow/www_rbac/decorators.py +++ b/airflow/www_rbac/decorators.py @@ -21,8 +21,9 @@ import functools import pendulum from io import BytesIO as IO -from flask import after_this_request, request, g -from airflow import models, settings +from flask import after_this_request, flash, redirect, request, url_for, g +from airflow.models.log import Log +from airflow.utils.db import create_session def action_logging(f): @@ -31,26 +32,26 @@ def action_logging(f): """ @functools.wraps(f) def wrapper(*args, **kwargs): - session = settings.Session() - if g.user.is_anonymous(): - user = 'anonymous' - else: - user = g.user.username - - log = models.Log( - event=f.__name__, - task_instance=None, - owner=user, - extra=str(list(request.args.items())), - task_id=request.args.get('task_id'), - dag_id=request.args.get('dag_id')) - - if 'execution_date' in request.args: - log.execution_date = pendulum.parse( - request.args.get('execution_date')) - - session.add(log) - session.commit() + + with create_session() as session: + if g.user.is_anonymous: + user = 'anonymous' + else: + user = g.user.username + + log = Log( + event=f.__name__, + task_instance=None, + owner=user, + extra=str(list(request.args.items())), + task_id=request.args.get('task_id'), + dag_id=request.args.get('dag_id')) + + if 'execution_date' in request.args: + log.execution_date = pendulum.parse( + request.args.get('execution_date')) + + session.add(log) return f(*args, **kwargs) @@ -91,3 +92,36 @@ def zipper(response): return f(*args, **kwargs) return view_func + + +def has_dag_access(**dag_kwargs): + """ + Decorator to check whether the user has read / write permission on the dag. + """ + def decorator(f): + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + has_access = self.appbuilder.sm.has_access + dag_id = request.args.get('dag_id') + # if it is false, we need to check whether user has write access on the dag + can_dag_edit = dag_kwargs.get('can_dag_edit', False) + + # 1. check whether the user has can_dag_edit permissions on all_dags + # 2. if 1 false, check whether the user + # has can_dag_edit permissions on the dag + # 3. if 2 false, check whether it is can_dag_read view, + # and whether user has the permissions + if ( + has_access('can_dag_edit', 'all_dags') or + has_access('can_dag_edit', dag_id) or (not can_dag_edit and + (has_access('can_dag_read', + 'all_dags') or + has_access('can_dag_read', + dag_id)))): + return f(self, *args, **kwargs) + else: + flash("Access is Denied", "danger") + return redirect(url_for(self.appbuilder.sm.auth_view. + __class__.__name__ + ".login")) + return wrapper + return decorator diff --git a/airflow/www_rbac/forms.py b/airflow/www_rbac/forms.py index da9d12c7adfd9..0a36a90ef3ced 100644 --- a/airflow/www_rbac/forms.py +++ b/airflow/www_rbac/forms.py @@ -22,7 +22,7 @@ from __future__ import print_function from __future__ import unicode_literals -from airflow import models +from airflow.models.connection import Connection from airflow.utils import timezone from flask_appbuilder.forms import DynamicForm @@ -93,7 +93,7 @@ class ConnectionForm(DynamicForm): widget=BS3TextFieldWidget()) conn_type = SelectField( lazy_gettext('Conn Type'), - choices=(models.Connection._types), + choices=Connection._types, widget=Select2Widget()) host = StringField( lazy_gettext('Host'), diff --git a/airflow/www_rbac/package-lock.json b/airflow/www_rbac/package-lock.json new file mode 100644 index 0000000000000..c42050e276be5 --- /dev/null +++ b/airflow/www_rbac/package-lock.json @@ -0,0 +1,11434 @@ +{ + "requires": true, + "lockfileVersion": 1, + "dependencies": { + "@babel/code-frame": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.0.0-beta.44.tgz", + "integrity": "sha512-cuAuTTIQ9RqcFRJ/Y8PvTh+paepNcaGxwQwjIDRWPXmzzyAeCO4KqS9ikMvq0MCbRk6GlYKwfzStrcP3/jSL8g==", + "dev": true, + "requires": { + "@babel/highlight": "7.0.0-beta.44" + } + }, + "@babel/generator": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.0.0-beta.44.tgz", + "integrity": "sha512-5xVb7hlhjGcdkKpMXgicAVgx8syK5VJz193k0i/0sLP6DzE6lRrU1K3B/rFefgdo9LPGMAOOOAWW4jycj07ShQ==", + "dev": true, + "requires": { + "@babel/types": "7.0.0-beta.44", + "jsesc": "^2.5.1", + "lodash": "^4.2.0", + "source-map": "^0.5.0", + "trim-right": "^1.0.1" + }, + "dependencies": { + "jsesc": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-2.5.1.tgz", + "integrity": "sha1-5CGiqOINawgZ3yiQj3glJrlt0f4=", + "dev": true + } + } + }, + "@babel/helper-function-name": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/helper-function-name/-/helper-function-name-7.0.0-beta.44.tgz", + "integrity": "sha512-MHRG2qZMKMFaBavX0LWpfZ2e+hLloT++N7rfM3DYOMUOGCD8cVjqZpwiL8a0bOX3IYcQev1ruciT0gdFFRTxzg==", + "dev": true, + "requires": { + "@babel/helper-get-function-arity": "7.0.0-beta.44", + "@babel/template": "7.0.0-beta.44", + "@babel/types": "7.0.0-beta.44" + } + }, + "@babel/helper-get-function-arity": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/helper-get-function-arity/-/helper-get-function-arity-7.0.0-beta.44.tgz", + "integrity": "sha512-w0YjWVwrM2HwP6/H3sEgrSQdkCaxppqFeJtAnB23pRiJB5E/O9Yp7JAAeWBl+gGEgmBFinnTyOv2RN7rcSmMiw==", + "dev": true, + "requires": { + "@babel/types": "7.0.0-beta.44" + } + }, + "@babel/helper-split-export-declaration": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.0.0-beta.44.tgz", + "integrity": "sha512-aQ7QowtkgKKzPGf0j6u77kBMdUFVBKNHw2p/3HX/POt5/oz8ec5cs0GwlgM8Hz7ui5EwJnzyfRmkNF1Nx1N7aA==", + "dev": true, + "requires": { + "@babel/types": "7.0.0-beta.44" + } + }, + "@babel/highlight": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/highlight/-/highlight-7.0.0-beta.44.tgz", + "integrity": "sha512-Il19yJvy7vMFm8AVAh6OZzaFoAd0hbkeMZiX3P5HGD+z7dyI7RzndHB0dg6Urh/VAFfHtpOIzDUSxmY6coyZWQ==", + "dev": true, + "requires": { + "chalk": "^2.0.0", + "esutils": "^2.0.2", + "js-tokens": "^3.0.0" + }, + "dependencies": { + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "@babel/template": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.0.0-beta.44.tgz", + "integrity": "sha512-w750Sloq0UNifLx1rUqwfbnC6uSUk0mfwwgGRfdLiaUzfAOiH0tHJE6ILQIUi3KYkjiCDTskoIsnfqZvWLBDng==", + "dev": true, + "requires": { + "@babel/code-frame": "7.0.0-beta.44", + "@babel/types": "7.0.0-beta.44", + "babylon": "7.0.0-beta.44", + "lodash": "^4.2.0" + }, + "dependencies": { + "babylon": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-7.0.0-beta.44.tgz", + "integrity": "sha512-5Hlm13BJVAioCHpImtFqNOF2H3ieTOHd0fmFGMxOJ9jgeFqeAwsv3u5P5cR7CSeFrkgHsT19DgFJkHV0/Mcd8g==", + "dev": true + } + } + }, + "@babel/traverse": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.0.0-beta.44.tgz", + "integrity": "sha512-UHuDz8ukQkJCDASKHf+oDt3FVUzFd+QYfuBIsiNu/4+/ix6pP/C+uQZJ6K1oEfbCMv/IKWbgDEh7fcsnIE5AtA==", + "dev": true, + "requires": { + "@babel/code-frame": "7.0.0-beta.44", + "@babel/generator": "7.0.0-beta.44", + "@babel/helper-function-name": "7.0.0-beta.44", + "@babel/helper-split-export-declaration": "7.0.0-beta.44", + "@babel/types": "7.0.0-beta.44", + "babylon": "7.0.0-beta.44", + "debug": "^3.1.0", + "globals": "^11.1.0", + "invariant": "^2.2.0", + "lodash": "^4.2.0" + }, + "dependencies": { + "babylon": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-7.0.0-beta.44.tgz", + "integrity": "sha512-5Hlm13BJVAioCHpImtFqNOF2H3ieTOHd0fmFGMxOJ9jgeFqeAwsv3u5P5cR7CSeFrkgHsT19DgFJkHV0/Mcd8g==", + "dev": true + }, + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "globals": { + "version": "11.7.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-11.7.0.tgz", + "integrity": "sha512-K8BNSPySfeShBQXsahYB/AbbWruVOTyVpgoIDnl8odPpeSfP2J5QO2oLFFdl2j7GfDCtZj2bMKar2T49itTPCg==", + "dev": true + } + } + }, + "@babel/types": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.0.0-beta.44.tgz", + "integrity": "sha512-5eTV4WRmqbaFM3v9gHAIljEQJU4Ssc6fxL61JN+Oe2ga/BwyjzjamwkCVVAQjHGuAX8i0BWo42dshL8eO5KfLQ==", + "dev": true, + "requires": { + "esutils": "^2.0.2", + "lodash": "^4.2.0", + "to-fast-properties": "^2.0.0" + }, + "dependencies": { + "to-fast-properties": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz", + "integrity": "sha1-3F5pjL0HkmW8c+A3doGk5Og/YW4=", + "dev": true + } + } + }, + "@webassemblyjs/ast": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.5.13.tgz", + "integrity": "sha512-49nwvW/Hx9i+OYHg+mRhKZfAlqThr11Dqz8TsrvqGKMhdI2ijy3KBJOun2Z4770TPjrIJhR6KxChQIDaz8clDA==", + "dev": true, + "requires": { + "@webassemblyjs/helper-module-context": "1.5.13", + "@webassemblyjs/helper-wasm-bytecode": "1.5.13", + "@webassemblyjs/wast-parser": "1.5.13", + "debug": "^3.1.0", + "mamacro": "^0.0.3" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/floating-point-hex-parser": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/floating-point-hex-parser/-/floating-point-hex-parser-1.5.13.tgz", + "integrity": "sha512-vrvvB18Kh4uyghSKb0NTv+2WZx871WL2NzwMj61jcq2bXkyhRC+8Q0oD7JGVf0+5i/fKQYQSBCNMMsDMRVAMqA==", + "dev": true + }, + "@webassemblyjs/helper-api-error": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-api-error/-/helper-api-error-1.5.13.tgz", + "integrity": "sha512-dBh2CWYqjaDlvMmRP/kudxpdh30uXjIbpkLj9HQe+qtYlwvYjPRjdQXrq1cTAAOUSMTtzqbXIxEdEZmyKfcwsg==", + "dev": true + }, + "@webassemblyjs/helper-buffer": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-buffer/-/helper-buffer-1.5.13.tgz", + "integrity": "sha512-v7igWf1mHcpJNbn4m7e77XOAWXCDT76Xe7Is1VQFXc4K5jRcFrl9D0NrqM4XifQ0bXiuTSkTKMYqDxu5MhNljA==", + "dev": true, + "requires": { + "debug": "^3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/helper-code-frame": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-code-frame/-/helper-code-frame-1.5.13.tgz", + "integrity": "sha512-yN6ScQQDFCiAXnVctdVO/J5NQRbwyTbQzsGzEgXsAnrxhjp0xihh+nNHQTMrq5UhOqTb5LykpJAvEv9AT0jnAQ==", + "dev": true, + "requires": { + "@webassemblyjs/wast-printer": "1.5.13" + } + }, + "@webassemblyjs/helper-fsm": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-fsm/-/helper-fsm-1.5.13.tgz", + "integrity": "sha512-hSIKzbXjVMRvy3Jzhgu+vDd/aswJ+UMEnLRCkZDdknZO3Z9e6rp1DAs0tdLItjCFqkz9+0BeOPK/mk3eYvVzZg==", + "dev": true + }, + "@webassemblyjs/helper-module-context": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-module-context/-/helper-module-context-1.5.13.tgz", + "integrity": "sha512-zxJXULGPLB7r+k+wIlvGlXpT4CYppRz8fLUM/xobGHc9Z3T6qlmJD9ySJ2jknuktuuiR9AjnNpKYDECyaiX+QQ==", + "dev": true, + "requires": { + "debug": "^3.1.0", + "mamacro": "^0.0.3" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/helper-wasm-bytecode": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-bytecode/-/helper-wasm-bytecode-1.5.13.tgz", + "integrity": "sha512-0n3SoNGLvbJIZPhtMFq0XmmnA/YmQBXaZKQZcW8maGKwLpVcgjNrxpFZHEOLKjXJYVN5Il8vSfG7nRX50Zn+aw==", + "dev": true + }, + "@webassemblyjs/helper-wasm-section": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/helper-wasm-section/-/helper-wasm-section-1.5.13.tgz", + "integrity": "sha512-IJ/goicOZ5TT1axZFSnlAtz4m8KEjYr12BNOANAwGFPKXM4byEDaMNXYowHMG0yKV9a397eU/NlibFaLwr1fbw==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-buffer": "1.5.13", + "@webassemblyjs/helper-wasm-bytecode": "1.5.13", + "@webassemblyjs/wasm-gen": "1.5.13", + "debug": "^3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/ieee754": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/ieee754/-/ieee754-1.5.13.tgz", + "integrity": "sha512-TseswvXEPpG5TCBKoLx9tT7+/GMACjC1ruo09j46ULRZWYm8XHpDWaosOjTnI7kr4SRJFzA6MWoUkAB+YCGKKg==", + "dev": true, + "requires": { + "ieee754": "^1.1.11" + } + }, + "@webassemblyjs/leb128": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/leb128/-/leb128-1.5.13.tgz", + "integrity": "sha512-0NRMxrL+GG3eISGZBmLBLAVjphbN8Si15s7jzThaw1UE9e5BY1oH49/+MA1xBzxpf1OW5sf9OrPDOclk9wj2yg==", + "dev": true, + "requires": { + "long": "4.0.0" + }, + "dependencies": { + "long": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/long/-/long-4.0.0.tgz", + "integrity": "sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==", + "dev": true + } + } + }, + "@webassemblyjs/utf8": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/utf8/-/utf8-1.5.13.tgz", + "integrity": "sha512-Ve1ilU2N48Ew0lVGB8FqY7V7hXjaC4+PeZM+vDYxEd+R2iQ0q+Wb3Rw8v0Ri0+rxhoz6gVGsnQNb4FjRiEH/Ng==", + "dev": true + }, + "@webassemblyjs/wasm-edit": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-edit/-/wasm-edit-1.5.13.tgz", + "integrity": "sha512-X7ZNW4+Hga4f2NmqENnHke2V/mGYK/xnybJSIXImt1ulxbCOEs/A+ZK/Km2jgihjyVxp/0z0hwIcxC6PrkWtgw==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-buffer": "1.5.13", + "@webassemblyjs/helper-wasm-bytecode": "1.5.13", + "@webassemblyjs/helper-wasm-section": "1.5.13", + "@webassemblyjs/wasm-gen": "1.5.13", + "@webassemblyjs/wasm-opt": "1.5.13", + "@webassemblyjs/wasm-parser": "1.5.13", + "@webassemblyjs/wast-printer": "1.5.13", + "debug": "^3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/wasm-gen": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-gen/-/wasm-gen-1.5.13.tgz", + "integrity": "sha512-yfv94Se8R73zmr8GAYzezFHc3lDwE/lBXQddSiIZEKZFuqy7yWtm3KMwA1uGbv5G1WphimJxboXHR80IgX1hQA==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-wasm-bytecode": "1.5.13", + "@webassemblyjs/ieee754": "1.5.13", + "@webassemblyjs/leb128": "1.5.13", + "@webassemblyjs/utf8": "1.5.13" + } + }, + "@webassemblyjs/wasm-opt": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-opt/-/wasm-opt-1.5.13.tgz", + "integrity": "sha512-IkXSkgzVhQ0QYAdIayuCWMmXSYx0dHGU8Ah/AxJf1gBvstMWVnzJnBwLsXLyD87VSBIcsqkmZ28dVb0mOC3oBg==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-buffer": "1.5.13", + "@webassemblyjs/wasm-gen": "1.5.13", + "@webassemblyjs/wasm-parser": "1.5.13", + "debug": "^3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + } + } + }, + "@webassemblyjs/wasm-parser": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wasm-parser/-/wasm-parser-1.5.13.tgz", + "integrity": "sha512-XnYoIcu2iqq8/LrtmdnN3T+bRjqYFjRHqWbqK3osD/0r/Fcv4d9ecRzjVtC29ENEuNTK4mQ9yyxCBCbK8S/cpg==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-api-error": "1.5.13", + "@webassemblyjs/helper-wasm-bytecode": "1.5.13", + "@webassemblyjs/ieee754": "1.5.13", + "@webassemblyjs/leb128": "1.5.13", + "@webassemblyjs/utf8": "1.5.13" + } + }, + "@webassemblyjs/wast-parser": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-parser/-/wast-parser-1.5.13.tgz", + "integrity": "sha512-Lbz65T0LQ1LgzKiUytl34CwuhMNhaCLgrh0JW4rJBN6INnBB8NMwUfQM+FxTnLY9qJ+lHJL/gCM5xYhB9oWi4A==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/floating-point-hex-parser": "1.5.13", + "@webassemblyjs/helper-api-error": "1.5.13", + "@webassemblyjs/helper-code-frame": "1.5.13", + "@webassemblyjs/helper-fsm": "1.5.13", + "long": "^3.2.0", + "mamacro": "^0.0.3" + } + }, + "@webassemblyjs/wast-printer": { + "version": "1.5.13", + "resolved": "https://registry.npmjs.org/@webassemblyjs/wast-printer/-/wast-printer-1.5.13.tgz", + "integrity": "sha512-QcwogrdqcBh8Z+eUF8SG+ag5iwQSXxQJELBEHmLkk790wgQgnIMmntT2sMAMw53GiFNckArf5X0bsCA44j3lWQ==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/wast-parser": "1.5.13", + "long": "^3.2.0" + } + }, + "@webpack-contrib/schema-utils": { + "version": "1.0.0-beta.0", + "resolved": "https://registry.npmjs.org/@webpack-contrib/schema-utils/-/schema-utils-1.0.0-beta.0.tgz", + "integrity": "sha512-LonryJP+FxQQHsjGBi6W786TQB1Oym+agTpY0c+Kj8alnIw+DLUJb6SI8Y1GHGhLCH1yPRrucjObUmxNICQ1pg==", + "dev": true, + "requires": { + "ajv": "^6.1.0", + "ajv-keywords": "^3.1.0", + "chalk": "^2.3.2", + "strip-ansi": "^4.0.0", + "text-table": "^0.2.0", + "webpack-log": "^1.1.2" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", + "dev": true + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "abbrev": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.0.9.tgz", + "integrity": "sha1-kbR5JYinc4wl813W9jdSovh3YTU=", + "dev": true + }, + "acorn": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-5.7.1.tgz", + "integrity": "sha512-d+nbxBUGKg7Arpsvbnlq61mc12ek3EY8EQldM3GPAhWJ1UVxC6TDGbIvUMNU6obBX3i1+ptCIzV4vq0gFPEGVQ==", + "dev": true + }, + "acorn-dynamic-import": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/acorn-dynamic-import/-/acorn-dynamic-import-3.0.0.tgz", + "integrity": "sha512-zVWV8Z8lislJoOKKqdNMOB+s6+XV5WERty8MnKBeFgwA+19XJjJHs2RP5dzM57FftIs+jQnRToLiWazKr6sSWg==", + "dev": true, + "requires": { + "acorn": "^5.0.0" + } + }, + "acorn-jsx": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-3.0.1.tgz", + "integrity": "sha1-r9+UiPsezvyDSPb7IvRk4ypYs2s=", + "dev": true, + "requires": { + "acorn": "^3.0.4" + }, + "dependencies": { + "acorn": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-3.3.0.tgz", + "integrity": "sha1-ReN/s56No/JbruP/U2niu18iAXo=", + "dev": true + } + } + }, + "ajv": { + "version": "6.5.2", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.5.2.tgz", + "integrity": "sha512-hOs7GfvI6tUI1LfZddH82ky6mOMyTuY0mk7kE2pWpmhhUSkumzaTO5vbVwij39MdwPQWCV4Zv57Eo06NtL/GVA==", + "dev": true, + "requires": { + "fast-deep-equal": "^2.0.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.1" + } + }, + "ajv-keywords": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-3.2.0.tgz", + "integrity": "sha1-6GuBnGAs+IIa1jdBNpjx3sAhhHo=", + "dev": true + }, + "align-text": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/align-text/-/align-text-0.1.4.tgz", + "integrity": "sha1-DNkKVhCT810KmSVsIrcGlDP60Rc=", + "dev": true, + "optional": true, + "requires": { + "kind-of": "^3.0.2", + "longest": "^1.0.1", + "repeat-string": "^1.5.2" + } + }, + "alphanum-sort": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/alphanum-sort/-/alphanum-sort-1.0.2.tgz", + "integrity": "sha1-l6ERlkmyEa0zaR2fn0hqjsn74KM=", + "dev": true + }, + "amdefine": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/amdefine/-/amdefine-1.0.1.tgz", + "integrity": "sha1-SlKCrBZHKek2Gbz9OtFR+BfOkfU=", + "dev": true + }, + "ansi-escapes": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-3.1.0.tgz", + "integrity": "sha512-UgAb8H9D41AQnu/PbWlCofQVcnV4Gs2bBJi9eZPxfU/hgglFh3SMDMENRIqdr7H6XFnXdoknctFByVsCOotTVw==", + "dev": true + }, + "ansi-regex": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", + "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=", + "dev": true + }, + "ansi-styles": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", + "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", + "dev": true + }, + "anymatch": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-2.0.0.tgz", + "integrity": "sha512-5teOsQWABXHHBFP9y3skS5P3d/WfWXpv3FUpy+LorMrNYaT9pI4oLMQX7jzQ2KklNpGpWHzdCXTDT2Y3XGlZBw==", + "dev": true, + "requires": { + "micromatch": "^3.1.4", + "normalize-path": "^2.1.1" + } + }, + "aproba": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", + "integrity": "sha512-Y9J6ZjXtoYh8RnXVCMOU/ttDmk1aBjunq9vO0ta5x85WDQiQfUF9sIPBITdbiiIVcBo03Hi3jMxigBtsddlXRw==", + "dev": true + }, + "argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "dev": true, + "requires": { + "sprintf-js": "~1.0.2" + } + }, + "arr-diff": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/arr-diff/-/arr-diff-4.0.0.tgz", + "integrity": "sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=", + "dev": true + }, + "arr-flatten": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/arr-flatten/-/arr-flatten-1.1.0.tgz", + "integrity": "sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==", + "dev": true + }, + "arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha1-45sJrqne+Gao8gbiiK9jkZuuOcQ=", + "dev": true + }, + "array-union": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-1.0.2.tgz", + "integrity": "sha1-mjRBDk9OPaI96jdb5b5w8kd47Dk=", + "dev": true, + "requires": { + "array-uniq": "^1.0.1" + } + }, + "array-uniq": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/array-uniq/-/array-uniq-1.0.3.tgz", + "integrity": "sha1-r2rId6Jcx/dOBYiUdThY39sk/bY=", + "dev": true + }, + "array-unique": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", + "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=", + "dev": true + }, + "arrify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/arrify/-/arrify-1.0.1.tgz", + "integrity": "sha1-iYUI2iIm84DfkEcoRWhJwVAaSw0=", + "dev": true + }, + "asn1.js": { + "version": "4.10.1", + "resolved": "https://registry.npmjs.org/asn1.js/-/asn1.js-4.10.1.tgz", + "integrity": "sha512-p32cOF5q0Zqs9uBiONKYLm6BClCoBCM5O9JfeUSlnQLBTxYdTK+pW+nXflm8UkKd2UYlEbYz5qEi0JuZR9ckSw==", + "dev": true, + "requires": { + "bn.js": "^4.0.0", + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "assert": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/assert/-/assert-1.4.1.tgz", + "integrity": "sha1-mZEtWRg2tab1s0XA8H7vwI/GXZE=", + "dev": true, + "requires": { + "util": "0.10.3" + }, + "dependencies": { + "inherits": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.1.tgz", + "integrity": "sha1-sX0I0ya0Qj5Wjv9xn5GwscvfafE=", + "dev": true + }, + "util": { + "version": "0.10.3", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.3.tgz", + "integrity": "sha1-evsa/lCAUkZInj23/g7TeTNqwPk=", + "dev": true, + "requires": { + "inherits": "2.0.1" + } + } + } + }, + "assign-symbols": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assign-symbols/-/assign-symbols-1.0.0.tgz", + "integrity": "sha1-WWZ/QfrdTyDMvCu5a41Pf3jsA2c=", + "dev": true + }, + "async": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/async/-/async-1.5.2.tgz", + "integrity": "sha1-7GphrlZIDAw8skHJVhjiCJL5Zyo=", + "dev": true + }, + "async-each": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/async-each/-/async-each-1.0.1.tgz", + "integrity": "sha1-GdOGodntxufByF04iu28xW0zYC0=", + "dev": true + }, + "atob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/atob/-/atob-2.1.1.tgz", + "integrity": "sha1-ri1acpR38onWDdf5amMUoi3Wwio=", + "dev": true + }, + "autoprefixer": { + "version": "6.7.7", + "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-6.7.7.tgz", + "integrity": "sha1-Hb0cg1ZY41zj+ZhAmdsAWFx4IBQ=", + "dev": true, + "requires": { + "browserslist": "^1.7.6", + "caniuse-db": "^1.0.30000634", + "normalize-range": "^0.1.2", + "num2fraction": "^1.2.2", + "postcss": "^5.2.16", + "postcss-value-parser": "^3.2.3" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "babel": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel/-/babel-6.23.0.tgz", + "integrity": "sha1-0NHn2APpdHZb7qMjLU4VPA77kPQ=", + "dev": true + }, + "babel-code-frame": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-code-frame/-/babel-code-frame-6.26.0.tgz", + "integrity": "sha1-Y/1D99weO7fONZR9uP42mj9Yx0s=", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "esutils": "^2.0.2", + "js-tokens": "^3.0.2" + } + }, + "babel-core": { + "version": "6.26.3", + "resolved": "https://registry.npmjs.org/babel-core/-/babel-core-6.26.3.tgz", + "integrity": "sha512-6jyFLuDmeidKmUEb3NM+/yawG0M2bDZ9Z1qbZP59cyHLz8kYGKYwpJP0UwUKKUiTRNvxfLesJnTedqczP7cTDA==", + "dev": true, + "requires": { + "babel-code-frame": "^6.26.0", + "babel-generator": "^6.26.0", + "babel-helpers": "^6.24.1", + "babel-messages": "^6.23.0", + "babel-register": "^6.26.0", + "babel-runtime": "^6.26.0", + "babel-template": "^6.26.0", + "babel-traverse": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "convert-source-map": "^1.5.1", + "debug": "^2.6.9", + "json5": "^0.5.1", + "lodash": "^4.17.4", + "minimatch": "^3.0.4", + "path-is-absolute": "^1.0.1", + "private": "^0.1.8", + "slash": "^1.0.0", + "source-map": "^0.5.7" + } + }, + "babel-eslint": { + "version": "8.2.6", + "resolved": "https://registry.npmjs.org/babel-eslint/-/babel-eslint-8.2.6.tgz", + "integrity": "sha512-aCdHjhzcILdP8c9lej7hvXKvQieyRt20SF102SIGyY4cUIiw6UaAtK4j2o3dXX74jEmy0TJ0CEhv4fTIM3SzcA==", + "dev": true, + "requires": { + "@babel/code-frame": "7.0.0-beta.44", + "@babel/traverse": "7.0.0-beta.44", + "@babel/types": "7.0.0-beta.44", + "babylon": "7.0.0-beta.44", + "eslint-scope": "3.7.1", + "eslint-visitor-keys": "^1.0.0" + }, + "dependencies": { + "babylon": { + "version": "7.0.0-beta.44", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-7.0.0-beta.44.tgz", + "integrity": "sha512-5Hlm13BJVAioCHpImtFqNOF2H3ieTOHd0fmFGMxOJ9jgeFqeAwsv3u5P5cR7CSeFrkgHsT19DgFJkHV0/Mcd8g==", + "dev": true + } + } + }, + "babel-generator": { + "version": "6.26.1", + "resolved": "https://registry.npmjs.org/babel-generator/-/babel-generator-6.26.1.tgz", + "integrity": "sha512-HyfwY6ApZj7BYTcJURpM5tznulaBvyio7/0d4zFOeMPUmfxkCjHocCuoLa2SAGzBI8AREcH3eP3758F672DppA==", + "dev": true, + "requires": { + "babel-messages": "^6.23.0", + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "detect-indent": "^4.0.0", + "jsesc": "^1.3.0", + "lodash": "^4.17.4", + "source-map": "^0.5.7", + "trim-right": "^1.0.1" + } + }, + "babel-helpers": { + "version": "6.24.1", + "resolved": "https://registry.npmjs.org/babel-helpers/-/babel-helpers-6.24.1.tgz", + "integrity": "sha1-NHHenK7DiOXIUOWX5Yom3fN2ArI=", + "dev": true, + "requires": { + "babel-runtime": "^6.22.0", + "babel-template": "^6.24.1" + } + }, + "babel-istanbul": { + "version": "0.12.2", + "resolved": "https://registry.npmjs.org/babel-istanbul/-/babel-istanbul-0.12.2.tgz", + "integrity": "sha1-5yPwfJokMtiAVVILwi519cI5Fhw=", + "dev": true, + "requires": { + "abbrev": "1.0.x", + "async": "1.x", + "escodegen": "1.8.x", + "esprima": "2.7.x", + "handlebars": "^4.0.1", + "js-yaml": "3.x", + "mkdirp": "0.5.x", + "multi-glob": "^1.0.1", + "nopt": "3.x", + "object-assign": "^4.0.1", + "once": "1.x", + "resolve": "^1.1.0", + "source-map": "0.4.x", + "supports-color": "3.1.x", + "which": "1.2.x", + "wordwrap": "1.0.x" + }, + "dependencies": { + "source-map": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.4.4.tgz", + "integrity": "sha1-66T12pwNyZneaAMti092FzZSA2s=", + "dev": true, + "requires": { + "amdefine": ">=0.0.4" + } + }, + "supports-color": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.1.2.tgz", + "integrity": "sha1-cqJiiU2dQIuVbKBf83su2KbiotU=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "babel-loader": { + "version": "7.1.5", + "resolved": "https://registry.npmjs.org/babel-loader/-/babel-loader-7.1.5.tgz", + "integrity": "sha512-iCHfbieL5d1LfOQeeVJEUyD9rTwBcP/fcEbRCfempxTDuqrKpu0AZjLAQHEQa3Yqyj9ORKe2iHfoj4rHLf7xpw==", + "dev": true, + "requires": { + "find-cache-dir": "^1.0.0", + "loader-utils": "^1.0.2", + "mkdirp": "^0.5.1" + } + }, + "babel-messages": { + "version": "6.23.0", + "resolved": "https://registry.npmjs.org/babel-messages/-/babel-messages-6.23.0.tgz", + "integrity": "sha1-8830cDhYA1sqKVHG7F7fbGLyYw4=", + "dev": true, + "requires": { + "babel-runtime": "^6.22.0" + } + }, + "babel-plugin-css-modules-transform": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/babel-plugin-css-modules-transform/-/babel-plugin-css-modules-transform-1.6.1.tgz", + "integrity": "sha512-Iv98dKRgQnhuHFcn2heHH1OpHo9LGyoKjlsAkj6/Q3wkwpVyHrNfVua/WHnrwe2f7EHy1KANnOSg+q4AJ6ZzaQ==", + "dev": true, + "requires": { + "css-modules-require-hook": "^4.0.6", + "mkdirp": "^0.5.1" + } + }, + "babel-polyfill": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-polyfill/-/babel-polyfill-6.26.0.tgz", + "integrity": "sha1-N5k3q8Z9eJWXCtxiHyhM2WbPIVM=", + "dev": true, + "requires": { + "babel-runtime": "^6.26.0", + "core-js": "^2.5.0", + "regenerator-runtime": "^0.10.5" + }, + "dependencies": { + "regenerator-runtime": { + "version": "0.10.5", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.10.5.tgz", + "integrity": "sha1-M2w+/BIgrc7dosn6tntaeVWjNlg=", + "dev": true + } + } + }, + "babel-register": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-register/-/babel-register-6.26.0.tgz", + "integrity": "sha1-btAhFz4vy0htestFxgCahW9kcHE=", + "dev": true, + "requires": { + "babel-core": "^6.26.0", + "babel-runtime": "^6.26.0", + "core-js": "^2.5.0", + "home-or-tmp": "^2.0.0", + "lodash": "^4.17.4", + "mkdirp": "^0.5.1", + "source-map-support": "^0.4.15" + } + }, + "babel-runtime": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-runtime/-/babel-runtime-6.26.0.tgz", + "integrity": "sha1-llxwWGaOgrVde/4E/yM3vItWR/4=", + "dev": true, + "requires": { + "core-js": "^2.4.0", + "regenerator-runtime": "^0.11.0" + } + }, + "babel-template": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-template/-/babel-template-6.26.0.tgz", + "integrity": "sha1-3gPi0WOWsGn0bdn/+FIfsaDjXgI=", + "dev": true, + "requires": { + "babel-runtime": "^6.26.0", + "babel-traverse": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "lodash": "^4.17.4" + } + }, + "babel-traverse": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-traverse/-/babel-traverse-6.26.0.tgz", + "integrity": "sha1-RqnL1+3MYsjlwGTi0tjQ9ANXZu4=", + "dev": true, + "requires": { + "babel-code-frame": "^6.26.0", + "babel-messages": "^6.23.0", + "babel-runtime": "^6.26.0", + "babel-types": "^6.26.0", + "babylon": "^6.18.0", + "debug": "^2.6.8", + "globals": "^9.18.0", + "invariant": "^2.2.2", + "lodash": "^4.17.4" + } + }, + "babel-types": { + "version": "6.26.0", + "resolved": "https://registry.npmjs.org/babel-types/-/babel-types-6.26.0.tgz", + "integrity": "sha1-o7Bz+Uq0nrb6Vc1lInozQ4BjJJc=", + "dev": true, + "requires": { + "babel-runtime": "^6.26.0", + "esutils": "^2.0.2", + "lodash": "^4.17.4", + "to-fast-properties": "^1.0.3" + } + }, + "babylon": { + "version": "6.18.0", + "resolved": "https://registry.npmjs.org/babylon/-/babylon-6.18.0.tgz", + "integrity": "sha512-q/UEjfGJ2Cm3oKV71DJz9d25TPnq5rhBVL2Q4fA5wcC3jcrdn7+SssEybFIxwAvvP+YCsCYNKughoF33GxgycQ==", + "dev": true + }, + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "dev": true + }, + "base": { + "version": "0.11.2", + "resolved": "https://registry.npmjs.org/base/-/base-0.11.2.tgz", + "integrity": "sha512-5T6P4xPgpp0YDFvSWwEZ4NoE3aM4QBQXDzmVbraCkFj8zHM+mba8SyqB5DbZWyR7mYHo6Y7BdQo3MoA4m0TeQg==", + "dev": true, + "requires": { + "cache-base": "^1.0.1", + "class-utils": "^0.3.5", + "component-emitter": "^1.2.1", + "define-property": "^1.0.0", + "isobject": "^3.0.1", + "mixin-deep": "^1.2.0", + "pascalcase": "^0.1.1" + }, + "dependencies": { + "define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", + "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", + "dev": true, + "requires": { + "is-descriptor": "^1.0.0" + } + }, + "is-accessor-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", + "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-data-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", + "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-descriptor": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", + "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "dev": true, + "requires": { + "is-accessor-descriptor": "^1.0.0", + "is-data-descriptor": "^1.0.0", + "kind-of": "^6.0.2" + } + }, + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "base64-js": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.3.0.tgz", + "integrity": "sha512-ccav/yGvoa80BQDljCxsmmQ3Xvx60/UpBIij5QN21W3wBi/hhIC9OoO+KLpu9IJTS9j4DRVJ3aDDF9cMSoa2lw==", + "dev": true + }, + "big.js": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/big.js/-/big.js-3.2.0.tgz", + "integrity": "sha512-+hN/Zh2D08Mx65pZ/4g5bsmNiZUuChDiQfTUQ7qJr4/kuopCr88xZsAXv6mBoZEsUI4OuGHlX59qE94K2mMW8Q==", + "dev": true + }, + "binary-extensions": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-1.11.0.tgz", + "integrity": "sha1-RqoXUftqL5PuXmibsQh9SxTGwgU=", + "dev": true + }, + "bluebird": { + "version": "3.5.1", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.1.tgz", + "integrity": "sha512-MKiLiV+I1AA596t9w1sQJ8jkiSr5+ZKi0WKrYGUn6d1Fx+Ij4tIj+m2WMQSGczs5jZVxV339chE8iwk6F64wjA==", + "dev": true + }, + "bn.js": { + "version": "4.11.8", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.11.8.tgz", + "integrity": "sha512-ItfYfPLkWHUjckQCk8xC+LwxgK8NYcXywGigJgSwOP8Y2iyWT4f2vsZnoOXTTbo+o5yXmIUJ4gn5538SO5S3gA==", + "dev": true + }, + "bootstrap-3-typeahead": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/bootstrap-3-typeahead/-/bootstrap-3-typeahead-4.0.2.tgz", + "integrity": "sha1-yxyWkESFaGIJb8jHHMIbOsu1BBI=" + }, + "bootstrap-toggle": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/bootstrap-toggle/-/bootstrap-toggle-2.2.2.tgz", + "integrity": "sha1-K4hTT8G5mGdPh3+Yug2LW3Q+lv4=" + }, + "brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "dev": true, + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "braces": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-2.3.2.tgz", + "integrity": "sha512-aNdbnj9P8PjdXU4ybaWLK2IF3jc/EoDYbC7AazW6to3TRsfXxscC9UXOB5iDiEQrkyIbWp2SLQda4+QAa7nc3w==", + "dev": true, + "requires": { + "arr-flatten": "^1.1.0", + "array-unique": "^0.3.2", + "extend-shallow": "^2.0.1", + "fill-range": "^4.0.0", + "isobject": "^3.0.1", + "repeat-element": "^1.1.2", + "snapdragon": "^0.8.1", + "snapdragon-node": "^2.0.1", + "split-string": "^3.0.2", + "to-regex": "^3.0.1" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } + } + }, + "brorand": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/brorand/-/brorand-1.1.0.tgz", + "integrity": "sha1-EsJe/kCkXjwyPrhnWgoM5XsiNx8=", + "dev": true + }, + "browserify-aes": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", + "integrity": "sha512-+7CHXqGuspUn/Sl5aO7Ea0xWGAtETPXNSAjHo48JfLdPWcMng33Xe4znFvQweqc/uzk5zSOI3H52CYnjCfb5hA==", + "dev": true, + "requires": { + "buffer-xor": "^1.0.3", + "cipher-base": "^1.0.0", + "create-hash": "^1.1.0", + "evp_bytestokey": "^1.0.3", + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "browserify-cipher": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/browserify-cipher/-/browserify-cipher-1.0.1.tgz", + "integrity": "sha512-sPhkz0ARKbf4rRQt2hTpAHqn47X3llLkUGn+xEJzLjwY8LRs2p0v7ljvI5EyoRO/mexrNunNECisZs+gw2zz1w==", + "dev": true, + "requires": { + "browserify-aes": "^1.0.4", + "browserify-des": "^1.0.0", + "evp_bytestokey": "^1.0.0" + } + }, + "browserify-des": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/browserify-des/-/browserify-des-1.0.2.tgz", + "integrity": "sha512-BioO1xf3hFwz4kc6iBhI3ieDFompMhrMlnDFC4/0/vd5MokpuAc3R+LYbwTA9A5Yc9pq9UYPqffKpW2ObuwX5A==", + "dev": true, + "requires": { + "cipher-base": "^1.0.1", + "des.js": "^1.0.0", + "inherits": "^2.0.1", + "safe-buffer": "^5.1.2" + } + }, + "browserify-rsa": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.0.1.tgz", + "integrity": "sha1-IeCr+vbyApzy+vsTNWenAdQTVSQ=", + "dev": true, + "requires": { + "bn.js": "^4.1.0", + "randombytes": "^2.0.1" + } + }, + "browserify-sign": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/browserify-sign/-/browserify-sign-4.0.4.tgz", + "integrity": "sha1-qk62jl17ZYuqa/alfmMMvXqT0pg=", + "dev": true, + "requires": { + "bn.js": "^4.1.1", + "browserify-rsa": "^4.0.0", + "create-hash": "^1.1.0", + "create-hmac": "^1.1.2", + "elliptic": "^6.0.0", + "inherits": "^2.0.1", + "parse-asn1": "^5.0.0" + } + }, + "browserify-zlib": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/browserify-zlib/-/browserify-zlib-0.2.0.tgz", + "integrity": "sha512-Z942RysHXmJrhqk88FmKBVq/v5tqmSkDz7p54G/MGyjMnCFFnC79XWNbg+Vta8W6Wb2qtSZTSxIGkJrRpCFEiA==", + "dev": true, + "requires": { + "pako": "~1.0.5" + } + }, + "browserslist": { + "version": "1.7.7", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-1.7.7.tgz", + "integrity": "sha1-C9dnBCWL6CmyOYu1Dkti0aFmsLk=", + "dev": true, + "requires": { + "caniuse-db": "^1.0.30000639", + "electron-to-chromium": "^1.2.7" + } + }, + "buffer": { + "version": "4.9.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-4.9.1.tgz", + "integrity": "sha1-bRu2AbB6TvztlwlBMgkwJ8lbwpg=", + "dev": true, + "requires": { + "base64-js": "^1.0.2", + "ieee754": "^1.1.4", + "isarray": "^1.0.0" + } + }, + "buffer-from": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.0.tgz", + "integrity": "sha512-c5mRlguI/Pe2dSZmpER62rSCu0ryKmWddzRYsuXc50U2/g8jMOulc31VZMa4mYx31U5xsmSOpDCgH88Vl9cDGQ==", + "dev": true + }, + "buffer-xor": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/buffer-xor/-/buffer-xor-1.0.3.tgz", + "integrity": "sha1-JuYe0UIvtw3ULm42cp7VHYVf6Nk=", + "dev": true + }, + "builtin-modules": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-1.1.1.tgz", + "integrity": "sha1-Jw8HbFpywC9bZaR9+Uxf46J4iS8=", + "dev": true + }, + "builtin-status-codes": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/builtin-status-codes/-/builtin-status-codes-3.0.0.tgz", + "integrity": "sha1-hZgoeOIbmOHGZCXgPQF0eI9Wnug=", + "dev": true + }, + "cacache": { + "version": "10.0.4", + "resolved": "https://registry.npmjs.org/cacache/-/cacache-10.0.4.tgz", + "integrity": "sha512-Dph0MzuH+rTQzGPNT9fAnrPmMmjKfST6trxJeK7NQuHRaVw24VzPRWTmg9MpcwOVQZO0E1FBICUlFeNaKPIfHA==", + "dev": true, + "requires": { + "bluebird": "^3.5.1", + "chownr": "^1.0.1", + "glob": "^7.1.2", + "graceful-fs": "^4.1.11", + "lru-cache": "^4.1.1", + "mississippi": "^2.0.0", + "mkdirp": "^0.5.1", + "move-concurrently": "^1.0.1", + "promise-inflight": "^1.0.1", + "rimraf": "^2.6.2", + "ssri": "^5.2.4", + "unique-filename": "^1.1.0", + "y18n": "^4.0.0" + }, + "dependencies": { + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + } + } + }, + "cache-base": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/cache-base/-/cache-base-1.0.1.tgz", + "integrity": "sha512-AKcdTnFSWATd5/GCPRxr2ChwIJ85CeyrEyjRHlKxQ56d4XJMGym0uAiKn0xbLOGOl3+yRpOTi484dVCEc5AUzQ==", + "dev": true, + "requires": { + "collection-visit": "^1.0.0", + "component-emitter": "^1.2.1", + "get-value": "^2.0.6", + "has-value": "^1.0.0", + "isobject": "^3.0.1", + "set-value": "^2.0.0", + "to-object-path": "^0.3.0", + "union-value": "^1.0.0", + "unset-value": "^1.0.0" + } + }, + "caller-path": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/caller-path/-/caller-path-0.1.0.tgz", + "integrity": "sha1-lAhe9jWB7NPaqSREqP6U6CV3dR8=", + "dev": true, + "requires": { + "callsites": "^0.2.0" + } + }, + "callsites": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/callsites/-/callsites-0.2.0.tgz", + "integrity": "sha1-r6uWJikQp/M8GaV3WCXGnzTjUMo=", + "dev": true + }, + "camelcase": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-1.2.1.tgz", + "integrity": "sha1-m7UwTS4LVmmLLHWLCKPqqdqlijk=", + "dev": true, + "optional": true + }, + "caniuse-api": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/caniuse-api/-/caniuse-api-1.6.1.tgz", + "integrity": "sha1-tTTnxzTE+B7F++isoq0kNUuWLGw=", + "dev": true, + "requires": { + "browserslist": "^1.3.6", + "caniuse-db": "^1.0.30000529", + "lodash.memoize": "^4.1.2", + "lodash.uniq": "^4.5.0" + } + }, + "caniuse-db": { + "version": "1.0.30000865", + "resolved": "https://registry.npmjs.org/caniuse-db/-/caniuse-db-1.0.30000865.tgz", + "integrity": "sha1-gv+2TUD3VnYgqsAtOmMgeWiavGs=", + "dev": true + }, + "center-align": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/center-align/-/center-align-0.1.3.tgz", + "integrity": "sha1-qg0yYptu6XIgBBHL1EYckHvCt60=", + "dev": true, + "optional": true, + "requires": { + "align-text": "^0.1.3", + "lazy-cache": "^1.0.3" + } + }, + "chalk": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", + "dev": true, + "requires": { + "ansi-styles": "^2.2.1", + "escape-string-regexp": "^1.0.2", + "has-ansi": "^2.0.0", + "strip-ansi": "^3.0.0", + "supports-color": "^2.0.0" + } + }, + "chardet": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.5.0.tgz", + "integrity": "sha512-9ZTaoBaePSCFvNlNGrsyI8ZVACP2svUtq0DkM7t4K2ClAa96sqOIRjAzDTc8zXzFt1cZR46rRzLTiHFSJ+Qw0g==", + "dev": true + }, + "chokidar": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-2.0.4.tgz", + "integrity": "sha512-z9n7yt9rOvIJrMhvDtDictKrkFHeihkNl6uWMmZlmL6tJtX9Cs+87oK+teBx+JIgzvbX3yZHT3eF8vpbDxHJXQ==", + "dev": true, + "requires": { + "anymatch": "^2.0.0", + "async-each": "^1.0.0", + "braces": "^2.3.0", + "fsevents": "^1.2.2", + "glob-parent": "^3.1.0", + "inherits": "^2.0.1", + "is-binary-path": "^1.0.0", + "is-glob": "^4.0.0", + "lodash.debounce": "^4.0.8", + "normalize-path": "^2.1.1", + "path-is-absolute": "^1.0.0", + "readdirp": "^2.0.0", + "upath": "^1.0.5" + } + }, + "chownr": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.0.1.tgz", + "integrity": "sha1-4qdQQqlVGQi+vSW4Uj1fl2nXkYE=", + "dev": true + }, + "chrome-trace-event": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/chrome-trace-event/-/chrome-trace-event-1.0.0.tgz", + "integrity": "sha512-xDbVgyfDTT2piup/h8dK/y4QZfJRSa73bw1WZ8b4XM1o7fsFubUVGYcE+1ANtOzJJELGpYoG2961z0Z6OAld9A==", + "dev": true, + "requires": { + "tslib": "^1.9.0" + } + }, + "cipher-base": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/cipher-base/-/cipher-base-1.0.4.tgz", + "integrity": "sha512-Kkht5ye6ZGmwv40uUDZztayT2ThLQGfnj/T71N/XzeZeo3nf8foyW7zGTsPYkEya3m5f3cAypH+qe7YOrM1U2Q==", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "circular-json": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/circular-json/-/circular-json-0.3.3.tgz", + "integrity": "sha512-UZK3NBx2Mca+b5LsG7bY183pHWt5Y1xts4P3Pz7ENTwGVnJOUWbRb3ocjvX7hx9tq/yTAdclXm9sZ38gNuem4A==", + "dev": true + }, + "clap": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/clap/-/clap-1.2.3.tgz", + "integrity": "sha512-4CoL/A3hf90V3VIEjeuhSvlGFEHKzOz+Wfc2IVZc+FaUgU0ZQafJTP49fvnULipOPcAfqhyI2duwQyns6xqjYA==", + "dev": true, + "requires": { + "chalk": "^1.1.3" + } + }, + "class-utils": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/class-utils/-/class-utils-0.3.6.tgz", + "integrity": "sha512-qOhPa/Fj7s6TY8H8esGu5QNpMMQxz79h+urzrNYN6mn+9BnxlDGf5QZ+XeCDsxSjPqsSR56XOZOJmpeurnLMeg==", + "dev": true, + "requires": { + "arr-union": "^3.1.0", + "define-property": "^0.2.5", + "isobject": "^3.0.0", + "static-extend": "^0.1.1" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + } + } + }, + "clean-webpack-plugin": { + "version": "0.1.19", + "resolved": "https://registry.npmjs.org/clean-webpack-plugin/-/clean-webpack-plugin-0.1.19.tgz", + "integrity": "sha512-M1Li5yLHECcN2MahoreuODul5LkjohJGFxLPTjl3j1ttKrF5rgjZET1SJduuqxLAuT1gAPOdkhg03qcaaU1KeA==", + "dev": true, + "requires": { + "rimraf": "^2.6.1" + } + }, + "cli-cursor": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-2.1.0.tgz", + "integrity": "sha1-s12sN2R5+sw+lHR9QdDQ9SOP/LU=", + "dev": true, + "requires": { + "restore-cursor": "^2.0.0" + } + }, + "cli-width": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cli-width/-/cli-width-2.2.0.tgz", + "integrity": "sha1-/xnt6Kml5XkyQUewwR8PvLq+1jk=", + "dev": true + }, + "cliui": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-2.1.0.tgz", + "integrity": "sha1-S0dXYP+AJkx2LDoXGQMukcf+oNE=", + "dev": true, + "optional": true, + "requires": { + "center-align": "^0.1.1", + "right-align": "^0.1.1", + "wordwrap": "0.0.2" + }, + "dependencies": { + "wordwrap": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.2.tgz", + "integrity": "sha1-t5Zpu0LstAn4PVg8rVLKF+qhZD8=", + "dev": true, + "optional": true + } + } + }, + "clone": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/clone/-/clone-1.0.4.tgz", + "integrity": "sha1-2jCcwmPfFZlMaIypAheco8fNfH4=", + "dev": true + }, + "co": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", + "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ=", + "dev": true + }, + "coa": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/coa/-/coa-1.0.4.tgz", + "integrity": "sha1-qe8VNmDWqGqL3sAomlxoTSF0Mv0=", + "dev": true, + "requires": { + "q": "^1.1.2" + } + }, + "code-point-at": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", + "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", + "dev": true + }, + "collection-visit": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/collection-visit/-/collection-visit-1.0.0.tgz", + "integrity": "sha1-S8A3PBZLwykbTTaMgpzxqApZ3KA=", + "dev": true, + "requires": { + "map-visit": "^1.0.0", + "object-visit": "^1.0.0" + } + }, + "color": { + "version": "0.11.4", + "resolved": "https://registry.npmjs.org/color/-/color-0.11.4.tgz", + "integrity": "sha1-bXtcdPtl6EHNSHkq0e1eB7kE12Q=", + "dev": true, + "requires": { + "clone": "^1.0.2", + "color-convert": "^1.3.0", + "color-string": "^0.3.0" + } + }, + "color-convert": { + "version": "1.9.2", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-1.9.2.tgz", + "integrity": "sha512-3NUJZdhMhcdPn8vJ9v2UQJoH0qqoGUkYTgFEPZaPjEtwmmKUfNV46zZmgB2M5M4DCEQHMaCfWHCxiBflLm04Tg==", + "dev": true, + "requires": { + "color-name": "1.1.1" + } + }, + "color-name": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.1.tgz", + "integrity": "sha1-SxQVMEz1ACjqgWQ2Q72C6gWANok=", + "dev": true + }, + "color-string": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-0.3.0.tgz", + "integrity": "sha1-J9RvtnAlxcL6JZk7+/V55HhBuZE=", + "dev": true, + "requires": { + "color-name": "^1.0.0" + } + }, + "colormin": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/colormin/-/colormin-1.1.2.tgz", + "integrity": "sha1-6i90IKcrlogaOKrlnsEkpvcpgTM=", + "dev": true, + "requires": { + "color": "^0.11.0", + "css-color-names": "0.0.4", + "has": "^1.0.1" + } + }, + "colors": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/colors/-/colors-1.1.2.tgz", + "integrity": "sha1-FopHAXVran9RoSzgyXv6KMCE7WM=", + "dev": true + }, + "commander": { + "version": "2.13.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.13.0.tgz", + "integrity": "sha512-MVuS359B+YzaWqjCL/c+22gfryv+mCBPHAv3zyVI2GN8EY6IRP8VwtasXn8jyyhvvq84R4ImN1OKRtcbIasjYA==", + "dev": true + }, + "commondir": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/commondir/-/commondir-1.0.1.tgz", + "integrity": "sha1-3dgA2gxmEnOTzKWVDqloo6rxJTs=", + "dev": true + }, + "component-emitter": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/component-emitter/-/component-emitter-1.2.1.tgz", + "integrity": "sha1-E3kY1teCg/ffemt8WmPhQOaUJeY=", + "dev": true + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "dev": true + }, + "concat-stream": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz", + "integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==", + "dev": true, + "requires": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, + "console-browserify": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/console-browserify/-/console-browserify-1.1.0.tgz", + "integrity": "sha1-8CQcRXMKn8YyOyBtvzjtx0HQuxA=", + "dev": true, + "requires": { + "date-now": "^0.1.4" + } + }, + "constants-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/constants-browserify/-/constants-browserify-1.0.0.tgz", + "integrity": "sha1-wguW2MYXdIqvHBYCF2DNJ/y4y3U=", + "dev": true + }, + "contains-path": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/contains-path/-/contains-path-0.1.0.tgz", + "integrity": "sha1-/ozxhP9mcLa67wGp1IYaXL7EEgo=", + "dev": true + }, + "convert-source-map": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-1.5.1.tgz", + "integrity": "sha1-uCeAl7m8IpNl3lxiz1/K7YtVmeU=", + "dev": true + }, + "copy-concurrently": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/copy-concurrently/-/copy-concurrently-1.0.5.tgz", + "integrity": "sha512-f2domd9fsVDFtaFcbaRZuYXwtdmnzqbADSwhSWYxYB/Q8zsdUUFMXVRwXGDMWmbEzAn1kdRrtI1T/KTFOL4X2A==", + "dev": true, + "requires": { + "aproba": "^1.1.1", + "fs-write-stream-atomic": "^1.0.8", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.0" + } + }, + "copy-descriptor": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/copy-descriptor/-/copy-descriptor-0.1.1.tgz", + "integrity": "sha1-Z29us8OZl8LuGsOpJP1hJHSPV40=", + "dev": true + }, + "copy-webpack-plugin": { + "version": "4.5.2", + "resolved": "https://registry.npmjs.org/copy-webpack-plugin/-/copy-webpack-plugin-4.5.2.tgz", + "integrity": "sha512-zmC33E8FFSq3AbflTvqvPvBo621H36Afsxlui91d+QyZxPIuXghfnTsa1CuqiAaCPgJoSUWfTFbKJnadZpKEbQ==", + "dev": true, + "requires": { + "cacache": "^10.0.4", + "find-cache-dir": "^1.0.0", + "globby": "^7.1.1", + "is-glob": "^4.0.0", + "loader-utils": "^1.1.0", + "minimatch": "^3.0.4", + "p-limit": "^1.0.0", + "serialize-javascript": "^1.4.0" + } + }, + "core-js": { + "version": "2.5.7", + "resolved": "https://registry.npmjs.org/core-js/-/core-js-2.5.7.tgz", + "integrity": "sha512-RszJCAxg/PP6uzXVXL6BsxSXx/B05oJAQ2vkJRjyjrEcNVycaqOmNb5OTxZPE3xa5gwZduqza6L9JOCenh/Ecw==", + "dev": true + }, + "core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", + "dev": true + }, + "create-ecdh": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/create-ecdh/-/create-ecdh-4.0.3.tgz", + "integrity": "sha512-GbEHQPMOswGpKXM9kCWVrremUcBmjteUaQ01T9rkKCPDXfUHX0IoP9LpHYo2NPFampa4e+/pFDc3jQdxrxQLaw==", + "dev": true, + "requires": { + "bn.js": "^4.1.0", + "elliptic": "^6.0.0" + } + }, + "create-hash": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", + "integrity": "sha512-z00bCGNHDG8mHAkP7CtT1qVu+bFQUPjYq/4Iv3C3kWjTFV10zIjfSoeqXo9Asws8gwSHDGj/hl2u4OGIjapeCg==", + "dev": true, + "requires": { + "cipher-base": "^1.0.1", + "inherits": "^2.0.1", + "md5.js": "^1.3.4", + "ripemd160": "^2.0.1", + "sha.js": "^2.4.0" + } + }, + "create-hmac": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", + "integrity": "sha512-MJG9liiZ+ogc4TzUwuvbER1JRdgvUFSB5+VR/g5h82fGaIRWMWddtKBHi7/sVhfjQZ6SehlyhvQYrcYkaUIpLg==", + "dev": true, + "requires": { + "cipher-base": "^1.0.3", + "create-hash": "^1.1.0", + "inherits": "^2.0.1", + "ripemd160": "^2.0.0", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + } + }, + "cross-spawn": { + "version": "6.0.5", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-6.0.5.tgz", + "integrity": "sha512-eTVLrBSt7fjbDygz805pMnstIs2VTBNkRm0qxZd+M7A5XDdxVRWO5MxGBXZhjY4cqLYLdtrGqRf8mBPmzwSpWQ==", + "dev": true, + "requires": { + "nice-try": "^1.0.4", + "path-key": "^2.0.1", + "semver": "^5.5.0", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + } + }, + "crypto-browserify": { + "version": "3.12.0", + "resolved": "https://registry.npmjs.org/crypto-browserify/-/crypto-browserify-3.12.0.tgz", + "integrity": "sha512-fz4spIh+znjO2VjL+IdhEpRJ3YN6sMzITSBijk6FK2UvTqruSQW+/cCZTSNsMiZNvUeq0CqurF+dAbyiGOY6Wg==", + "dev": true, + "requires": { + "browserify-cipher": "^1.0.0", + "browserify-sign": "^4.0.0", + "create-ecdh": "^4.0.0", + "create-hash": "^1.1.0", + "create-hmac": "^1.1.0", + "diffie-hellman": "^5.0.0", + "inherits": "^2.0.1", + "pbkdf2": "^3.0.3", + "public-encrypt": "^4.0.0", + "randombytes": "^2.0.0", + "randomfill": "^1.0.3" + } + }, + "css-color-names": { + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/css-color-names/-/css-color-names-0.0.4.tgz", + "integrity": "sha1-gIrcLnnPhHOAabZGyyDsJ762KeA=", + "dev": true + }, + "css-loader": { + "version": "0.28.11", + "resolved": "https://registry.npmjs.org/css-loader/-/css-loader-0.28.11.tgz", + "integrity": "sha512-wovHgjAx8ZIMGSL8pTys7edA1ClmzxHeY6n/d97gg5odgsxEgKjULPR0viqyC+FWMCL9sfqoC/QCUBo62tLvPg==", + "dev": true, + "requires": { + "babel-code-frame": "^6.26.0", + "css-selector-tokenizer": "^0.7.0", + "cssnano": "^3.10.0", + "icss-utils": "^2.1.0", + "loader-utils": "^1.0.2", + "lodash.camelcase": "^4.3.0", + "object-assign": "^4.1.1", + "postcss": "^5.0.6", + "postcss-modules-extract-imports": "^1.2.0", + "postcss-modules-local-by-default": "^1.2.0", + "postcss-modules-scope": "^1.1.0", + "postcss-modules-values": "^1.3.0", + "postcss-value-parser": "^3.3.0", + "source-list-map": "^2.0.0" + }, + "dependencies": { + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "icss-utils": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/icss-utils/-/icss-utils-2.1.0.tgz", + "integrity": "sha1-g/Cg7DeL8yRheLbCrZE28TWxyWI=", + "dev": true, + "requires": { + "postcss": "^6.0.1" + }, + "dependencies": { + "postcss": { + "version": "6.0.23", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-6.0.23.tgz", + "integrity": "sha512-soOk1h6J3VMTZtVeVpv15/Hpdl2cBLX3CAw4TAbkpTJiNPk9YP/zWcD1ND+xEtvyuuvKzbxliTOIyvkSeSJ6ag==", + "dev": true, + "requires": { + "chalk": "^2.4.1", + "source-map": "^0.6.1", + "supports-color": "^5.4.0" + } + } + } + }, + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + }, + "dependencies": { + "ansi-styles": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-2.2.1.tgz", + "integrity": "sha1-tDLdM1i2NM914eRmQ2gkBTPB3b4=", + "dev": true + }, + "chalk": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-1.1.3.tgz", + "integrity": "sha1-qBFcVeSnAv5NFQq9OHKCKn4J/Jg=", + "dev": true, + "requires": { + "ansi-styles": "^2.2.1", + "escape-string-regexp": "^1.0.2", + "has-ansi": "^2.0.0", + "strip-ansi": "^3.0.0", + "supports-color": "^2.0.0" + }, + "dependencies": { + "supports-color": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", + "dev": true + } + } + }, + "has-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-1.0.0.tgz", + "integrity": "sha1-nZ55MWXOAXoA8AQYxD+UKnsdEfo=", + "dev": true + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-modules-extract-imports": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postcss-modules-extract-imports/-/postcss-modules-extract-imports-1.2.0.tgz", + "integrity": "sha1-ZhQOzs447wa/DT41XWm/WdFB6oU=", + "dev": true, + "requires": { + "postcss": "^6.0.1" + }, + "dependencies": { + "postcss": { + "version": "6.0.23", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-6.0.23.tgz", + "integrity": "sha512-soOk1h6J3VMTZtVeVpv15/Hpdl2cBLX3CAw4TAbkpTJiNPk9YP/zWcD1ND+xEtvyuuvKzbxliTOIyvkSeSJ6ag==", + "dev": true, + "requires": { + "chalk": "^2.4.1", + "source-map": "^0.6.1", + "supports-color": "^5.4.0" + } + } + } + }, + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "css-modules-require-hook": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/css-modules-require-hook/-/css-modules-require-hook-4.2.3.tgz", + "integrity": "sha1-Z5LKQSsV4j5vm+agfc739Xf/kE0=", + "dev": true, + "requires": { + "debug": "^2.2.0", + "generic-names": "^1.0.1", + "glob-to-regexp": "^0.3.0", + "icss-replace-symbols": "^1.0.2", + "lodash": "^4.3.0", + "postcss": "^6.0.1", + "postcss-modules-extract-imports": "^1.0.0", + "postcss-modules-local-by-default": "^1.0.1", + "postcss-modules-resolve-imports": "^1.3.0", + "postcss-modules-scope": "^1.0.0", + "postcss-modules-values": "^1.1.1", + "seekout": "^1.0.1" + } + }, + "css-selector-tokenizer": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.7.0.tgz", + "integrity": "sha1-5piEdK6MlTR3v15+/s/OzNnPTIY=", + "dev": true, + "requires": { + "cssesc": "^0.1.0", + "fastparse": "^1.1.1", + "regexpu-core": "^1.0.0" + } + }, + "cssesc": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-0.1.0.tgz", + "integrity": "sha1-yBSQPkViM3GgR3tAEJqq++6t27Q=", + "dev": true + }, + "cssnano": { + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/cssnano/-/cssnano-3.10.0.tgz", + "integrity": "sha1-Tzj2zqK5sX+gFJDyPx3GjqZcHDg=", + "dev": true, + "requires": { + "autoprefixer": "^6.3.1", + "decamelize": "^1.1.2", + "defined": "^1.0.0", + "has": "^1.0.1", + "object-assign": "^4.0.1", + "postcss": "^5.0.14", + "postcss-calc": "^5.2.0", + "postcss-colormin": "^2.1.8", + "postcss-convert-values": "^2.3.4", + "postcss-discard-comments": "^2.0.4", + "postcss-discard-duplicates": "^2.0.1", + "postcss-discard-empty": "^2.0.1", + "postcss-discard-overridden": "^0.1.1", + "postcss-discard-unused": "^2.2.1", + "postcss-filter-plugins": "^2.0.0", + "postcss-merge-idents": "^2.1.5", + "postcss-merge-longhand": "^2.0.1", + "postcss-merge-rules": "^2.0.3", + "postcss-minify-font-values": "^1.0.2", + "postcss-minify-gradients": "^1.0.1", + "postcss-minify-params": "^1.0.4", + "postcss-minify-selectors": "^2.0.4", + "postcss-normalize-charset": "^1.1.0", + "postcss-normalize-url": "^3.0.7", + "postcss-ordered-values": "^2.1.0", + "postcss-reduce-idents": "^2.2.2", + "postcss-reduce-initial": "^1.0.0", + "postcss-reduce-transforms": "^1.0.3", + "postcss-svgo": "^2.1.1", + "postcss-unique-selectors": "^2.0.2", + "postcss-value-parser": "^3.2.3", + "postcss-zindex": "^2.0.1" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "csso": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/csso/-/csso-2.3.2.tgz", + "integrity": "sha1-3dUsWHAz9J6Utx/FVWnyUuj/X4U=", + "dev": true, + "requires": { + "clap": "^1.0.9", + "source-map": "^0.5.3" + } + }, + "cyclist": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/cyclist/-/cyclist-0.2.2.tgz", + "integrity": "sha1-GzN5LhHpFKL9bW7WRHRkRE5fpkA=", + "dev": true + }, + "d": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/d/-/d-1.0.0.tgz", + "integrity": "sha1-dUu1v+VUUdpppYuU1F9MWwRi1Y8=", + "dev": true, + "requires": { + "es5-ext": "^0.10.9" + } + }, + "d3": { + "version": "3.5.17", + "resolved": "https://registry.npmjs.org/d3/-/d3-3.5.17.tgz", + "integrity": "sha1-vEZ0gAQ3iyGjYMn8fPUjF5B2L7g=" + }, + "d3-collection": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/d3-collection/-/d3-collection-1.0.4.tgz", + "integrity": "sha1-NC39EoN8kJdPM/HMCnha6lcNzcI=" + }, + "d3-selection": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-1.3.0.tgz", + "integrity": "sha512-qgpUOg9tl5CirdqESUAu0t9MU/t3O9klYfGfyKsXEmhyxyzLpzpeh08gaxBUTQw1uXIOkr/30Ut2YRjSSxlmHA==" + }, + "d3-tip": { + "version": "0.9.1", + "resolved": "https://registry.npmjs.org/d3-tip/-/d3-tip-0.9.1.tgz", + "integrity": "sha512-EVBfG9d+HnjIoyVXfhpytWxlF59JaobwizqMX9EBXtsFmJytjwHeYiUs74ldHQjE7S9vzfKTx2LCtvUrIbuFYg==", + "requires": { + "d3-collection": "^1.0.4", + "d3-selection": "^1.3.0" + } + }, + "dagre": { + "version": "0.7.4", + "resolved": "https://registry.npmjs.org/dagre/-/dagre-0.7.4.tgz", + "integrity": "sha1-3nLw50pVDOEc5jjwoTb+1xI5gCI=", + "requires": { + "graphlib": "^1.0.5", + "lodash": "^3.10.0" + }, + "dependencies": { + "lodash": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz", + "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y=" + } + } + }, + "dagre-d3": { + "version": "0.4.18", + "resolved": "https://registry.npmjs.org/dagre-d3/-/dagre-d3-0.4.18.tgz", + "integrity": "sha512-7tRltaOfNTIkNEZYMCL8N3Q8bCre99x/mAJL2RbuUfPu5d+4f0KOHglZm+AzOG2Z/+S2HBDYciE6iDcDtki6Tg==", + "requires": { + "d3": "^3.3.8", + "dagre": "^0.7.3", + "graphlib": "^1.0.5", + "lodash": "^3.10.0" + }, + "dependencies": { + "lodash": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz", + "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y=" + } + } + }, + "datatables.net": { + "version": "1.10.19", + "resolved": "https://registry.npmjs.org/datatables.net/-/datatables.net-1.10.19.tgz", + "integrity": "sha512-+ljXcI6Pj3PTGy5pesp3E5Dr3x3AV45EZe0o1r0gKENN2gafBKXodVnk2ypKwl2tTmivjxbkiqoWnipTefyBTA==", + "requires": { + "jquery": ">=1.7" + } + }, + "datatables.net-bs": { + "version": "1.10.19", + "resolved": "https://registry.npmjs.org/datatables.net-bs/-/datatables.net-bs-1.10.19.tgz", + "integrity": "sha512-5gxoI2n+duZP06+4xVC2TtH6zcY369/TRKTZ1DdSgDcDUl4OYQsrXCuaLJmbVzna/5Y5lrMmK7CxgvYgIynICA==", + "requires": { + "datatables.net": "1.10.19", + "jquery": ">=1.7" + } + }, + "date-now": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/date-now/-/date-now-0.1.4.tgz", + "integrity": "sha1-6vQ5/U1ISK105cx9vvIAZyueNFs=", + "dev": true + }, + "debug": { + "version": "2.6.9", + "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", + "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "decamelize": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-1.2.0.tgz", + "integrity": "sha1-9lNNFRSCabIDUue+4m9QH5oZEpA=", + "dev": true + }, + "decode-uri-component": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/decode-uri-component/-/decode-uri-component-0.2.0.tgz", + "integrity": "sha1-6zkTMzRYd1y4TNGh+uBiEGu4dUU=", + "dev": true + }, + "deep-is": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.3.tgz", + "integrity": "sha1-s2nW+128E+7PUk+RsHD+7cNXzzQ=", + "dev": true + }, + "define-properties": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.1.2.tgz", + "integrity": "sha1-g6c/L+pWmJj7c3GTyPhzyvbUXJQ=", + "dev": true, + "requires": { + "foreach": "^2.0.5", + "object-keys": "^1.0.8" + } + }, + "define-property": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-2.0.2.tgz", + "integrity": "sha512-jwK2UV4cnPpbcG7+VRARKTZPUWowwXA8bzH5NP6ud0oeAxyYPuGZUAC7hMugpCdz4BeSZl2Dl9k66CHJ/46ZYQ==", + "dev": true, + "requires": { + "is-descriptor": "^1.0.2", + "isobject": "^3.0.1" + }, + "dependencies": { + "is-accessor-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", + "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-data-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", + "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-descriptor": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", + "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "dev": true, + "requires": { + "is-accessor-descriptor": "^1.0.0", + "is-data-descriptor": "^1.0.0", + "kind-of": "^6.0.2" + } + }, + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "defined": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/defined/-/defined-1.0.0.tgz", + "integrity": "sha1-yY2bzvdWdBiOEQlpFRGZ45sfppM=", + "dev": true + }, + "del": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/del/-/del-2.2.2.tgz", + "integrity": "sha1-wSyYHQZ4RshLyvhiz/kw2Qf/0ag=", + "dev": true, + "requires": { + "globby": "^5.0.0", + "is-path-cwd": "^1.0.0", + "is-path-in-cwd": "^1.0.0", + "object-assign": "^4.0.1", + "pify": "^2.0.0", + "pinkie-promise": "^2.0.0", + "rimraf": "^2.2.8" + }, + "dependencies": { + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "globby": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-5.0.0.tgz", + "integrity": "sha1-69hGZ8oNuzMLmbz8aOrCvFQ3Dg0=", + "dev": true, + "requires": { + "array-union": "^1.0.1", + "arrify": "^1.0.0", + "glob": "^7.0.3", + "object-assign": "^4.0.1", + "pify": "^2.0.0", + "pinkie-promise": "^2.0.0" + } + }, + "pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", + "dev": true + } + } + }, + "des.js": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/des.js/-/des.js-1.0.0.tgz", + "integrity": "sha1-wHTS4qpqipoH29YfmhXCzYPsjsw=", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "detect-indent": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/detect-indent/-/detect-indent-4.0.0.tgz", + "integrity": "sha1-920GQ1LN9Docts5hnE7jqUdd4gg=", + "dev": true, + "requires": { + "repeating": "^2.0.0" + } + }, + "diffie-hellman": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", + "integrity": "sha512-kqag/Nl+f3GwyK25fhUMYj81BUOrZ9IuJsjIcDE5icNM9FJHAVm3VcUDxdLPoQtTuUylWm6ZIknYJwwaPxsUzg==", + "dev": true, + "requires": { + "bn.js": "^4.1.0", + "miller-rabin": "^4.0.0", + "randombytes": "^2.0.0" + } + }, + "dir-glob": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-2.0.0.tgz", + "integrity": "sha512-37qirFDz8cA5fimp9feo43fSuRo2gHwaIn6dXL8Ber1dGwUosDrGZeCCXq57WnIqE4aQ+u3eQZzsk1yOzhdwag==", + "dev": true, + "requires": { + "arrify": "^1.0.1", + "path-type": "^3.0.0" + } + }, + "doctrine": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz", + "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==", + "dev": true, + "requires": { + "esutils": "^2.0.2" + } + }, + "dom-serializer": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", + "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=", + "dev": true, + "requires": { + "domelementtype": "~1.1.1", + "entities": "~1.1.1" + }, + "dependencies": { + "domelementtype": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz", + "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs=", + "dev": true + } + } + }, + "domain-browser": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/domain-browser/-/domain-browser-1.2.0.tgz", + "integrity": "sha512-jnjyiM6eRyZl2H+W8Q/zLMA481hzi0eszAaBUzIVnmYVDBbnLxVNnfu1HgEBvCbL+71FrxMl3E6lpKH7Ge3OXA==", + "dev": true + }, + "domelementtype": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.3.0.tgz", + "integrity": "sha1-sXrtguirWeUt2cGbF1bg/BhyBMI=", + "dev": true + }, + "domhandler": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.4.2.tgz", + "integrity": "sha512-JiK04h0Ht5u/80fdLMCEmV4zkNh2BcoMFBmZ/91WtYZ8qVXSKjiw7fXMgFPnHcSZgOo3XdinHvmnDUeMf5R4wA==", + "dev": true, + "requires": { + "domelementtype": "1" + } + }, + "domutils": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.7.0.tgz", + "integrity": "sha512-Lgd2XcJ/NjEw+7tFvfKxOzCYKZsdct5lczQ2ZaQY8Djz7pfAD3Gbp8ySJWtreII/vDlMVmxwa6pHmdxIYgttDg==", + "dev": true, + "requires": { + "dom-serializer": "0", + "domelementtype": "1" + } + }, + "duplexify": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/duplexify/-/duplexify-3.6.0.tgz", + "integrity": "sha512-fO3Di4tBKJpYTFHAxTU00BcfWMY9w24r/x21a6rZRbsD/ToUgGxsMbiGRmB7uVAXeGKXD9MwiLZa5E97EVgIRQ==", + "dev": true, + "requires": { + "end-of-stream": "^1.0.0", + "inherits": "^2.0.1", + "readable-stream": "^2.0.0", + "stream-shift": "^1.0.0" + } + }, + "electron-to-chromium": { + "version": "1.3.51", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.3.51.tgz", + "integrity": "sha1-akK0nar38ipbN7mR2vlJ8029ubU=", + "dev": true + }, + "elliptic": { + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/elliptic/-/elliptic-6.4.0.tgz", + "integrity": "sha1-ysmvh2LIWDYYcAPI3+GT5eLq5d8=", + "dev": true, + "requires": { + "bn.js": "^4.4.0", + "brorand": "^1.0.1", + "hash.js": "^1.0.0", + "hmac-drbg": "^1.0.0", + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0", + "minimalistic-crypto-utils": "^1.0.0" + } + }, + "emojis-list": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/emojis-list/-/emojis-list-2.1.0.tgz", + "integrity": "sha1-TapNnbAPmBmIDHn6RXrlsJof04k=", + "dev": true + }, + "end-of-stream": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.1.tgz", + "integrity": "sha512-1MkrZNvWTKCaigbn+W15elq2BB/L22nqrSY5DKlo3X6+vclJm8Bb5djXJBmEX6fS3+zCh/F4VBK5Z2KxJt4s2Q==", + "dev": true, + "requires": { + "once": "^1.4.0" + } + }, + "enhanced-resolve": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-4.1.0.tgz", + "integrity": "sha512-F/7vkyTtyc/llOIn8oWclcB25KdRaiPBpZYDgJHgh/UHtpgT2p2eldQgtQnLtUvfMKPKxbRaQM/hHkvLHt1Vng==", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "memory-fs": "^0.4.0", + "tapable": "^1.0.0" + } + }, + "entities": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.1.tgz", + "integrity": "sha1-blwtClYhtdra7O+AuQ7ftc13cvA=", + "dev": true + }, + "errno": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/errno/-/errno-0.1.7.tgz", + "integrity": "sha512-MfrRBDWzIWifgq6tJj60gkAwtLNb6sQPlcFrSOflcP1aFmmruKQ2wRnze/8V6kgyz7H3FF8Npzv78mZ7XLLflg==", + "dev": true, + "requires": { + "prr": "~1.0.1" + } + }, + "error-ex": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", + "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==", + "dev": true, + "requires": { + "is-arrayish": "^0.2.1" + } + }, + "es-abstract": { + "version": "1.12.0", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.12.0.tgz", + "integrity": "sha512-C8Fx/0jFmV5IPoMOFPA9P9G5NtqW+4cOPit3MIuvR2t7Ag2K15EJTpxnHAYTzL+aYQJIESYeXZmDBfOBE1HcpA==", + "dev": true, + "requires": { + "es-to-primitive": "^1.1.1", + "function-bind": "^1.1.1", + "has": "^1.0.1", + "is-callable": "^1.1.3", + "is-regex": "^1.0.4" + } + }, + "es-to-primitive": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.1.1.tgz", + "integrity": "sha1-RTVSSKiJeQNLZ5Lhm7gfK3l13Q0=", + "dev": true, + "requires": { + "is-callable": "^1.1.1", + "is-date-object": "^1.0.1", + "is-symbol": "^1.0.1" + } + }, + "es5-ext": { + "version": "0.10.45", + "resolved": "https://registry.npmjs.org/es5-ext/-/es5-ext-0.10.45.tgz", + "integrity": "sha512-FkfM6Vxxfmztilbxxz5UKSD4ICMf5tSpRFtDNtkAhOxZ0EKtX6qwmXNyH/sFyIbX2P/nU5AMiA9jilWsUGJzCQ==", + "dev": true, + "requires": { + "es6-iterator": "~2.0.3", + "es6-symbol": "~3.1.1", + "next-tick": "1" + } + }, + "es6-iterator": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/es6-iterator/-/es6-iterator-2.0.3.tgz", + "integrity": "sha1-p96IkUGgWpSwhUQDstCg+/qY87c=", + "dev": true, + "requires": { + "d": "1", + "es5-ext": "^0.10.35", + "es6-symbol": "^3.1.1" + } + }, + "es6-symbol": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/es6-symbol/-/es6-symbol-3.1.1.tgz", + "integrity": "sha1-vwDvT9q2uhtG7Le2KbTH7VcVzHc=", + "dev": true, + "requires": { + "d": "1", + "es5-ext": "~0.10.14" + } + }, + "escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", + "dev": true + }, + "escodegen": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-1.8.1.tgz", + "integrity": "sha1-WltTr0aTEQvrsIZ6o0MN07cKEBg=", + "dev": true, + "requires": { + "esprima": "^2.7.1", + "estraverse": "^1.9.1", + "esutils": "^2.0.2", + "optionator": "^0.8.1", + "source-map": "~0.2.0" + }, + "dependencies": { + "source-map": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.2.0.tgz", + "integrity": "sha1-2rc/vPwrqBm03gO9b26qSBZLP50=", + "dev": true, + "optional": true, + "requires": { + "amdefine": ">=0.0.4" + } + } + } + }, + "eslint": { + "version": "4.19.1", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-4.19.1.tgz", + "integrity": "sha512-bT3/1x1EbZB7phzYu7vCr1v3ONuzDtX8WjuM9c0iYxe+cq+pwcKEoQjl7zd3RpC6YOLgnSy3cTN58M2jcoPDIQ==", + "dev": true, + "requires": { + "ajv": "^5.3.0", + "babel-code-frame": "^6.22.0", + "chalk": "^2.1.0", + "concat-stream": "^1.6.0", + "cross-spawn": "^5.1.0", + "debug": "^3.1.0", + "doctrine": "^2.1.0", + "eslint-scope": "^3.7.1", + "eslint-visitor-keys": "^1.0.0", + "espree": "^3.5.4", + "esquery": "^1.0.0", + "esutils": "^2.0.2", + "file-entry-cache": "^2.0.0", + "functional-red-black-tree": "^1.0.1", + "glob": "^7.1.2", + "globals": "^11.0.1", + "ignore": "^3.3.3", + "imurmurhash": "^0.1.4", + "inquirer": "^3.0.6", + "is-resolvable": "^1.0.0", + "js-yaml": "^3.9.1", + "json-stable-stringify-without-jsonify": "^1.0.1", + "levn": "^0.3.0", + "lodash": "^4.17.4", + "minimatch": "^3.0.2", + "mkdirp": "^0.5.1", + "natural-compare": "^1.4.0", + "optionator": "^0.8.2", + "path-is-inside": "^1.0.2", + "pluralize": "^7.0.0", + "progress": "^2.0.0", + "regexpp": "^1.0.1", + "require-uncached": "^1.0.3", + "semver": "^5.3.0", + "strip-ansi": "^4.0.0", + "strip-json-comments": "~2.0.1", + "table": "4.0.2", + "text-table": "~0.2.0" + }, + "dependencies": { + "ajv": { + "version": "5.5.2", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz", + "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", + "dev": true, + "requires": { + "co": "^4.6.0", + "fast-deep-equal": "^1.0.0", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.3.0" + } + }, + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", + "dev": true + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "chardet": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-0.4.2.tgz", + "integrity": "sha1-tUc7M9yXxCTl2Y3IfVXU2KKci/I=", + "dev": true + }, + "cross-spawn": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-5.1.0.tgz", + "integrity": "sha1-6L0O/uWPz/b4+UUQoKVUu/ojVEk=", + "dev": true, + "requires": { + "lru-cache": "^4.0.1", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + } + }, + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "external-editor": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/external-editor/-/external-editor-2.2.0.tgz", + "integrity": "sha512-bSn6gvGxKt+b7+6TKEv1ZycHleA7aHhRHyAqJyp5pbUFuYYNIzpZnQDk7AsYckyWdEnTeAnay0aCy2aV6iTk9A==", + "dev": true, + "requires": { + "chardet": "^0.4.0", + "iconv-lite": "^0.4.17", + "tmp": "^0.0.33" + } + }, + "fast-deep-equal": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", + "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=", + "dev": true + }, + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "globals": { + "version": "11.7.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-11.7.0.tgz", + "integrity": "sha512-K8BNSPySfeShBQXsahYB/AbbWruVOTyVpgoIDnl8odPpeSfP2J5QO2oLFFdl2j7GfDCtZj2bMKar2T49itTPCg==", + "dev": true + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "inquirer": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-3.3.0.tgz", + "integrity": "sha512-h+xtnyk4EwKvFWHrUYsWErEVR+igKtLdchu+o0Z1RL7VU/jVMFbYir2bp6bAj8efFNxWqHX0dIss6fJQ+/+qeQ==", + "dev": true, + "requires": { + "ansi-escapes": "^3.0.0", + "chalk": "^2.0.0", + "cli-cursor": "^2.1.0", + "cli-width": "^2.0.0", + "external-editor": "^2.0.4", + "figures": "^2.0.0", + "lodash": "^4.3.0", + "mute-stream": "0.0.7", + "run-async": "^2.2.0", + "rx-lite": "^4.0.8", + "rx-lite-aggregates": "^4.0.8", + "string-width": "^2.1.0", + "strip-ansi": "^4.0.0", + "through": "^2.3.6" + } + }, + "json-schema-traverse": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz", + "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=", + "dev": true + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "eslint-config-airbnb-base": { + "version": "13.0.0", + "resolved": "https://registry.npmjs.org/eslint-config-airbnb-base/-/eslint-config-airbnb-base-13.0.0.tgz", + "integrity": "sha512-hUFXRlE6AY84z0qYh4wKdtSF4EqDnyT8sxrvTpcXCV4ENSLF8li5yNA1yDM26iinH8Ierbpc4lv8Rp62uX6VSQ==", + "dev": true, + "requires": { + "eslint-restricted-globals": "^0.1.1", + "object.assign": "^4.1.0", + "object.entries": "^1.0.4" + } + }, + "eslint-import-resolver-node": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.2.tgz", + "integrity": "sha512-sfmTqJfPSizWu4aymbPr4Iidp5yKm8yDkHp+Ir3YiTHiiDfxh69mOUsmiqW6RZ9zRXFaF64GtYmN7e+8GHBv6Q==", + "dev": true, + "requires": { + "debug": "^2.6.9", + "resolve": "^1.5.0" + } + }, + "eslint-module-utils": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.2.0.tgz", + "integrity": "sha1-snA2LNiLGkitMIl2zn+lTphBF0Y=", + "dev": true, + "requires": { + "debug": "^2.6.8", + "pkg-dir": "^1.0.0" + }, + "dependencies": { + "find-up": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-1.1.2.tgz", + "integrity": "sha1-ay6YIrGizgpgq2TWEOzK1TyyTQ8=", + "dev": true, + "requires": { + "path-exists": "^2.0.0", + "pinkie-promise": "^2.0.0" + } + }, + "path-exists": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-2.1.0.tgz", + "integrity": "sha1-D+tsZPD8UY2adU3V77YscCJ2H0s=", + "dev": true, + "requires": { + "pinkie-promise": "^2.0.0" + } + }, + "pkg-dir": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-1.0.0.tgz", + "integrity": "sha1-ektQio1bstYp1EcFb/TpyTFM89Q=", + "dev": true, + "requires": { + "find-up": "^1.0.0" + } + } + } + }, + "eslint-plugin-es": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-es/-/eslint-plugin-es-1.3.1.tgz", + "integrity": "sha512-9XcVyZiQRVeFjqHw8qHNDAZcQLqaHlOGGpeYqzYh8S4JYCWTCO3yzyen8yVmA5PratfzTRWDwCOFphtDEG+w/w==", + "dev": true, + "requires": { + "eslint-utils": "^1.3.0", + "regexpp": "^2.0.0" + }, + "dependencies": { + "regexpp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-2.0.0.tgz", + "integrity": "sha512-g2FAVtR8Uh8GO1Nv5wpxW7VFVwHcCEr4wyA8/MHiRkO8uHoR5ntAA8Uq3P1vvMTX/BeQiRVSpDGLd+Wn5HNOTA==", + "dev": true + } + } + }, + "eslint-plugin-html": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/eslint-plugin-html/-/eslint-plugin-html-4.0.5.tgz", + "integrity": "sha512-yULqYldzhYXTwZEaJXM30HhfgJdtTzuVH3LeoANybESHZ5+2ztLD72BsB2wR124/kk/PvQqZofDFSdNIk+kykw==", + "dev": true, + "requires": { + "htmlparser2": "^3.8.2" + } + }, + "eslint-plugin-import": { + "version": "2.13.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.13.0.tgz", + "integrity": "sha512-t6hGKQDMIt9N8R7vLepsYXgDfeuhp6ZJSgtrLEDxonpSubyxUZHjhm6LsAaZX8q6GYVxkbT3kTsV9G5mBCFR6A==", + "dev": true, + "requires": { + "contains-path": "^0.1.0", + "debug": "^2.6.8", + "doctrine": "1.5.0", + "eslint-import-resolver-node": "^0.3.1", + "eslint-module-utils": "^2.2.0", + "has": "^1.0.1", + "lodash": "^4.17.4", + "minimatch": "^3.0.3", + "read-pkg-up": "^2.0.0", + "resolve": "^1.6.0" + }, + "dependencies": { + "doctrine": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-1.5.0.tgz", + "integrity": "sha1-N53Ocw9hZvds76TmcHoVmwLFpvo=", + "dev": true, + "requires": { + "esutils": "^2.0.2", + "isarray": "^1.0.0" + } + } + } + }, + "eslint-plugin-node": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-node/-/eslint-plugin-node-7.0.1.tgz", + "integrity": "sha512-lfVw3TEqThwq0j2Ba/Ckn2ABdwmL5dkOgAux1rvOk6CO7A6yGyPI2+zIxN6FyNkp1X1X/BSvKOceD6mBWSj4Yw==", + "dev": true, + "requires": { + "eslint-plugin-es": "^1.3.1", + "eslint-utils": "^1.3.1", + "ignore": "^4.0.2", + "minimatch": "^3.0.4", + "resolve": "^1.8.1", + "semver": "^5.5.0" + }, + "dependencies": { + "ignore": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-4.0.2.tgz", + "integrity": "sha512-uoxnT7PYpyEnsja+yX+7v49B7LXxmzDJ2JALqHH3oEGzpM2U1IGcbfnOr8Dt57z3B/UWs7/iAgPFbmye8m4I0g==", + "dev": true + } + } + }, + "eslint-plugin-promise": { + "version": "3.8.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-promise/-/eslint-plugin-promise-3.8.0.tgz", + "integrity": "sha512-JiFL9UFR15NKpHyGii1ZcvmtIqa3UTwiDAGb8atSffe43qJ3+1czVGN6UtkklpcJ2DVnqvTMzEKRaJdBkAL2aQ==", + "dev": true + }, + "eslint-plugin-standard": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-standard/-/eslint-plugin-standard-3.1.0.tgz", + "integrity": "sha512-fVcdyuKRr0EZ4fjWl3c+gp1BANFJD1+RaWa2UPYfMZ6jCtp5RG00kSaXnK/dE5sYzt4kaWJ9qdxqUfc0d9kX0w==", + "dev": true + }, + "eslint-restricted-globals": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/eslint-restricted-globals/-/eslint-restricted-globals-0.1.1.tgz", + "integrity": "sha1-NfDVy8ZMLj7WLpO0saevBbp+1Nc=", + "dev": true + }, + "eslint-scope": { + "version": "3.7.1", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-3.7.1.tgz", + "integrity": "sha1-PWPD7f2gLgbgGkUq2IyqzHzctug=", + "dev": true, + "requires": { + "esrecurse": "^4.1.0", + "estraverse": "^4.1.1" + }, + "dependencies": { + "estraverse": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.2.0.tgz", + "integrity": "sha1-De4/7TH81GlhjOc0IJn8GvoL2xM=", + "dev": true + } + } + }, + "eslint-utils": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/eslint-utils/-/eslint-utils-1.3.1.tgz", + "integrity": "sha512-Z7YjnIldX+2XMcjr7ZkgEsOj/bREONV60qYeB/bjMAqqqZ4zxKyWX+BOUkdmRmA9riiIPVvo5x86m5elviOk0Q==", + "dev": true + }, + "eslint-visitor-keys": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-1.0.0.tgz", + "integrity": "sha512-qzm/XxIbxm/FHyH341ZrbnMUpe+5Bocte9xkmFMzPMjRaZMcXww+MpBptFvtU+79L362nqiLhekCxCxDPaUMBQ==", + "dev": true + }, + "espree": { + "version": "3.5.4", + "resolved": "https://registry.npmjs.org/espree/-/espree-3.5.4.tgz", + "integrity": "sha512-yAcIQxtmMiB/jL32dzEp2enBeidsB7xWPLNiw3IIkpVds1P+h7qF9YwJq1yUNzp2OKXgAprs4F61ih66UsoD1A==", + "dev": true, + "requires": { + "acorn": "^5.5.0", + "acorn-jsx": "^3.0.0" + } + }, + "esprima": { + "version": "2.7.3", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-2.7.3.tgz", + "integrity": "sha1-luO3DVd59q1JzQMmc9HDEnZ7pYE=", + "dev": true + }, + "esquery": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.0.1.tgz", + "integrity": "sha512-SmiyZ5zIWH9VM+SRUReLS5Q8a7GxtRdxEBVZpm98rJM7Sb+A9DVCndXfkeFUd3byderg+EbDkfnevfCwynWaNA==", + "dev": true, + "requires": { + "estraverse": "^4.0.0" + }, + "dependencies": { + "estraverse": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.2.0.tgz", + "integrity": "sha1-De4/7TH81GlhjOc0IJn8GvoL2xM=", + "dev": true + } + } + }, + "esrecurse": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.2.1.tgz", + "integrity": "sha512-64RBB++fIOAXPw3P9cy89qfMlvZEXZkqqJkjqqXIvzP5ezRZjW+lPWjw35UX/3EhUPFYbg5ER4JYgDw4007/DQ==", + "dev": true, + "requires": { + "estraverse": "^4.1.0" + }, + "dependencies": { + "estraverse": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.2.0.tgz", + "integrity": "sha1-De4/7TH81GlhjOc0IJn8GvoL2xM=", + "dev": true + } + } + }, + "estraverse": { + "version": "1.9.3", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-1.9.3.tgz", + "integrity": "sha1-r2fy3JIlgkFZUJJgkaQAXSnJu0Q=", + "dev": true + }, + "esutils": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.2.tgz", + "integrity": "sha1-Cr9PHKpbyx96nYrMbepPqqBLrJs=", + "dev": true + }, + "events": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/events/-/events-1.1.1.tgz", + "integrity": "sha1-nr23Y1rQmccNzEwqH1AEKI6L2SQ=", + "dev": true + }, + "evp_bytestokey": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/evp_bytestokey/-/evp_bytestokey-1.0.3.tgz", + "integrity": "sha512-/f2Go4TognH/KvCISP7OUsHn85hT9nUkxxA9BEWxFn+Oj9o8ZNLm/40hdlgSLyuOimsrTKLUMEorQexp/aPQeA==", + "dev": true, + "requires": { + "md5.js": "^1.3.4", + "safe-buffer": "^5.1.1" + } + }, + "execa": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/execa/-/execa-0.7.0.tgz", + "integrity": "sha1-lEvs00zEHuMqY6n68nrVpl/Fl3c=", + "dev": true, + "requires": { + "cross-spawn": "^5.0.1", + "get-stream": "^3.0.0", + "is-stream": "^1.1.0", + "npm-run-path": "^2.0.0", + "p-finally": "^1.0.0", + "signal-exit": "^3.0.0", + "strip-eof": "^1.0.0" + }, + "dependencies": { + "cross-spawn": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-5.1.0.tgz", + "integrity": "sha1-6L0O/uWPz/b4+UUQoKVUu/ojVEk=", + "dev": true, + "requires": { + "lru-cache": "^4.0.1", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + } + } + } + }, + "expand-brackets": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/expand-brackets/-/expand-brackets-2.1.4.tgz", + "integrity": "sha1-t3c14xXOMPa27/D4OwQVGiJEliI=", + "dev": true, + "requires": { + "debug": "^2.3.3", + "define-property": "^0.2.5", + "extend-shallow": "^2.0.1", + "posix-character-classes": "^0.1.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + }, + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } + } + }, + "extend-shallow": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-3.0.2.tgz", + "integrity": "sha1-Jqcarwc7OfshJxcnRhMcJwQCjbg=", + "dev": true, + "requires": { + "assign-symbols": "^1.0.0", + "is-extendable": "^1.0.1" + }, + "dependencies": { + "is-extendable": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz", + "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==", + "dev": true, + "requires": { + "is-plain-object": "^2.0.4" + } + } + } + }, + "external-editor": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/external-editor/-/external-editor-3.0.0.tgz", + "integrity": "sha512-mpkfj0FEdxrIhOC04zk85X7StNtr0yXnG7zCb+8ikO8OJi2jsHh5YGoknNTyXgsbHOf1WOOcVU3kPFWT2WgCkQ==", + "dev": true, + "requires": { + "chardet": "^0.5.0", + "iconv-lite": "^0.4.22", + "tmp": "^0.0.33" + } + }, + "extglob": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/extglob/-/extglob-2.0.4.tgz", + "integrity": "sha512-Nmb6QXkELsuBr24CJSkilo6UHHgbekK5UiZgfE6UHD3Eb27YC6oD+bhcT+tJ6cl8dmsgdQxnWlcry8ksBIBLpw==", + "dev": true, + "requires": { + "array-unique": "^0.3.2", + "define-property": "^1.0.0", + "expand-brackets": "^2.1.4", + "extend-shallow": "^2.0.1", + "fragment-cache": "^0.2.1", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" + }, + "dependencies": { + "define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", + "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", + "dev": true, + "requires": { + "is-descriptor": "^1.0.0" + } + }, + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + }, + "is-accessor-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", + "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-data-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", + "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-descriptor": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", + "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "dev": true, + "requires": { + "is-accessor-descriptor": "^1.0.0", + "is-data-descriptor": "^1.0.0", + "kind-of": "^6.0.2" + } + }, + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "fast-deep-equal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-2.0.1.tgz", + "integrity": "sha1-ewUhjd+WZ79/Nwv3/bLLFf3Qqkk=", + "dev": true + }, + "fast-json-stable-stringify": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz", + "integrity": "sha1-1RQsDK7msRifh9OnYREGT4bIu/I=", + "dev": true + }, + "fast-levenshtein": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", + "integrity": "sha1-PYpcZog6FqMMqGQ+hR8Zuqd5eRc=", + "dev": true + }, + "fastparse": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/fastparse/-/fastparse-1.1.1.tgz", + "integrity": "sha1-0eJkOzipTXWDtHkGDmxK/8lAcfg=", + "dev": true + }, + "figures": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/figures/-/figures-2.0.0.tgz", + "integrity": "sha1-OrGi0qYsi/tDGgyUy3l6L84nyWI=", + "dev": true, + "requires": { + "escape-string-regexp": "^1.0.5" + } + }, + "file-entry-cache": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-2.0.0.tgz", + "integrity": "sha1-w5KZDD5oR4PYOLjISkXYoEhFg2E=", + "dev": true, + "requires": { + "flat-cache": "^1.2.1", + "object-assign": "^4.0.1" + } + }, + "file-loader": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/file-loader/-/file-loader-1.1.11.tgz", + "integrity": "sha512-TGR4HU7HUsGg6GCOPJnFk06RhWgEWFLAGWiT6rcD+GRC2keU3s9RGJ+b3Z6/U73jwwNb2gKLJ7YCrp+jvU4ALg==", + "dev": true, + "requires": { + "loader-utils": "^1.0.2", + "schema-utils": "^0.4.5" + } + }, + "fill-range": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-4.0.0.tgz", + "integrity": "sha1-1USBHUKPmOsGpj3EAtJAPDKMOPc=", + "dev": true, + "requires": { + "extend-shallow": "^2.0.1", + "is-number": "^3.0.0", + "repeat-string": "^1.6.1", + "to-regex-range": "^2.1.0" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } + } + }, + "find-cache-dir": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/find-cache-dir/-/find-cache-dir-1.0.0.tgz", + "integrity": "sha1-kojj6ePMN0hxfTnq3hfPcfww7m8=", + "dev": true, + "requires": { + "commondir": "^1.0.1", + "make-dir": "^1.0.0", + "pkg-dir": "^2.0.0" + } + }, + "find-up": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-2.1.0.tgz", + "integrity": "sha1-RdG35QbHF93UgndaK3eSCjwMV6c=", + "dev": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "flat-cache": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-1.3.0.tgz", + "integrity": "sha1-0wMLMrOBVPTjt+nHCfSQ9++XxIE=", + "dev": true, + "requires": { + "circular-json": "^0.3.1", + "del": "^2.0.2", + "graceful-fs": "^4.1.2", + "write": "^0.2.1" + } + }, + "flatten": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/flatten/-/flatten-1.0.2.tgz", + "integrity": "sha1-2uRqnXj74lKSJYzB54CkHZXAN4I=", + "dev": true + }, + "flush-write-stream": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/flush-write-stream/-/flush-write-stream-1.0.3.tgz", + "integrity": "sha512-calZMC10u0FMUqoiunI2AiGIIUtUIvifNwkHhNupZH4cbNnW1Itkoh/Nf5HFYmDrwWPjrUxpkZT0KhuCq0jmGw==", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.4" + } + }, + "for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha1-gQaNKVqBQuwKxybG4iAMMPttXoA=", + "dev": true + }, + "foreach": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/foreach/-/foreach-2.0.5.tgz", + "integrity": "sha1-C+4AUBiusmDQo6865ljdATbsG5k=", + "dev": true + }, + "fragment-cache": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/fragment-cache/-/fragment-cache-0.2.1.tgz", + "integrity": "sha1-QpD60n8T6Jvn8zeZxrxaCr//DRk=", + "dev": true, + "requires": { + "map-cache": "^0.2.2" + } + }, + "from2": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/from2/-/from2-2.3.0.tgz", + "integrity": "sha1-i/tVAr3kpNNs/e6gB/zKIdfjgq8=", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.0" + } + }, + "fs-extra": { + "version": "0.30.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-0.30.0.tgz", + "integrity": "sha1-8jP/zAjU2n1DLapEl3aYnbHfk/A=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "jsonfile": "^2.1.0", + "klaw": "^1.0.0", + "path-is-absolute": "^1.0.0", + "rimraf": "^2.2.8" + } + }, + "fs-write-stream-atomic": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/fs-write-stream-atomic/-/fs-write-stream-atomic-1.0.10.tgz", + "integrity": "sha1-tH31NJPvkR33VzHnCp3tAYnbQMk=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "iferr": "^0.1.5", + "imurmurhash": "^0.1.4", + "readable-stream": "1 || 2" + } + }, + "fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true + }, + "fsevents": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-1.2.4.tgz", + "integrity": "sha512-z8H8/diyk76B7q5wg+Ud0+CqzcAF3mBBI/bA5ne5zrRUUIvNkJY//D3BqyH571KuAC4Nr7Rw7CjWX4r0y9DvNg==", + "dev": true, + "optional": true, + "requires": { + "nan": "^2.9.2", + "node-pre-gyp": "^0.10.0" + }, + "dependencies": { + "abbrev": { + "version": "1.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "ansi-regex": { + "version": "2.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "aproba": { + "version": "1.2.0", + "bundled": true, + "dev": true, + "optional": true + }, + "are-we-there-yet": { + "version": "1.1.4", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "delegates": "^1.0.0", + "readable-stream": "^2.0.6" + } + }, + "balanced-match": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "brace-expansion": { + "version": "1.1.11", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "chownr": { + "version": "1.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "code-point-at": { + "version": "1.1.0", + "bundled": true, + "dev": true, + "optional": true + }, + "concat-map": { + "version": "0.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "console-control-strings": { + "version": "1.1.0", + "bundled": true, + "dev": true, + "optional": true + }, + "core-util-is": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "debug": { + "version": "2.6.9", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "ms": "2.0.0" + } + }, + "deep-extend": { + "version": "0.5.1", + "bundled": true, + "dev": true, + "optional": true + }, + "delegates": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "detect-libc": { + "version": "1.0.3", + "bundled": true, + "dev": true, + "optional": true + }, + "fs-minipass": { + "version": "1.2.5", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "minipass": "^2.2.1" + } + }, + "fs.realpath": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "gauge": { + "version": "2.7.4", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "aproba": "^1.0.3", + "console-control-strings": "^1.0.0", + "has-unicode": "^2.0.0", + "object-assign": "^4.1.0", + "signal-exit": "^3.0.0", + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1", + "wide-align": "^1.1.0" + } + }, + "glob": { + "version": "7.1.2", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "has-unicode": { + "version": "2.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "iconv-lite": { + "version": "0.4.21", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "safer-buffer": "^2.1.0" + } + }, + "ignore-walk": { + "version": "3.0.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "minimatch": "^3.0.4" + } + }, + "inflight": { + "version": "1.0.6", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.3", + "bundled": true, + "dev": true, + "optional": true + }, + "ini": { + "version": "1.3.5", + "bundled": true, + "dev": true, + "optional": true + }, + "is-fullwidth-code-point": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "number-is-nan": "^1.0.0" + } + }, + "isarray": { + "version": "1.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "minimatch": { + "version": "3.0.4", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "bundled": true, + "dev": true, + "optional": true + }, + "minipass": { + "version": "2.2.4", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "safe-buffer": "^5.1.1", + "yallist": "^3.0.0" + } + }, + "minizlib": { + "version": "1.1.0", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "minipass": "^2.2.1" + } + }, + "mkdirp": { + "version": "0.5.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "minimist": "0.0.8" + } + }, + "ms": { + "version": "2.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "needle": { + "version": "2.2.0", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "debug": "^2.1.2", + "iconv-lite": "^0.4.4", + "sax": "^1.2.4" + } + }, + "node-pre-gyp": { + "version": "0.10.0", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "detect-libc": "^1.0.2", + "mkdirp": "^0.5.1", + "needle": "^2.2.0", + "nopt": "^4.0.1", + "npm-packlist": "^1.1.6", + "npmlog": "^4.0.2", + "rc": "^1.1.7", + "rimraf": "^2.6.1", + "semver": "^5.3.0", + "tar": "^4" + } + }, + "nopt": { + "version": "4.0.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "abbrev": "1", + "osenv": "^0.1.4" + } + }, + "npm-bundled": { + "version": "1.0.3", + "bundled": true, + "dev": true, + "optional": true + }, + "npm-packlist": { + "version": "1.1.10", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "ignore-walk": "^3.0.1", + "npm-bundled": "^1.0.1" + } + }, + "npmlog": { + "version": "4.1.2", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "are-we-there-yet": "~1.1.2", + "console-control-strings": "~1.1.0", + "gauge": "~2.7.3", + "set-blocking": "~2.0.0" + } + }, + "number-is-nan": { + "version": "1.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "object-assign": { + "version": "4.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "once": { + "version": "1.4.0", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "wrappy": "1" + } + }, + "os-homedir": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "os-tmpdir": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "osenv": { + "version": "0.1.5", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.0" + } + }, + "path-is-absolute": { + "version": "1.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "process-nextick-args": { + "version": "2.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "rc": { + "version": "1.2.7", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "deep-extend": "^0.5.1", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "dependencies": { + "minimist": { + "version": "1.2.0", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "readable-stream": { + "version": "2.3.6", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "rimraf": { + "version": "2.6.2", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "glob": "^7.0.5" + } + }, + "safe-buffer": { + "version": "5.1.1", + "bundled": true, + "dev": true, + "optional": true + }, + "safer-buffer": { + "version": "2.1.2", + "bundled": true, + "dev": true, + "optional": true + }, + "sax": { + "version": "1.2.4", + "bundled": true, + "dev": true, + "optional": true + }, + "semver": { + "version": "5.5.0", + "bundled": true, + "dev": true, + "optional": true + }, + "set-blocking": { + "version": "2.0.0", + "bundled": true, + "dev": true, + "optional": true + }, + "signal-exit": { + "version": "3.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "string-width": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + } + }, + "string_decoder": { + "version": "1.1.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "strip-ansi": { + "version": "3.0.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "strip-json-comments": { + "version": "2.0.1", + "bundled": true, + "dev": true, + "optional": true + }, + "tar": { + "version": "4.4.1", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "chownr": "^1.0.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.2.4", + "minizlib": "^1.1.0", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.1", + "yallist": "^3.0.2" + } + }, + "util-deprecate": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "wide-align": { + "version": "1.1.2", + "bundled": true, + "dev": true, + "optional": true, + "requires": { + "string-width": "^1.0.2" + } + }, + "wrappy": { + "version": "1.0.2", + "bundled": true, + "dev": true, + "optional": true + }, + "yallist": { + "version": "3.0.2", + "bundled": true, + "dev": true, + "optional": true + } + } + }, + "function-bind": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.1.tgz", + "integrity": "sha512-yIovAzMX49sF8Yl58fSCWJ5svSLuaibPxXQJFLmBObTuCr0Mf1KiPopGM9NiFjiYBCbfaa2Fh6breQ6ANVTI0A==", + "dev": true + }, + "functional-red-black-tree": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/functional-red-black-tree/-/functional-red-black-tree-1.0.1.tgz", + "integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=", + "dev": true + }, + "generic-names": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/generic-names/-/generic-names-1.0.3.tgz", + "integrity": "sha1-LXhqEhruUIh2eWk56OO/+DbCCRc=", + "dev": true, + "requires": { + "loader-utils": "^0.2.16" + }, + "dependencies": { + "loader-utils": { + "version": "0.2.17", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-0.2.17.tgz", + "integrity": "sha1-+G5jdNQyBabmxg6RlvF8Apm/s0g=", + "dev": true, + "requires": { + "big.js": "^3.1.3", + "emojis-list": "^2.0.0", + "json5": "^0.5.0", + "object-assign": "^4.0.1" + } + } + } + }, + "get-caller-file": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.3.tgz", + "integrity": "sha512-3t6rVToeoZfYSGd8YoLFR2DJkiQrIiUrGcjvFX2mDw3bn6k2OtwHN0TNCLbBO+w8qTvimhDkv+LSscbJY1vE6w==", + "dev": true + }, + "get-stream": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-3.0.0.tgz", + "integrity": "sha1-jpQ9E1jcN1VQVOy+LtsFqhdO3hQ=", + "dev": true + }, + "get-value": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/get-value/-/get-value-2.0.6.tgz", + "integrity": "sha1-3BXKHGcjh8p2vTesCjlbogQqLCg=", + "dev": true + }, + "glob": { + "version": "5.0.15", + "resolved": "https://registry.npmjs.org/glob/-/glob-5.0.15.tgz", + "integrity": "sha1-G8k2ueAvSmA/zCIuz3Yz0wuLk7E=", + "dev": true, + "requires": { + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "2 || 3", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "glob-parent": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-3.1.0.tgz", + "integrity": "sha1-nmr2KZ2NO9K9QEMIMr0RPfkGxa4=", + "dev": true, + "requires": { + "is-glob": "^3.1.0", + "path-dirname": "^1.0.0" + }, + "dependencies": { + "is-glob": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-3.1.0.tgz", + "integrity": "sha1-e6WuJCF4BKxwcHuWkiVnSGzD6Eo=", + "dev": true, + "requires": { + "is-extglob": "^2.1.0" + } + } + } + }, + "glob-to-regexp": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/glob-to-regexp/-/glob-to-regexp-0.3.0.tgz", + "integrity": "sha1-jFoUlNIGbFcMw7/kSWF1rMTVAqs=", + "dev": true + }, + "global-modules-path": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/global-modules-path/-/global-modules-path-2.3.0.tgz", + "integrity": "sha512-HchvMJNYh9dGSCy8pOQ2O8u/hoXaL+0XhnrwH0RyLiSXMMTl9W3N6KUU73+JFOg5PGjtzl6VZzUQsnrpm7Szag==", + "dev": true + }, + "globals": { + "version": "9.18.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-9.18.0.tgz", + "integrity": "sha512-S0nG3CLEQiY/ILxqtztTWH/3iRRdyBLw6KMDxnKMchrtbj2OFmehVh0WUCfW3DUrIgx/qFrJPICrq4Z4sTR9UQ==", + "dev": true + }, + "globby": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/globby/-/globby-7.1.1.tgz", + "integrity": "sha1-+yzP+UAfhgCUXfral0QMypcrhoA=", + "dev": true, + "requires": { + "array-union": "^1.0.1", + "dir-glob": "^2.0.0", + "glob": "^7.1.2", + "ignore": "^3.3.5", + "pify": "^3.0.0", + "slash": "^1.0.0" + }, + "dependencies": { + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + } + } + }, + "graceful-fs": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.1.11.tgz", + "integrity": "sha1-Dovf5NHduIVNZOBOp8AOKgJuVlg=", + "dev": true + }, + "graphlib": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/graphlib/-/graphlib-1.0.7.tgz", + "integrity": "sha1-DKst8P/mq+BwsmJb+h7bbslnuLE=", + "requires": { + "lodash": "^3.10.0" + }, + "dependencies": { + "lodash": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz", + "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y=" + } + } + }, + "handlebars": { + "version": "4.0.11", + "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.0.11.tgz", + "integrity": "sha1-Ywo13+ApS8KB7a5v/F0yn8eYLcw=", + "dev": true, + "requires": { + "async": "^1.4.0", + "optimist": "^0.6.1", + "source-map": "^0.4.4", + "uglify-js": "^2.6" + }, + "dependencies": { + "source-map": { + "version": "0.4.4", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.4.4.tgz", + "integrity": "sha1-66T12pwNyZneaAMti092FzZSA2s=", + "dev": true, + "requires": { + "amdefine": ">=0.0.4" + } + } + } + }, + "has": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", + "integrity": "sha512-f2dvO0VU6Oej7RkWJGrehjbzMAjFp5/VKPp5tTpWIV4JHHZK1/BxbFRtf/siA2SWTe09caDmVtYYzWEIbBS4zw==", + "dev": true, + "requires": { + "function-bind": "^1.1.1" + } + }, + "has-ansi": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/has-ansi/-/has-ansi-2.0.0.tgz", + "integrity": "sha1-NPUEnOHs3ysGSa8+8k5F7TVBbZE=", + "dev": true, + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "has-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-1.0.0.tgz", + "integrity": "sha1-nZ55MWXOAXoA8AQYxD+UKnsdEfo=", + "dev": true + }, + "has-symbols": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.0.tgz", + "integrity": "sha1-uhqPGvKg/DllD1yFA2dwQSIGO0Q=", + "dev": true + }, + "has-value": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/has-value/-/has-value-1.0.0.tgz", + "integrity": "sha1-GLKB2lhbHFxR3vJMkw7SmgvmsXc=", + "dev": true, + "requires": { + "get-value": "^2.0.6", + "has-values": "^1.0.0", + "isobject": "^3.0.0" + } + }, + "has-values": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/has-values/-/has-values-1.0.0.tgz", + "integrity": "sha1-lbC2P+whRmGab+V/51Yo1aOe/k8=", + "dev": true, + "requires": { + "is-number": "^3.0.0", + "kind-of": "^4.0.0" + }, + "dependencies": { + "kind-of": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-4.0.0.tgz", + "integrity": "sha1-IIE989cSkosgc3hpGkUGb65y3Vc=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + } + } + }, + "hash-base": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/hash-base/-/hash-base-3.0.4.tgz", + "integrity": "sha1-X8hoaEfs1zSZQDMZprCj8/auSRg=", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "hash.js": { + "version": "1.1.5", + "resolved": "https://registry.npmjs.org/hash.js/-/hash.js-1.1.5.tgz", + "integrity": "sha512-eWI5HG9Np+eHV1KQhisXWwM+4EPPYe5dFX1UZZH7k/E3JzDEazVH+VGlZi6R94ZqImq+A3D1mCEtrFIfg/E7sA==", + "dev": true, + "requires": { + "inherits": "^2.0.3", + "minimalistic-assert": "^1.0.1" + } + }, + "hmac-drbg": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/hmac-drbg/-/hmac-drbg-1.0.1.tgz", + "integrity": "sha1-0nRXAQJabHdabFRXk+1QL8DGSaE=", + "dev": true, + "requires": { + "hash.js": "^1.0.3", + "minimalistic-assert": "^1.0.0", + "minimalistic-crypto-utils": "^1.0.1" + } + }, + "home-or-tmp": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/home-or-tmp/-/home-or-tmp-2.0.0.tgz", + "integrity": "sha1-42w/LSyufXRqhX440Y1fMqeILbg=", + "dev": true, + "requires": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.1" + } + }, + "hosted-git-info": { + "version": "2.7.1", + "resolved": "https://registry.npmjs.org/hosted-git-info/-/hosted-git-info-2.7.1.tgz", + "integrity": "sha512-7T/BxH19zbcCTa8XkMlbK5lTo1WtgkFi3GvdWEyNuc4Vex7/9Dqbnpsf4JMydcfj9HCg4zUWFTL3Za6lapg5/w==", + "dev": true + }, + "html-comment-regex": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/html-comment-regex/-/html-comment-regex-1.1.1.tgz", + "integrity": "sha1-ZouTd26q5V696POtRkswekljYl4=", + "dev": true + }, + "htmlparser2": { + "version": "3.9.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-3.9.2.tgz", + "integrity": "sha1-G9+HrMoPP55T+k/M6w9LTLsAszg=", + "dev": true, + "requires": { + "domelementtype": "^1.3.0", + "domhandler": "^2.3.0", + "domutils": "^1.5.1", + "entities": "^1.1.1", + "inherits": "^2.0.1", + "readable-stream": "^2.0.2" + } + }, + "https-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https-browserify/-/https-browserify-1.0.0.tgz", + "integrity": "sha1-7AbBDgo0wPL68Zn3/X/Hj//QPHM=", + "dev": true + }, + "iconv-lite": { + "version": "0.4.23", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.23.tgz", + "integrity": "sha512-neyTUVFtahjf0mB3dZT77u+8O0QB89jFdnBkd5P1JgYPbPaia3gXXOVL2fq8VyU2gMMD7SaN7QukTB/pmXYvDA==", + "dev": true, + "requires": { + "safer-buffer": ">= 2.1.2 < 3" + } + }, + "icss-replace-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/icss-replace-symbols/-/icss-replace-symbols-1.1.0.tgz", + "integrity": "sha1-Bupvg2ead0njhs/h/oEq5dsiPe0=", + "dev": true + }, + "icss-utils": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/icss-utils/-/icss-utils-3.0.1.tgz", + "integrity": "sha1-7nDTroysOMa+XtkehRsn7tNDrQ8=", + "dev": true, + "requires": { + "postcss": "^6.0.2" + } + }, + "ieee754": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.12.tgz", + "integrity": "sha512-GguP+DRY+pJ3soyIiGPTvdiVXjZ+DbXOxGpXn3eMvNW4x4irjqXm4wHKscC+TfxSJ0yw/S1F24tqdMNsMZTiLA==", + "dev": true + }, + "iferr": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/iferr/-/iferr-0.1.5.tgz", + "integrity": "sha1-xg7taebY/bazEEofy8ocGS3FtQE=", + "dev": true + }, + "ignore": { + "version": "3.3.10", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-3.3.10.tgz", + "integrity": "sha512-Pgs951kaMm5GXP7MOvxERINe3gsaVjUWFm+UZPSq9xYriQAksyhg0csnS0KXSNRD5NmNdapXEpjxG49+AKh/ug==", + "dev": true + }, + "import-local": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/import-local/-/import-local-1.0.0.tgz", + "integrity": "sha512-vAaZHieK9qjGo58agRBg+bhHX3hoTZU/Oa3GESWLz7t1U62fk63aHuDJJEteXoDeTCcPmUT+z38gkHPZkkmpmQ==", + "dev": true, + "requires": { + "pkg-dir": "^2.0.0", + "resolve-cwd": "^2.0.0" + } + }, + "imports-loader": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/imports-loader/-/imports-loader-0.8.0.tgz", + "integrity": "sha512-kXWL7Scp8KQ4552ZcdVTeaQCZSLW+e6nJfp3cwUMB673T7Hr98Xjx5JK+ql7ADlJUvj1JS5O01RLbKoutN5QDQ==", + "dev": true, + "requires": { + "loader-utils": "^1.0.2", + "source-map": "^0.6.1" + }, + "dependencies": { + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + } + } + }, + "imurmurhash": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", + "integrity": "sha1-khi5srkoojixPcT7a21XbyMUU+o=", + "dev": true + }, + "indexes-of": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/indexes-of/-/indexes-of-1.0.1.tgz", + "integrity": "sha1-8w9xbI4r00bHtn0985FVZqfAVgc=", + "dev": true + }, + "indexof": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/indexof/-/indexof-0.0.1.tgz", + "integrity": "sha1-gtwzbSMrkGIXnQWrMpOmYFn9Q10=", + "dev": true + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "dev": true, + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", + "dev": true + }, + "inquirer": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-6.0.0.tgz", + "integrity": "sha512-tISQWRwtcAgrz+SHPhTH7d3e73k31gsOy6i1csonLc0u1dVK/wYvuOnFeiWqC5OXFIYbmrIFInef31wbT8MEJg==", + "dev": true, + "requires": { + "ansi-escapes": "^3.0.0", + "chalk": "^2.0.0", + "cli-cursor": "^2.1.0", + "cli-width": "^2.0.0", + "external-editor": "^3.0.0", + "figures": "^2.0.0", + "lodash": "^4.3.0", + "mute-stream": "0.0.7", + "run-async": "^2.2.0", + "rxjs": "^6.1.0", + "string-width": "^2.1.0", + "strip-ansi": "^4.0.0", + "through": "^2.3.6" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", + "dev": true + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "interpret": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/interpret/-/interpret-1.1.0.tgz", + "integrity": "sha1-ftGxQQxqDg94z5XTuEQMY/eLhhQ=", + "dev": true + }, + "invariant": { + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/invariant/-/invariant-2.2.4.tgz", + "integrity": "sha512-phJfQVBuaJM5raOpJjSfkiD6BpbCE4Ns//LaXl6wGYtUBY83nWS6Rf9tXm2e8VaK60JEjYldbPif/A2B1C2gNA==", + "dev": true, + "requires": { + "loose-envify": "^1.0.0" + } + }, + "invert-kv": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/invert-kv/-/invert-kv-1.0.0.tgz", + "integrity": "sha1-EEqOSqym09jNFXqO+L+rLXo//bY=", + "dev": true + }, + "is-absolute-url": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-absolute-url/-/is-absolute-url-2.1.0.tgz", + "integrity": "sha1-UFMN+4T8yap9vnhS6Do3uTufKqY=", + "dev": true + }, + "is-accessor-descriptor": { + "version": "0.1.6", + "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-0.1.6.tgz", + "integrity": "sha1-qeEss66Nh2cn7u84Q/igiXtcmNY=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "is-arrayish": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", + "integrity": "sha1-d8mYQFJ6qOyxqLppe4BkWnqSap0=", + "dev": true + }, + "is-binary-path": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-1.0.1.tgz", + "integrity": "sha1-dfFmQrSA8YenEcgUFh/TpKdlWJg=", + "dev": true, + "requires": { + "binary-extensions": "^1.0.0" + } + }, + "is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==", + "dev": true + }, + "is-builtin-module": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-builtin-module/-/is-builtin-module-1.0.0.tgz", + "integrity": "sha1-VAVy0096wxGfj3bDDLwbHgN6/74=", + "dev": true, + "requires": { + "builtin-modules": "^1.0.0" + } + }, + "is-callable": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.1.4.tgz", + "integrity": "sha512-r5p9sxJjYnArLjObpjA4xu5EKI3CuKHkJXMhT7kwbpUyIFD1n5PMAsoPvWnvtZiNz7LjkYDRZhd7FlI0eMijEA==", + "dev": true + }, + "is-data-descriptor": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-0.1.4.tgz", + "integrity": "sha1-C17mSDiOLIYCgueT8YVv7D8wG1Y=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "is-date-object": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.1.tgz", + "integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=", + "dev": true + }, + "is-descriptor": { + "version": "0.1.6", + "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-0.1.6.tgz", + "integrity": "sha512-avDYr0SB3DwO9zsMov0gKCESFYqCnE4hq/4z3TdUlukEy5t9C0YRq7HLrsN52NAcqXKaepeCD0n+B0arnVG3Hg==", + "dev": true, + "requires": { + "is-accessor-descriptor": "^0.1.6", + "is-data-descriptor": "^0.1.4", + "kind-of": "^5.0.0" + }, + "dependencies": { + "kind-of": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-5.1.0.tgz", + "integrity": "sha512-NGEErnH6F2vUuXDh+OlbcKW7/wOcfdRHaZ7VWtqCztfHri/++YKmP51OdWeGPuqCOba6kk2OTe5d02VmTB80Pw==", + "dev": true + } + } + }, + "is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha1-YrEQ4omkcUGOPsNqYX1HLjAd/Ik=", + "dev": true + }, + "is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha1-qIwCU1eR8C7TfHahueqXc8gz+MI=", + "dev": true + }, + "is-finite": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-finite/-/is-finite-1.0.2.tgz", + "integrity": "sha1-zGZ3aVYCvlUO8R6LSqYwU0K20Ko=", + "dev": true, + "requires": { + "number-is-nan": "^1.0.0" + } + }, + "is-fullwidth-code-point": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", + "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=", + "dev": true + }, + "is-glob": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.0.tgz", + "integrity": "sha1-lSHHaEXMJhCoUgPd8ICpWML/q8A=", + "dev": true, + "requires": { + "is-extglob": "^2.1.1" + } + }, + "is-number": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-3.0.0.tgz", + "integrity": "sha1-JP1iAaR4LPUFYcgQJ2r8fRLXEZU=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "is-path-cwd": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-path-cwd/-/is-path-cwd-1.0.0.tgz", + "integrity": "sha1-0iXsIxMuie3Tj9p2dHLmLmXxEG0=", + "dev": true + }, + "is-path-in-cwd": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-path-in-cwd/-/is-path-in-cwd-1.0.1.tgz", + "integrity": "sha512-FjV1RTW48E7CWM7eE/J2NJvAEEVektecDBVBE5Hh3nM1Jd0kvhHtX68Pr3xsDf857xt3Y4AkwVULK1Vku62aaQ==", + "dev": true, + "requires": { + "is-path-inside": "^1.0.0" + } + }, + "is-path-inside": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-path-inside/-/is-path-inside-1.0.1.tgz", + "integrity": "sha1-jvW33lBDej/cprToZe96pVy0gDY=", + "dev": true, + "requires": { + "path-is-inside": "^1.0.1" + } + }, + "is-plain-obj": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-1.1.0.tgz", + "integrity": "sha1-caUMhCnfync8kqOQpKA7OfzVHT4=", + "dev": true + }, + "is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dev": true, + "requires": { + "isobject": "^3.0.1" + } + }, + "is-promise": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-2.1.0.tgz", + "integrity": "sha1-eaKp7OfwlugPNtKy87wWwf9L8/o=", + "dev": true + }, + "is-regex": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz", + "integrity": "sha1-VRdIm1RwkbCTDglWVM7SXul+lJE=", + "dev": true, + "requires": { + "has": "^1.0.1" + } + }, + "is-resolvable": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-resolvable/-/is-resolvable-1.1.0.tgz", + "integrity": "sha512-qgDYXFSR5WvEfuS5dMj6oTMEbrrSaM0CrFk2Yiq/gXnBvD9pMa2jGXxyhGLfvhZpuMZe18CJpFxAt3CRs42NMg==", + "dev": true + }, + "is-stream": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-1.1.0.tgz", + "integrity": "sha1-EtSj3U5o4Lec6428hBc66A2RykQ=", + "dev": true + }, + "is-svg": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-svg/-/is-svg-2.1.0.tgz", + "integrity": "sha1-z2EJDaDZ77yrhyLeum8DIgjbsOk=", + "dev": true, + "requires": { + "html-comment-regex": "^1.1.0" + } + }, + "is-symbol": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.0.1.tgz", + "integrity": "sha1-PMWfAAJRlLarLjjbrmaJJWtmBXI=", + "dev": true + }, + "is-windows": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-windows/-/is-windows-1.0.2.tgz", + "integrity": "sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA==", + "dev": true + }, + "isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", + "dev": true + }, + "isexe": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", + "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=", + "dev": true + }, + "isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=", + "dev": true + }, + "jquery": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/jquery/-/jquery-3.3.1.tgz", + "integrity": "sha512-Ubldcmxp5np52/ENotGxlLe6aGMvmF4R8S6tZjsP6Knsaxd/xp3Zrh50cG93lR6nPXyUFwzN3ZSOQI0wRJNdGg==" + }, + "js-base64": { + "version": "2.4.5", + "resolved": "https://registry.npmjs.org/js-base64/-/js-base64-2.4.5.tgz", + "integrity": "sha512-aUnNwqMOXw3yvErjMPSQu6qIIzUmT1e5KcU1OZxRDU1g/am6mzBvcrmLAYwzmB59BHPrh5/tKaiF4OPhqRWESQ==", + "dev": true + }, + "js-tokens": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-3.0.2.tgz", + "integrity": "sha1-mGbfOVECEw449/mWvOtlRDIJwls=", + "dev": true + }, + "js-yaml": { + "version": "3.12.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz", + "integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==", + "dev": true, + "requires": { + "argparse": "^1.0.7", + "esprima": "^4.0.0" + }, + "dependencies": { + "esprima": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz", + "integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==", + "dev": true + } + } + }, + "jsesc": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-1.3.0.tgz", + "integrity": "sha1-RsP+yMGJKxKwgz25vHYiF226s0s=", + "dev": true + }, + "json-parse-better-errors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz", + "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==", + "dev": true + }, + "json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "dev": true + }, + "json-stable-stringify-without-jsonify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz", + "integrity": "sha1-nbe1lJatPzz+8wp1FC0tkwrXJlE=", + "dev": true + }, + "json5": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/json5/-/json5-0.5.1.tgz", + "integrity": "sha1-Hq3nrMASA0rYTiOWdn6tn6VJWCE=", + "dev": true + }, + "jsonfile": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-2.4.0.tgz", + "integrity": "sha1-NzaitCi4e72gzIO1P6PWM6NcKug=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.6" + } + }, + "kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha1-MeohpzS6ubuw8yRm2JOupR5KPGQ=", + "dev": true, + "requires": { + "is-buffer": "^1.1.5" + } + }, + "klaw": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/klaw/-/klaw-1.3.1.tgz", + "integrity": "sha1-QIhDO0azsbolnXh4XY6W9zugJDk=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.9" + } + }, + "lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha1-odePw6UEdMuAhF07O24dpJpEbo4=", + "dev": true, + "optional": true + }, + "lcid": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/lcid/-/lcid-1.0.0.tgz", + "integrity": "sha1-MIrMr6C8SDo4Z7S28rlQYlHRuDU=", + "dev": true, + "requires": { + "invert-kv": "^1.0.0" + } + }, + "levn": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/levn/-/levn-0.3.0.tgz", + "integrity": "sha1-OwmSTt+fCDwEkP3UwLxEIeBHZO4=", + "dev": true, + "requires": { + "prelude-ls": "~1.1.2", + "type-check": "~0.3.2" + } + }, + "load-json-file": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-2.0.0.tgz", + "integrity": "sha1-eUfkIUmvgNaWy/eXvKq8/h/inKg=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "parse-json": "^2.2.0", + "pify": "^2.0.0", + "strip-bom": "^3.0.0" + }, + "dependencies": { + "pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", + "dev": true + } + } + }, + "loader-runner": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-2.3.0.tgz", + "integrity": "sha1-9IKuqC1UPgeSFwDVpG7yb9rGuKI=", + "dev": true + }, + "loader-utils": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.1.0.tgz", + "integrity": "sha1-yYrvSIvM7aL/teLeZG1qdUQp9c0=", + "dev": true, + "requires": { + "big.js": "^3.1.3", + "emojis-list": "^2.0.0", + "json5": "^0.5.0" + } + }, + "locate-path": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-2.0.0.tgz", + "integrity": "sha1-K1aLJl7slExtnA3pw9u7ygNUzY4=", + "dev": true, + "requires": { + "p-locate": "^2.0.0", + "path-exists": "^3.0.0" + } + }, + "lodash": { + "version": "4.17.11", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.11.tgz", + "integrity": "sha512-cQKh8igo5QUhZ7lg38DYWAxMvjSAKG0A8wGSVimP07SIUEK2UO+arSRKbRZWtelMtN5V0Hkwh5ryOto/SshYIg==" + }, + "lodash.camelcase": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/lodash.camelcase/-/lodash.camelcase-4.3.0.tgz", + "integrity": "sha1-soqmKIorn8ZRA1x3EfZathkDMaY=", + "dev": true + }, + "lodash.debounce": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", + "integrity": "sha1-gteb/zCmfEAF/9XiUVMArZyk168=", + "dev": true + }, + "lodash.memoize": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/lodash.memoize/-/lodash.memoize-4.1.2.tgz", + "integrity": "sha1-vMbEmkKihA7Zl/Mj6tpezRguC/4=", + "dev": true + }, + "lodash.uniq": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/lodash.uniq/-/lodash.uniq-4.5.0.tgz", + "integrity": "sha1-0CJTc662Uq3BvILklFM5qEJ1R3M=", + "dev": true + }, + "log-symbols": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-2.2.0.tgz", + "integrity": "sha512-VeIAFslyIerEJLXHziedo2basKbMKtTw3vfn5IzG0XTjhAVEJyNHnL2p7vc+wBDSdQuUpNw3M2u6xb9QsAY5Eg==", + "dev": true, + "requires": { + "chalk": "^2.0.1" + }, + "dependencies": { + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "loglevelnext": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/loglevelnext/-/loglevelnext-1.0.5.tgz", + "integrity": "sha512-V/73qkPuJmx4BcBF19xPBr+0ZRVBhc4POxvZTZdMeXpJ4NItXSJ/MSwuFT0kQJlCbXvdlZoQQ/418bS1y9Jh6A==", + "dev": true, + "requires": { + "es6-symbol": "^3.1.1", + "object.assign": "^4.1.0" + } + }, + "long": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/long/-/long-3.2.0.tgz", + "integrity": "sha1-2CG3E4yhy1gcFymQ7xTbIAtcR0s=", + "dev": true + }, + "longest": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/longest/-/longest-1.0.1.tgz", + "integrity": "sha1-MKCy2jj3N3DoKUoNIuZiXtd9AJc=", + "dev": true, + "optional": true + }, + "loose-envify": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.3.1.tgz", + "integrity": "sha1-0aitM/qc4OcT1l/dCsi3SNR4yEg=", + "dev": true, + "requires": { + "js-tokens": "^3.0.0" + } + }, + "lru-cache": { + "version": "4.1.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.3.tgz", + "integrity": "sha512-fFEhvcgzuIoJVUF8fYr5KR0YqxD238zgObTps31YdADwPPAp82a4M8TrckkWyx7ekNlf9aBcVn81cFwwXngrJA==", + "dev": true, + "requires": { + "pseudomap": "^1.0.2", + "yallist": "^2.1.2" + } + }, + "make-dir": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/make-dir/-/make-dir-1.3.0.tgz", + "integrity": "sha512-2w31R7SJtieJJnQtGc7RVL2StM2vGYVfqUOvUDxH6bC6aJTxPxTF0GnIgCyu7tjockiUWAYQRbxa7vKn34s5sQ==", + "dev": true, + "requires": { + "pify": "^3.0.0" + } + }, + "mamacro": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/mamacro/-/mamacro-0.0.3.tgz", + "integrity": "sha512-qMEwh+UujcQ+kbz3T6V+wAmO2U8veoq2w+3wY8MquqwVA3jChfwY+Tk52GZKDfACEPjuZ7r2oJLejwpt8jtwTA==", + "dev": true + }, + "map-cache": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/map-cache/-/map-cache-0.2.2.tgz", + "integrity": "sha1-wyq9C9ZSXZsFFkW7TyasXcmKDb8=", + "dev": true + }, + "map-visit": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz", + "integrity": "sha1-7Nyo8TFE5mDxtb1B8S80edmN+48=", + "dev": true, + "requires": { + "object-visit": "^1.0.0" + } + }, + "math-expression-evaluator": { + "version": "1.2.17", + "resolved": "https://registry.npmjs.org/math-expression-evaluator/-/math-expression-evaluator-1.2.17.tgz", + "integrity": "sha1-3oGf282E3M2PrlnGrreWFbnSZqw=", + "dev": true + }, + "md5.js": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/md5.js/-/md5.js-1.3.4.tgz", + "integrity": "sha1-6b296UogpawYsENA/Fdk1bCdkB0=", + "dev": true, + "requires": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1" + } + }, + "mem": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/mem/-/mem-1.1.0.tgz", + "integrity": "sha1-Xt1StIXKHZAP5kiVUFOZoN+kX3Y=", + "dev": true, + "requires": { + "mimic-fn": "^1.0.0" + } + }, + "memory-fs": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/memory-fs/-/memory-fs-0.4.1.tgz", + "integrity": "sha1-OpoguEYlI+RHz7x+i7gO1me/xVI=", + "dev": true, + "requires": { + "errno": "^0.1.3", + "readable-stream": "^2.0.1" + } + }, + "micromatch": { + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-3.1.10.tgz", + "integrity": "sha512-MWikgl9n9M3w+bpsY3He8L+w9eF9338xRl8IAO5viDizwSzziFEyUzo2xrrloB64ADbTf8uA8vRqqttDTOmccg==", + "dev": true, + "requires": { + "arr-diff": "^4.0.0", + "array-unique": "^0.3.2", + "braces": "^2.3.1", + "define-property": "^2.0.2", + "extend-shallow": "^3.0.2", + "extglob": "^2.0.4", + "fragment-cache": "^0.2.1", + "kind-of": "^6.0.2", + "nanomatch": "^1.2.9", + "object.pick": "^1.3.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.2" + }, + "dependencies": { + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "miller-rabin": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/miller-rabin/-/miller-rabin-4.0.1.tgz", + "integrity": "sha512-115fLhvZVqWwHPbClyntxEVfVDfl9DLLTuJvq3g2O/Oxi8AiNouAHvDSzHS0viUJc+V5vm3eq91Xwqn9dp4jRA==", + "dev": true, + "requires": { + "bn.js": "^4.0.0", + "brorand": "^1.0.1" + } + }, + "mime": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/mime/-/mime-2.3.1.tgz", + "integrity": "sha512-OEUllcVoydBHGN1z84yfQDimn58pZNNNXgZlHXSboxMlFvgI6MXSWpWKpFRra7H1HxpVhHTkrghfRW49k6yjeg==", + "dev": true + }, + "mimic-fn": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/mimic-fn/-/mimic-fn-1.2.0.tgz", + "integrity": "sha512-jf84uxzwiuiIVKiOLpfYk7N46TSy8ubTonmneY9vrpHNAnp0QBt2BxWV9dO3/j+BoVAb+a5G6YDPW3M5HOdMWQ==", + "dev": true + }, + "mini-css-extract-plugin": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/mini-css-extract-plugin/-/mini-css-extract-plugin-0.4.1.tgz", + "integrity": "sha512-XWuB3G61Rtasq/gLe7cp5cuozehE6hN+E4sxCamRR/WDiHTg+f7ZIAS024r8UJQffY+e2gGELXQZgQoFDfNDCg==", + "dev": true, + "requires": { + "@webpack-contrib/schema-utils": "^1.0.0-beta.0", + "loader-utils": "^1.1.0", + "webpack-sources": "^1.1.0" + } + }, + "minimalistic-assert": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz", + "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==", + "dev": true + }, + "minimalistic-crypto-utils": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-crypto-utils/-/minimalistic-crypto-utils-1.0.1.tgz", + "integrity": "sha1-9sAMHAsIIkblxNmd+4x8CDsrWCo=", + "dev": true + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "dev": true, + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", + "dev": true + }, + "mississippi": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mississippi/-/mississippi-2.0.0.tgz", + "integrity": "sha512-zHo8v+otD1J10j/tC+VNoGK9keCuByhKovAvdn74dmxJl9+mWHnx6EMsDN4lgRoMI/eYo2nchAxniIbUPb5onw==", + "dev": true, + "requires": { + "concat-stream": "^1.5.0", + "duplexify": "^3.4.2", + "end-of-stream": "^1.1.0", + "flush-write-stream": "^1.0.0", + "from2": "^2.1.0", + "parallel-transform": "^1.1.0", + "pump": "^2.0.1", + "pumpify": "^1.3.3", + "stream-each": "^1.1.0", + "through2": "^2.0.0" + } + }, + "mixin-deep": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/mixin-deep/-/mixin-deep-1.3.1.tgz", + "integrity": "sha512-8ZItLHeEgaqEvd5lYBXfm4EZSFCX29Jb9K+lAHhDKzReKBQKj3R+7NOF6tjqYi9t4oI8VUfaWITJQm86wnXGNQ==", + "dev": true, + "requires": { + "for-in": "^1.0.2", + "is-extendable": "^1.0.1" + }, + "dependencies": { + "is-extendable": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz", + "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==", + "dev": true, + "requires": { + "is-plain-object": "^2.0.4" + } + } + } + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "dev": true, + "requires": { + "minimist": "0.0.8" + } + }, + "moment": { + "version": "2.22.2", + "resolved": "https://registry.npmjs.org/moment/-/moment-2.22.2.tgz", + "integrity": "sha1-PCV/mDn8DpP/UxSWMiOeuQeD/2Y=" + }, + "moment-timezone": { + "version": "0.5.21", + "resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.21.tgz", + "integrity": "sha512-j96bAh4otsgj3lKydm3K7kdtA3iKf2m6MY2iSYCzCm5a1zmHo1g+aK3068dDEeocLZQIS9kU8bsdQHLqEvgW0A==", + "requires": { + "moment": ">= 2.9.0" + } + }, + "move-concurrently": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/move-concurrently/-/move-concurrently-1.0.1.tgz", + "integrity": "sha1-viwAX9oy4LKa8fBdfEszIUxwH5I=", + "dev": true, + "requires": { + "aproba": "^1.1.1", + "copy-concurrently": "^1.0.0", + "fs-write-stream-atomic": "^1.0.8", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.3" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "multi-glob": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/multi-glob/-/multi-glob-1.0.1.tgz", + "integrity": "sha1-5n0qtEKdJ2BubrTbNQlK/JF4h1A=", + "dev": true, + "requires": { + "async": "1.x", + "glob": "5.x", + "lodash": "3.x" + }, + "dependencies": { + "lodash": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz", + "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y=", + "dev": true + } + } + }, + "mute-stream": { + "version": "0.0.7", + "resolved": "https://registry.npmjs.org/mute-stream/-/mute-stream-0.0.7.tgz", + "integrity": "sha1-MHXOk7whuPq0PhvE2n6BFe0ee6s=", + "dev": true + }, + "nan": { + "version": "2.10.0", + "resolved": "https://registry.npmjs.org/nan/-/nan-2.10.0.tgz", + "integrity": "sha512-bAdJv7fBLhWC+/Bls0Oza+mvTaNQtP+1RyhhhvD95pgUJz6XM5IzgmxOkItJ9tkoCiplvAnXI1tNmmUD/eScyA==", + "dev": true, + "optional": true + }, + "nanomatch": { + "version": "1.2.13", + "resolved": "https://registry.npmjs.org/nanomatch/-/nanomatch-1.2.13.tgz", + "integrity": "sha512-fpoe2T0RbHwBTBUOftAfBPaDEi06ufaUai0mE6Yn1kacc3SnTErfb/h+X94VXzI64rKFHYImXSvdwGGCmwOqCA==", + "dev": true, + "requires": { + "arr-diff": "^4.0.0", + "array-unique": "^0.3.2", + "define-property": "^2.0.2", + "extend-shallow": "^3.0.2", + "fragment-cache": "^0.2.1", + "is-windows": "^1.0.2", + "kind-of": "^6.0.2", + "object.pick": "^1.3.0", + "regex-not": "^1.0.0", + "snapdragon": "^0.8.1", + "to-regex": "^3.0.1" + }, + "dependencies": { + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "natural-compare": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", + "integrity": "sha1-Sr6/7tdUHywnrPspvbvRXI1bpPc=", + "dev": true + }, + "neo-async": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.5.1.tgz", + "integrity": "sha512-3KL3fvuRkZ7s4IFOMfztb7zJp3QaVWnBeGoJlgB38XnCRPj/0tLzzLG5IB8NYOHbJ8g8UGrgZv44GLDk6CxTxA==", + "dev": true + }, + "next-tick": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/next-tick/-/next-tick-1.0.0.tgz", + "integrity": "sha1-yobR/ogoFpsBICCOPchCS524NCw=", + "dev": true + }, + "nice-try": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/nice-try/-/nice-try-1.0.4.tgz", + "integrity": "sha512-2NpiFHqC87y/zFke0fC0spBXL3bBsoh/p5H1EFhshxjCR5+0g2d6BiXbUFz9v1sAcxsk2htp2eQnNIci2dIYcA==", + "dev": true + }, + "node-libs-browser": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/node-libs-browser/-/node-libs-browser-2.1.0.tgz", + "integrity": "sha512-5AzFzdoIMb89hBGMZglEegffzgRg+ZFoUmisQ8HI4j1KDdpx13J0taNp2y9xPbur6W61gepGDDotGBVQ7mfUCg==", + "dev": true, + "requires": { + "assert": "^1.1.1", + "browserify-zlib": "^0.2.0", + "buffer": "^4.3.0", + "console-browserify": "^1.1.0", + "constants-browserify": "^1.0.0", + "crypto-browserify": "^3.11.0", + "domain-browser": "^1.1.1", + "events": "^1.0.0", + "https-browserify": "^1.0.0", + "os-browserify": "^0.3.0", + "path-browserify": "0.0.0", + "process": "^0.11.10", + "punycode": "^1.2.4", + "querystring-es3": "^0.2.0", + "readable-stream": "^2.3.3", + "stream-browserify": "^2.0.1", + "stream-http": "^2.7.2", + "string_decoder": "^1.0.0", + "timers-browserify": "^2.0.4", + "tty-browserify": "0.0.0", + "url": "^0.11.0", + "util": "^0.10.3", + "vm-browserify": "0.0.4" + }, + "dependencies": { + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=", + "dev": true + } + } + }, + "nopt": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/nopt/-/nopt-3.0.6.tgz", + "integrity": "sha1-xkZdvwirzU2zWTF/eaxopkayj/k=", + "dev": true, + "requires": { + "abbrev": "1" + } + }, + "normalize-package-data": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/normalize-package-data/-/normalize-package-data-2.4.0.tgz", + "integrity": "sha512-9jjUFbTPfEy3R/ad/2oNbKtW9Hgovl5O1FvFWKkKblNXoN/Oou6+9+KKohPK13Yc3/TyunyWhJp6gvRNR/PPAw==", + "dev": true, + "requires": { + "hosted-git-info": "^2.1.4", + "is-builtin-module": "^1.0.0", + "semver": "2 || 3 || 4 || 5", + "validate-npm-package-license": "^3.0.1" + } + }, + "normalize-path": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-2.1.1.tgz", + "integrity": "sha1-GrKLVW4Zg2Oowab35vogE3/mrtk=", + "dev": true, + "requires": { + "remove-trailing-separator": "^1.0.1" + } + }, + "normalize-range": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz", + "integrity": "sha1-LRDAa9/TEuqXd2laTShDlFa3WUI=", + "dev": true + }, + "normalize-url": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-1.9.1.tgz", + "integrity": "sha1-LMDWazHqIwNkWENuNiDYWVTGbDw=", + "dev": true, + "requires": { + "object-assign": "^4.0.1", + "prepend-http": "^1.0.0", + "query-string": "^4.1.0", + "sort-keys": "^1.0.0" + } + }, + "npm": { + "version": "6.3.0", + "resolved": "https://registry.npmjs.org/npm/-/npm-6.3.0.tgz", + "integrity": "sha512-oDtLFo3wXue/xe3pU/oks9VHS5501OAWlYrZrApZkFv7l2LXk+9CfPMbjbfZWK7Jqlc1jbNcJMkB6KZC7K/vEA==", + "requires": { + "JSONStream": "^1.3.3", + "abbrev": "~1.1.1", + "ansicolors": "~0.3.2", + "ansistyles": "~0.1.3", + "aproba": "~1.2.0", + "archy": "~1.0.0", + "bin-links": "^1.1.2", + "bluebird": "~3.5.1", + "byte-size": "^4.0.3", + "cacache": "^11.1.0", + "call-limit": "~1.1.0", + "chownr": "~1.0.1", + "cli-columns": "^3.1.2", + "cli-table3": "^0.5.0", + "cmd-shim": "~2.0.2", + "columnify": "~1.5.4", + "config-chain": "~1.1.11", + "debuglog": "*", + "detect-indent": "~5.0.0", + "detect-newline": "^2.1.0", + "dezalgo": "~1.0.3", + "editor": "~1.0.0", + "figgy-pudding": "^3.2.0", + "find-npm-prefix": "^1.0.2", + "fs-vacuum": "~1.2.10", + "fs-write-stream-atomic": "~1.0.10", + "gentle-fs": "^2.0.1", + "glob": "~7.1.2", + "graceful-fs": "~4.1.11", + "has-unicode": "~2.0.1", + "hosted-git-info": "^2.6.0", + "iferr": "^1.0.0", + "imurmurhash": "*", + "inflight": "~1.0.6", + "inherits": "~2.0.3", + "ini": "^1.3.5", + "init-package-json": "^1.10.3", + "is-cidr": "^2.0.6", + "json-parse-better-errors": "^1.0.2", + "lazy-property": "~1.0.0", + "libcipm": "^2.0.0", + "libnpmhook": "^4.0.1", + "libnpx": "^10.2.0", + "lock-verify": "^2.0.2", + "lockfile": "^1.0.4", + "lodash._baseindexof": "*", + "lodash._baseuniq": "~4.6.0", + "lodash._bindcallback": "*", + "lodash._cacheindexof": "*", + "lodash._createcache": "*", + "lodash._getnative": "*", + "lodash.clonedeep": "~4.5.0", + "lodash.restparam": "*", + "lodash.union": "~4.6.0", + "lodash.uniq": "~4.5.0", + "lodash.without": "~4.4.0", + "lru-cache": "^4.1.3", + "meant": "~1.0.1", + "mississippi": "^3.0.0", + "mkdirp": "~0.5.1", + "move-concurrently": "^1.0.1", + "node-gyp": "^3.7.0", + "nopt": "~4.0.1", + "normalize-package-data": "~2.4.0", + "npm-audit-report": "^1.3.1", + "npm-cache-filename": "~1.0.2", + "npm-install-checks": "~3.0.0", + "npm-lifecycle": "^2.0.3", + "npm-package-arg": "^6.1.0", + "npm-packlist": "~1.1.10", + "npm-pick-manifest": "^2.1.0", + "npm-profile": "^3.0.2", + "npm-registry-client": "^8.5.1", + "npm-registry-fetch": "^1.1.0", + "npm-user-validate": "~1.0.0", + "npmlog": "~4.1.2", + "once": "~1.4.0", + "opener": "~1.4.3", + "osenv": "^0.1.5", + "pacote": "^8.1.6", + "path-is-inside": "~1.0.2", + "promise-inflight": "~1.0.1", + "qrcode-terminal": "^0.12.0", + "query-string": "^6.1.0", + "qw": "~1.0.1", + "read": "~1.0.7", + "read-cmd-shim": "~1.0.1", + "read-installed": "~4.0.3", + "read-package-json": "^2.0.13", + "read-package-tree": "^5.2.1", + "readable-stream": "^2.3.6", + "readdir-scoped-modules": "*", + "request": "^2.81.0", + "retry": "^0.12.0", + "rimraf": "~2.6.2", + "safe-buffer": "^5.1.2", + "semver": "^5.5.0", + "sha": "~2.0.1", + "slide": "~1.1.6", + "sorted-object": "~2.0.1", + "sorted-union-stream": "~2.1.3", + "ssri": "^6.0.0", + "stringify-package": "^1.0.0", + "tar": "^4.4.4", + "text-table": "~0.2.0", + "tiny-relative-date": "^1.3.0", + "uid-number": "0.0.6", + "umask": "~1.1.0", + "unique-filename": "~1.1.0", + "unpipe": "~1.0.0", + "update-notifier": "^2.5.0", + "uuid": "^3.3.2", + "validate-npm-package-license": "^3.0.3", + "validate-npm-package-name": "~3.0.0", + "which": "^1.3.1", + "worker-farm": "^1.6.0", + "write-file-atomic": "^2.3.0" + }, + "dependencies": { + "JSONStream": { + "version": "1.3.3", + "bundled": true, + "requires": { + "jsonparse": "^1.2.0", + "through": ">=2.2.7 <3" + } + }, + "abbrev": { + "version": "1.1.1", + "bundled": true + }, + "agent-base": { + "version": "4.2.0", + "bundled": true, + "requires": { + "es6-promisify": "^5.0.0" + } + }, + "agentkeepalive": { + "version": "3.4.1", + "bundled": true, + "requires": { + "humanize-ms": "^1.2.1" + } + }, + "ansi-align": { + "version": "2.0.0", + "bundled": true, + "requires": { + "string-width": "^2.0.0" + } + }, + "ansi-regex": { + "version": "2.1.1", + "bundled": true + }, + "ansi-styles": { + "version": "3.2.1", + "bundled": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "ansicolors": { + "version": "0.3.2", + "bundled": true + }, + "ansistyles": { + "version": "0.1.3", + "bundled": true + }, + "aproba": { + "version": "1.2.0", + "bundled": true + }, + "archy": { + "version": "1.0.0", + "bundled": true + }, + "are-we-there-yet": { + "version": "1.1.4", + "bundled": true, + "requires": { + "delegates": "^1.0.0", + "readable-stream": "^2.0.6" + } + }, + "asap": { + "version": "2.0.6", + "bundled": true + }, + "asn1": { + "version": "0.2.3", + "bundled": true + }, + "assert-plus": { + "version": "0.2.0", + "bundled": true + }, + "asynckit": { + "version": "0.4.0", + "bundled": true + }, + "aws-sign2": { + "version": "0.6.0", + "bundled": true + }, + "aws4": { + "version": "1.7.0", + "bundled": true + }, + "balanced-match": { + "version": "1.0.0", + "bundled": true + }, + "bcrypt-pbkdf": { + "version": "1.0.2", + "bundled": true, + "optional": true, + "requires": { + "tweetnacl": "^0.14.3" + } + }, + "bin-links": { + "version": "1.1.2", + "bundled": true, + "requires": { + "bluebird": "^3.5.0", + "cmd-shim": "^2.0.2", + "gentle-fs": "^2.0.0", + "graceful-fs": "^4.1.11", + "write-file-atomic": "^2.3.0" + } + }, + "block-stream": { + "version": "0.0.9", + "bundled": true, + "requires": { + "inherits": "~2.0.0" + } + }, + "bluebird": { + "version": "3.5.1", + "bundled": true + }, + "boom": { + "version": "2.10.1", + "bundled": true, + "requires": { + "hoek": "2.x.x" + } + }, + "boxen": { + "version": "1.3.0", + "bundled": true, + "requires": { + "ansi-align": "^2.0.0", + "camelcase": "^4.0.0", + "chalk": "^2.0.1", + "cli-boxes": "^1.0.0", + "string-width": "^2.0.0", + "term-size": "^1.2.0", + "widest-line": "^2.0.0" + } + }, + "brace-expansion": { + "version": "1.1.11", + "bundled": true, + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "buffer-from": { + "version": "1.0.0", + "bundled": true + }, + "builtin-modules": { + "version": "1.1.1", + "bundled": true + }, + "builtins": { + "version": "1.0.3", + "bundled": true + }, + "byline": { + "version": "5.0.0", + "bundled": true + }, + "byte-size": { + "version": "4.0.3", + "bundled": true + }, + "cacache": { + "version": "11.1.0", + "bundled": true, + "requires": { + "bluebird": "^3.5.1", + "chownr": "^1.0.1", + "figgy-pudding": "^3.1.0", + "glob": "^7.1.2", + "graceful-fs": "^4.1.11", + "lru-cache": "^4.1.3", + "mississippi": "^3.0.0", + "mkdirp": "^0.5.1", + "move-concurrently": "^1.0.1", + "promise-inflight": "^1.0.1", + "rimraf": "^2.6.2", + "ssri": "^6.0.0", + "unique-filename": "^1.1.0", + "y18n": "^4.0.0" + } + }, + "call-limit": { + "version": "1.1.0", + "bundled": true + }, + "camelcase": { + "version": "4.1.0", + "bundled": true + }, + "capture-stack-trace": { + "version": "1.0.0", + "bundled": true + }, + "caseless": { + "version": "0.12.0", + "bundled": true + }, + "chalk": { + "version": "2.4.1", + "bundled": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "chownr": { + "version": "1.0.1", + "bundled": true + }, + "ci-info": { + "version": "1.1.3", + "bundled": true + }, + "cidr-regex": { + "version": "2.0.9", + "bundled": true, + "requires": { + "ip-regex": "^2.1.0" + } + }, + "cli-boxes": { + "version": "1.0.0", + "bundled": true + }, + "cli-columns": { + "version": "3.1.2", + "bundled": true, + "requires": { + "string-width": "^2.0.0", + "strip-ansi": "^3.0.1" + } + }, + "cli-table3": { + "version": "0.5.0", + "bundled": true, + "requires": { + "colors": "^1.1.2", + "object-assign": "^4.1.0", + "string-width": "^2.1.1" + } + }, + "cliui": { + "version": "4.1.0", + "bundled": true, + "requires": { + "string-width": "^2.1.1", + "strip-ansi": "^4.0.0", + "wrap-ansi": "^2.0.0" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "bundled": true + }, + "strip-ansi": { + "version": "4.0.0", + "bundled": true, + "requires": { + "ansi-regex": "^3.0.0" + } + } + } + }, + "clone": { + "version": "1.0.4", + "bundled": true + }, + "cmd-shim": { + "version": "2.0.2", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2", + "mkdirp": "~0.5.0" + } + }, + "co": { + "version": "4.6.0", + "bundled": true + }, + "code-point-at": { + "version": "1.1.0", + "bundled": true + }, + "color-convert": { + "version": "1.9.1", + "bundled": true, + "requires": { + "color-name": "^1.1.1" + } + }, + "color-name": { + "version": "1.1.3", + "bundled": true + }, + "colors": { + "version": "1.3.0", + "bundled": true, + "optional": true + }, + "columnify": { + "version": "1.5.4", + "bundled": true, + "requires": { + "strip-ansi": "^3.0.0", + "wcwidth": "^1.0.0" + } + }, + "combined-stream": { + "version": "1.0.6", + "bundled": true, + "requires": { + "delayed-stream": "~1.0.0" + } + }, + "concat-map": { + "version": "0.0.1", + "bundled": true + }, + "concat-stream": { + "version": "1.6.2", + "bundled": true, + "requires": { + "buffer-from": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^2.2.2", + "typedarray": "^0.0.6" + } + }, + "config-chain": { + "version": "1.1.11", + "bundled": true, + "requires": { + "ini": "^1.3.4", + "proto-list": "~1.2.1" + } + }, + "configstore": { + "version": "3.1.2", + "bundled": true, + "requires": { + "dot-prop": "^4.1.0", + "graceful-fs": "^4.1.2", + "make-dir": "^1.0.0", + "unique-string": "^1.0.0", + "write-file-atomic": "^2.0.0", + "xdg-basedir": "^3.0.0" + } + }, + "console-control-strings": { + "version": "1.1.0", + "bundled": true + }, + "copy-concurrently": { + "version": "1.0.5", + "bundled": true, + "requires": { + "aproba": "^1.1.1", + "fs-write-stream-atomic": "^1.0.8", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.0" + }, + "dependencies": { + "iferr": { + "version": "0.1.5", + "bundled": true + } + } + }, + "core-util-is": { + "version": "1.0.2", + "bundled": true + }, + "create-error-class": { + "version": "3.0.2", + "bundled": true, + "requires": { + "capture-stack-trace": "^1.0.0" + } + }, + "cross-spawn": { + "version": "5.1.0", + "bundled": true, + "requires": { + "lru-cache": "^4.0.1", + "shebang-command": "^1.2.0", + "which": "^1.2.9" + } + }, + "cryptiles": { + "version": "2.0.5", + "bundled": true, + "requires": { + "boom": "2.x.x" + } + }, + "crypto-random-string": { + "version": "1.0.0", + "bundled": true + }, + "cyclist": { + "version": "0.2.2", + "bundled": true + }, + "dashdash": { + "version": "1.14.1", + "bundled": true, + "requires": { + "assert-plus": "^1.0.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true + } + } + }, + "debug": { + "version": "3.1.0", + "bundled": true, + "requires": { + "ms": "2.0.0" + }, + "dependencies": { + "ms": { + "version": "2.0.0", + "bundled": true + } + } + }, + "debuglog": { + "version": "1.0.1", + "bundled": true + }, + "decamelize": { + "version": "1.2.0", + "bundled": true + }, + "decode-uri-component": { + "version": "0.2.0", + "bundled": true + }, + "deep-extend": { + "version": "0.5.1", + "bundled": true + }, + "defaults": { + "version": "1.0.3", + "bundled": true, + "requires": { + "clone": "^1.0.2" + } + }, + "delayed-stream": { + "version": "1.0.0", + "bundled": true + }, + "delegates": { + "version": "1.0.0", + "bundled": true + }, + "detect-indent": { + "version": "5.0.0", + "bundled": true + }, + "detect-newline": { + "version": "2.1.0", + "bundled": true + }, + "dezalgo": { + "version": "1.0.3", + "bundled": true, + "requires": { + "asap": "^2.0.0", + "wrappy": "1" + } + }, + "dot-prop": { + "version": "4.2.0", + "bundled": true, + "requires": { + "is-obj": "^1.0.0" + } + }, + "dotenv": { + "version": "5.0.1", + "bundled": true + }, + "duplexer3": { + "version": "0.1.4", + "bundled": true + }, + "duplexify": { + "version": "3.6.0", + "bundled": true, + "requires": { + "end-of-stream": "^1.0.0", + "inherits": "^2.0.1", + "readable-stream": "^2.0.0", + "stream-shift": "^1.0.0" + } + }, + "ecc-jsbn": { + "version": "0.1.1", + "bundled": true, + "optional": true, + "requires": { + "jsbn": "~0.1.0" + } + }, + "editor": { + "version": "1.0.0", + "bundled": true + }, + "encoding": { + "version": "0.1.12", + "bundled": true, + "requires": { + "iconv-lite": "~0.4.13" + } + }, + "end-of-stream": { + "version": "1.4.1", + "bundled": true, + "requires": { + "once": "^1.4.0" + } + }, + "err-code": { + "version": "1.1.2", + "bundled": true + }, + "errno": { + "version": "0.1.7", + "bundled": true, + "requires": { + "prr": "~1.0.1" + } + }, + "es6-promise": { + "version": "4.2.4", + "bundled": true + }, + "es6-promisify": { + "version": "5.0.0", + "bundled": true, + "requires": { + "es6-promise": "^4.0.3" + } + }, + "escape-string-regexp": { + "version": "1.0.5", + "bundled": true + }, + "execa": { + "version": "0.7.0", + "bundled": true, + "requires": { + "cross-spawn": "^5.0.1", + "get-stream": "^3.0.0", + "is-stream": "^1.1.0", + "npm-run-path": "^2.0.0", + "p-finally": "^1.0.0", + "signal-exit": "^3.0.0", + "strip-eof": "^1.0.0" + } + }, + "extend": { + "version": "3.0.1", + "bundled": true + }, + "extsprintf": { + "version": "1.3.0", + "bundled": true + }, + "figgy-pudding": { + "version": "3.2.0", + "bundled": true + }, + "find-npm-prefix": { + "version": "1.0.2", + "bundled": true + }, + "find-up": { + "version": "2.1.0", + "bundled": true, + "requires": { + "locate-path": "^2.0.0" + } + }, + "flush-write-stream": { + "version": "1.0.3", + "bundled": true, + "requires": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.4" + } + }, + "forever-agent": { + "version": "0.6.1", + "bundled": true + }, + "form-data": { + "version": "2.1.4", + "bundled": true, + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.5", + "mime-types": "^2.1.12" + } + }, + "from2": { + "version": "2.3.0", + "bundled": true, + "requires": { + "inherits": "^2.0.1", + "readable-stream": "^2.0.0" + } + }, + "fs-minipass": { + "version": "1.2.5", + "bundled": true, + "requires": { + "minipass": "^2.2.1" + } + }, + "fs-vacuum": { + "version": "1.2.10", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2", + "path-is-inside": "^1.0.1", + "rimraf": "^2.5.2" + } + }, + "fs-write-stream-atomic": { + "version": "1.0.10", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2", + "iferr": "^0.1.5", + "imurmurhash": "^0.1.4", + "readable-stream": "1 || 2" + }, + "dependencies": { + "iferr": { + "version": "0.1.5", + "bundled": true + } + } + }, + "fs.realpath": { + "version": "1.0.0", + "bundled": true + }, + "fstream": { + "version": "1.0.11", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2", + "inherits": "~2.0.0", + "mkdirp": ">=0.5 0", + "rimraf": "2" + } + }, + "gauge": { + "version": "2.7.4", + "bundled": true, + "requires": { + "aproba": "^1.0.3", + "console-control-strings": "^1.0.0", + "has-unicode": "^2.0.0", + "object-assign": "^4.1.0", + "signal-exit": "^3.0.0", + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1", + "wide-align": "^1.1.0" + }, + "dependencies": { + "string-width": { + "version": "1.0.2", + "bundled": true, + "requires": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + } + } + } + }, + "genfun": { + "version": "4.0.1", + "bundled": true + }, + "gentle-fs": { + "version": "2.0.1", + "bundled": true, + "requires": { + "aproba": "^1.1.2", + "fs-vacuum": "^1.2.10", + "graceful-fs": "^4.1.11", + "iferr": "^0.1.5", + "mkdirp": "^0.5.1", + "path-is-inside": "^1.0.2", + "read-cmd-shim": "^1.0.1", + "slide": "^1.1.6" + }, + "dependencies": { + "iferr": { + "version": "0.1.5", + "bundled": true + } + } + }, + "get-caller-file": { + "version": "1.0.2", + "bundled": true + }, + "get-stream": { + "version": "3.0.0", + "bundled": true + }, + "getpass": { + "version": "0.1.7", + "bundled": true, + "requires": { + "assert-plus": "^1.0.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true + } + } + }, + "glob": { + "version": "7.1.2", + "bundled": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "global-dirs": { + "version": "0.1.1", + "bundled": true, + "requires": { + "ini": "^1.3.4" + } + }, + "got": { + "version": "6.7.1", + "bundled": true, + "requires": { + "create-error-class": "^3.0.0", + "duplexer3": "^0.1.4", + "get-stream": "^3.0.0", + "is-redirect": "^1.0.0", + "is-retry-allowed": "^1.0.0", + "is-stream": "^1.0.0", + "lowercase-keys": "^1.0.0", + "safe-buffer": "^5.0.1", + "timed-out": "^4.0.0", + "unzip-response": "^2.0.1", + "url-parse-lax": "^1.0.0" + } + }, + "graceful-fs": { + "version": "4.1.11", + "bundled": true + }, + "har-schema": { + "version": "1.0.5", + "bundled": true + }, + "har-validator": { + "version": "4.2.1", + "bundled": true, + "requires": { + "ajv": "^4.9.1", + "har-schema": "^1.0.5" + }, + "dependencies": { + "ajv": { + "version": "4.11.8", + "bundled": true, + "requires": { + "co": "^4.6.0", + "json-stable-stringify": "^1.0.1" + } + } + } + }, + "has-flag": { + "version": "3.0.0", + "bundled": true + }, + "has-unicode": { + "version": "2.0.1", + "bundled": true + }, + "hawk": { + "version": "3.1.3", + "bundled": true, + "requires": { + "boom": "2.x.x", + "cryptiles": "2.x.x", + "hoek": "2.x.x", + "sntp": "1.x.x" + } + }, + "hoek": { + "version": "2.16.3", + "bundled": true + }, + "hosted-git-info": { + "version": "2.6.0", + "bundled": true + }, + "http-cache-semantics": { + "version": "3.8.1", + "bundled": true + }, + "http-proxy-agent": { + "version": "2.1.0", + "bundled": true, + "requires": { + "agent-base": "4", + "debug": "3.1.0" + } + }, + "http-signature": { + "version": "1.1.1", + "bundled": true, + "requires": { + "assert-plus": "^0.2.0", + "jsprim": "^1.2.2", + "sshpk": "^1.7.0" + } + }, + "https-proxy-agent": { + "version": "2.2.1", + "bundled": true, + "requires": { + "agent-base": "^4.1.0", + "debug": "^3.1.0" + } + }, + "humanize-ms": { + "version": "1.2.1", + "bundled": true, + "requires": { + "ms": "^2.0.0" + } + }, + "iconv-lite": { + "version": "0.4.23", + "bundled": true, + "requires": { + "safer-buffer": ">= 2.1.2 < 3" + } + }, + "iferr": { + "version": "1.0.0", + "bundled": true + }, + "ignore-walk": { + "version": "3.0.1", + "bundled": true, + "requires": { + "minimatch": "^3.0.4" + } + }, + "import-lazy": { + "version": "2.1.0", + "bundled": true + }, + "imurmurhash": { + "version": "0.1.4", + "bundled": true + }, + "inflight": { + "version": "1.0.6", + "bundled": true, + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.3", + "bundled": true + }, + "ini": { + "version": "1.3.5", + "bundled": true + }, + "init-package-json": { + "version": "1.10.3", + "bundled": true, + "requires": { + "glob": "^7.1.1", + "npm-package-arg": "^4.0.0 || ^5.0.0 || ^6.0.0", + "promzard": "^0.3.0", + "read": "~1.0.1", + "read-package-json": "1 || 2", + "semver": "2.x || 3.x || 4 || 5", + "validate-npm-package-license": "^3.0.1", + "validate-npm-package-name": "^3.0.0" + } + }, + "invert-kv": { + "version": "1.0.0", + "bundled": true + }, + "ip": { + "version": "1.1.5", + "bundled": true + }, + "ip-regex": { + "version": "2.1.0", + "bundled": true + }, + "is-builtin-module": { + "version": "1.0.0", + "bundled": true, + "requires": { + "builtin-modules": "^1.0.0" + } + }, + "is-ci": { + "version": "1.1.0", + "bundled": true, + "requires": { + "ci-info": "^1.0.0" + } + }, + "is-cidr": { + "version": "2.0.6", + "bundled": true, + "requires": { + "cidr-regex": "^2.0.8" + } + }, + "is-fullwidth-code-point": { + "version": "1.0.0", + "bundled": true, + "requires": { + "number-is-nan": "^1.0.0" + } + }, + "is-installed-globally": { + "version": "0.1.0", + "bundled": true, + "requires": { + "global-dirs": "^0.1.0", + "is-path-inside": "^1.0.0" + } + }, + "is-npm": { + "version": "1.0.0", + "bundled": true + }, + "is-obj": { + "version": "1.0.1", + "bundled": true + }, + "is-path-inside": { + "version": "1.0.1", + "bundled": true, + "requires": { + "path-is-inside": "^1.0.1" + } + }, + "is-redirect": { + "version": "1.0.0", + "bundled": true + }, + "is-retry-allowed": { + "version": "1.1.0", + "bundled": true + }, + "is-stream": { + "version": "1.1.0", + "bundled": true + }, + "is-typedarray": { + "version": "1.0.0", + "bundled": true + }, + "isarray": { + "version": "1.0.0", + "bundled": true + }, + "isexe": { + "version": "2.0.0", + "bundled": true + }, + "isstream": { + "version": "0.1.2", + "bundled": true + }, + "jsbn": { + "version": "0.1.1", + "bundled": true, + "optional": true + }, + "json-parse-better-errors": { + "version": "1.0.2", + "bundled": true + }, + "json-schema": { + "version": "0.2.3", + "bundled": true + }, + "json-stable-stringify": { + "version": "1.0.1", + "bundled": true, + "requires": { + "jsonify": "~0.0.0" + } + }, + "json-stringify-safe": { + "version": "5.0.1", + "bundled": true + }, + "jsonify": { + "version": "0.0.0", + "bundled": true + }, + "jsonparse": { + "version": "1.3.1", + "bundled": true + }, + "jsprim": { + "version": "1.4.1", + "bundled": true, + "requires": { + "assert-plus": "1.0.0", + "extsprintf": "1.3.0", + "json-schema": "0.2.3", + "verror": "1.10.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true + } + } + }, + "latest-version": { + "version": "3.1.0", + "bundled": true, + "requires": { + "package-json": "^4.0.0" + } + }, + "lazy-property": { + "version": "1.0.0", + "bundled": true + }, + "lcid": { + "version": "1.0.0", + "bundled": true, + "requires": { + "invert-kv": "^1.0.0" + } + }, + "libcipm": { + "version": "2.0.0", + "bundled": true, + "requires": { + "bin-links": "^1.1.2", + "bluebird": "^3.5.1", + "find-npm-prefix": "^1.0.2", + "graceful-fs": "^4.1.11", + "lock-verify": "^2.0.2", + "npm-lifecycle": "^2.0.3", + "npm-logical-tree": "^1.2.1", + "npm-package-arg": "^6.1.0", + "pacote": "^8.1.6", + "protoduck": "^5.0.0", + "read-package-json": "^2.0.13", + "rimraf": "^2.6.2", + "worker-farm": "^1.6.0" + } + }, + "libnpmhook": { + "version": "4.0.1", + "bundled": true, + "requires": { + "figgy-pudding": "^3.1.0", + "npm-registry-fetch": "^3.0.0" + }, + "dependencies": { + "npm-registry-fetch": { + "version": "3.1.1", + "bundled": true, + "requires": { + "bluebird": "^3.5.1", + "figgy-pudding": "^3.1.0", + "lru-cache": "^4.1.2", + "make-fetch-happen": "^4.0.0", + "npm-package-arg": "^6.0.0" + } + } + } + }, + "libnpx": { + "version": "10.2.0", + "bundled": true, + "requires": { + "dotenv": "^5.0.1", + "npm-package-arg": "^6.0.0", + "rimraf": "^2.6.2", + "safe-buffer": "^5.1.0", + "update-notifier": "^2.3.0", + "which": "^1.3.0", + "y18n": "^4.0.0", + "yargs": "^11.0.0" + } + }, + "locate-path": { + "version": "2.0.0", + "bundled": true, + "requires": { + "p-locate": "^2.0.0", + "path-exists": "^3.0.0" + } + }, + "lock-verify": { + "version": "2.0.2", + "bundled": true, + "requires": { + "npm-package-arg": "^5.1.2 || 6", + "semver": "^5.4.1" + } + }, + "lockfile": { + "version": "1.0.4", + "bundled": true, + "requires": { + "signal-exit": "^3.0.2" + } + }, + "lodash._baseindexof": { + "version": "3.1.0", + "bundled": true + }, + "lodash._baseuniq": { + "version": "4.6.0", + "bundled": true, + "requires": { + "lodash._createset": "~4.0.0", + "lodash._root": "~3.0.0" + } + }, + "lodash._bindcallback": { + "version": "3.0.1", + "bundled": true + }, + "lodash._cacheindexof": { + "version": "3.0.2", + "bundled": true + }, + "lodash._createcache": { + "version": "3.1.2", + "bundled": true, + "requires": { + "lodash._getnative": "^3.0.0" + } + }, + "lodash._createset": { + "version": "4.0.3", + "bundled": true + }, + "lodash._getnative": { + "version": "3.9.1", + "bundled": true + }, + "lodash._root": { + "version": "3.0.1", + "bundled": true + }, + "lodash.clonedeep": { + "version": "4.5.0", + "bundled": true + }, + "lodash.restparam": { + "version": "3.6.1", + "bundled": true + }, + "lodash.union": { + "version": "4.6.0", + "bundled": true + }, + "lodash.uniq": { + "version": "4.5.0", + "bundled": true + }, + "lodash.without": { + "version": "4.4.0", + "bundled": true + }, + "lowercase-keys": { + "version": "1.0.1", + "bundled": true + }, + "lru-cache": { + "version": "4.1.3", + "bundled": true, + "requires": { + "pseudomap": "^1.0.2", + "yallist": "^2.1.2" + } + }, + "make-dir": { + "version": "1.3.0", + "bundled": true, + "requires": { + "pify": "^3.0.0" + } + }, + "make-fetch-happen": { + "version": "4.0.1", + "bundled": true, + "requires": { + "agentkeepalive": "^3.4.1", + "cacache": "^11.0.1", + "http-cache-semantics": "^3.8.1", + "http-proxy-agent": "^2.1.0", + "https-proxy-agent": "^2.2.1", + "lru-cache": "^4.1.2", + "mississippi": "^3.0.0", + "node-fetch-npm": "^2.0.2", + "promise-retry": "^1.1.1", + "socks-proxy-agent": "^4.0.0", + "ssri": "^6.0.0" + } + }, + "meant": { + "version": "1.0.1", + "bundled": true + }, + "mem": { + "version": "1.1.0", + "bundled": true, + "requires": { + "mimic-fn": "^1.0.0" + } + }, + "mime-db": { + "version": "1.33.0", + "bundled": true + }, + "mime-types": { + "version": "2.1.18", + "bundled": true, + "requires": { + "mime-db": "~1.33.0" + } + }, + "mimic-fn": { + "version": "1.2.0", + "bundled": true + }, + "minimatch": { + "version": "3.0.4", + "bundled": true, + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "bundled": true + }, + "minipass": { + "version": "2.3.3", + "bundled": true, + "requires": { + "safe-buffer": "^5.1.2", + "yallist": "^3.0.0" + }, + "dependencies": { + "yallist": { + "version": "3.0.2", + "bundled": true + } + } + }, + "minizlib": { + "version": "1.1.0", + "bundled": true, + "requires": { + "minipass": "^2.2.1" + } + }, + "mississippi": { + "version": "3.0.0", + "bundled": true, + "requires": { + "concat-stream": "^1.5.0", + "duplexify": "^3.4.2", + "end-of-stream": "^1.1.0", + "flush-write-stream": "^1.0.0", + "from2": "^2.1.0", + "parallel-transform": "^1.1.0", + "pump": "^3.0.0", + "pumpify": "^1.3.3", + "stream-each": "^1.1.0", + "through2": "^2.0.0" + } + }, + "mkdirp": { + "version": "0.5.1", + "bundled": true, + "requires": { + "minimist": "0.0.8" + } + }, + "move-concurrently": { + "version": "1.0.1", + "bundled": true, + "requires": { + "aproba": "^1.1.1", + "copy-concurrently": "^1.0.0", + "fs-write-stream-atomic": "^1.0.8", + "mkdirp": "^0.5.1", + "rimraf": "^2.5.4", + "run-queue": "^1.0.3" + } + }, + "ms": { + "version": "2.1.1", + "bundled": true + }, + "mute-stream": { + "version": "0.0.7", + "bundled": true + }, + "node-fetch-npm": { + "version": "2.0.2", + "bundled": true, + "requires": { + "encoding": "^0.1.11", + "json-parse-better-errors": "^1.0.0", + "safe-buffer": "^5.1.1" + } + }, + "node-gyp": { + "version": "3.7.0", + "bundled": true, + "requires": { + "fstream": "^1.0.0", + "glob": "^7.0.3", + "graceful-fs": "^4.1.2", + "mkdirp": "^0.5.0", + "nopt": "2 || 3", + "npmlog": "0 || 1 || 2 || 3 || 4", + "osenv": "0", + "request": ">=2.9.0 <2.82.0", + "rimraf": "2", + "semver": "~5.3.0", + "tar": "^2.0.0", + "which": "1" + }, + "dependencies": { + "nopt": { + "version": "3.0.6", + "bundled": true, + "requires": { + "abbrev": "1" + } + }, + "semver": { + "version": "5.3.0", + "bundled": true + }, + "tar": { + "version": "2.2.1", + "bundled": true, + "requires": { + "block-stream": "*", + "fstream": "^1.0.2", + "inherits": "2" + } + } + } + }, + "nopt": { + "version": "4.0.1", + "bundled": true, + "requires": { + "abbrev": "1", + "osenv": "^0.1.4" + } + }, + "normalize-package-data": { + "version": "2.4.0", + "bundled": true, + "requires": { + "hosted-git-info": "^2.1.4", + "is-builtin-module": "^1.0.0", + "semver": "2 || 3 || 4 || 5", + "validate-npm-package-license": "^3.0.1" + } + }, + "npm-audit-report": { + "version": "1.3.1", + "bundled": true, + "requires": { + "cli-table3": "^0.5.0", + "console-control-strings": "^1.1.0" + } + }, + "npm-bundled": { + "version": "1.0.3", + "bundled": true + }, + "npm-cache-filename": { + "version": "1.0.2", + "bundled": true + }, + "npm-install-checks": { + "version": "3.0.0", + "bundled": true, + "requires": { + "semver": "^2.3.0 || 3.x || 4 || 5" + } + }, + "npm-lifecycle": { + "version": "2.0.3", + "bundled": true, + "requires": { + "byline": "^5.0.0", + "graceful-fs": "^4.1.11", + "node-gyp": "^3.6.2", + "resolve-from": "^4.0.0", + "slide": "^1.1.6", + "uid-number": "0.0.6", + "umask": "^1.1.0", + "which": "^1.3.0" + } + }, + "npm-logical-tree": { + "version": "1.2.1", + "bundled": true + }, + "npm-package-arg": { + "version": "6.1.0", + "bundled": true, + "requires": { + "hosted-git-info": "^2.6.0", + "osenv": "^0.1.5", + "semver": "^5.5.0", + "validate-npm-package-name": "^3.0.0" + } + }, + "npm-packlist": { + "version": "1.1.10", + "bundled": true, + "requires": { + "ignore-walk": "^3.0.1", + "npm-bundled": "^1.0.1" + } + }, + "npm-pick-manifest": { + "version": "2.1.0", + "bundled": true, + "requires": { + "npm-package-arg": "^6.0.0", + "semver": "^5.4.1" + } + }, + "npm-profile": { + "version": "3.0.2", + "bundled": true, + "requires": { + "aproba": "^1.1.2 || 2", + "make-fetch-happen": "^2.5.0 || 3 || 4" + } + }, + "npm-registry-client": { + "version": "8.5.1", + "bundled": true, + "requires": { + "concat-stream": "^1.5.2", + "graceful-fs": "^4.1.6", + "normalize-package-data": "~1.0.1 || ^2.0.0", + "npm-package-arg": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0", + "npmlog": "2 || ^3.1.0 || ^4.0.0", + "once": "^1.3.3", + "request": "^2.74.0", + "retry": "^0.10.0", + "safe-buffer": "^5.1.1", + "semver": "2 >=2.2.1 || 3.x || 4 || 5", + "slide": "^1.1.3", + "ssri": "^5.2.4" + }, + "dependencies": { + "retry": { + "version": "0.10.1", + "bundled": true + }, + "ssri": { + "version": "5.3.0", + "bundled": true, + "requires": { + "safe-buffer": "^5.1.1" + } + } + } + }, + "npm-registry-fetch": { + "version": "1.1.0", + "bundled": true, + "requires": { + "bluebird": "^3.5.1", + "figgy-pudding": "^2.0.1", + "lru-cache": "^4.1.2", + "make-fetch-happen": "^3.0.0", + "npm-package-arg": "^6.0.0", + "safe-buffer": "^5.1.1" + }, + "dependencies": { + "cacache": { + "version": "10.0.4", + "bundled": true, + "requires": { + "bluebird": "^3.5.1", + "chownr": "^1.0.1", + "glob": "^7.1.2", + "graceful-fs": "^4.1.11", + "lru-cache": "^4.1.1", + "mississippi": "^2.0.0", + "mkdirp": "^0.5.1", + "move-concurrently": "^1.0.1", + "promise-inflight": "^1.0.1", + "rimraf": "^2.6.2", + "ssri": "^5.2.4", + "unique-filename": "^1.1.0", + "y18n": "^4.0.0" + }, + "dependencies": { + "mississippi": { + "version": "2.0.0", + "bundled": true, + "requires": { + "concat-stream": "^1.5.0", + "duplexify": "^3.4.2", + "end-of-stream": "^1.1.0", + "flush-write-stream": "^1.0.0", + "from2": "^2.1.0", + "parallel-transform": "^1.1.0", + "pump": "^2.0.1", + "pumpify": "^1.3.3", + "stream-each": "^1.1.0", + "through2": "^2.0.0" + } + } + } + }, + "figgy-pudding": { + "version": "2.0.1", + "bundled": true + }, + "make-fetch-happen": { + "version": "3.0.0", + "bundled": true, + "requires": { + "agentkeepalive": "^3.4.1", + "cacache": "^10.0.4", + "http-cache-semantics": "^3.8.1", + "http-proxy-agent": "^2.1.0", + "https-proxy-agent": "^2.2.0", + "lru-cache": "^4.1.2", + "mississippi": "^3.0.0", + "node-fetch-npm": "^2.0.2", + "promise-retry": "^1.1.1", + "socks-proxy-agent": "^3.0.1", + "ssri": "^5.2.4" + } + }, + "pump": { + "version": "2.0.1", + "bundled": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "smart-buffer": { + "version": "1.1.15", + "bundled": true + }, + "socks": { + "version": "1.1.10", + "bundled": true, + "requires": { + "ip": "^1.1.4", + "smart-buffer": "^1.0.13" + } + }, + "socks-proxy-agent": { + "version": "3.0.1", + "bundled": true, + "requires": { + "agent-base": "^4.1.0", + "socks": "^1.1.10" + } + }, + "ssri": { + "version": "5.3.0", + "bundled": true, + "requires": { + "safe-buffer": "^5.1.1" + } + } + } + }, + "npm-run-path": { + "version": "2.0.2", + "bundled": true, + "requires": { + "path-key": "^2.0.0" + } + }, + "npm-user-validate": { + "version": "1.0.0", + "bundled": true + }, + "npmlog": { + "version": "4.1.2", + "bundled": true, + "requires": { + "are-we-there-yet": "~1.1.2", + "console-control-strings": "~1.1.0", + "gauge": "~2.7.3", + "set-blocking": "~2.0.0" + } + }, + "number-is-nan": { + "version": "1.0.1", + "bundled": true + }, + "oauth-sign": { + "version": "0.8.2", + "bundled": true + }, + "object-assign": { + "version": "4.1.1", + "bundled": true + }, + "once": { + "version": "1.4.0", + "bundled": true, + "requires": { + "wrappy": "1" + } + }, + "opener": { + "version": "1.4.3", + "bundled": true + }, + "os-homedir": { + "version": "1.0.2", + "bundled": true + }, + "os-locale": { + "version": "2.1.0", + "bundled": true, + "requires": { + "execa": "^0.7.0", + "lcid": "^1.0.0", + "mem": "^1.1.0" + } + }, + "os-tmpdir": { + "version": "1.0.2", + "bundled": true + }, + "osenv": { + "version": "0.1.5", + "bundled": true, + "requires": { + "os-homedir": "^1.0.0", + "os-tmpdir": "^1.0.0" + } + }, + "p-finally": { + "version": "1.0.0", + "bundled": true + }, + "p-limit": { + "version": "1.2.0", + "bundled": true, + "requires": { + "p-try": "^1.0.0" + } + }, + "p-locate": { + "version": "2.0.0", + "bundled": true, + "requires": { + "p-limit": "^1.1.0" + } + }, + "p-try": { + "version": "1.0.0", + "bundled": true + }, + "package-json": { + "version": "4.0.1", + "bundled": true, + "requires": { + "got": "^6.7.1", + "registry-auth-token": "^3.0.1", + "registry-url": "^3.0.3", + "semver": "^5.1.0" + } + }, + "pacote": { + "version": "8.1.6", + "bundled": true, + "requires": { + "bluebird": "^3.5.1", + "cacache": "^11.0.2", + "get-stream": "^3.0.0", + "glob": "^7.1.2", + "lru-cache": "^4.1.3", + "make-fetch-happen": "^4.0.1", + "minimatch": "^3.0.4", + "minipass": "^2.3.3", + "mississippi": "^3.0.0", + "mkdirp": "^0.5.1", + "normalize-package-data": "^2.4.0", + "npm-package-arg": "^6.1.0", + "npm-packlist": "^1.1.10", + "npm-pick-manifest": "^2.1.0", + "osenv": "^0.1.5", + "promise-inflight": "^1.0.1", + "promise-retry": "^1.1.1", + "protoduck": "^5.0.0", + "rimraf": "^2.6.2", + "safe-buffer": "^5.1.2", + "semver": "^5.5.0", + "ssri": "^6.0.0", + "tar": "^4.4.3", + "unique-filename": "^1.1.0", + "which": "^1.3.0" + } + }, + "parallel-transform": { + "version": "1.1.0", + "bundled": true, + "requires": { + "cyclist": "~0.2.2", + "inherits": "^2.0.3", + "readable-stream": "^2.1.5" + } + }, + "path-exists": { + "version": "3.0.0", + "bundled": true + }, + "path-is-absolute": { + "version": "1.0.1", + "bundled": true + }, + "path-is-inside": { + "version": "1.0.2", + "bundled": true + }, + "path-key": { + "version": "2.0.1", + "bundled": true + }, + "performance-now": { + "version": "0.2.0", + "bundled": true + }, + "pify": { + "version": "3.0.0", + "bundled": true + }, + "prepend-http": { + "version": "1.0.4", + "bundled": true + }, + "process-nextick-args": { + "version": "2.0.0", + "bundled": true + }, + "promise-inflight": { + "version": "1.0.1", + "bundled": true + }, + "promise-retry": { + "version": "1.1.1", + "bundled": true, + "requires": { + "err-code": "^1.0.0", + "retry": "^0.10.0" + }, + "dependencies": { + "retry": { + "version": "0.10.1", + "bundled": true + } + } + }, + "promzard": { + "version": "0.3.0", + "bundled": true, + "requires": { + "read": "1" + } + }, + "proto-list": { + "version": "1.2.4", + "bundled": true + }, + "protoduck": { + "version": "5.0.0", + "bundled": true, + "requires": { + "genfun": "^4.0.1" + } + }, + "prr": { + "version": "1.0.1", + "bundled": true + }, + "pseudomap": { + "version": "1.0.2", + "bundled": true + }, + "pump": { + "version": "3.0.0", + "bundled": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "pumpify": { + "version": "1.5.1", + "bundled": true, + "requires": { + "duplexify": "^3.6.0", + "inherits": "^2.0.3", + "pump": "^2.0.0" + }, + "dependencies": { + "pump": { + "version": "2.0.1", + "bundled": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + } + } + }, + "punycode": { + "version": "1.4.1", + "bundled": true + }, + "qrcode-terminal": { + "version": "0.12.0", + "bundled": true + }, + "qs": { + "version": "6.4.0", + "bundled": true + }, + "query-string": { + "version": "6.1.0", + "bundled": true, + "requires": { + "decode-uri-component": "^0.2.0", + "strict-uri-encode": "^2.0.0" + } + }, + "qw": { + "version": "1.0.1", + "bundled": true + }, + "rc": { + "version": "1.2.7", + "bundled": true, + "requires": { + "deep-extend": "^0.5.1", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "dependencies": { + "minimist": { + "version": "1.2.0", + "bundled": true + } + } + }, + "read": { + "version": "1.0.7", + "bundled": true, + "requires": { + "mute-stream": "~0.0.4" + } + }, + "read-cmd-shim": { + "version": "1.0.1", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2" + } + }, + "read-installed": { + "version": "4.0.3", + "bundled": true, + "requires": { + "debuglog": "^1.0.1", + "graceful-fs": "^4.1.2", + "read-package-json": "^2.0.0", + "readdir-scoped-modules": "^1.0.0", + "semver": "2 || 3 || 4 || 5", + "slide": "~1.1.3", + "util-extend": "^1.0.1" + } + }, + "read-package-json": { + "version": "2.0.13", + "bundled": true, + "requires": { + "glob": "^7.1.1", + "graceful-fs": "^4.1.2", + "json-parse-better-errors": "^1.0.1", + "normalize-package-data": "^2.0.0", + "slash": "^1.0.0" + } + }, + "read-package-tree": { + "version": "5.2.1", + "bundled": true, + "requires": { + "debuglog": "^1.0.1", + "dezalgo": "^1.0.0", + "once": "^1.3.0", + "read-package-json": "^2.0.0", + "readdir-scoped-modules": "^1.0.0" + } + }, + "readable-stream": { + "version": "2.3.6", + "bundled": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "readdir-scoped-modules": { + "version": "1.0.2", + "bundled": true, + "requires": { + "debuglog": "^1.0.1", + "dezalgo": "^1.0.0", + "graceful-fs": "^4.1.2", + "once": "^1.3.0" + } + }, + "registry-auth-token": { + "version": "3.3.2", + "bundled": true, + "requires": { + "rc": "^1.1.6", + "safe-buffer": "^5.0.1" + } + }, + "registry-url": { + "version": "3.1.0", + "bundled": true, + "requires": { + "rc": "^1.0.1" + } + }, + "request": { + "version": "2.81.0", + "bundled": true, + "requires": { + "aws-sign2": "~0.6.0", + "aws4": "^1.2.1", + "caseless": "~0.12.0", + "combined-stream": "~1.0.5", + "extend": "~3.0.0", + "forever-agent": "~0.6.1", + "form-data": "~2.1.1", + "har-validator": "~4.2.1", + "hawk": "~3.1.3", + "http-signature": "~1.1.0", + "is-typedarray": "~1.0.0", + "isstream": "~0.1.2", + "json-stringify-safe": "~5.0.1", + "mime-types": "~2.1.7", + "oauth-sign": "~0.8.1", + "performance-now": "^0.2.0", + "qs": "~6.4.0", + "safe-buffer": "^5.0.1", + "stringstream": "~0.0.4", + "tough-cookie": "~2.3.0", + "tunnel-agent": "^0.6.0", + "uuid": "^3.0.0" + } + }, + "require-directory": { + "version": "2.1.1", + "bundled": true + }, + "require-main-filename": { + "version": "1.0.1", + "bundled": true + }, + "resolve-from": { + "version": "4.0.0", + "bundled": true + }, + "retry": { + "version": "0.12.0", + "bundled": true + }, + "rimraf": { + "version": "2.6.2", + "bundled": true, + "requires": { + "glob": "^7.0.5" + } + }, + "run-queue": { + "version": "1.0.3", + "bundled": true, + "requires": { + "aproba": "^1.1.1" + } + }, + "safe-buffer": { + "version": "5.1.2", + "bundled": true + }, + "safer-buffer": { + "version": "2.1.2", + "bundled": true + }, + "semver": { + "version": "5.5.0", + "bundled": true + }, + "semver-diff": { + "version": "2.1.0", + "bundled": true, + "requires": { + "semver": "^5.0.3" + } + }, + "set-blocking": { + "version": "2.0.0", + "bundled": true + }, + "sha": { + "version": "2.0.1", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.2", + "readable-stream": "^2.0.2" + } + }, + "shebang-command": { + "version": "1.2.0", + "bundled": true, + "requires": { + "shebang-regex": "^1.0.0" + } + }, + "shebang-regex": { + "version": "1.0.0", + "bundled": true + }, + "signal-exit": { + "version": "3.0.2", + "bundled": true + }, + "slash": { + "version": "1.0.0", + "bundled": true + }, + "slide": { + "version": "1.1.6", + "bundled": true + }, + "smart-buffer": { + "version": "4.0.1", + "bundled": true + }, + "sntp": { + "version": "1.0.9", + "bundled": true, + "requires": { + "hoek": "2.x.x" + } + }, + "socks": { + "version": "2.2.0", + "bundled": true, + "requires": { + "ip": "^1.1.5", + "smart-buffer": "^4.0.1" + } + }, + "socks-proxy-agent": { + "version": "4.0.1", + "bundled": true, + "requires": { + "agent-base": "~4.2.0", + "socks": "~2.2.0" + } + }, + "sorted-object": { + "version": "2.0.1", + "bundled": true + }, + "sorted-union-stream": { + "version": "2.1.3", + "bundled": true, + "requires": { + "from2": "^1.3.0", + "stream-iterate": "^1.1.0" + }, + "dependencies": { + "from2": { + "version": "1.3.0", + "bundled": true, + "requires": { + "inherits": "~2.0.1", + "readable-stream": "~1.1.10" + } + }, + "isarray": { + "version": "0.0.1", + "bundled": true + }, + "readable-stream": { + "version": "1.1.14", + "bundled": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "0.0.1", + "string_decoder": "~0.10.x" + } + }, + "string_decoder": { + "version": "0.10.31", + "bundled": true + } + } + }, + "spdx-correct": { + "version": "3.0.0", + "bundled": true, + "requires": { + "spdx-expression-parse": "^3.0.0", + "spdx-license-ids": "^3.0.0" + } + }, + "spdx-exceptions": { + "version": "2.1.0", + "bundled": true + }, + "spdx-expression-parse": { + "version": "3.0.0", + "bundled": true, + "requires": { + "spdx-exceptions": "^2.1.0", + "spdx-license-ids": "^3.0.0" + } + }, + "spdx-license-ids": { + "version": "3.0.0", + "bundled": true + }, + "sshpk": { + "version": "1.14.2", + "bundled": true, + "requires": { + "asn1": "~0.2.3", + "assert-plus": "^1.0.0", + "bcrypt-pbkdf": "^1.0.0", + "dashdash": "^1.12.0", + "ecc-jsbn": "~0.1.1", + "getpass": "^0.1.1", + "jsbn": "~0.1.0", + "safer-buffer": "^2.0.2", + "tweetnacl": "~0.14.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true + } + } + }, + "ssri": { + "version": "6.0.0", + "bundled": true + }, + "stream-each": { + "version": "1.2.2", + "bundled": true, + "requires": { + "end-of-stream": "^1.1.0", + "stream-shift": "^1.0.0" + } + }, + "stream-iterate": { + "version": "1.2.0", + "bundled": true, + "requires": { + "readable-stream": "^2.1.5", + "stream-shift": "^1.0.0" + } + }, + "stream-shift": { + "version": "1.0.0", + "bundled": true + }, + "strict-uri-encode": { + "version": "2.0.0", + "bundled": true + }, + "string-width": { + "version": "2.1.1", + "bundled": true, + "requires": { + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^4.0.0" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "bundled": true + }, + "is-fullwidth-code-point": { + "version": "2.0.0", + "bundled": true + }, + "strip-ansi": { + "version": "4.0.0", + "bundled": true, + "requires": { + "ansi-regex": "^3.0.0" + } + } + } + }, + "string_decoder": { + "version": "1.1.1", + "bundled": true, + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "stringify-package": { + "version": "1.0.0", + "bundled": true + }, + "stringstream": { + "version": "0.0.6", + "bundled": true + }, + "strip-ansi": { + "version": "3.0.1", + "bundled": true, + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "strip-eof": { + "version": "1.0.0", + "bundled": true + }, + "strip-json-comments": { + "version": "2.0.1", + "bundled": true + }, + "supports-color": { + "version": "5.4.0", + "bundled": true, + "requires": { + "has-flag": "^3.0.0" + } + }, + "tar": { + "version": "4.4.4", + "bundled": true, + "requires": { + "chownr": "^1.0.1", + "fs-minipass": "^1.2.5", + "minipass": "^2.3.3", + "minizlib": "^1.1.0", + "mkdirp": "^0.5.0", + "safe-buffer": "^5.1.2", + "yallist": "^3.0.2" + }, + "dependencies": { + "yallist": { + "version": "3.0.2", + "bundled": true + } + } + }, + "term-size": { + "version": "1.2.0", + "bundled": true, + "requires": { + "execa": "^0.7.0" + } + }, + "text-table": { + "version": "0.2.0", + "bundled": true + }, + "through": { + "version": "2.3.8", + "bundled": true + }, + "through2": { + "version": "2.0.3", + "bundled": true, + "requires": { + "readable-stream": "^2.1.5", + "xtend": "~4.0.1" + } + }, + "timed-out": { + "version": "4.0.1", + "bundled": true + }, + "tiny-relative-date": { + "version": "1.3.0", + "bundled": true + }, + "tough-cookie": { + "version": "2.3.4", + "bundled": true, + "requires": { + "punycode": "^1.4.1" + } + }, + "tunnel-agent": { + "version": "0.6.0", + "bundled": true, + "requires": { + "safe-buffer": "^5.0.1" + } + }, + "tweetnacl": { + "version": "0.14.5", + "bundled": true, + "optional": true + }, + "typedarray": { + "version": "0.0.6", + "bundled": true + }, + "uid-number": { + "version": "0.0.6", + "bundled": true + }, + "umask": { + "version": "1.1.0", + "bundled": true + }, + "unique-filename": { + "version": "1.1.0", + "bundled": true, + "requires": { + "unique-slug": "^2.0.0" + } + }, + "unique-slug": { + "version": "2.0.0", + "bundled": true, + "requires": { + "imurmurhash": "^0.1.4" + } + }, + "unique-string": { + "version": "1.0.0", + "bundled": true, + "requires": { + "crypto-random-string": "^1.0.0" + } + }, + "unpipe": { + "version": "1.0.0", + "bundled": true + }, + "unzip-response": { + "version": "2.0.1", + "bundled": true + }, + "update-notifier": { + "version": "2.5.0", + "bundled": true, + "requires": { + "boxen": "^1.2.1", + "chalk": "^2.0.1", + "configstore": "^3.0.0", + "import-lazy": "^2.1.0", + "is-ci": "^1.0.10", + "is-installed-globally": "^0.1.0", + "is-npm": "^1.0.0", + "latest-version": "^3.0.0", + "semver-diff": "^2.0.0", + "xdg-basedir": "^3.0.0" + } + }, + "url-parse-lax": { + "version": "1.0.0", + "bundled": true, + "requires": { + "prepend-http": "^1.0.1" + } + }, + "util-deprecate": { + "version": "1.0.2", + "bundled": true + }, + "util-extend": { + "version": "1.0.3", + "bundled": true + }, + "uuid": { + "version": "3.3.2", + "bundled": true + }, + "validate-npm-package-license": { + "version": "3.0.3", + "bundled": true, + "requires": { + "spdx-correct": "^3.0.0", + "spdx-expression-parse": "^3.0.0" + } + }, + "validate-npm-package-name": { + "version": "3.0.0", + "bundled": true, + "requires": { + "builtins": "^1.0.3" + } + }, + "verror": { + "version": "1.10.0", + "bundled": true, + "requires": { + "assert-plus": "^1.0.0", + "core-util-is": "1.0.2", + "extsprintf": "^1.2.0" + }, + "dependencies": { + "assert-plus": { + "version": "1.0.0", + "bundled": true + } + } + }, + "wcwidth": { + "version": "1.0.1", + "bundled": true, + "requires": { + "defaults": "^1.0.3" + } + }, + "which": { + "version": "1.3.1", + "bundled": true, + "requires": { + "isexe": "^2.0.0" + } + }, + "which-module": { + "version": "2.0.0", + "bundled": true + }, + "wide-align": { + "version": "1.1.2", + "bundled": true, + "requires": { + "string-width": "^1.0.2" + }, + "dependencies": { + "string-width": { + "version": "1.0.2", + "bundled": true, + "requires": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + } + } + } + }, + "widest-line": { + "version": "2.0.0", + "bundled": true, + "requires": { + "string-width": "^2.1.1" + } + }, + "worker-farm": { + "version": "1.6.0", + "bundled": true, + "requires": { + "errno": "~0.1.7" + } + }, + "wrap-ansi": { + "version": "2.1.0", + "bundled": true, + "requires": { + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1" + }, + "dependencies": { + "string-width": { + "version": "1.0.2", + "bundled": true, + "requires": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + } + } + } + }, + "wrappy": { + "version": "1.0.2", + "bundled": true + }, + "write-file-atomic": { + "version": "2.3.0", + "bundled": true, + "requires": { + "graceful-fs": "^4.1.11", + "imurmurhash": "^0.1.4", + "signal-exit": "^3.0.2" + } + }, + "xdg-basedir": { + "version": "3.0.0", + "bundled": true + }, + "xtend": { + "version": "4.0.1", + "bundled": true + }, + "y18n": { + "version": "4.0.0", + "bundled": true + }, + "yallist": { + "version": "2.1.2", + "bundled": true + }, + "yargs": { + "version": "11.0.0", + "bundled": true, + "requires": { + "cliui": "^4.0.0", + "decamelize": "^1.1.1", + "find-up": "^2.1.0", + "get-caller-file": "^1.0.1", + "os-locale": "^2.0.0", + "require-directory": "^2.1.1", + "require-main-filename": "^1.0.1", + "set-blocking": "^2.0.0", + "string-width": "^2.0.0", + "which-module": "^2.0.0", + "y18n": "^3.2.1", + "yargs-parser": "^9.0.2" + }, + "dependencies": { + "y18n": { + "version": "3.2.1", + "bundled": true + } + } + }, + "yargs-parser": { + "version": "9.0.2", + "bundled": true, + "requires": { + "camelcase": "^4.1.0" + } + } + } + }, + "npm-run-path": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-2.0.2.tgz", + "integrity": "sha1-NakjLfo11wZ7TLLd8jV7GHFTbF8=", + "dev": true, + "requires": { + "path-key": "^2.0.0" + } + }, + "num2fraction": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/num2fraction/-/num2fraction-1.2.2.tgz", + "integrity": "sha1-b2gragJ6Tp3fpFZM0lidHU5mnt4=", + "dev": true + }, + "number-is-nan": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", + "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=", + "dev": true + }, + "nvd3": { + "version": "1.8.6", + "resolved": "https://registry.npmjs.org/nvd3/-/nvd3-1.8.6.tgz", + "integrity": "sha1-LT66dL8zNjtRAevx0JPFmlOuc8Q=" + }, + "object-assign": { + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", + "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", + "dev": true + }, + "object-copy": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/object-copy/-/object-copy-0.1.0.tgz", + "integrity": "sha1-fn2Fi3gb18mRpBupde04EnVOmYw=", + "dev": true, + "requires": { + "copy-descriptor": "^0.1.0", + "define-property": "^0.2.5", + "kind-of": "^3.0.3" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + } + } + }, + "object-keys": { + "version": "1.0.12", + "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.0.12.tgz", + "integrity": "sha512-FTMyFUm2wBcGHnH2eXmz7tC6IwlqQZ6mVZ+6dm6vZ4IQIHjs6FdNsQBuKGPuUUUY6NfJw2PshC08Tn6LzLDOag==", + "dev": true + }, + "object-visit": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/object-visit/-/object-visit-1.0.1.tgz", + "integrity": "sha1-95xEk68MU3e1n+OdOV5BBC3QRbs=", + "dev": true, + "requires": { + "isobject": "^3.0.0" + } + }, + "object.assign": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.0.tgz", + "integrity": "sha512-exHJeq6kBKj58mqGyTQ9DFvrZC/eR6OwxzoM9YRoGBqrXYonaFyGiFMuc9VZrXf7DarreEwMpurG3dd+CNyW5w==", + "dev": true, + "requires": { + "define-properties": "^1.1.2", + "function-bind": "^1.1.1", + "has-symbols": "^1.0.0", + "object-keys": "^1.0.11" + } + }, + "object.entries": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.0.4.tgz", + "integrity": "sha1-G/mk3SKI9bM/Opk9JXZh8F0WGl8=", + "dev": true, + "requires": { + "define-properties": "^1.1.2", + "es-abstract": "^1.6.1", + "function-bind": "^1.1.0", + "has": "^1.0.1" + } + }, + "object.pick": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz", + "integrity": "sha1-h6EKxMFpS9Lhy/U1kaZhQftd10c=", + "dev": true, + "requires": { + "isobject": "^3.0.1" + } + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "dev": true, + "requires": { + "wrappy": "1" + } + }, + "onetime": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/onetime/-/onetime-2.0.1.tgz", + "integrity": "sha1-BnQoIw/WdEOyeUsiu6UotoZ5YtQ=", + "dev": true, + "requires": { + "mimic-fn": "^1.0.0" + } + }, + "optimist": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/optimist/-/optimist-0.6.1.tgz", + "integrity": "sha1-2j6nRob6IaGaERwybpDrFaAZZoY=", + "dev": true, + "requires": { + "minimist": "~0.0.1", + "wordwrap": "~0.0.2" + }, + "dependencies": { + "wordwrap": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-0.0.3.tgz", + "integrity": "sha1-o9XabNXAvAAI03I0u68b7WMFkQc=", + "dev": true + } + } + }, + "optionator": { + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.8.2.tgz", + "integrity": "sha1-NkxeQJ0/TWMB1sC0wFu6UBgK62Q=", + "dev": true, + "requires": { + "deep-is": "~0.1.3", + "fast-levenshtein": "~2.0.4", + "levn": "~0.3.0", + "prelude-ls": "~1.1.2", + "type-check": "~0.3.2", + "wordwrap": "~1.0.0" + } + }, + "os-browserify": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/os-browserify/-/os-browserify-0.3.0.tgz", + "integrity": "sha1-hUNzx/XCMVkU/Jv8a9gjj92h7Cc=", + "dev": true + }, + "os-homedir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", + "integrity": "sha1-/7xJiDNuDoM94MFox+8VISGqf7M=", + "dev": true + }, + "os-locale": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/os-locale/-/os-locale-2.1.0.tgz", + "integrity": "sha512-3sslG3zJbEYcaC4YVAvDorjGxc7tv6KVATnLPZONiljsUncvihe9BQoVCEs0RZ1kmf4Hk9OBqlZfJZWI4GanKA==", + "dev": true, + "requires": { + "execa": "^0.7.0", + "lcid": "^1.0.0", + "mem": "^1.1.0" + } + }, + "os-tmpdir": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", + "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", + "dev": true + }, + "p-finally": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-finally/-/p-finally-1.0.0.tgz", + "integrity": "sha1-P7z7FbiZpEEjs0ttzBi3JDNqLK4=", + "dev": true + }, + "p-limit": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-1.3.0.tgz", + "integrity": "sha512-vvcXsLAJ9Dr5rQOPk7toZQZJApBl2K4J6dANSsEuh6QI41JYcsS/qhTGa9ErIUUgK3WNQoJYvylxvjqmiqEA9Q==", + "dev": true, + "requires": { + "p-try": "^1.0.0" + } + }, + "p-locate": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-2.0.0.tgz", + "integrity": "sha1-IKAQOyIqcMj9OcwuWAaA893l7EM=", + "dev": true, + "requires": { + "p-limit": "^1.1.0" + } + }, + "p-try": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-1.0.0.tgz", + "integrity": "sha1-y8ec26+P1CKOE/Yh8rGiN8GyB7M=", + "dev": true + }, + "pako": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.6.tgz", + "integrity": "sha512-lQe48YPsMJAig+yngZ87Lus+NF+3mtu7DVOBu6b/gHO1YpKwIj5AWjZ/TOS7i46HD/UixzWb1zeWDZfGZ3iYcg==", + "dev": true + }, + "parallel-transform": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/parallel-transform/-/parallel-transform-1.1.0.tgz", + "integrity": "sha1-1BDwZbBdojCB/NEPKIVMKb2jOwY=", + "dev": true, + "requires": { + "cyclist": "~0.2.2", + "inherits": "^2.0.3", + "readable-stream": "^2.1.5" + } + }, + "parse-asn1": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/parse-asn1/-/parse-asn1-5.1.1.tgz", + "integrity": "sha512-KPx7flKXg775zZpnp9SxJlz00gTd4BmJ2yJufSc44gMCRrRQ7NSzAcSJQfifuOLgW6bEi+ftrALtsgALeB2Adw==", + "dev": true, + "requires": { + "asn1.js": "^4.0.0", + "browserify-aes": "^1.0.0", + "create-hash": "^1.1.0", + "evp_bytestokey": "^1.0.0", + "pbkdf2": "^3.0.3" + } + }, + "parse-json": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-2.2.0.tgz", + "integrity": "sha1-9ID0BDTvgHQfhGkJn43qGPVaTck=", + "dev": true, + "requires": { + "error-ex": "^1.2.0" + } + }, + "pascalcase": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/pascalcase/-/pascalcase-0.1.1.tgz", + "integrity": "sha1-s2PlXoAGym/iF4TS2yK9FdeRfxQ=", + "dev": true + }, + "path-browserify": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-0.0.0.tgz", + "integrity": "sha1-oLhwcpquIUAFt9UDLsLLuw+0RRo=", + "dev": true + }, + "path-dirname": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/path-dirname/-/path-dirname-1.0.2.tgz", + "integrity": "sha1-zDPSTVJeCZpTiMAzbG4yuRYGCeA=", + "dev": true + }, + "path-exists": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-3.0.0.tgz", + "integrity": "sha1-zg6+ql94yxiSXqfYENe1mwEP1RU=", + "dev": true + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "dev": true + }, + "path-is-inside": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/path-is-inside/-/path-is-inside-1.0.2.tgz", + "integrity": "sha1-NlQX3t5EQw0cEa9hAn+s8HS9/FM=", + "dev": true + }, + "path-key": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/path-key/-/path-key-2.0.1.tgz", + "integrity": "sha1-QRyttXTFoUDTpLGRDUDYDMn0C0A=", + "dev": true + }, + "path-parse": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.5.tgz", + "integrity": "sha1-PBrfhx6pzWyUMbbqK9dKD/BVxME=", + "dev": true + }, + "path-type": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", + "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", + "dev": true, + "requires": { + "pify": "^3.0.0" + } + }, + "pbkdf2": { + "version": "3.0.16", + "resolved": "https://registry.npmjs.org/pbkdf2/-/pbkdf2-3.0.16.tgz", + "integrity": "sha512-y4CXP3thSxqf7c0qmOF+9UeOTrifiVTIM+u7NWlq+PRsHbr7r7dpCmvzrZxa96JJUNi0Y5w9VqG5ZNeCVMoDcA==", + "dev": true, + "requires": { + "create-hash": "^1.1.2", + "create-hmac": "^1.1.4", + "ripemd160": "^2.0.1", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + } + }, + "pify": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-3.0.0.tgz", + "integrity": "sha1-5aSs0sEB/fPZpNB/DbxNtJ3SgXY=", + "dev": true + }, + "pinkie": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/pinkie/-/pinkie-2.0.4.tgz", + "integrity": "sha1-clVrgM+g1IqXToDnckjoDtT3+HA=", + "dev": true + }, + "pinkie-promise": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pinkie-promise/-/pinkie-promise-2.0.1.tgz", + "integrity": "sha1-ITXW36ejWMBprJsXh3YogihFD/o=", + "dev": true, + "requires": { + "pinkie": "^2.0.0" + } + }, + "pkg-dir": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-2.0.0.tgz", + "integrity": "sha1-9tXREJ4Z1j7fQo4L1X4Sd3YVM0s=", + "dev": true, + "requires": { + "find-up": "^2.1.0" + } + }, + "pluralize": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/pluralize/-/pluralize-7.0.0.tgz", + "integrity": "sha512-ARhBOdzS3e41FbkW/XWrTEtukqqLoK5+Z/4UeDaLuSW+39JPeFgs4gCGqsrJHVZX0fUrx//4OF0K1CUGwlIFow==", + "dev": true + }, + "posix-character-classes": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/posix-character-classes/-/posix-character-classes-0.1.1.tgz", + "integrity": "sha1-AerA/jta9xoqbAL+q7jB/vfgDqs=", + "dev": true + }, + "postcss": { + "version": "6.0.23", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-6.0.23.tgz", + "integrity": "sha512-soOk1h6J3VMTZtVeVpv15/Hpdl2cBLX3CAw4TAbkpTJiNPk9YP/zWcD1ND+xEtvyuuvKzbxliTOIyvkSeSJ6ag==", + "dev": true, + "requires": { + "chalk": "^2.4.1", + "source-map": "^0.6.1", + "supports-color": "^5.4.0" + }, + "dependencies": { + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "postcss-calc": { + "version": "5.3.1", + "resolved": "https://registry.npmjs.org/postcss-calc/-/postcss-calc-5.3.1.tgz", + "integrity": "sha1-d7rnypKK2FcW4v2kLyYb98HWW14=", + "dev": true, + "requires": { + "postcss": "^5.0.2", + "postcss-message-helpers": "^2.0.0", + "reduce-css-calc": "^1.2.6" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-colormin": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/postcss-colormin/-/postcss-colormin-2.2.2.tgz", + "integrity": "sha1-ZjFBfV8OkJo9fsJrJMio0eT5bks=", + "dev": true, + "requires": { + "colormin": "^1.0.5", + "postcss": "^5.0.13", + "postcss-value-parser": "^3.2.3" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-convert-values": { + "version": "2.6.1", + "resolved": "https://registry.npmjs.org/postcss-convert-values/-/postcss-convert-values-2.6.1.tgz", + "integrity": "sha1-u9hZPFwf0uPRwyK7kl3K6Nrk1i0=", + "dev": true, + "requires": { + "postcss": "^5.0.11", + "postcss-value-parser": "^3.1.2" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-discard-comments": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/postcss-discard-comments/-/postcss-discard-comments-2.0.4.tgz", + "integrity": "sha1-vv6J+v1bPazlzM5Rt2uBUUvgDj0=", + "dev": true, + "requires": { + "postcss": "^5.0.14" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-discard-duplicates": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postcss-discard-duplicates/-/postcss-discard-duplicates-2.1.0.tgz", + "integrity": "sha1-uavye4isGIFYpesSq8riAmO5GTI=", + "dev": true, + "requires": { + "postcss": "^5.0.4" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-discard-empty": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postcss-discard-empty/-/postcss-discard-empty-2.1.0.tgz", + "integrity": "sha1-0rS9nVztXr2Nyt52QMfXzX9PkrU=", + "dev": true, + "requires": { + "postcss": "^5.0.14" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-discard-overridden": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/postcss-discard-overridden/-/postcss-discard-overridden-0.1.1.tgz", + "integrity": "sha1-ix6vVU9ob7KIzYdMVWZ7CqNmjVg=", + "dev": true, + "requires": { + "postcss": "^5.0.16" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-discard-unused": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/postcss-discard-unused/-/postcss-discard-unused-2.2.3.tgz", + "integrity": "sha1-vOMLLMWR/8Y0Mitfs0ZLbZNPRDM=", + "dev": true, + "requires": { + "postcss": "^5.0.14", + "uniqs": "^2.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-filter-plugins": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/postcss-filter-plugins/-/postcss-filter-plugins-2.0.3.tgz", + "integrity": "sha512-T53GVFsdinJhgwm7rg1BzbeBRomOg9y5MBVhGcsV0CxurUdVj1UlPdKtn7aqYA/c/QVkzKMjq2bSV5dKG5+AwQ==", + "dev": true, + "requires": { + "postcss": "^5.0.4" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-merge-idents": { + "version": "2.1.7", + "resolved": "https://registry.npmjs.org/postcss-merge-idents/-/postcss-merge-idents-2.1.7.tgz", + "integrity": "sha1-TFUwMTwI4dWzu/PSu8dH4njuonA=", + "dev": true, + "requires": { + "has": "^1.0.1", + "postcss": "^5.0.10", + "postcss-value-parser": "^3.1.1" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-merge-longhand": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/postcss-merge-longhand/-/postcss-merge-longhand-2.0.2.tgz", + "integrity": "sha1-I9kM0Sewp3mUkVMyc5A0oaTz1lg=", + "dev": true, + "requires": { + "postcss": "^5.0.4" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-merge-rules": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/postcss-merge-rules/-/postcss-merge-rules-2.1.2.tgz", + "integrity": "sha1-0d9d+qexrMO+VT8OnhDofGG19yE=", + "dev": true, + "requires": { + "browserslist": "^1.5.2", + "caniuse-api": "^1.5.2", + "postcss": "^5.0.4", + "postcss-selector-parser": "^2.2.2", + "vendors": "^1.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-message-helpers": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/postcss-message-helpers/-/postcss-message-helpers-2.0.0.tgz", + "integrity": "sha1-pPL0+rbk/gAvCu0ABHjN9S+bpg4=", + "dev": true + }, + "postcss-minify-font-values": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/postcss-minify-font-values/-/postcss-minify-font-values-1.0.5.tgz", + "integrity": "sha1-S1jttWZB66fIR0qzUmyv17vey2k=", + "dev": true, + "requires": { + "object-assign": "^4.0.1", + "postcss": "^5.0.4", + "postcss-value-parser": "^3.0.2" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-minify-gradients": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/postcss-minify-gradients/-/postcss-minify-gradients-1.0.5.tgz", + "integrity": "sha1-Xb2hE3NwP4PPtKPqOIHY11/15uE=", + "dev": true, + "requires": { + "postcss": "^5.0.12", + "postcss-value-parser": "^3.3.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-minify-params": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/postcss-minify-params/-/postcss-minify-params-1.2.2.tgz", + "integrity": "sha1-rSzgcTc7lDs9kwo/pZo1jCjW8fM=", + "dev": true, + "requires": { + "alphanum-sort": "^1.0.1", + "postcss": "^5.0.2", + "postcss-value-parser": "^3.0.2", + "uniqs": "^2.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-minify-selectors": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/postcss-minify-selectors/-/postcss-minify-selectors-2.1.1.tgz", + "integrity": "sha1-ssapjAByz5G5MtGkllCBFDEXNb8=", + "dev": true, + "requires": { + "alphanum-sort": "^1.0.2", + "has": "^1.0.1", + "postcss": "^5.0.14", + "postcss-selector-parser": "^2.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-modules-extract-imports": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/postcss-modules-extract-imports/-/postcss-modules-extract-imports-1.1.0.tgz", + "integrity": "sha1-thTJcgvmgW6u41+zpfqh26agXds=", + "dev": true, + "requires": { + "postcss": "^6.0.1" + } + }, + "postcss-modules-local-by-default": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/postcss-modules-local-by-default/-/postcss-modules-local-by-default-1.2.0.tgz", + "integrity": "sha1-99gMOYxaOT+nlkRmvRlQCn1hwGk=", + "dev": true, + "requires": { + "css-selector-tokenizer": "^0.7.0", + "postcss": "^6.0.1" + } + }, + "postcss-modules-resolve-imports": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/postcss-modules-resolve-imports/-/postcss-modules-resolve-imports-1.3.0.tgz", + "integrity": "sha1-OY0wALla6WlCDN9M2D+oBn8cXq4=", + "dev": true, + "requires": { + "css-selector-tokenizer": "^0.7.0", + "icss-utils": "^3.0.1", + "minimist": "^1.2.0" + }, + "dependencies": { + "minimist": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", + "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=", + "dev": true + } + } + }, + "postcss-modules-scope": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/postcss-modules-scope/-/postcss-modules-scope-1.1.0.tgz", + "integrity": "sha1-1upkmUx5+XtipytCb75gVqGUu5A=", + "dev": true, + "requires": { + "css-selector-tokenizer": "^0.7.0", + "postcss": "^6.0.1" + } + }, + "postcss-modules-values": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/postcss-modules-values/-/postcss-modules-values-1.3.0.tgz", + "integrity": "sha1-7P+p1+GSUYOJ9CrQ6D9yrsRW6iA=", + "dev": true, + "requires": { + "icss-replace-symbols": "^1.1.0", + "postcss": "^6.0.1" + } + }, + "postcss-normalize-charset": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/postcss-normalize-charset/-/postcss-normalize-charset-1.1.1.tgz", + "integrity": "sha1-757nEhLX/nWceO0WL2HtYrXLk/E=", + "dev": true, + "requires": { + "postcss": "^5.0.5" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-normalize-url": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/postcss-normalize-url/-/postcss-normalize-url-3.0.8.tgz", + "integrity": "sha1-EI90s/L82viRov+j6kWSJ5/HgiI=", + "dev": true, + "requires": { + "is-absolute-url": "^2.0.0", + "normalize-url": "^1.4.0", + "postcss": "^5.0.14", + "postcss-value-parser": "^3.2.3" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-ordered-values": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/postcss-ordered-values/-/postcss-ordered-values-2.2.3.tgz", + "integrity": "sha1-7sbCpntsQSqNsgQud/6NpD+VwR0=", + "dev": true, + "requires": { + "postcss": "^5.0.4", + "postcss-value-parser": "^3.0.1" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-reduce-idents": { + "version": "2.4.0", + "resolved": "https://registry.npmjs.org/postcss-reduce-idents/-/postcss-reduce-idents-2.4.0.tgz", + "integrity": "sha1-wsbSDMlYKE9qv75j92Cb9AkFmtM=", + "dev": true, + "requires": { + "postcss": "^5.0.4", + "postcss-value-parser": "^3.0.2" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-reduce-initial": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/postcss-reduce-initial/-/postcss-reduce-initial-1.0.1.tgz", + "integrity": "sha1-aPgGlfBF0IJjqHmtJA343WT2ROo=", + "dev": true, + "requires": { + "postcss": "^5.0.4" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-reduce-transforms": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/postcss-reduce-transforms/-/postcss-reduce-transforms-1.0.4.tgz", + "integrity": "sha1-/3b02CEkN7McKYpC0uFEQCV3GuE=", + "dev": true, + "requires": { + "has": "^1.0.1", + "postcss": "^5.0.8", + "postcss-value-parser": "^3.0.1" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-selector-parser": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-2.2.3.tgz", + "integrity": "sha1-+UN3iGBsPJrO4W/+jYsWKX8nu5A=", + "dev": true, + "requires": { + "flatten": "^1.0.2", + "indexes-of": "^1.0.1", + "uniq": "^1.0.1" + } + }, + "postcss-svgo": { + "version": "2.1.6", + "resolved": "https://registry.npmjs.org/postcss-svgo/-/postcss-svgo-2.1.6.tgz", + "integrity": "sha1-tt8YqmE7Zm4TPwittSGcJoSsEI0=", + "dev": true, + "requires": { + "is-svg": "^2.0.0", + "postcss": "^5.0.14", + "postcss-value-parser": "^3.2.3", + "svgo": "^0.7.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-unique-selectors": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/postcss-unique-selectors/-/postcss-unique-selectors-2.0.2.tgz", + "integrity": "sha1-mB1X0p3csz57Hf4f1DuGSfkzyh0=", + "dev": true, + "requires": { + "alphanum-sort": "^1.0.1", + "postcss": "^5.0.4", + "uniqs": "^2.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "postcss-value-parser": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-3.3.0.tgz", + "integrity": "sha1-h/OPnxj3dKSrTIojL1xc6IcqnRU=", + "dev": true + }, + "postcss-zindex": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/postcss-zindex/-/postcss-zindex-2.2.0.tgz", + "integrity": "sha1-0hCd3AVbka9n/EyzsCWUZjnSryI=", + "dev": true, + "requires": { + "has": "^1.0.1", + "postcss": "^5.0.4", + "uniqs": "^2.0.0" + }, + "dependencies": { + "postcss": { + "version": "5.2.18", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-5.2.18.tgz", + "integrity": "sha512-zrUjRRe1bpXKsX1qAJNJjqZViErVuyEkMTRrwu4ud4sbTtIBRmtaYDrHmcGgmrbsW3MHfmtIf+vJumgQn+PrXg==", + "dev": true, + "requires": { + "chalk": "^1.1.3", + "js-base64": "^2.1.9", + "source-map": "^0.5.6", + "supports-color": "^3.2.3" + } + }, + "supports-color": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-3.2.3.tgz", + "integrity": "sha1-ZawFBLOVQXHYpklGsq48u4pfVPY=", + "dev": true, + "requires": { + "has-flag": "^1.0.0" + } + } + } + }, + "prelude-ls": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.1.2.tgz", + "integrity": "sha1-IZMqVJ9eUv/ZqCf1cOBL5iqX2lQ=", + "dev": true + }, + "prepend-http": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/prepend-http/-/prepend-http-1.0.4.tgz", + "integrity": "sha1-1PRWKwzjaW5BrFLQ4ALlemNdxtw=", + "dev": true + }, + "private": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/private/-/private-0.1.8.tgz", + "integrity": "sha512-VvivMrbvd2nKkiG38qjULzlc+4Vx4wm/whI9pQD35YrARNnhxeiRktSOhSukRLFNlzg6Br/cJPet5J/u19r/mg==", + "dev": true + }, + "process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha1-czIwDoQBYb2j5podHZGn1LwW8YI=", + "dev": true + }, + "process-nextick-args": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", + "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "dev": true + }, + "progress": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.0.tgz", + "integrity": "sha1-ihvjZr+Pwj2yvSPxDG/pILQ4nR8=", + "dev": true + }, + "promise-inflight": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/promise-inflight/-/promise-inflight-1.0.1.tgz", + "integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=", + "dev": true + }, + "prr": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/prr/-/prr-1.0.1.tgz", + "integrity": "sha1-0/wRS6BplaRexok/SEzrHXj19HY=", + "dev": true + }, + "pseudomap": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", + "integrity": "sha1-8FKijacOYYkX7wqKw0wa5aaChrM=", + "dev": true + }, + "public-encrypt": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/public-encrypt/-/public-encrypt-4.0.2.tgz", + "integrity": "sha512-4kJ5Esocg8X3h8YgJsKAuoesBgB7mqH3eowiDzMUPKiRDDE7E/BqqZD1hnTByIaAFiwAw246YEltSq7tdrOH0Q==", + "dev": true, + "requires": { + "bn.js": "^4.1.0", + "browserify-rsa": "^4.0.0", + "create-hash": "^1.1.0", + "parse-asn1": "^5.0.0", + "randombytes": "^2.0.1" + } + }, + "pump": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/pump/-/pump-2.0.1.tgz", + "integrity": "sha512-ruPMNRkN3MHP1cWJc9OWr+T/xDP0jhXYCLfJcBuX54hhfIBnaQmAUMfDcG4DM5UMWByBbJY69QSphm3jtDKIkA==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "pumpify": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/pumpify/-/pumpify-1.5.1.tgz", + "integrity": "sha512-oClZI37HvuUJJxSKKrC17bZ9Cu0ZYhEAGPsPUy9KlMUmv9dKX2o77RUmq7f3XjIxbwyGwYzbzQ1L2Ks8sIradQ==", + "dev": true, + "requires": { + "duplexify": "^3.6.0", + "inherits": "^2.0.3", + "pump": "^2.0.0" + } + }, + "punycode": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz", + "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==", + "dev": true + }, + "q": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/q/-/q-1.5.1.tgz", + "integrity": "sha1-fjL3W0E4EpHQRhHxvxQQmsAGUdc=", + "dev": true + }, + "query-string": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/query-string/-/query-string-4.3.4.tgz", + "integrity": "sha1-u7aTucqRXCMlFbIosaArYJBD2+s=", + "dev": true, + "requires": { + "object-assign": "^4.1.0", + "strict-uri-encode": "^1.0.0" + } + }, + "querystring": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/querystring/-/querystring-0.2.0.tgz", + "integrity": "sha1-sgmEkgO7Jd+CDadW50cAWHhSFiA=", + "dev": true + }, + "querystring-es3": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/querystring-es3/-/querystring-es3-0.2.1.tgz", + "integrity": "sha1-nsYfeQSYdXB9aUFFlv2Qek1xHnM=", + "dev": true + }, + "randombytes": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.0.6.tgz", + "integrity": "sha512-CIQ5OFxf4Jou6uOKe9t1AOgqpeU5fd70A8NPdHSGeYXqXsPe6peOwI0cUl88RWZ6sP1vPMV3avd/R6cZ5/sP1A==", + "dev": true, + "requires": { + "safe-buffer": "^5.1.0" + } + }, + "randomfill": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/randomfill/-/randomfill-1.0.4.tgz", + "integrity": "sha512-87lcbR8+MhcWcUiQ+9e+Rwx8MyR2P7qnt15ynUlbm3TU/fjbgz4GsvfSUDTemtCCtVCqb4ZcEFlyPNTh9bBTLw==", + "dev": true, + "requires": { + "randombytes": "^2.0.5", + "safe-buffer": "^5.1.0" + } + }, + "read-pkg": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-2.0.0.tgz", + "integrity": "sha1-jvHAYjxqbbDcZxPEv6xGMysjaPg=", + "dev": true, + "requires": { + "load-json-file": "^2.0.0", + "normalize-package-data": "^2.3.2", + "path-type": "^2.0.0" + }, + "dependencies": { + "path-type": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-2.0.0.tgz", + "integrity": "sha1-8BLMuEFbcJb8LaoQVMPXI4lZTHM=", + "dev": true, + "requires": { + "pify": "^2.0.0" + } + }, + "pify": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", + "dev": true + } + } + }, + "read-pkg-up": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-2.0.0.tgz", + "integrity": "sha1-a3KoBImE4MQeeVEP1en6mbO1Sb4=", + "dev": true, + "requires": { + "find-up": "^2.0.0", + "read-pkg": "^2.0.0" + } + }, + "readable-stream": { + "version": "2.3.6", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", + "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "dev": true, + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "readdirp": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-2.1.0.tgz", + "integrity": "sha1-TtCtBg3zBzMAxIRANz9y0cxkLXg=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "minimatch": "^3.0.2", + "readable-stream": "^2.0.2", + "set-immediate-shim": "^1.0.1" + } + }, + "reduce-css-calc": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/reduce-css-calc/-/reduce-css-calc-1.3.0.tgz", + "integrity": "sha1-dHyRTgSWFKTJz7umKYca0dKSdxY=", + "dev": true, + "requires": { + "balanced-match": "^0.4.2", + "math-expression-evaluator": "^1.2.14", + "reduce-function-call": "^1.0.1" + }, + "dependencies": { + "balanced-match": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-0.4.2.tgz", + "integrity": "sha1-yz8+PHMtwPAe5wtAPzAuYddwmDg=", + "dev": true + } + } + }, + "reduce-function-call": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/reduce-function-call/-/reduce-function-call-1.0.2.tgz", + "integrity": "sha1-WiAL+S4ON3UXUv5FsKszD9S2vpk=", + "dev": true, + "requires": { + "balanced-match": "^0.4.2" + }, + "dependencies": { + "balanced-match": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-0.4.2.tgz", + "integrity": "sha1-yz8+PHMtwPAe5wtAPzAuYddwmDg=", + "dev": true + } + } + }, + "regenerate": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.4.0.tgz", + "integrity": "sha512-1G6jJVDWrt0rK99kBjvEtziZNCICAuvIPkSiUFIQxVP06RCVpq3dmDo2oi6ABpYaDYaTRr67BEhL8r1wgEZZKg==", + "dev": true + }, + "regenerator-runtime": { + "version": "0.11.1", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.11.1.tgz", + "integrity": "sha512-MguG95oij0fC3QV3URf4V2SDYGJhJnJGqvIIgdECeODCT98wSWDAJ94SSuVpYQUoTcGUIL6L4yNB7j1DFFHSBg==", + "dev": true + }, + "regex-not": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/regex-not/-/regex-not-1.0.2.tgz", + "integrity": "sha512-J6SDjUgDxQj5NusnOtdFxDwN/+HWykR8GELwctJ7mdqhcyy1xEc4SRFHUXvxTp661YaVKAjfRLZ9cCqS6tn32A==", + "dev": true, + "requires": { + "extend-shallow": "^3.0.2", + "safe-regex": "^1.1.0" + } + }, + "regexpp": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/regexpp/-/regexpp-1.1.0.tgz", + "integrity": "sha512-LOPw8FpgdQF9etWMaAfG/WRthIdXJGYp4mJ2Jgn/2lpkbod9jPn0t9UqN7AxBOKNfzRbYyVfgc7Vk4t/MpnXgw==", + "dev": true + }, + "regexpu-core": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-1.0.0.tgz", + "integrity": "sha1-hqdj9Y7k18L2sQLkdkBQ3n7ZDGs=", + "dev": true, + "requires": { + "regenerate": "^1.2.1", + "regjsgen": "^0.2.0", + "regjsparser": "^0.1.4" + } + }, + "regjsgen": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.2.0.tgz", + "integrity": "sha1-bAFq3qxVT3WCP+N6wFuS1aTtsfc=", + "dev": true + }, + "regjsparser": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.1.5.tgz", + "integrity": "sha1-fuj4Tcb6eS0/0K4ijSS9lJ6tIFw=", + "dev": true, + "requires": { + "jsesc": "~0.5.0" + }, + "dependencies": { + "jsesc": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-0.5.0.tgz", + "integrity": "sha1-597mbjXW/Bb3EP6R1c9p9w8IkR0=", + "dev": true + } + } + }, + "remove-trailing-separator": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/remove-trailing-separator/-/remove-trailing-separator-1.1.0.tgz", + "integrity": "sha1-wkvOKig62tW8P1jg1IJJuSN52O8=", + "dev": true + }, + "repeat-element": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/repeat-element/-/repeat-element-1.1.2.tgz", + "integrity": "sha1-7wiaF40Ug7quTZPrmLT55OEdmQo=", + "dev": true + }, + "repeat-string": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/repeat-string/-/repeat-string-1.6.1.tgz", + "integrity": "sha1-jcrkcOHIirwtYA//Sndihtp15jc=", + "dev": true + }, + "repeating": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/repeating/-/repeating-2.0.1.tgz", + "integrity": "sha1-UhTFOpJtNVJwdSf7q0FdvAjQbdo=", + "dev": true, + "requires": { + "is-finite": "^1.0.0" + } + }, + "require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha1-jGStX9MNqxyXbiNE/+f3kqam30I=", + "dev": true + }, + "require-main-filename": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/require-main-filename/-/require-main-filename-1.0.1.tgz", + "integrity": "sha1-l/cXtp1IeE9fUmpsWqj/3aBVpNE=", + "dev": true + }, + "require-uncached": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/require-uncached/-/require-uncached-1.0.3.tgz", + "integrity": "sha1-Tg1W1slmL9MeQwEcS5WqSZVUIdM=", + "dev": true, + "requires": { + "caller-path": "^0.1.0", + "resolve-from": "^1.0.0" + }, + "dependencies": { + "resolve-from": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-1.0.1.tgz", + "integrity": "sha1-Jsv+k10a7uq7Kbw/5a6wHpPUQiY=", + "dev": true + } + } + }, + "resolve": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.8.1.tgz", + "integrity": "sha512-AicPrAC7Qu1JxPCZ9ZgCZlY35QgFnNqc+0LtbRNxnVw4TXvjQ72wnuL9JQcEBgXkI9JM8MsT9kaQoHcpCRJOYA==", + "dev": true, + "requires": { + "path-parse": "^1.0.5" + } + }, + "resolve-cwd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-2.0.0.tgz", + "integrity": "sha1-AKn3OHVW4nA46uIyyqNypqWbZlo=", + "dev": true, + "requires": { + "resolve-from": "^3.0.0" + } + }, + "resolve-from": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-3.0.0.tgz", + "integrity": "sha1-six699nWiBvItuZTM17rywoYh0g=", + "dev": true + }, + "resolve-url": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz", + "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=", + "dev": true + }, + "restore-cursor": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-2.0.0.tgz", + "integrity": "sha1-n37ih/gv0ybU/RYpI9YhKe7g368=", + "dev": true, + "requires": { + "onetime": "^2.0.0", + "signal-exit": "^3.0.2" + } + }, + "ret": { + "version": "0.1.15", + "resolved": "https://registry.npmjs.org/ret/-/ret-0.1.15.tgz", + "integrity": "sha512-TTlYpa+OL+vMMNG24xSlQGEJ3B/RzEfUlLct7b5G/ytav+wPrplCpVMFuwzXbkecJrb6IYo1iFb0S9v37754mg==", + "dev": true + }, + "right-align": { + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/right-align/-/right-align-0.1.3.tgz", + "integrity": "sha1-YTObci/mo1FWiSENJOFMlhSGE+8=", + "dev": true, + "optional": true, + "requires": { + "align-text": "^0.1.1" + } + }, + "rimraf": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", + "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", + "dev": true, + "requires": { + "glob": "^7.0.5" + }, + "dependencies": { + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.0.4", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + } + } + }, + "ripemd160": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/ripemd160/-/ripemd160-2.0.2.tgz", + "integrity": "sha512-ii4iagi25WusVoiC4B4lq7pbXfAp3D9v5CwfkY33vffw2+pkDjY1D8GaN7spsxvCSx8dkPqOZCEZyfxcmJG2IA==", + "dev": true, + "requires": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1" + } + }, + "run-async": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/run-async/-/run-async-2.3.0.tgz", + "integrity": "sha1-A3GrSuC91yDUFm19/aZP96RFpsA=", + "dev": true, + "requires": { + "is-promise": "^2.1.0" + } + }, + "run-queue": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/run-queue/-/run-queue-1.0.3.tgz", + "integrity": "sha1-6Eg5bwV9Ij8kOGkkYY4laUFh7Ec=", + "dev": true, + "requires": { + "aproba": "^1.1.1" + } + }, + "rx-lite": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/rx-lite/-/rx-lite-4.0.8.tgz", + "integrity": "sha1-Cx4Rr4vESDbwSmQH6S2kJGe3lEQ=", + "dev": true + }, + "rx-lite-aggregates": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/rx-lite-aggregates/-/rx-lite-aggregates-4.0.8.tgz", + "integrity": "sha1-dTuHqJoRyVRnxKwWJsTvxOBcZ74=", + "dev": true, + "requires": { + "rx-lite": "*" + } + }, + "rxjs": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-6.2.2.tgz", + "integrity": "sha512-0MI8+mkKAXZUF9vMrEoPnaoHkfzBPP4IGwUYRJhIRJF6/w3uByO1e91bEHn8zd43RdkTMKiooYKmwz7RH6zfOQ==", + "dev": true, + "requires": { + "tslib": "^1.9.0" + } + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true + }, + "safe-regex": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", + "integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=", + "dev": true, + "requires": { + "ret": "~0.1.10" + } + }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "dev": true + }, + "sax": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", + "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==", + "dev": true + }, + "schema-utils": { + "version": "0.4.5", + "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-0.4.5.tgz", + "integrity": "sha512-yYrjb9TX2k/J1Y5UNy3KYdZq10xhYcF8nMpAW6o3hy6Q8WSIEf9lJHG/ePnOBfziPM3fvQwfOwa13U/Fh8qTfA==", + "dev": true, + "requires": { + "ajv": "^6.1.0", + "ajv-keywords": "^3.1.0" + } + }, + "seekout": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/seekout/-/seekout-1.0.2.tgz", + "integrity": "sha1-CbqfG9W0b7sTRxjrGaaDgsuxuck=", + "dev": true + }, + "semver": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-5.5.0.tgz", + "integrity": "sha512-4SJ3dm0WAwWy/NVeioZh5AntkdJoWKxHxcmyP622fOkgHa4z3R0TdBJICINyaSDE6uNwVc8gZr+ZinwZAH4xIA==", + "dev": true + }, + "serialize-javascript": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-1.5.0.tgz", + "integrity": "sha512-Ga8c8NjAAp46Br4+0oZ2WxJCwIzwP60Gq1YPgU+39PiTVxyed/iKE/zyZI6+UlVYH5Q4PaQdHhcegIFPZTUfoQ==", + "dev": true + }, + "set-blocking": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", + "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", + "dev": true + }, + "set-immediate-shim": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/set-immediate-shim/-/set-immediate-shim-1.0.1.tgz", + "integrity": "sha1-SysbJ+uAip+NzEgaWOXlb1mfP2E=", + "dev": true + }, + "set-value": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/set-value/-/set-value-2.0.0.tgz", + "integrity": "sha512-hw0yxk9GT/Hr5yJEYnHNKYXkIA8mVJgd9ditYZCe16ZczcaELYYcfvaXesNACk2O8O0nTiPQcQhGUQj8JLzeeg==", + "dev": true, + "requires": { + "extend-shallow": "^2.0.1", + "is-extendable": "^0.1.1", + "is-plain-object": "^2.0.3", + "split-string": "^3.0.1" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } + } + }, + "setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha1-KQy7Iy4waULX1+qbg3Mqt4VvgoU=", + "dev": true + }, + "sha.js": { + "version": "2.4.11", + "resolved": "https://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", + "integrity": "sha512-QMEp5B7cftE7APOjk5Y6xgrbWu+WkLVQwk8JNjZ8nKRciZaByEW6MubieAiToS7+dwvrjGhH8jRXz3MVd0AYqQ==", + "dev": true, + "requires": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "shebang-command": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-1.2.0.tgz", + "integrity": "sha1-RKrGW2lbAzmJaMOfNj/uXer98eo=", + "dev": true, + "requires": { + "shebang-regex": "^1.0.0" + } + }, + "shebang-regex": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-1.0.0.tgz", + "integrity": "sha1-2kL0l0DAtC2yypcoVxyxkMmO/qM=", + "dev": true + }, + "signal-exit": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.2.tgz", + "integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=", + "dev": true + }, + "slash": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-1.0.0.tgz", + "integrity": "sha1-xB8vbDn8FtHNF61LXYlhFK5HDVU=", + "dev": true + }, + "slice-ansi": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/slice-ansi/-/slice-ansi-1.0.0.tgz", + "integrity": "sha512-POqxBK6Lb3q6s047D/XsDVNPnF9Dl8JSaqe9h9lURl0OdNqy/ujDrOiIHtsqXMGbWWTIomRzAMaTyawAU//Reg==", + "dev": true, + "requires": { + "is-fullwidth-code-point": "^2.0.0" + } + }, + "snapdragon": { + "version": "0.8.2", + "resolved": "https://registry.npmjs.org/snapdragon/-/snapdragon-0.8.2.tgz", + "integrity": "sha512-FtyOnWN/wCHTVXOMwvSv26d+ko5vWlIDD6zoUJ7LW8vh+ZBC8QdljveRP+crNrtBwioEUWy/4dMtbBjA4ioNlg==", + "dev": true, + "requires": { + "base": "^0.11.1", + "debug": "^2.2.0", + "define-property": "^0.2.5", + "extend-shallow": "^2.0.1", + "map-cache": "^0.2.2", + "source-map": "^0.5.6", + "source-map-resolve": "^0.5.0", + "use": "^3.1.0" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + }, + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + } + } + }, + "snapdragon-node": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/snapdragon-node/-/snapdragon-node-2.1.1.tgz", + "integrity": "sha512-O27l4xaMYt/RSQ5TR3vpWCAB5Kb/czIcqUFOM/C4fYcLnbZUc1PkjTAMjof2pBWaSTwOUd6qUHcFGVGj7aIwnw==", + "dev": true, + "requires": { + "define-property": "^1.0.0", + "isobject": "^3.0.0", + "snapdragon-util": "^3.0.1" + }, + "dependencies": { + "define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-1.0.0.tgz", + "integrity": "sha1-dp66rz9KY6rTr56NMEybvnm/sOY=", + "dev": true, + "requires": { + "is-descriptor": "^1.0.0" + } + }, + "is-accessor-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-accessor-descriptor/-/is-accessor-descriptor-1.0.0.tgz", + "integrity": "sha512-m5hnHTkcVsPfqx3AKlyttIPb7J+XykHvJP2B9bZDjlhLIoEq4XoK64Vg7boZlVWYK6LUY94dYPEE7Lh0ZkZKcQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-data-descriptor": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-data-descriptor/-/is-data-descriptor-1.0.0.tgz", + "integrity": "sha512-jbRXy1FmtAoCjQkVmIVYwuuqDFUbaOeDjmed1tOGPrsMhtJA4rD9tkgA0F1qJ3gRFRXcHYVkdeaP50Q5rE/jLQ==", + "dev": true, + "requires": { + "kind-of": "^6.0.0" + } + }, + "is-descriptor": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-descriptor/-/is-descriptor-1.0.2.tgz", + "integrity": "sha512-2eis5WqQGV7peooDyLmNEPUrps9+SXX5c9pL3xEB+4e9HnGuDa7mB7kHxHw4CbqS9k1T2hOH3miL8n8WtiYVtg==", + "dev": true, + "requires": { + "is-accessor-descriptor": "^1.0.0", + "is-data-descriptor": "^1.0.0", + "kind-of": "^6.0.2" + } + }, + "kind-of": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-6.0.2.tgz", + "integrity": "sha512-s5kLOcnH0XqDO+FvuaLX8DDjZ18CGFk7VygH40QoKPUQhW4e2rvM0rwUq0t8IQDOwYSeLK01U90OjzBTme2QqA==", + "dev": true + } + } + }, + "snapdragon-util": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/snapdragon-util/-/snapdragon-util-3.0.1.tgz", + "integrity": "sha512-mbKkMdQKsjX4BAL4bRYTj21edOf8cN7XHdYUJEe+Zn99hVEYcMvKPct1IqNe7+AZPirn8BCDOQBHQZknqmKlZQ==", + "dev": true, + "requires": { + "kind-of": "^3.2.0" + } + }, + "sort-keys": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/sort-keys/-/sort-keys-1.1.2.tgz", + "integrity": "sha1-RBttTTRnmPG05J6JIK37oOVD+a0=", + "dev": true, + "requires": { + "is-plain-obj": "^1.0.0" + } + }, + "source-list-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/source-list-map/-/source-list-map-2.0.0.tgz", + "integrity": "sha512-I2UmuJSRr/T8jisiROLU3A3ltr+swpniSmNPI4Ml3ZCX6tVnDsuZzK7F2hl5jTqbZBWCEKlj5HRQiPExXLgE8A==", + "dev": true + }, + "source-map": { + "version": "0.5.7", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.5.7.tgz", + "integrity": "sha1-igOdLRAh0i0eoUyA2OpGi6LvP8w=", + "dev": true + }, + "source-map-resolve": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/source-map-resolve/-/source-map-resolve-0.5.2.tgz", + "integrity": "sha512-MjqsvNwyz1s0k81Goz/9vRBe9SZdB09Bdw+/zYyO+3CuPk6fouTaxscHkgtE8jKvf01kVfl8riHzERQ/kefaSA==", + "dev": true, + "requires": { + "atob": "^2.1.1", + "decode-uri-component": "^0.2.0", + "resolve-url": "^0.2.1", + "source-map-url": "^0.4.0", + "urix": "^0.1.0" + } + }, + "source-map-support": { + "version": "0.4.18", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.4.18.tgz", + "integrity": "sha512-try0/JqxPLF9nOjvSta7tVondkP5dwgyLDjVoyMDlmjugT2lRZ1OfsrYTkCd2hkDnJTKRbO/Rl3orm8vlsUzbA==", + "dev": true, + "requires": { + "source-map": "^0.5.6" + } + }, + "source-map-url": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/source-map-url/-/source-map-url-0.4.0.tgz", + "integrity": "sha1-PpNdfd1zYxuXZZlW1VEo6HtQhKM=", + "dev": true + }, + "spdx-correct": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.0.0.tgz", + "integrity": "sha512-N19o9z5cEyc8yQQPukRCZ9EUmb4HUpnrmaL/fxS2pBo2jbfcFRVuFZ/oFC+vZz0MNNk0h80iMn5/S6qGZOL5+g==", + "dev": true, + "requires": { + "spdx-expression-parse": "^3.0.0", + "spdx-license-ids": "^3.0.0" + } + }, + "spdx-exceptions": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/spdx-exceptions/-/spdx-exceptions-2.1.0.tgz", + "integrity": "sha512-4K1NsmrlCU1JJgUrtgEeTVyfx8VaYea9J9LvARxhbHtVtohPs/gFGG5yy49beySjlIMhhXZ4QqujIZEfS4l6Cg==", + "dev": true + }, + "spdx-expression-parse": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/spdx-expression-parse/-/spdx-expression-parse-3.0.0.tgz", + "integrity": "sha512-Yg6D3XpRD4kkOmTpdgbUiEJFKghJH03fiC1OPll5h/0sO6neh2jqRDVHOQ4o/LMea0tgCkbMgea5ip/e+MkWyg==", + "dev": true, + "requires": { + "spdx-exceptions": "^2.1.0", + "spdx-license-ids": "^3.0.0" + } + }, + "spdx-license-ids": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.0.tgz", + "integrity": "sha512-2+EPwgbnmOIl8HjGBXXMd9NAu02vLjOO1nWw4kmeRDFyHn+M/ETfHxQUK0oXg8ctgVnl9t3rosNVsZ1jG61nDA==", + "dev": true + }, + "split-string": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/split-string/-/split-string-3.1.0.tgz", + "integrity": "sha512-NzNVhJDYpwceVVii8/Hu6DKfD2G+NrQHlS/V/qgv763EYudVwEcMQNxd2lh+0VrUByXN/oJkl5grOhYWvQUYiw==", + "dev": true, + "requires": { + "extend-shallow": "^3.0.0" + } + }, + "sprintf-js": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", + "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", + "dev": true + }, + "ssri": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/ssri/-/ssri-5.3.0.tgz", + "integrity": "sha512-XRSIPqLij52MtgoQavH/x/dU1qVKtWUAAZeOHsR9c2Ddi4XerFy3mc1alf+dLJKl9EUIm/Ht+EowFkTUOA6GAQ==", + "dev": true, + "requires": { + "safe-buffer": "^5.1.1" + } + }, + "static-extend": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/static-extend/-/static-extend-0.1.2.tgz", + "integrity": "sha1-YICcOcv/VTNyJv1eC1IPNB8ftcY=", + "dev": true, + "requires": { + "define-property": "^0.2.5", + "object-copy": "^0.1.0" + }, + "dependencies": { + "define-property": { + "version": "0.2.5", + "resolved": "https://registry.npmjs.org/define-property/-/define-property-0.2.5.tgz", + "integrity": "sha1-w1se+RjsPJkPmlvFe+BKrOxcgRY=", + "dev": true, + "requires": { + "is-descriptor": "^0.1.0" + } + } + } + }, + "stream-browserify": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-2.0.1.tgz", + "integrity": "sha1-ZiZu5fm9uZQKTkUUyvtDu3Hlyds=", + "dev": true, + "requires": { + "inherits": "~2.0.1", + "readable-stream": "^2.0.2" + } + }, + "stream-each": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/stream-each/-/stream-each-1.2.2.tgz", + "integrity": "sha512-mc1dbFhGBxvTM3bIWmAAINbqiuAk9TATcfIQC8P+/+HJefgaiTlMn2dHvkX8qlI12KeYKSQ1Ua9RrIqrn1VPoA==", + "dev": true, + "requires": { + "end-of-stream": "^1.1.0", + "stream-shift": "^1.0.0" + } + }, + "stream-http": { + "version": "2.8.3", + "resolved": "https://registry.npmjs.org/stream-http/-/stream-http-2.8.3.tgz", + "integrity": "sha512-+TSkfINHDo4J+ZobQLWiMouQYB+UVYFttRA94FpEzzJ7ZdqcL4uUUQ7WkdkI4DSozGmgBUE/a47L+38PenXhUw==", + "dev": true, + "requires": { + "builtin-status-codes": "^3.0.0", + "inherits": "^2.0.1", + "readable-stream": "^2.3.6", + "to-arraybuffer": "^1.0.0", + "xtend": "^4.0.0" + } + }, + "stream-shift": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/stream-shift/-/stream-shift-1.0.0.tgz", + "integrity": "sha1-1cdSgl5TZ+eG944Y5EXqIjoVWVI=", + "dev": true + }, + "strict-uri-encode": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/strict-uri-encode/-/strict-uri-encode-1.1.0.tgz", + "integrity": "sha1-J5siXfHVgrH1TmWt3UNS4Y+qBxM=", + "dev": true + }, + "string-width": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", + "integrity": "sha512-nOqH59deCq9SRHlxq1Aw85Jnt4w6KvLKqWVik6oA9ZklXLNIOlqg4F2yrT1MVaTjAqvVwdfeZ7w7aCvJD7ugkw==", + "dev": true, + "requires": { + "is-fullwidth-code-point": "^2.0.0", + "strip-ansi": "^4.0.0" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", + "dev": true + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + } + } + }, + "string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dev": true, + "requires": { + "safe-buffer": "~5.1.0" + } + }, + "strip-ansi": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", + "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", + "dev": true, + "requires": { + "ansi-regex": "^2.0.0" + } + }, + "strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", + "dev": true + }, + "strip-eof": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", + "integrity": "sha1-u0P/VZim6wXYm1n80SnJgzE2Br8=", + "dev": true + }, + "strip-json-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", + "integrity": "sha1-PFMZQukIwml8DsNEhYwobHygpgo=", + "dev": true + }, + "style-loader": { + "version": "0.21.0", + "resolved": "https://registry.npmjs.org/style-loader/-/style-loader-0.21.0.tgz", + "integrity": "sha512-T+UNsAcl3Yg+BsPKs1vd22Fr8sVT+CJMtzqc6LEw9bbJZb43lm9GoeIfUcDEefBSWC0BhYbcdupV1GtI4DGzxg==", + "dev": true, + "requires": { + "loader-utils": "^1.1.0", + "schema-utils": "^0.4.5" + } + }, + "supports-color": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-2.0.0.tgz", + "integrity": "sha1-U10EXOa2Nj+kARcIRimZXp3zJMc=", + "dev": true + }, + "svgo": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/svgo/-/svgo-0.7.2.tgz", + "integrity": "sha1-n1dyQTlSE1xv779Ar+ak+qiLS7U=", + "dev": true, + "requires": { + "coa": "~1.0.1", + "colors": "~1.1.2", + "csso": "~2.3.1", + "js-yaml": "~3.7.0", + "mkdirp": "~0.5.1", + "sax": "~1.2.1", + "whet.extend": "~0.9.9" + }, + "dependencies": { + "js-yaml": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.7.0.tgz", + "integrity": "sha1-XJZ93YN6m/3KXy3oQlOr6KHAO4A=", + "dev": true, + "requires": { + "argparse": "^1.0.7", + "esprima": "^2.6.0" + } + } + } + }, + "table": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/table/-/table-4.0.2.tgz", + "integrity": "sha512-UUkEAPdSGxtRpiV9ozJ5cMTtYiqz7Ni1OGqLXRCynrvzdtR1p+cfOWe2RJLwvUG8hNanaSRjecIqwOjqeatDsA==", + "dev": true, + "requires": { + "ajv": "^5.2.3", + "ajv-keywords": "^2.1.0", + "chalk": "^2.1.0", + "lodash": "^4.17.4", + "slice-ansi": "1.0.0", + "string-width": "^2.1.1" + }, + "dependencies": { + "ajv": { + "version": "5.5.2", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz", + "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", + "dev": true, + "requires": { + "co": "^4.6.0", + "fast-deep-equal": "^1.0.0", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.3.0" + } + }, + "ajv-keywords": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/ajv-keywords/-/ajv-keywords-2.1.1.tgz", + "integrity": "sha1-YXmX/F9gV2iUxDX5QNgZ4TW4B2I=", + "dev": true + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "fast-deep-equal": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", + "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=", + "dev": true + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "json-schema-traverse": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz", + "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "tapable": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-1.0.0.tgz", + "integrity": "sha512-dQRhbNQkRnaqauC7WqSJ21EEksgT0fYZX2lqXzGkpo8JNig9zGZTYoMGvyI2nWmXlE2VSVXVDu7wLVGu/mQEsg==", + "dev": true + }, + "text-table": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", + "integrity": "sha1-f17oI66AUgfACvLfSoTsP8+lcLQ=", + "dev": true + }, + "through": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", + "integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=", + "dev": true + }, + "through2": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/through2/-/through2-2.0.3.tgz", + "integrity": "sha1-AARWmzfHx0ujnEPzzteNGtlBQL4=", + "dev": true, + "requires": { + "readable-stream": "^2.1.5", + "xtend": "~4.0.1" + } + }, + "timers-browserify": { + "version": "2.0.10", + "resolved": "https://registry.npmjs.org/timers-browserify/-/timers-browserify-2.0.10.tgz", + "integrity": "sha512-YvC1SV1XdOUaL6gx5CoGroT3Gu49pK9+TZ38ErPldOWW4j49GI1HKs9DV+KGq/w6y+LZ72W1c8cKz2vzY+qpzg==", + "dev": true, + "requires": { + "setimmediate": "^1.0.4" + } + }, + "tmp": { + "version": "0.0.33", + "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", + "integrity": "sha512-jRCJlojKnZ3addtTOjdIqoRuPEKBvNXcGYqzO6zWZX8KfKEpnGY5jfggJQ3EjKuu8D4bJRr0y+cYJFmYbImXGw==", + "dev": true, + "requires": { + "os-tmpdir": "~1.0.2" + } + }, + "to-arraybuffer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/to-arraybuffer/-/to-arraybuffer-1.0.1.tgz", + "integrity": "sha1-fSKbH8xjfkZsoIEYCDanqr/4P0M=", + "dev": true + }, + "to-fast-properties": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-1.0.3.tgz", + "integrity": "sha1-uDVx+k2MJbguIxsG46MFXeTKGkc=", + "dev": true + }, + "to-object-path": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/to-object-path/-/to-object-path-0.3.0.tgz", + "integrity": "sha1-KXWIt7Dn4KwI4E5nL4XB9JmeF68=", + "dev": true, + "requires": { + "kind-of": "^3.0.2" + } + }, + "to-regex": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/to-regex/-/to-regex-3.0.2.tgz", + "integrity": "sha512-FWtleNAtZ/Ki2qtqej2CXTOayOH9bHDQF+Q48VpWyDXjbYxA4Yz8iDB31zXOBUlOHHKidDbqGVrTUvQMPmBGBw==", + "dev": true, + "requires": { + "define-property": "^2.0.2", + "extend-shallow": "^3.0.2", + "regex-not": "^1.0.2", + "safe-regex": "^1.1.0" + } + }, + "to-regex-range": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-2.1.1.tgz", + "integrity": "sha1-fIDBe53+vlmeJzZ+DU3VWQFB2zg=", + "dev": true, + "requires": { + "is-number": "^3.0.0", + "repeat-string": "^1.6.1" + } + }, + "trim-right": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/trim-right/-/trim-right-1.0.1.tgz", + "integrity": "sha1-yy4SAwZ+DI3h9hQJS5/kVwTqYAM=", + "dev": true + }, + "tslib": { + "version": "1.9.3", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.9.3.tgz", + "integrity": "sha512-4krF8scpejhaOgqzBEcGM7yDIEfi0/8+8zDRZhNZZ2kjmHJ4hv3zCbQWxoJGz1iw5U0Jl0nma13xzHXcncMavQ==", + "dev": true + }, + "tty-browserify": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.0.tgz", + "integrity": "sha1-oVe6QC2iTpv5V/mqadUk7tQpAaY=", + "dev": true + }, + "type-check": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.3.2.tgz", + "integrity": "sha1-WITKtRLPHTVeP7eE8wgEsrUg23I=", + "dev": true, + "requires": { + "prelude-ls": "~1.1.2" + } + }, + "typedarray": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", + "integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=", + "dev": true + }, + "uglify-js": { + "version": "2.8.29", + "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-2.8.29.tgz", + "integrity": "sha1-KcVzMUgFe7Th913zW3qcty5qWd0=", + "dev": true, + "optional": true, + "requires": { + "source-map": "~0.5.1", + "uglify-to-browserify": "~1.0.0", + "yargs": "~3.10.0" + } + }, + "uglify-to-browserify": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/uglify-to-browserify/-/uglify-to-browserify-1.0.2.tgz", + "integrity": "sha1-bgkk1r2mta/jSeOabWMoUKD4grc=", + "dev": true, + "optional": true + }, + "uglifyjs-webpack-plugin": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/uglifyjs-webpack-plugin/-/uglifyjs-webpack-plugin-1.2.7.tgz", + "integrity": "sha512-1VicfKhCYHLS8m1DCApqBhoulnASsEoJ/BvpUpP4zoNAPpKzdH+ghk0olGJMmwX2/jprK2j3hAHdUbczBSy2FA==", + "dev": true, + "requires": { + "cacache": "^10.0.4", + "find-cache-dir": "^1.0.0", + "schema-utils": "^0.4.5", + "serialize-javascript": "^1.4.0", + "source-map": "^0.6.1", + "uglify-es": "^3.3.4", + "webpack-sources": "^1.1.0", + "worker-farm": "^1.5.2" + }, + "dependencies": { + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + }, + "uglify-es": { + "version": "3.3.9", + "resolved": "https://registry.npmjs.org/uglify-es/-/uglify-es-3.3.9.tgz", + "integrity": "sha512-r+MU0rfv4L/0eeW3xZrd16t4NZfK8Ld4SWVglYBb7ez5uXFWHuVRs6xCTrf1yirs9a4j4Y27nn7SRfO6v67XsQ==", + "dev": true, + "requires": { + "commander": "~2.13.0", + "source-map": "~0.6.1" + } + } + } + }, + "union-value": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/union-value/-/union-value-1.0.0.tgz", + "integrity": "sha1-XHHDTLW61dzr4+oM0IIHulqhrqQ=", + "dev": true, + "requires": { + "arr-union": "^3.1.0", + "get-value": "^2.0.6", + "is-extendable": "^0.1.1", + "set-value": "^0.4.3" + }, + "dependencies": { + "extend-shallow": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extend-shallow/-/extend-shallow-2.0.1.tgz", + "integrity": "sha1-Ua99YUrZqfYQ6huvu5idaxxWiQ8=", + "dev": true, + "requires": { + "is-extendable": "^0.1.0" + } + }, + "set-value": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/set-value/-/set-value-0.4.3.tgz", + "integrity": "sha1-fbCPnT0i3H945Trzw79GZuzfzPE=", + "dev": true, + "requires": { + "extend-shallow": "^2.0.1", + "is-extendable": "^0.1.1", + "is-plain-object": "^2.0.1", + "to-object-path": "^0.3.0" + } + } + } + }, + "uniq": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/uniq/-/uniq-1.0.1.tgz", + "integrity": "sha1-sxxa6CVIRKOoKBVBzisEuGWnNP8=", + "dev": true + }, + "uniqs": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/uniqs/-/uniqs-2.0.0.tgz", + "integrity": "sha1-/+3ks2slKQaW5uFl1KWe25mOawI=", + "dev": true + }, + "unique-filename": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/unique-filename/-/unique-filename-1.1.0.tgz", + "integrity": "sha1-0F8v5AMlYIcfMOk8vnNe6iAVFPM=", + "dev": true, + "requires": { + "unique-slug": "^2.0.0" + } + }, + "unique-slug": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/unique-slug/-/unique-slug-2.0.0.tgz", + "integrity": "sha1-22Z258fMBimHj/GWCXx4hVrp9Ks=", + "dev": true, + "requires": { + "imurmurhash": "^0.1.4" + } + }, + "unset-value": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unset-value/-/unset-value-1.0.0.tgz", + "integrity": "sha1-g3aHP30jNRef+x5vw6jtDfyKtVk=", + "dev": true, + "requires": { + "has-value": "^0.3.1", + "isobject": "^3.0.0" + }, + "dependencies": { + "has-value": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/has-value/-/has-value-0.3.1.tgz", + "integrity": "sha1-ex9YutpiyoJ+wKIHgCVlSEWZXh8=", + "dev": true, + "requires": { + "get-value": "^2.0.3", + "has-values": "^0.1.4", + "isobject": "^2.0.0" + }, + "dependencies": { + "isobject": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-2.1.0.tgz", + "integrity": "sha1-8GVWEJaj8dou9GJy+BXIQNh+DIk=", + "dev": true, + "requires": { + "isarray": "1.0.0" + } + } + } + }, + "has-values": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/has-values/-/has-values-0.1.4.tgz", + "integrity": "sha1-bWHeldkd/Km5oCCJrThL/49it3E=", + "dev": true + } + } + }, + "upath": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/upath/-/upath-1.1.0.tgz", + "integrity": "sha512-bzpH/oBhoS/QI/YtbkqCg6VEiPYjSZtrHQM6/QnJS6OL9pKUFLqb3aFh4Scvwm45+7iAgiMkLhSbaZxUqmrprw==", + "dev": true + }, + "uri-js": { + "version": "4.2.2", + "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.2.2.tgz", + "integrity": "sha512-KY9Frmirql91X2Qgjry0Wd4Y+YTdrdZheS8TFwvkbLWf/G5KNJDCh6pKL5OZctEW4+0Baa5idK2ZQuELRwPznQ==", + "dev": true, + "requires": { + "punycode": "^2.1.0" + } + }, + "urix": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/urix/-/urix-0.1.0.tgz", + "integrity": "sha1-2pN/emLiH+wf0Y1Js1wpNQZ6bHI=", + "dev": true + }, + "url": { + "version": "0.11.0", + "resolved": "https://registry.npmjs.org/url/-/url-0.11.0.tgz", + "integrity": "sha1-ODjpfPxgUh63PFJajlW/3Z4uKPE=", + "dev": true, + "requires": { + "punycode": "1.3.2", + "querystring": "0.2.0" + }, + "dependencies": { + "punycode": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.3.2.tgz", + "integrity": "sha1-llOgNvt8HuQjQvIyXM7v6jkmxI0=", + "dev": true + } + } + }, + "url-loader": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/url-loader/-/url-loader-1.0.1.tgz", + "integrity": "sha512-rAonpHy7231fmweBKUFe0bYnlGDty77E+fm53NZdij7j/YOpyGzc7ttqG1nAXl3aRs0k41o0PC3TvGXQiw2Zvw==", + "dev": true, + "requires": { + "loader-utils": "^1.1.0", + "mime": "^2.0.3", + "schema-utils": "^0.4.3" + } + }, + "use": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/use/-/use-3.1.1.tgz", + "integrity": "sha512-cwESVXlO3url9YWlFW/TA9cshCEhtu7IKJ/p5soJ/gGpj7vbvFrAY/eIioQ6Dw23KjZhYgiIo8HOs1nQ2vr/oQ==", + "dev": true + }, + "util": { + "version": "0.10.4", + "resolved": "https://registry.npmjs.org/util/-/util-0.10.4.tgz", + "integrity": "sha512-0Pm9hTQ3se5ll1XihRic3FDIku70C+iHUdT/W926rSgHV5QgXsYbKZN8MSC3tJtSkhuROzvsQjAaFENRXr+19A==", + "dev": true, + "requires": { + "inherits": "2.0.3" + } + }, + "util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=", + "dev": true + }, + "uuid": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.3.2.tgz", + "integrity": "sha512-yXJmeNaw3DnnKAOKJE51sL/ZaYfWJRl1pK9dr19YFCu0ObS231AB1/LbqTKRAQ5kw8A90rA6fr4riOUpTZvQZA==", + "dev": true + }, + "v8-compile-cache": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/v8-compile-cache/-/v8-compile-cache-2.0.0.tgz", + "integrity": "sha512-qNdTUMaCjPs4eEnM3W9H94R3sU70YCuT+/ST7nUf+id1bVOrdjrpUaeZLqPBPRph3hsgn4a4BvwpxhHZx+oSDg==", + "dev": true + }, + "validate-npm-package-license": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/validate-npm-package-license/-/validate-npm-package-license-3.0.3.tgz", + "integrity": "sha512-63ZOUnL4SIXj4L0NixR3L1lcjO38crAbgrTpl28t8jjrfuiOBL5Iygm+60qPs/KsZGzPNg6Smnc/oY16QTjF0g==", + "dev": true, + "requires": { + "spdx-correct": "^3.0.0", + "spdx-expression-parse": "^3.0.0" + } + }, + "vendors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/vendors/-/vendors-1.0.2.tgz", + "integrity": "sha512-w/hry/368nO21AN9QljsaIhb9ZiZtZARoVH5f3CsFbawdLdayCgKRPup7CggujvySMxx0I91NOyxdVENohprLQ==", + "dev": true + }, + "vm-browserify": { + "version": "0.0.4", + "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-0.0.4.tgz", + "integrity": "sha1-XX6kW7755Kb/ZflUOOCofDV9WnM=", + "dev": true, + "requires": { + "indexof": "0.0.1" + } + }, + "watchpack": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/watchpack/-/watchpack-1.6.0.tgz", + "integrity": "sha512-i6dHe3EyLjMmDlU1/bGQpEw25XSjkJULPuAVKCbNRefQVq48yXKUpwg538F7AZTf9kyr57zj++pQFltUa5H7yA==", + "dev": true, + "requires": { + "chokidar": "^2.0.2", + "graceful-fs": "^4.1.2", + "neo-async": "^2.5.0" + } + }, + "webpack": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/webpack/-/webpack-4.16.3.tgz", + "integrity": "sha512-3VcrVoFgzSz1IYgga71YpU3HO89Al5bSnDOj9RJQPsy+FNyI1sFsUyJITn3pktNuaRBlQT0usvKZE3GgkPGAIw==", + "dev": true, + "requires": { + "@webassemblyjs/ast": "1.5.13", + "@webassemblyjs/helper-module-context": "1.5.13", + "@webassemblyjs/wasm-edit": "1.5.13", + "@webassemblyjs/wasm-opt": "1.5.13", + "@webassemblyjs/wasm-parser": "1.5.13", + "acorn": "^5.6.2", + "acorn-dynamic-import": "^3.0.0", + "ajv": "^6.1.0", + "ajv-keywords": "^3.1.0", + "chrome-trace-event": "^1.0.0", + "enhanced-resolve": "^4.1.0", + "eslint-scope": "^4.0.0", + "json-parse-better-errors": "^1.0.2", + "loader-runner": "^2.3.0", + "loader-utils": "^1.1.0", + "memory-fs": "~0.4.1", + "micromatch": "^3.1.8", + "mkdirp": "~0.5.0", + "neo-async": "^2.5.0", + "node-libs-browser": "^2.0.0", + "schema-utils": "^0.4.4", + "tapable": "^1.0.0", + "uglifyjs-webpack-plugin": "^1.2.4", + "watchpack": "^1.5.0", + "webpack-sources": "^1.0.1" + }, + "dependencies": { + "eslint-scope": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-4.0.0.tgz", + "integrity": "sha512-1G6UTDi7Jc1ELFwnR58HV4fK9OQK4S6N985f166xqXxpjU6plxFISJa2Ba9KCQuFa8RCnj/lSFJbHo7UFDBnUA==", + "dev": true, + "requires": { + "esrecurse": "^4.1.0", + "estraverse": "^4.1.1" + } + }, + "estraverse": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-4.2.0.tgz", + "integrity": "sha1-De4/7TH81GlhjOc0IJn8GvoL2xM=", + "dev": true + } + } + }, + "webpack-cli": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/webpack-cli/-/webpack-cli-3.1.0.tgz", + "integrity": "sha512-p5NeKDtYwjZozUWq6kGNs9w+Gtw/CPvyuXjXn2HMdz8Tie+krjEg8oAtonvIyITZdvpF7XG9xDHwscLr2c+ugQ==", + "dev": true, + "requires": { + "chalk": "^2.4.1", + "cross-spawn": "^6.0.5", + "enhanced-resolve": "^4.0.0", + "global-modules-path": "^2.1.0", + "import-local": "^1.0.0", + "inquirer": "^6.0.0", + "interpret": "^1.1.0", + "loader-utils": "^1.1.0", + "supports-color": "^5.4.0", + "v8-compile-cache": "^2.0.0", + "yargs": "^12.0.1" + }, + "dependencies": { + "ansi-regex": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-3.0.0.tgz", + "integrity": "sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=", + "dev": true + }, + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "cliui": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-4.1.0.tgz", + "integrity": "sha512-4FG+RSG9DL7uEwRUZXZn3SS34DiDPfzP0VOiEwtUWlE+AR2EIg+hSyvrIgUUfhdgR/UkAeW2QHgeP+hWrXs7jQ==", + "dev": true, + "requires": { + "string-width": "^2.1.1", + "strip-ansi": "^4.0.0", + "wrap-ansi": "^2.0.0" + } + }, + "decamelize": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-2.0.0.tgz", + "integrity": "sha512-Ikpp5scV3MSYxY39ymh45ZLEecsTdv/Xj2CaQfI8RLMuwi7XvjX9H/fhraiSuU+C5w5NTDu4ZU72xNiZnurBPg==", + "dev": true, + "requires": { + "xregexp": "4.0.0" + } + }, + "find-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-3.0.0.tgz", + "integrity": "sha512-1yD6RmLI1XBfxugvORwlck6f75tYL+iR0jqwsOrOxMZyGYqUuDhJ0l4AXdO1iX/FTs9cBAMEk1gWSEx1kSbylg==", + "dev": true, + "requires": { + "locate-path": "^3.0.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "locate-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-3.0.0.tgz", + "integrity": "sha512-7AO748wWnIhNqAuaty2ZWHkQHRSNfPVIsPIfwEOWO22AmaoVrWavlOcMR5nzTLNYvp36X220/maaRsrec1G65A==", + "dev": true, + "requires": { + "p-locate": "^3.0.0", + "path-exists": "^3.0.0" + } + }, + "p-limit": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-2.0.0.tgz", + "integrity": "sha512-fl5s52lI5ahKCernzzIyAP0QAZbGIovtVHGwpcu1Jr/EpzLVDI2myISHwGqK7m8uQFugVWSrbxH7XnhGtvEc+A==", + "dev": true, + "requires": { + "p-try": "^2.0.0" + } + }, + "p-locate": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-3.0.0.tgz", + "integrity": "sha512-x+12w/To+4GFfgJhBEpiDcLozRJGegY+Ei7/z0tSLkMmxGZNybVMSfWj9aJn8Z5Fc7dBUNJOOVgPv2H7IwulSQ==", + "dev": true, + "requires": { + "p-limit": "^2.0.0" + } + }, + "p-try": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/p-try/-/p-try-2.0.0.tgz", + "integrity": "sha512-hMp0onDKIajHfIkdRk3P4CdCmErkYAxxDtP3Wx/4nZ3aGlau2VKh3mZpcuFkH27WQkL/3WBCPOktzA9ZOAnMQQ==", + "dev": true + }, + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + }, + "yargs": { + "version": "12.0.1", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-12.0.1.tgz", + "integrity": "sha512-B0vRAp1hRX4jgIOWFtjfNjd9OA9RWYZ6tqGA9/I/IrTMsxmKvtWy+ersM+jzpQqbC3YfLzeABPdeTgcJ9eu1qQ==", + "dev": true, + "requires": { + "cliui": "^4.0.0", + "decamelize": "^2.0.0", + "find-up": "^3.0.0", + "get-caller-file": "^1.0.1", + "os-locale": "^2.0.0", + "require-directory": "^2.1.1", + "require-main-filename": "^1.0.1", + "set-blocking": "^2.0.0", + "string-width": "^2.0.0", + "which-module": "^2.0.0", + "y18n": "^3.2.1 || ^4.0.0", + "yargs-parser": "^10.1.0" + } + } + } + }, + "webpack-log": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/webpack-log/-/webpack-log-1.2.0.tgz", + "integrity": "sha512-U9AnICnu50HXtiqiDxuli5gLB5PGBo7VvcHx36jRZHwK4vzOYLbImqT4lwWwoMHdQWwEKw736fCHEekokTEKHA==", + "dev": true, + "requires": { + "chalk": "^2.1.0", + "log-symbols": "^2.1.0", + "loglevelnext": "^1.0.1", + "uuid": "^3.1.0" + }, + "dependencies": { + "ansi-styles": { + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-3.2.1.tgz", + "integrity": "sha512-VT0ZI6kZRdTh8YyJw3SMbYm/u+NqfsAxEpWO0Pf9sq8/e94WxxOpPKx9FR1FlyCtOVDNOQ+8ntlqFxiRc+r5qA==", + "dev": true, + "requires": { + "color-convert": "^1.9.0" + } + }, + "chalk": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-2.4.1.tgz", + "integrity": "sha512-ObN6h1v2fTJSmUXoS3nMQ92LbDK9be4TV+6G+omQlGJFdcUX5heKi1LZ1YnRMIgwTLEj3E24bT6tYni50rlCfQ==", + "dev": true, + "requires": { + "ansi-styles": "^3.2.1", + "escape-string-regexp": "^1.0.5", + "supports-color": "^5.3.0" + } + }, + "has-flag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-3.0.0.tgz", + "integrity": "sha1-tdRU3CGZriJWmfNGfloH87lVuv0=", + "dev": true + }, + "supports-color": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.4.0.tgz", + "integrity": "sha512-zjaXglF5nnWpsq470jSv6P9DwPvgLkuapYmfDm3JWOm0vkNTVF2tI4UrN2r6jH1qM/uc/WtxYY1hYoA2dOKj5w==", + "dev": true, + "requires": { + "has-flag": "^3.0.0" + } + } + } + }, + "webpack-manifest-plugin": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/webpack-manifest-plugin/-/webpack-manifest-plugin-2.0.3.tgz", + "integrity": "sha512-FZcnB3MMQ0CT0aU1+LItwywXWAixLTGUEAtN0fw15dScf2LudQwheLPUCj+QMhDlwZT+9ysfKqUFTcfUGc8bXg==", + "dev": true, + "requires": { + "fs-extra": "^0.30.0", + "lodash": ">=3.5 <5", + "tapable": "^1.0.0" + } + }, + "webpack-sources": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-1.1.0.tgz", + "integrity": "sha512-aqYp18kPphgoO5c/+NaUvEeACtZjMESmDChuD3NBciVpah3XpMEU9VAAtIaB1BsfJWWTSdv8Vv1m3T0aRk2dUw==", + "dev": true, + "requires": { + "source-list-map": "^2.0.0", + "source-map": "~0.6.1" + }, + "dependencies": { + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + } + } + }, + "whet.extend": { + "version": "0.9.9", + "resolved": "https://registry.npmjs.org/whet.extend/-/whet.extend-0.9.9.tgz", + "integrity": "sha1-+HfVv2SMl+WqVC+twW1qJZucEaE=", + "dev": true + }, + "which": { + "version": "1.2.14", + "resolved": "https://registry.npmjs.org/which/-/which-1.2.14.tgz", + "integrity": "sha1-mofEN48D6CfOyvGs31bHNsAcFOU=", + "dev": true, + "requires": { + "isexe": "^2.0.0" + } + }, + "which-module": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/which-module/-/which-module-2.0.0.tgz", + "integrity": "sha1-2e8H3Od7mQK4o6j6SzHD4/fm6Ho=", + "dev": true + }, + "window-size": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/window-size/-/window-size-0.1.0.tgz", + "integrity": "sha1-VDjNLqk7IC76Ohn+iIeu58lPnJ0=", + "dev": true, + "optional": true + }, + "wordwrap": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz", + "integrity": "sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=", + "dev": true + }, + "worker-farm": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/worker-farm/-/worker-farm-1.6.0.tgz", + "integrity": "sha512-6w+3tHbM87WnSWnENBUvA2pxJPLhQUg5LKwUQHq3r+XPhIM+Gh2R5ycbwPCyuGbNg+lPgdcnQUhuC02kJCvffQ==", + "dev": true, + "requires": { + "errno": "~0.1.7" + } + }, + "wrap-ansi": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-2.1.0.tgz", + "integrity": "sha1-2Pw9KE3QV5T+hJc8rs3Rz4JP3YU=", + "dev": true, + "requires": { + "string-width": "^1.0.1", + "strip-ansi": "^3.0.1" + }, + "dependencies": { + "is-fullwidth-code-point": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", + "integrity": "sha1-754xOG8DGn8NZDr4L95QxFfvAMs=", + "dev": true, + "requires": { + "number-is-nan": "^1.0.0" + } + }, + "string-width": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", + "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=", + "dev": true, + "requires": { + "code-point-at": "^1.0.0", + "is-fullwidth-code-point": "^1.0.0", + "strip-ansi": "^3.0.0" + } + } + } + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", + "dev": true + }, + "write": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/write/-/write-0.2.1.tgz", + "integrity": "sha1-X8A4KOJkzqP+kUVUdvejxWbLB1c=", + "dev": true, + "requires": { + "mkdirp": "^0.5.1" + } + }, + "xregexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz", + "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==", + "dev": true + }, + "xtend": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.1.tgz", + "integrity": "sha1-pcbVMr5lbiPbgg77lDofBJmNY68=", + "dev": true + }, + "y18n": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-4.0.0.tgz", + "integrity": "sha512-r9S/ZyXu/Xu9q1tYlpsLIsa3EeLXXk0VwlxqTcFRfg9EhMW+17kbt9G0NrgCmhGb5vT2hyhJZLfDGx+7+5Uj/w==", + "dev": true + }, + "yallist": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-2.1.2.tgz", + "integrity": "sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=", + "dev": true + }, + "yargs": { + "version": "3.10.0", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-3.10.0.tgz", + "integrity": "sha1-9+572FfdfB0tOMDnTvvWgdFDH9E=", + "dev": true, + "optional": true, + "requires": { + "camelcase": "^1.0.2", + "cliui": "^2.1.0", + "decamelize": "^1.0.0", + "window-size": "0.1.0" + } + }, + "yargs-parser": { + "version": "10.1.0", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-10.1.0.tgz", + "integrity": "sha512-VCIyR1wJoEBZUqk5PA+oOBF6ypbwh5aNB3I50guxAL/quggdfs4TtNHQrSazFA3fYZ+tEqfs0zIGlv0c/rgjbQ==", + "dev": true, + "requires": { + "camelcase": "^4.1.0" + }, + "dependencies": { + "camelcase": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", + "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "dev": true + } + } + } + } +} diff --git a/airflow/www_rbac/package.json b/airflow/www_rbac/package.json new file mode 100644 index 0000000000000..5c431895a7666 --- /dev/null +++ b/airflow/www_rbac/package.json @@ -0,0 +1,69 @@ +{ + "description": "Apache Airflow is a platform to programmatically author, schedule and monitor workflows.", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "dev": "NODE_ENV=dev webpack --watch --colors --progress --debug --output-pathinfo --devtool eval-cheap-source-map --mode development", + "prod": "NODE_ENV=production node --max_old_space_size=4096 ./node_modules/webpack/bin/webpack.js -p --colors --progress", + "build": "NODE_ENV=production webpack --colors --progress", + "lint": "eslint --ignore-path=.eslintignore --ext .js,.html .", + "lint:fix": "eslint --fix --ignore-path=.eslintignore --ext .js,.html ." + }, + "author": "Apache", + "license": "Apache-2.0", + "repository": { + "type": "git", + "url": "git+https://github.com/apache/airflow.git" + }, + "homepage": "http://airflow.apache.org/", + "keywords": [ + "big", + "data", + "workflow", + "airflow", + "d3", + "nerds", + "database", + "flask" + ], + "devDependencies": { + "babel": "^6.23.0", + "babel-core": "^6.26.3", + "babel-eslint": "^8.2.6", + "babel-istanbul": "^0.12.2", + "babel-loader": "^7.1.4", + "babel-plugin-css-modules-transform": "^1.6.1", + "babel-polyfill": "^6.26.0", + "clean-webpack-plugin": "^0.1.19", + "copy-webpack-plugin": "^4.5.2", + "css-loader": "^0.28.11", + "eslint": "^4.19.1", + "eslint-config-airbnb-base": "^13.0.0", + "eslint-plugin-html": "^4.0.5", + "eslint-plugin-import": "^2.13.0", + "eslint-plugin-node": "^7.0.1", + "eslint-plugin-promise": "^3.8.0", + "eslint-plugin-standard": "^3.1.0", + "file-loader": "^1.1.11", + "imports-loader": "^0.8.0", + "lodash": "^4.17.11", + "mini-css-extract-plugin": "^0.4.1", + "style-loader": "^0.21.0", + "url-loader": "^1.0.1", + "webpack": "^4.16.3", + "webpack-cli": "^3.1.0", + "webpack-manifest-plugin": "^2.0.3" + }, + "dependencies": { + "bootstrap-3-typeahead": "^4.0.2", + "bootstrap-toggle": "^2.2.2", + "d3": "^3.4.4", + "d3-tip": "^0.9.1", + "dagre-d3": "^0.4.18", + "datatables.net": "^1.10.19", + "datatables.net-bs": "^1.10.19", + "lodash": "^4.17.11", + "moment-timezone": "^0.5.21", + "npm": "^6.3.0", + "nvd3": "^1.8.6" + } +} diff --git a/airflow/www_rbac/security.py b/airflow/www_rbac/security.py index d2271f822a47e..18a9c447b66a4 100644 --- a/airflow/www_rbac/security.py +++ b/airflow/www_rbac/security.py @@ -7,22 +7,31 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# +from flask import g from flask_appbuilder.security.sqla import models as sqla_models +from flask_appbuilder.security.sqla.manager import SecurityManager +from sqlalchemy import or_ + +from airflow import models +from airflow.www_rbac.app import appbuilder +from airflow.utils.db import provide_session +from airflow.utils.log.logging_mixin import LoggingMixin ########################################################################### # VIEW MENUS ########################################################################### -viewer_vms = [ +VIEWER_VMS = { 'Airflow', 'DagModelView', 'Browse', @@ -42,11 +51,11 @@ 'About', 'Version', 'VersionView', -] +} -user_vms = viewer_vms +USER_VMS = VIEWER_VMS -op_vms = [ +OP_VMS = { 'Admin', 'Configurations', 'ConfigurationView', @@ -58,13 +67,13 @@ 'VariableModelView', 'XComs', 'XComModelView', -] +} ########################################################################### # PERMISSIONS ########################################################################### -viewer_perms = [ +VIEWER_PERMS = { 'menu_access', 'can_index', 'can_list', @@ -75,6 +84,7 @@ 'can_task_stats', 'can_code', 'can_log', + 'can_get_logs_with_metadata', 'can_tries', 'can_graph', 'can_tree', @@ -88,9 +98,9 @@ 'can_rendered', 'can_pickle_info', 'can_version', -] +} -user_perms = [ +USER_PERMS = { 'can_dagrun_clear', 'can_run', 'can_trigger', @@ -105,12 +115,23 @@ 'set_running', 'set_success', 'clear', -] + 'can_clear', +} -op_perms = [ +OP_PERMS = { 'can_conf', 'can_varimport', -] +} + +# global view-menu for dag-level access +DAG_VMS = { + 'all_dags' +} + +DAG_PERMS = { + 'can_dag_read', + 'can_dag_edit', +} ########################################################################### # DEFAULT ROLE CONFIGURATIONS @@ -119,61 +140,344 @@ ROLE_CONFIGS = [ { 'role': 'Viewer', - 'perms': viewer_perms, - 'vms': viewer_vms, + 'perms': VIEWER_PERMS, + 'vms': VIEWER_VMS | DAG_VMS, }, { 'role': 'User', - 'perms': viewer_perms + user_perms, - 'vms': viewer_vms + user_vms, + 'perms': VIEWER_PERMS | USER_PERMS | DAG_PERMS, + 'vms': VIEWER_VMS | DAG_VMS | USER_VMS, }, { 'role': 'Op', - 'perms': viewer_perms + user_perms + op_perms, - 'vms': viewer_vms + user_vms + op_vms, + 'perms': VIEWER_PERMS | USER_PERMS | OP_PERMS | DAG_PERMS, + 'vms': VIEWER_VMS | DAG_VMS | USER_VMS | OP_VMS, }, ] +EXISTING_ROLES = { + 'Admin', + 'Viewer', + 'User', + 'Op', + 'Public', +} + + +class AirflowSecurityManager(SecurityManager, LoggingMixin): + + def init_role(self, role_name, role_vms, role_perms): + """ + Initialize the role with the permissions and related view-menus. + + :param role_name: + :param role_vms: + :param role_perms: + :return: + """ + pvms = self.get_session.query(sqla_models.PermissionView).all() + pvms = [p for p in pvms if p.permission and p.view_menu] + + role = self.find_role(role_name) + if not role: + role = self.add_role(role_name) + + if len(role.permissions) == 0: + self.log.info('Initializing permissions for role:%s in the database.', role_name) + role_pvms = set() + for pvm in pvms: + if pvm.view_menu.name in role_vms and pvm.permission.name in role_perms: + role_pvms.add(pvm) + role.permissions = list(role_pvms) + self.get_session.merge(role) + self.get_session.commit() + else: + self.log.debug('Existing permissions for the role:%s ' + 'within the database will persist.', role_name) + + def get_user_roles(self, user=None): + """ + Get all the roles associated with the user. + + :param user: the ab_user in FAB model. + :return: a list of roles associated with the user. + """ + if user is None: + user = g.user + if user.is_anonymous: + public_role = appbuilder.config.get('AUTH_ROLE_PUBLIC') + return [appbuilder.security_manager.find_role(public_role)] \ + if public_role else [] + return user.roles + + def get_all_permissions_views(self): + """ + Returns a set of tuples with the perm name and view menu name + """ + perms_views = set() + for role in self.get_user_roles(): + perms_views.update({(perm_view.permission.name, perm_view.view_menu.name) + for perm_view in role.permissions}) + return perms_views + + def get_accessible_dag_ids(self, username=None): + """ + Return a set of dags that user has access to(either read or write). + + :param username: Name of the user. + :return: A set of dag ids that the user could access. + """ + if not username: + username = g.user + + if username.is_anonymous or 'Public' in username.roles: + # return an empty set if the role is public + return set() + + roles = {role.name for role in username.roles} + if {'Admin', 'Viewer', 'User', 'Op'} & roles: + return DAG_VMS + + user_perms_views = self.get_all_permissions_views() + # return a set of all dags that the user could access + return set([view for perm, view in user_perms_views if perm in DAG_PERMS]) + + def has_access(self, permission, view_name, user=None): + """ + Verify whether a given user could perform certain permission + (e.g can_read, can_write) on the given dag_id. + + :param permission: permission on dag_id(e.g can_read, can_edit). + :type permission: str + :param view_name: name of view-menu(e.g dag id is a view-menu as well). + :type permission: str + :param user: user name + :type permission: str + :return: a bool whether user could perform certain permission on the dag_id. + :rtype bool + """ + if not user: + user = g.user + if user.is_anonymous: + return self.is_item_public(permission, view_name) + return self._has_view_access(user, permission, view_name) + + def _get_and_cache_perms(self): + """ + Cache permissions-views + """ + self.perms = self.get_all_permissions_views() + + def _has_role(self, role_name_or_list): + """ + Whether the user has this role name + """ + if not isinstance(role_name_or_list, list): + role_name_or_list = [role_name_or_list] + return any( + [r.name in role_name_or_list for r in self.get_user_roles()]) + + def _has_perm(self, permission_name, view_menu_name): + """ + Whether the user has this perm + """ + if hasattr(self, 'perms'): + if (permission_name, view_menu_name) in self.perms: + return True + # rebuild the permissions set + self._get_and_cache_perms() + return (permission_name, view_menu_name) in self.perms + + def has_all_dags_access(self): + """ + Has all the dag access in any of the 3 cases: + 1. Role needs to be in (Admin, Viewer, User, Op). + 2. Has can_dag_read permission on all_dags view. + 3. Has can_dag_edit permission on all_dags view. + """ + return ( + self._has_role(['Admin', 'Viewer', 'Op', 'User']) or + self._has_perm('can_dag_read', 'all_dags') or + self._has_perm('can_dag_edit', 'all_dags')) + + def clean_perms(self): + """ + FAB leaves faulty permissions that need to be cleaned up + """ + self.log.info('Cleaning faulty perms') + sesh = self.get_session + pvms = ( + sesh.query(sqla_models.PermissionView) + .filter(or_( + sqla_models.PermissionView.permission == None, # NOQA + sqla_models.PermissionView.view_menu == None, # NOQA + )) + ) + deleted_count = pvms.delete() + sesh.commit() + if deleted_count: + self.log.info('Deleted %s faulty permissions', deleted_count) + + def _merge_perm(self, permission_name, view_menu_name): + """ + Add the new permission , view_menu to ab_permission_view_role if not exists. + It will add the related entry to ab_permission + and ab_view_menu two meta tables as well. + + :param permission_name: Name of the permission. + :type permission_name: str + :param view_menu_name: Name of the view-menu + :type view_menu_name: str + :return: + """ + permission = self.find_permission(permission_name) + view_menu = self.find_view_menu(view_menu_name) + pv = None + if permission and view_menu: + pv = self.get_session.query(self.permissionview_model).filter_by( + permission=permission, view_menu=view_menu).first() + if not pv and permission_name and view_menu_name: + self.add_permission_view_menu(permission_name, view_menu_name) + + @provide_session + def create_custom_dag_permission_view(self, session=None): + """ + Workflow: + 1. Fetch all the existing (permissions, view-menu) from Airflow DB. + 2. Fetch all the existing dag models that are either active or paused. Exclude the subdags. + 3. Create both read and write permission view-menus relation for every dags from step 2 + 4. Find out all the dag specific roles(excluded pubic, admin, viewer, op, user) + 5. Get all the permission-vm owned by the user role. + 6. Grant all the user role's permission-vm except the all-dag view-menus to the dag roles. + 7. Commit the updated permission-vm-role into db + + :return: None. + """ + # todo(Tao): should we put this function here or in scheduler loop? + self.log.info('Fetching a set of all permission, view_menu from FAB meta-table') + + def merge_pv(perm, view_menu): + """Create permission view menu only if it doesn't exist""" + if view_menu and perm and (view_menu, perm) not in all_pvs: + self._merge_perm(perm, view_menu) + + all_pvs = set() + for pv in self.get_session.query(self.permissionview_model).all(): + if pv.permission and pv.view_menu: + all_pvs.add((pv.permission.name, pv.view_menu.name)) + + # create perm for global logical dag + for dag in DAG_VMS: + for perm in DAG_PERMS: + merge_pv(perm, dag) + + # Get all the active / paused dags and insert them into a set + all_dags_models = session.query(models.DagModel)\ + .filter(or_(models.DagModel.is_active, models.DagModel.is_paused))\ + .filter(~models.DagModel.is_subdag).all() + + # create can_dag_edit and can_dag_read permissions for every dag(vm) + for dag in all_dags_models: + for perm in DAG_PERMS: + merge_pv(perm, dag.dag_id) + + # for all the dag-level role, add the permission of viewer + # with the dag view to ab_permission_view + all_roles = self.get_all_roles() + user_role = self.find_role('User') + + dag_role = [role for role in all_roles if role.name not in EXISTING_ROLES] + update_perm_views = [] + + # need to remove all_dag vm from all the existing view-menus + dag_vm = self.find_view_menu('all_dags') + ab_perm_view_role = sqla_models.assoc_permissionview_role + perm_view = self.permissionview_model + view_menu = self.viewmenu_model + + all_perm_view_by_user = session.query(ab_perm_view_role)\ + .join(perm_view, perm_view.id == ab_perm_view_role + .columns.permission_view_id)\ + .filter(ab_perm_view_role.columns.role_id == user_role.id)\ + .join(view_menu)\ + .filter(perm_view.view_menu_id != dag_vm.id) + all_perm_views = set([role.permission_view_id for role in all_perm_view_by_user]) + + for role in dag_role: + # Get all the perm-view of the role + existing_perm_view_by_user = self.get_session.query(ab_perm_view_role)\ + .filter(ab_perm_view_role.columns.role_id == role.id) + + existing_perms_views = set([pv.permission_view_id + for pv in existing_perm_view_by_user]) + missing_perm_views = all_perm_views - existing_perms_views + + for perm_view_id in missing_perm_views: + update_perm_views.append({'permission_view_id': perm_view_id, + 'role_id': role.id}) + + if update_perm_views: + self.get_session.execute(ab_perm_view_role.insert(), update_perm_views) + self.get_session.commit() + + def update_admin_perm_view(self): + """ + Admin should have all the permission-views. + Add the missing ones to the table for admin. + + :return: None. + """ + pvms = self.get_session.query(sqla_models.PermissionView).all() + pvms = [p for p in pvms if p.permission and p.view_menu] + + admin = self.find_role('Admin') + admin.permissions = list(set(admin.permissions) | set(pvms)) + + self.get_session.commit() + + def sync_roles(self): + """ + 1. Init the default role(Admin, Viewer, User, Op, public) + with related permissions. + 2. Init the custom role(dag-user) with related permissions. + + :return: None. + """ + self.log.info('Start syncing user roles.') + # Create global all-dag VM + self.create_perm_vm_for_all_dag() + + # Create default user role. + for config in ROLE_CONFIGS: + role = config['role'] + vms = config['vms'] + perms = config['perms'] + self.init_role(role, vms, perms) + self.create_custom_dag_permission_view() + + # init existing roles, the rest role could be created through UI. + self.update_admin_perm_view() + self.clean_perms() + + def sync_perm_for_dag(self, dag_id): + """ + Sync permissions for given dag id. The dag id surely exists in our dag bag + as only / refresh button will call this function + + :param dag_id: + :return: + """ + for dag_perm in DAG_PERMS: + perm_on_dag = self.find_permission_view_menu(dag_perm, dag_id) + if perm_on_dag is None: + self.add_permission_view_menu(dag_perm, dag_id) -def init_role(sm, role_name, role_vms, role_perms): - sm_session = sm.get_session - pvms = sm_session.query(sqla_models.PermissionView).all() - pvms = [p for p in pvms if p.permission and p.view_menu] - - valid_perms = [p.permission.name for p in pvms] - valid_vms = [p.view_menu.name for p in pvms] - invalid_perms = [p for p in role_perms if p not in valid_perms] - if invalid_perms: - raise Exception('The following permissions are not valid: {}' - .format(invalid_perms)) - invalid_vms = [v for v in role_vms if v not in valid_vms] - if invalid_vms: - raise Exception('The following view menus are not valid: {}' - .format(invalid_vms)) - - role = sm.add_role(role_name) - role_pvms = [] - for pvm in pvms: - if pvm.view_menu.name in role_vms and pvm.permission.name in role_perms: - role_pvms.append(pvm) - role_pvms = list(set(role_pvms)) - role.permissions = role_pvms - sm_session.merge(role) - sm_session.commit() - - -def init_roles(appbuilder): - for config in ROLE_CONFIGS: - name = config['role'] - vms = config['vms'] - perms = config['perms'] - init_role(appbuilder.sm, name, vms, perms) - - -def is_view_only(user, appbuilder): - if user.is_anonymous(): - anonymous_role = appbuilder.sm.auth_role_public - return anonymous_role == 'Viewer' - - user_roles = user.roles - return len(user_roles) == 1 and user_roles[0].name == 'Viewer' + def create_perm_vm_for_all_dag(self): + """ + Create perm-vm if not exist and insert into FAB security model for all-dags. + """ + # create perm for global logical dag + for dag_vm in DAG_VMS: + for perm in DAG_PERMS: + self._merge_perm(permission_name=perm, + view_menu_name=dag_vm) diff --git a/airflow/www_rbac/static/bootstrap-toggle.min.css b/airflow/www_rbac/static/bootstrap-toggle.min.css deleted file mode 100644 index 0d42ed09cdaa3..0000000000000 --- a/airflow/www_rbac/static/bootstrap-toggle.min.css +++ /dev/null @@ -1,28 +0,0 @@ -/*! ======================================================================== - * Bootstrap Toggle: bootstrap-toggle.css v2.2.0 - * http://www.bootstraptoggle.com - * ======================================================================== - * Copyright 2014 Min Hur, The New York Times Company - * Licensed under MIT - * ======================================================================== */ -.checkbox label .toggle,.checkbox-inline .toggle{margin-left:-20px;margin-right:5px} -.toggle{position:relative;overflow:hidden} -.toggle input[type=checkbox]{display:none} -.toggle-group{position:absolute;width:200%;top:0;bottom:0;left:0;transition:left .35s;-webkit-transition:left .35s;-moz-user-select:none;-webkit-user-select:none} -.toggle.off .toggle-group{left:-100%} -.toggle-on{position:absolute;top:0;bottom:0;left:0;right:50%;margin:0;border:0;border-radius:0} -.toggle-off{position:absolute;top:0;bottom:0;left:50%;right:0;margin:0;border:0;border-radius:0} -.toggle-handle{position:relative;margin:0 auto;padding-top:0;padding-bottom:0;height:100%;width:0;border-width:0 1px} -.toggle.btn{min-width:59px;min-height:34px} -.toggle-on.btn{padding-right:24px} -.toggle-off.btn{padding-left:24px} -.toggle.btn-lg{min-width:79px;min-height:45px} -.toggle-on.btn-lg{padding-right:31px} -.toggle-off.btn-lg{padding-left:31px} -.toggle-handle.btn-lg{width:40px} -.toggle.btn-sm{min-width:50px;min-height:30px} -.toggle-on.btn-sm{padding-right:20px} -.toggle-off.btn-sm{padding-left:20px} -.toggle.btn-xs{min-width:35px;min-height:22px} -.toggle-on.btn-xs{padding-right:12px} -.toggle-off.btn-xs{padding-left:12px} \ No newline at end of file diff --git a/airflow/www_rbac/static/bootstrap-toggle.min.js b/airflow/www_rbac/static/bootstrap-toggle.min.js deleted file mode 100644 index 37113200bff5c..0000000000000 --- a/airflow/www_rbac/static/bootstrap-toggle.min.js +++ /dev/null @@ -1,9 +0,0 @@ -/*! ======================================================================== - * Bootstrap Toggle: bootstrap-toggle.js v2.2.0 - * http://www.bootstraptoggle.com - * ======================================================================== - * Copyright 2014 Min Hur, The New York Times Company - * Licensed under MIT - * ======================================================================== */ -+function(a){"use strict";function b(b){return this.each(function(){var d=a(this),e=d.data("bs.toggle"),f="object"==typeof b&&b;e||d.data("bs.toggle",e=new c(this,f)),"string"==typeof b&&e[b]&&e[b]()})}var c=function(b,c){this.$element=a(b),this.options=a.extend({},this.defaults(),c),this.render()};c.VERSION="2.2.0",c.DEFAULTS={on:"On",off:"Off",onstyle:"primary",offstyle:"default",size:"normal",style:"",width:null,height:null},c.prototype.defaults=function(){return{on:this.$element.attr("data-on")||c.DEFAULTS.on,off:this.$element.attr("data-off")||c.DEFAULTS.off,onstyle:this.$element.attr("data-onstyle")||c.DEFAULTS.onstyle,offstyle:this.$element.attr("data-offstyle")||c.DEFAULTS.offstyle,size:this.$element.attr("data-size")||c.DEFAULTS.size,style:this.$element.attr("data-style")||c.DEFAULTS.style,width:this.$element.attr("data-width")||c.DEFAULTS.width,height:this.$element.attr("data-height")||c.DEFAULTS.height}},c.prototype.render=function(){this._onstyle="btn-"+this.options.onstyle,this._offstyle="btn-"+this.options.offstyle;var b="large"===this.options.size?"btn-lg":"small"===this.options.size?"btn-sm":"mini"===this.options.size?"btn-xs":"",c=a('