diff --git a/requirements.txt b/requirements.txt index d2534489..62ab00e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,15 +13,12 @@ treelite==2.1.0 treelite_runtime==2.1.0 flaml==0.9.6 -# pipelines -shrike[pipeline]==1.14.7 -azure-ml-component==0.9.4.post1 # for component dsl -azureml-train-core==1.36.0 # for azureml.train.hyperdrive -azureml-dataset-runtime==1.36.0 # to register dataset -hydra-core~=1.0.3 -typing_extensions==4.0.1 # for hydra - # unit testing pytest==6.2.4 pytest-cov==2.12.1 pytest-mock==3.6.1 + +# pipelines +hydra-core~=1.0.3 +azure-ml==0.0.61212840 +--extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-cli-v2 diff --git a/src/common/aml.py b/src/common/aml.py index db3e1449..3835e221 100644 --- a/src/common/aml.py +++ b/src/common/aml.py @@ -7,7 +7,7 @@ """ import logging import re -from azureml.core import Datastore, Dataset +#from azureml.core import Datastore, Dataset def dataset_from_dstore_path(workspace, datastore, datastore_path, validate=True): diff --git a/src/common/pipelines.py b/src/common/pipelines.py index 7d3d035b..e7daeb35 100644 --- a/src/common/pipelines.py +++ b/src/common/pipelines.py @@ -17,9 +17,7 @@ from hydra.core.config_store import ConfigStore from omegaconf import DictConfig, OmegaConf -from azureml.core import Workspace -from azureml.pipeline.core import Pipeline -from shrike.pipeline.aml_connect import azureml_connect as shrike_azureml_connect +from azure.ml import MLClient # when running this script directly, needed to import common from .paths import COMPONENTS_ROOT, CONFIG_PATH @@ -134,18 +132,40 @@ def azureml_connect(config: DictConfig): Returns: workspace (azure.ml.core.Workspace) """ - return shrike_azureml_connect( - aml_subscription_id=config.aml.subscription_id, - aml_resource_group=config.aml.resource_group, - aml_workspace_name=config.aml.workspace_name, - aml_auth=config.aml.auth, - aml_tenant=config.aml.tenant, - aml_force=config.aml.force + if config.aml.auth == "msi": + from azure.identity import ManagedIdentityCredential + credential = ManagedIdentityCredential() + elif config.aml.auth == "azurecli": + from azure.identity import AzureCliCredential + credential = AzureCliCredential() + elif config.aml.auth == "interactive": + from azure.identity import InteractiveBrowserCredential + + credential = InteractiveBrowserCredential( + tenant_id=config.aml.tenant, force=config.aml.force + ) + else: + # authentication package + from azure.identity import DefaultAzureCredential + try: + credential = DefaultAzureCredential() + # Check if given credential can get token successfully. + credential.get_token("https://management.azure.com/.default") + except Exception as ex: + from azure.identity import InteractiveBrowserCredential + # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work + credential = InteractiveBrowserCredential() + + return MLClient( + credential=credential, + subscription_id=config.aml.subscription_id, + resource_group_name=config.aml.resource_group, + workspace_name=config.aml.workspace_name ) -def pipeline_submit(workspace: Workspace, +def pipeline_submit(ml_client: MLClient, pipeline_config: DictConfig, - pipeline_instance: Pipeline, + pipeline_instance, experiment_name: str=None, experiment_description: str=None, display_name: str=None, @@ -153,7 +173,7 @@ def pipeline_submit(workspace: Workspace, """Standard helper function to submit a pipeline to AzureML. Args: - workspace (azure.ml.core.Workspace): AzureML workspace (see azureml_connect()) + ml_client (azure.ml.MLClient): AzureML client (see azureml_connect()) pipeline_config (DictConfig): class for hosting the config of pipeline_func pipeline_instance (Pipeline): pipeline object experiment_name (str): override config.experiment.name at runtime @@ -164,30 +184,21 @@ def pipeline_submit(workspace: Workspace, Returns: pipeline (azure.ml.core.PipelineRun) """ - if pipeline_config.run.validate: - pipeline_instance.validate(workspace=workspace) + #if pipeline_config.run.validate: + # pipeline_instance.validate(workspace=workspace) experiment_description = (experiment_description or pipeline_config.experiment.description) if experiment_description and len(experiment_description) > 5000: experiment_description = experiment_description[:5000-50] + "\n<<>>" if pipeline_config.run.submit: - # convert dictconfig to dict format as required for pipeline_submit function. - if pipeline_config.experiment.tags: - tags_dict = OmegaConf.to_container(pipeline_config.experiment.tags) - else: - tags_dict = None - pipeline_run = pipeline_instance.submit( - workspace=workspace, + pipeline_run = ml_client.jobs.create_or_update( + pipeline_instance, experiment_name=(experiment_name or pipeline_config.experiment.name), description=experiment_description, - display_name=(display_name or pipeline_config.experiment.display_name), - tags=(tags or tags_dict), - default_compute_target=pipeline_config.compute.default_compute_target, - regenerate_outputs=pipeline_config.run.regenerate_outputs, - continue_on_step_failure=pipeline_config.run.continue_on_failure, + tags=(tags or pipeline_config.experiment.tags), + continue_run_on_step_failure=pipeline_config.run.continue_on_failure ) - logging.info( f""" ################################# @@ -196,7 +207,7 @@ def pipeline_submit(workspace: Workspace, Follow link below to access your pipeline run directly: ------------------------------------------------------- -{pipeline_run.get_portal_url()} +{pipeline_run.services['Studio'].endpoint} ################################# ################################# diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py index 79b7cd99..0623a7f2 100644 --- a/src/pipelines/azureml/data_generation.py +++ b/src/pipelines/azureml/data_generation.py @@ -19,9 +19,9 @@ from omegaconf import OmegaConf, MISSING from typing import Optional, List -# AzureML -from azure.ml.component import Component -from azure.ml.component import dsl +# AzureML SDK 2.0 +from azure.ml import dsl +from azure.ml.entities import load_component # when running this script directly, needed to import common LIGHTGBM_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..')) @@ -68,7 +68,7 @@ class data_generation_config: # pylint: disable=invalid-name # load those components from local yaml specifications # use COMPONENTS_ROOT as base folder -generate_data_component = Component.from_yaml(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml")) +generate_data_component = load_component(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml")) ### DATA GENERATION PIPELINE ### @@ -78,10 +78,6 @@ class data_generation_config: # pylint: disable=invalid-name # but `pipeline_cli_main` will need one pipeline function # taking a single config argument, not a pipeline parameter. -@dsl.pipeline( - name="generate_all_datasets", # pythonic name - non_pipeline_parameters=["config"] # required to use config object -) def data_generation_main_pipeline_function(config): """Pipeline's main building function. @@ -92,34 +88,38 @@ def data_generation_main_pipeline_function(config): Returns: None """ - benchmark_custom_properties = json.dumps({ - 'benchmark_name' : config.data_generation_config.benchmark_name - }) - - # for each task provided in the general config - for generation_task in config.data_generation_config.tasks: - - # run a generation step with the right parameters - generate_data_step = generate_data_component( - learning_task = generation_task.task, - train_samples = generation_task.train_samples, - train_partitions = generation_task.train_partitions, - test_samples = generation_task.test_samples, - test_partitions = generation_task.test_partitions, - inferencing_samples = generation_task.inferencing_samples, - inferencing_partitions = generation_task.inferencing_partitions, - n_features = generation_task.n_features, - n_informative = generation_task.n_informative, - n_label_classes = generation_task.n_label_classes, - docs_per_query = generation_task.docs_per_query, - delimiter = generation_task.delimiter, - header = generation_task.header, - random_state = 5, - verbose = False, - custom_properties = benchmark_custom_properties - ) - # run it on the right compute target - generate_data_step.runsettings.configure(target=config.compute.linux_cpu) + @dsl.pipeline( + name="generate_all_datasets", # pythonic name + ) + def _data_generation_main_pipeline_function(): + benchmark_custom_properties = json.dumps({ + 'benchmark_name' : config.data_generation_config.benchmark_name + }) + + # for each task provided in the general config + for generation_task in config.data_generation_config.tasks: + + # run a generation step with the right parameters + generate_data_step = generate_data_component( + learning_task = generation_task.task, + train_samples = generation_task.train_samples, + train_partitions = generation_task.train_partitions, + test_samples = generation_task.test_samples, + test_partitions = generation_task.test_partitions, + inferencing_samples = generation_task.inferencing_samples, + inferencing_partitions = generation_task.inferencing_partitions, + n_features = generation_task.n_features, + n_informative = generation_task.n_informative, + n_label_classes = generation_task.n_label_classes, + docs_per_query = generation_task.docs_per_query, + delimiter = generation_task.delimiter, + header = generation_task.header, + random_state = 5, + verbose = False, + custom_properties = benchmark_custom_properties + ) + # run it on the right compute target + generate_data_step.compute = config.compute.linux_cpu # generate a readable run name generate_data_step.node_name = format_run_name("generate_{}_train{}test{}inf{}_feat{}".format( @@ -138,45 +138,59 @@ def data_generation_main_pipeline_function(config): task=generation_task.task, cols=generation_task.n_features ) - - # register each output (train, test, inference) - generate_data_step.outputs.output_train.register_as( - name=f"{dataset_prefix}-{generation_task.train_samples}samples-train", - create_new_version=True, - tags={ # add tags that will show up in AzureML - 'type':'train', - 'task':generation_task.task, - 'origin':'synthetic', - 'samples':generation_task.train_samples, - 'features':generation_task.n_features, - 'informative':generation_task.n_informative - } - ) - generate_data_step.outputs.output_test.register_as( - name=f"{dataset_prefix}-{generation_task.test_samples}samples-test", - create_new_version=True, - tags={ # add tags that will show up in AzureML - 'type':'test', - 'task':generation_task.task, - 'origin':'synthetic', - 'samples':generation_task.test_samples, - 'features':generation_task.n_features, - 'informative':generation_task.n_informative - } - ) - generate_data_step.outputs.output_inference.register_as( - name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference", - create_new_version=True, - tags={ # add tags that will show up in AzureML - 'type':'inference', - 'task':generation_task.task, - 'origin':'synthetic', - 'samples':generation_task.inferencing_samples, - 'features':generation_task.n_features, - 'informative':generation_task.n_informative - } - ) - + # run it on the right compute target + generate_data_step.compute = config.compute.linux_cpu + + # if config asks to register the outputs automatically... + if config.data_generation_config.register_outputs: + raise NotImplementedError("automated registering of outputs currently doesn't work in sdkv2") + + # create a prefix for the dataset + dataset_prefix = "{prefix}-{task}-{cols}cols".format( + prefix=config.data_generation_config.register_outputs_prefix, + task=generation_task.task, + cols=generation_task.n_features + ) + + # register each output (train, test, inference) + generate_data_step.outputs.output_train.register_as( + name=f"{dataset_prefix}-{generation_task.train_samples}samples-train", + create_new_version=True, + tags={ # add tags that will show up in AzureML + 'type':'train', + 'task':generation_task.task, + 'origin':'synthetic', + 'samples':generation_task.train_samples, + 'features':generation_task.n_features, + 'informative':generation_task.n_informative + } + ) + generate_data_step.outputs.output_test.register_as( + name=f"{dataset_prefix}-{generation_task.test_samples}samples-test", + create_new_version=True, + tags={ # add tags that will show up in AzureML + 'type':'test', + 'task':generation_task.task, + 'origin':'synthetic', + 'samples':generation_task.test_samples, + 'features':generation_task.n_features, + 'informative':generation_task.n_informative + } + ) + generate_data_step.outputs.output_inference.register_as( + name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference", + create_new_version=True, + tags={ # add tags that will show up in AzureML + 'type':'inference', + 'task':generation_task.task, + 'origin':'synthetic', + 'samples':generation_task.inferencing_samples, + 'features':generation_task.n_features, + 'informative':generation_task.n_informative + } + ) + + return _data_generation_main_pipeline_function() ### MAIN BLOCK ### @@ -187,7 +201,7 @@ def main(): config = parse_pipeline_config(data_generation_config) # you'll need a workspace object to connect - workspace = azureml_connect(config) + ml_client = azureml_connect(config) # run the pipeline function with the given arguments pipeline_instance = data_generation_main_pipeline_function(config) @@ -203,7 +217,7 @@ def main(): # validate/submit the pipeline (if run.submit=True) pipeline_submit( - workspace, + ml_client, config, pipeline_instance, experiment_description=experiment_description diff --git a/src/scripts/data_processing/generate_data/conda_env.yaml b/src/scripts/data_processing/generate_data/conda_env.yaml index 223c34f7..c4a6ee64 100644 --- a/src/scripts/data_processing/generate_data/conda_env.yaml +++ b/src/scripts/data_processing/generate_data/conda_env.yaml @@ -1,4 +1,4 @@ -name: treelite_conda_env +name: generate_data_env channels: - defaults dependencies: diff --git a/src/scripts/data_processing/generate_data/spec.additional_includes b/src/scripts/data_processing/generate_data/spec.additional_includes deleted file mode 100644 index 0ad98a8f..00000000 --- a/src/scripts/data_processing/generate_data/spec.additional_includes +++ /dev/null @@ -1 +0,0 @@ -../../../common/ diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml index 339a8a8c..1f739cc0 100644 --- a/src/scripts/data_processing/generate_data/spec.yaml +++ b/src/scripts/data_processing/generate_data/spec.yaml @@ -1,8 +1,8 @@ -$schema: http://azureml/sdk-2-0/CommandComponent.json -name: generate_synthetic_data -version: 1.0.5 +$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json +name: lightgbm_benchmark_data_generate +version: 2.0.0 display_name: "Generate Synthetic Data" -type: CommandComponent +type: command description: "Generate data for classification or regression." is_deterministic: true @@ -14,130 +14,130 @@ tags: inputs: learning_task: - type: Enum + type: string default: "regression" enum: - regression - classification - lambdarank train_samples: - type: Integer + type: integer description: Number of training samples to generate default: 1000 optional: false train_partitions: - type: Integer + type: integer description: Number of partitions to generate for training data default: 1 optional: false test_samples: - type: Integer + type: integer description: Number of testing samples to generate default: 100 optional: false test_partitions: - type: Integer + type: integer description: Number of partitions to generate for testing data default: 1 optional: false inferencing_samples: - type: Integer + type: integer description: Number of inferencing samples to generate default: 1000 optional: false inferencing_partitions: - type: Integer + type: integer description: Number of partitions to generate for inferencing data default: 1 optional: false n_features: - type: Integer + type: integer description: Number of features/columns default: 100 optional: false n_informative: - type: Integer + type: integer description: Number of informative features default: 100 optional: false n_redundant: - type: Integer + type: integer description: number of redundant features (for classification) optional: true random_state: - type: Integer + type: integer description: random seed optional: true docs_per_query: - type: Integer + type: integer description: docs per query, used for ranking data default: 20 optional: true n_label_classes: - type: Integer + type: integer description: n_label_classes, used for ranking data default: 10 optional: true delimiter: - type: Enum + type: string default: "comma" enum: - tab - comma - space header: - type: Boolean + type: boolean default: False description: "generate header for output files" # generic benchmark parameters verbose: - type: Boolean + type: boolean default: False description: "Show debug logs" custom_properties: - type: String + type: string description: "For benchmark analysis, provide as a json dictionary (ex: {\"foo\":\"bar\"}) anything that will be added as tags to the job" optional: true outputs: output_train: - type: AnyDirectory + type: path output_test: - type: AnyDirectory + type: path output_inference: - type: AnyDirectory + type: path external_header: - type: AnyDirectory - + type: path + +code: "../../../" + command: >- - python generate.py - --type {inputs.learning_task} - --train_samples {inputs.train_samples} - --train_partitions {inputs.train_partitions} - --test_samples {inputs.test_samples} - --test_partitions {inputs.test_partitions} - --inferencing_samples {inputs.inferencing_samples} - --inferencing_partitions {inputs.inferencing_partitions} - --n_features {inputs.n_features} - --n_informative {inputs.n_informative} - [--n_redundant {inputs.n_redundant}] - [--random_state {inputs.random_state}] - --delimiter {inputs.delimiter} - --generate_header {inputs.header} - --output_train {outputs.output_train} - --output_test {outputs.output_test} - --output_inference {outputs.output_inference} - --external_header {outputs.external_header} - --verbose {inputs.verbose} - [--custom_properties {inputs.custom_properties}] - [--docs_per_query {inputs.docs_per_query}] - [--n_label_classes {inputs.n_label_classes}] + python scripts/data_processing/generate_data/generate.py + --type ${{inputs.learning_task}} + --train_samples ${{inputs.train_samples}} + --train_partitions ${{inputs.train_partitions}} + --test_samples ${{inputs.test_samples}} + --test_partitions ${{inputs.test_partitions}} + --inferencing_samples ${{inputs.inferencing_samples}} + --inferencing_partitions ${{inputs.inferencing_partitions}} + --n_features ${{inputs.n_features}} + --n_informative ${{inputs.n_informative}} + [--n_redundant ${{inputs.n_redundant}}] + [--random_state ${{inputs.random_state}}] + --delimiter ${{inputs.delimiter}} + --generate_header ${{inputs.header}} + --output_train ${{outputs.output_train}} + --output_test ${{outputs.output_test}} + --output_inference ${{outputs.output_inference}} + --external_header ${{outputs.external_header}} + --verbose ${{inputs.verbose}} + [--custom_properties '${{inputs.custom_properties}}'] + [--docs_per_query ${{inputs.docs_per_query}}] + [--n_label_classes ${{inputs.n_label_classes}}] environment: - conda: - # conda file path is resolved after additional includes - conda_dependencies_file: conda_env.yaml - os: Linux + image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04 + conda_file: conda_env.yaml diff --git a/src/scripts/lightgbm_python/conda.yml b/src/scripts/lightgbm_python/conda.yml new file mode 100644 index 00000000..9ea15cb5 --- /dev/null +++ b/src/scripts/lightgbm_python/conda.yml @@ -0,0 +1,10 @@ +name: lightgbm_python_env +channels: + - conda-forge +dependencies: + - python=3.8 + - pip + - pip: + - lightgbm==3.2.1 + - mlflow==1.19.0 + - azureml-mlflow==1.33.0