Skip to content
This repository was archived by the owner on Apr 8, 2024. It is now read-only.
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
61be9df
work in progress
jfomhover Aug 14, 2021
60599ac
working module, failing pipeline
jfomhover Aug 14, 2021
5ec7a84
merge from wip branch
Aug 15, 2021
386810d
working pipeline
Aug 15, 2021
f985333
working with the right name
Aug 15, 2021
159721f
descriptions in component
Aug 15, 2021
90494fb
mlflow test implementation
Aug 15, 2021
250111e
proper metric logging
Aug 15, 2021
8fce693
azureml mlflow integration
Aug 16, 2021
725a75f
merge and resolve
jfomhover Aug 16, 2021
3f22207
merge and resolve
jfomhover Aug 16, 2021
4c1ec9e
wip
jfomhover Aug 16, 2021
74d3fa6
merge main and resolve
jfomhover Aug 24, 2021
6439327
merge main and resolve
jfomhover Aug 24, 2021
72798f0
fix merge issues
jfomhover Aug 24, 2021
3fed9db
fix merge issues
jfomhover Aug 24, 2021
9fa23de
merge main and resolve
jfomhover Aug 31, 2021
67fa425
merge docs
jfomhover Aug 31, 2021
310be0f
rename
jfomhover Aug 31, 2021
a229aa6
resolve old code merge
jfomhover Aug 31, 2021
bc74d0c
merge and resolve
jfomhover Jan 11, 2022
eee56c9
remove reqs temporarily
jfomhover Jan 11, 2022
a4297ff
revise pipeline helper code
jfomhover Jan 11, 2022
01781d7
revise data generation pipeline
jfomhover Jan 11, 2022
a64d502
modify specs
jfomhover Jan 11, 2022
94dce49
remove deprecated instructions
jfomhover Jan 11, 2022
479aea2
remove old yaml files
jfomhover Jan 11, 2022
4f61e88
align specs
jfomhover Jan 12, 2022
459e45b
correct input format
jfomhover Jan 12, 2022
712c6fb
resolve and merge
jfomhover Jan 12, 2022
bca2071
remove portal url
jfomhover Jan 13, 2022
cade7fd
escape json in custom properties
jfomhover Jan 13, 2022
5ebd55a
Merge branch 'main' into jfomhover/sdk20dev
jfomhover May 9, 2022
476c606
working data generation pipeline
jfomhover May 9, 2022
ac3a089
working data generation pipeline
jfomhover May 9, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 5 additions & 8 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@ treelite==2.1.0
treelite_runtime==2.1.0
flaml==0.9.6

# pipelines
shrike[pipeline]==1.14.7
azure-ml-component==0.9.4.post1 # for component dsl
azureml-train-core==1.36.0 # for azureml.train.hyperdrive
azureml-dataset-runtime==1.36.0 # to register dataset
hydra-core~=1.0.3
typing_extensions==4.0.1 # for hydra

# unit testing
pytest==6.2.4
pytest-cov==2.12.1
pytest-mock==3.6.1

# pipelines
hydra-core~=1.0.3
azure-ml==0.0.61212840
--extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-cli-v2
2 changes: 1 addition & 1 deletion src/common/aml.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"""
import logging
import re
from azureml.core import Datastore, Dataset
#from azureml.core import Datastore, Dataset


def dataset_from_dstore_path(workspace, datastore, datastore_path, validate=True):
Expand Down
69 changes: 40 additions & 29 deletions src/common/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@
from hydra.core.config_store import ConfigStore
from omegaconf import DictConfig, OmegaConf

from azureml.core import Workspace
from azureml.pipeline.core import Pipeline
from shrike.pipeline.aml_connect import azureml_connect as shrike_azureml_connect
from azure.ml import MLClient

# when running this script directly, needed to import common
from .paths import COMPONENTS_ROOT, CONFIG_PATH
Expand Down Expand Up @@ -134,26 +132,48 @@ def azureml_connect(config: DictConfig):
Returns:
workspace (azure.ml.core.Workspace)
"""
return shrike_azureml_connect(
aml_subscription_id=config.aml.subscription_id,
aml_resource_group=config.aml.resource_group,
aml_workspace_name=config.aml.workspace_name,
aml_auth=config.aml.auth,
aml_tenant=config.aml.tenant,
aml_force=config.aml.force
if config.aml.auth == "msi":
from azure.identity import ManagedIdentityCredential
credential = ManagedIdentityCredential()
elif config.aml.auth == "azurecli":
from azure.identity import AzureCliCredential
credential = AzureCliCredential()
elif config.aml.auth == "interactive":
from azure.identity import InteractiveBrowserCredential

credential = InteractiveBrowserCredential(
tenant_id=config.aml.tenant, force=config.aml.force
)
else:
# authentication package
from azure.identity import DefaultAzureCredential
try:
credential = DefaultAzureCredential()
# Check if given credential can get token successfully.
credential.get_token("https://management.azure.com/.default")
except Exception as ex:
from azure.identity import InteractiveBrowserCredential
# Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
credential = InteractiveBrowserCredential()

return MLClient(
credential=credential,
subscription_id=config.aml.subscription_id,
resource_group_name=config.aml.resource_group,
workspace_name=config.aml.workspace_name
)

def pipeline_submit(workspace: Workspace,
def pipeline_submit(ml_client: MLClient,
pipeline_config: DictConfig,
pipeline_instance: Pipeline,
pipeline_instance,
experiment_name: str=None,
experiment_description: str=None,
display_name: str=None,
tags: dict=None):
"""Standard helper function to submit a pipeline to AzureML.

Args:
workspace (azure.ml.core.Workspace): AzureML workspace (see azureml_connect())
ml_client (azure.ml.MLClient): AzureML client (see azureml_connect())
pipeline_config (DictConfig): class for hosting the config of pipeline_func
pipeline_instance (Pipeline): pipeline object
experiment_name (str): override config.experiment.name at runtime
Expand All @@ -164,30 +184,21 @@ def pipeline_submit(workspace: Workspace,
Returns:
pipeline (azure.ml.core.PipelineRun)
"""
if pipeline_config.run.validate:
pipeline_instance.validate(workspace=workspace)
#if pipeline_config.run.validate:
# pipeline_instance.validate(workspace=workspace)

experiment_description = (experiment_description or pipeline_config.experiment.description)
if experiment_description and len(experiment_description) > 5000:
experiment_description = experiment_description[:5000-50] + "\n<<<TRUNCATED DUE TO SIZE LIMIT>>>"

if pipeline_config.run.submit:
# convert dictconfig to dict format as required for pipeline_submit function.
if pipeline_config.experiment.tags:
tags_dict = OmegaConf.to_container(pipeline_config.experiment.tags)
else:
tags_dict = None
pipeline_run = pipeline_instance.submit(
workspace=workspace,
pipeline_run = ml_client.jobs.create_or_update(
pipeline_instance,
experiment_name=(experiment_name or pipeline_config.experiment.name),
description=experiment_description,
display_name=(display_name or pipeline_config.experiment.display_name),
tags=(tags or tags_dict),
default_compute_target=pipeline_config.compute.default_compute_target,
regenerate_outputs=pipeline_config.run.regenerate_outputs,
continue_on_step_failure=pipeline_config.run.continue_on_failure,
tags=(tags or pipeline_config.experiment.tags),
continue_run_on_step_failure=pipeline_config.run.continue_on_failure
)

logging.info(
f"""
#################################
Expand All @@ -196,7 +207,7 @@ def pipeline_submit(workspace: Workspace,

Follow link below to access your pipeline run directly:
-------------------------------------------------------
{pipeline_run.get_portal_url()}
{pipeline_run.services['Studio'].endpoint}

#################################
#################################
Expand Down
168 changes: 91 additions & 77 deletions src/pipelines/azureml/data_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
from omegaconf import OmegaConf, MISSING
from typing import Optional, List

# AzureML
from azure.ml.component import Component
from azure.ml.component import dsl
# AzureML SDK 2.0
from azure.ml import dsl
from azure.ml.entities import load_component

# when running this script directly, needed to import common
LIGHTGBM_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
Expand Down Expand Up @@ -68,7 +68,7 @@ class data_generation_config: # pylint: disable=invalid-name
# load those components from local yaml specifications
# use COMPONENTS_ROOT as base folder

generate_data_component = Component.from_yaml(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))
generate_data_component = load_component(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))

### DATA GENERATION PIPELINE ###

Expand All @@ -78,10 +78,6 @@ class data_generation_config: # pylint: disable=invalid-name
# but `pipeline_cli_main` will need one pipeline function
# taking a single config argument, not a pipeline parameter.

@dsl.pipeline(
name="generate_all_datasets", # pythonic name
non_pipeline_parameters=["config"] # required to use config object
)
def data_generation_main_pipeline_function(config):
"""Pipeline's main building function.

Expand All @@ -92,34 +88,38 @@ def data_generation_main_pipeline_function(config):
Returns:
None
"""
benchmark_custom_properties = json.dumps({
'benchmark_name' : config.data_generation_config.benchmark_name
})

# for each task provided in the general config
for generation_task in config.data_generation_config.tasks:

# run a generation step with the right parameters
generate_data_step = generate_data_component(
learning_task = generation_task.task,
train_samples = generation_task.train_samples,
train_partitions = generation_task.train_partitions,
test_samples = generation_task.test_samples,
test_partitions = generation_task.test_partitions,
inferencing_samples = generation_task.inferencing_samples,
inferencing_partitions = generation_task.inferencing_partitions,
n_features = generation_task.n_features,
n_informative = generation_task.n_informative,
n_label_classes = generation_task.n_label_classes,
docs_per_query = generation_task.docs_per_query,
delimiter = generation_task.delimiter,
header = generation_task.header,
random_state = 5,
verbose = False,
custom_properties = benchmark_custom_properties
)
# run it on the right compute target
generate_data_step.runsettings.configure(target=config.compute.linux_cpu)
@dsl.pipeline(
name="generate_all_datasets", # pythonic name
)
def _data_generation_main_pipeline_function():
benchmark_custom_properties = json.dumps({
'benchmark_name' : config.data_generation_config.benchmark_name
})

# for each task provided in the general config
for generation_task in config.data_generation_config.tasks:

# run a generation step with the right parameters
generate_data_step = generate_data_component(
learning_task = generation_task.task,
train_samples = generation_task.train_samples,
train_partitions = generation_task.train_partitions,
test_samples = generation_task.test_samples,
test_partitions = generation_task.test_partitions,
inferencing_samples = generation_task.inferencing_samples,
inferencing_partitions = generation_task.inferencing_partitions,
n_features = generation_task.n_features,
n_informative = generation_task.n_informative,
n_label_classes = generation_task.n_label_classes,
docs_per_query = generation_task.docs_per_query,
delimiter = generation_task.delimiter,
header = generation_task.header,
random_state = 5,
verbose = False,
custom_properties = benchmark_custom_properties
)
# run it on the right compute target
generate_data_step.compute = config.compute.linux_cpu

# generate a readable run name
generate_data_step.node_name = format_run_name("generate_{}_train{}test{}inf{}_feat{}".format(
Expand All @@ -138,45 +138,59 @@ def data_generation_main_pipeline_function(config):
task=generation_task.task,
cols=generation_task.n_features
)

# register each output (train, test, inference)
generate_data_step.outputs.output_train.register_as(
name=f"{dataset_prefix}-{generation_task.train_samples}samples-train",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'train',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.train_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)
generate_data_step.outputs.output_test.register_as(
name=f"{dataset_prefix}-{generation_task.test_samples}samples-test",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'test',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.test_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)
generate_data_step.outputs.output_inference.register_as(
name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'inference',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.inferencing_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)

# run it on the right compute target
generate_data_step.compute = config.compute.linux_cpu

# if config asks to register the outputs automatically...
if config.data_generation_config.register_outputs:
raise NotImplementedError("automated registering of outputs currently doesn't work in sdkv2")

# create a prefix for the dataset
dataset_prefix = "{prefix}-{task}-{cols}cols".format(
prefix=config.data_generation_config.register_outputs_prefix,
task=generation_task.task,
cols=generation_task.n_features
)

# register each output (train, test, inference)
generate_data_step.outputs.output_train.register_as(
name=f"{dataset_prefix}-{generation_task.train_samples}samples-train",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'train',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.train_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)
generate_data_step.outputs.output_test.register_as(
name=f"{dataset_prefix}-{generation_task.test_samples}samples-test",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'test',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.test_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)
generate_data_step.outputs.output_inference.register_as(
name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference",
create_new_version=True,
tags={ # add tags that will show up in AzureML
'type':'inference',
'task':generation_task.task,
'origin':'synthetic',
'samples':generation_task.inferencing_samples,
'features':generation_task.n_features,
'informative':generation_task.n_informative
}
)

return _data_generation_main_pipeline_function()

### MAIN BLOCK ###

Expand All @@ -187,7 +201,7 @@ def main():
config = parse_pipeline_config(data_generation_config)

# you'll need a workspace object to connect
workspace = azureml_connect(config)
ml_client = azureml_connect(config)

# run the pipeline function with the given arguments
pipeline_instance = data_generation_main_pipeline_function(config)
Expand All @@ -203,7 +217,7 @@ def main():

# validate/submit the pipeline (if run.submit=True)
pipeline_submit(
workspace,
ml_client,
config,
pipeline_instance,
experiment_description=experiment_description
Expand Down
2 changes: 1 addition & 1 deletion src/scripts/data_processing/generate_data/conda_env.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: treelite_conda_env
name: generate_data_env
channels:
- defaults
dependencies:
Expand Down

This file was deleted.

Loading