From 61be9df73d4e1d845dd867477c5333361cf5c858 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Sat, 14 Aug 2021 13:38:13 -0700
Subject: [PATCH 01/27] work in progress

---
 src/common/io.py                              | 24 +++++++++++++++
 src/pipelines/score.yml                       | 26 ++++++++++++++++
 src/scripts/lightgbm_cli/score.py             |  6 ++--
 src/scripts/lightgbm_python/conda.yml         |  8 +++++
 src/scripts/lightgbm_python/score.py          |  9 +++---
 .../lightgbm_python/score_component.yml       | 30 +++++++++++++++++++
 src/scripts/lightgbm_python/train.py          |  5 ++--
 7 files changed, 99 insertions(+), 9 deletions(-)
 create mode 100644 src/common/io.py
 create mode 100644 src/pipelines/score.yml
 create mode 100644 src/scripts/lightgbm_python/conda.yml
 create mode 100644 src/scripts/lightgbm_python/score_component.yml
diff --git a/src/common/io.py b/src/common/io.py
new file mode 100644
index 00000000..91aabc22
--- /dev/null
+++ b/src/common/io.py
@@ -0,0 +1,24 @@
+import os
+import argparse
+
+def input_file_path(path):
+    """ Resolve input path from AzureML
+
+    Args:
+        path (str)
+    
+    Returns:
+        str
+    """
+    if os.path.isfile(path):
+        print(f"Found INPUT file {path}")
+        return path
+    if os.path.isdir(path):
+        all_files = os.listdir(path)
+        if not all_files:
+            raise Exception(f"Could not find any file in specified input directory {path}")
+        if len(all_files) > 1:
+            raise Exception(f"Found multiple files in input file path {path}, use input_directory_path type instead.")
+        print(f"Found INPUT directory {path}, selecting unique file {all_files[0]}")
+        return os.path.join(path, all_files[0])
+    raise Exception(f"Provided INPUT path {path} is neither a directory or a file???")
diff --git a/src/pipelines/score.yml b/src/pipelines/score.yml
new file mode 100644
index 00000000..abde3470
--- /dev/null
+++ b/src/pipelines/score.yml
@@ -0,0 +1,26 @@
+name: lightgbm_benchmark_pipeline
+type: pipeline_job
+
+# <inputs_and_outputs>
+inputs:
+  benchmark_data: #using dataset, can use datastore + datapath also
+    data: 
+      local_path: ../../data/synthetic-sample/inference.txt
+  benchmark_model: #using dataset, can use datastore + datapath also
+    data: 
+      local_path: ../../data/models/synthetic-150.txt
+
+# <jobs>
+defaults:
+  component_job:
+    datastore: azureml:workspaceblobstore
+    compute:
+      target: azureml:linux-d14v2
+
+jobs:
+  hello_python_world_job:
+    type: component_job
+    component: file:../../src/scripts/lightgbm_python/score_component.yml
+    inputs:
+      data: ${{inputs.benchmark_data}}
+      model: ${{inputs.benchmark_model}}
diff --git a/src/scripts/lightgbm_cli/score.py b/src/scripts/lightgbm_cli/score.py
index 0f4e228d..08f1d2d9 100644
--- a/src/scripts/lightgbm_cli/score.py
+++ b/src/scripts/lightgbm_cli/score.py
@@ -21,7 +21,7 @@
 
 # before doing local import
 from common.metrics import LogTimeBlock
-
+from common.io import input_file_path
 
 def get_arg_parser(parser=None):
     """Adds component/module arguments to a given argument parser.
@@ -43,9 +43,9 @@ def get_arg_parser(parser=None):
     group_i.add_argument("--lightgbm_exec",
         required=True, type=str, help="Path to lightgbm.exe (file path)")
     group_i.add_argument("--data",
-        required=True, type=str, help="Inferencing data location (file path)")
+        required=True, type=input_file_path, help="Inferencing data location (file path)")
     group_i.add_argument("--model",
-        required=False, type=str, help="Exported model location")
+        required=False, type=input_file_path, help="Exported model location")
     group_i.add_argument("--output",
         required=False, default=None, type=str, help="Inferencing output location (file path)")
     
diff --git a/src/scripts/lightgbm_python/conda.yml b/src/scripts/lightgbm_python/conda.yml
new file mode 100644
index 00000000..6f97fda2
--- /dev/null
+++ b/src/scripts/lightgbm_python/conda.yml
@@ -0,0 +1,8 @@
+name: lightgbm_python_env
+channels:
+  - conda-forge
+dependencies:
+  - python=3.8
+  - pip
+  - pip:
+    - lightgbm==3.2.1
\ No newline at end of file
diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py
index a599ea07..c98448cb 100644
--- a/src/scripts/lightgbm_python/score.py
+++ b/src/scripts/lightgbm_python/score.py
@@ -19,7 +19,7 @@
 
 # before doing local import
 from common.metrics import LogTimeBlock
-
+from common.io import input_file_path
 
 def get_arg_parser(parser=None):
     """Adds component/module arguments to a given argument parser.
@@ -39,9 +39,9 @@ def get_arg_parser(parser=None):
 
     group_i = parser.add_argument_group("Input Data")
     group_i.add_argument("--data",
-        required=True, type=str, help="Inferencing data location (file path)")
+        required=True, type=input_file_path, help="Inferencing data location (file path)")
     group_i.add_argument("--model",
-        required=False, type=str, help="Exported model location (file path)")
+        required=False, type=input_file_path, help="Exported model location (file path)")
     group_i.add_argument("--output",
         required=False, default=None, type=str, help="Inferencing output location (file path)")
     
@@ -57,7 +57,8 @@ def run(args, other_args=[]):
     """
     # create sub dir
     if args.output:
-        os.makedirs(os.path.dirname(args.output), exist_ok=True)
+        os.makedirs(args.output, exist_ok=True)
+        args.output = args.output + "/test.out"
 
     print(f"Loading model from {args.model}")
     booster = lightgbm.Booster(model_file=args.model)
diff --git a/src/scripts/lightgbm_python/score_component.yml b/src/scripts/lightgbm_python/score_component.yml
new file mode 100644
index 00000000..3e1ac433
--- /dev/null
+++ b/src/scripts/lightgbm_python/score_component.yml
@@ -0,0 +1,30 @@
+type: command_component
+
+name: lightgbm_python_score
+display_name: "LightGBM Inferencing (python)"
+version: 1
+
+inputs:
+  data:
+    type: path
+    description: "TODO"
+  model:
+    type: path
+    description: "TODO"
+outputs:
+  predictions:
+    type: path
+
+code:
+  local_path: ../../
+
+environment: 
+  conda_file: file:./conda.yml
+  docker: 
+    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
+
+command: >-
+  python scripts/lightgbm_python/score.py
+    --data ${{inputs.data}}
+    --model ${{inputs.model}}
+    --output ${{outputs.predictions}}
diff --git a/src/scripts/lightgbm_python/train.py b/src/scripts/lightgbm_python/train.py
index fa91a5cd..5ab66c42 100644
--- a/src/scripts/lightgbm_python/train.py
+++ b/src/scripts/lightgbm_python/train.py
@@ -20,6 +20,7 @@
 
 # before doing local import
 from common.metrics import LogTimeBlock
+from common.io import input_file_path
 
 
 def get_arg_parser(parser=None):
@@ -40,9 +41,9 @@ def get_arg_parser(parser=None):
 
     group_i = parser.add_argument_group("Input Data")
     group_i.add_argument("--train",
-        required=True, default="*", type=str, help="Training data location (file path)")
+        required=True, default="*", type=input_file_path, help="Training data location (file path)")
     group_i.add_argument("--test",
-        required=True, default="*", type=str, help="Testing data location (file path)")
+        required=True, default="*", type=input_file_path, help="Testing data location (file path)")
     group_i.add_argument("--header", required=False, default=False, type=strtobool)
     group_i.add_argument("--label_column", required=False, default="0", type=str)
     group_i.add_argument("--query_column", required=False, default=None, type=str)

From 60599ac80357a94d7dbdb5a4f0c37dc870367a22 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jeomhove@microsoft.com>
Date: Sat, 14 Aug 2021 14:53:40 -0700
Subject: [PATCH 02/27] working module, failing pipeline

---
 docs/AzureML-CLI20-benchmark.md               | 57 +++++++++++++++++++
 .../components/lightgbm_python_score.yml      | 10 ++--
 pipelines/azureml_cli20/inference_data.yml    |  6 ++
 pipelines/azureml_cli20/score.yml             | 37 ++++++++++++
 src/pipelines/score.yml                       | 26 ---------
 5 files changed, 105 insertions(+), 31 deletions(-)
 create mode 100644 docs/AzureML-CLI20-benchmark.md
 rename src/scripts/lightgbm_python/score_component.yml => pipelines/azureml_cli20/components/lightgbm_python_score.yml (75%)
 create mode 100644 pipelines/azureml_cli20/inference_data.yml
 create mode 100644 pipelines/azureml_cli20/score.yml
 delete mode 100644 src/pipelines/score.yml

diff --git a/docs/AzureML-CLI20-benchmark.md b/docs/AzureML-CLI20-benchmark.md
new file mode 100644
index 00000000..b569cc35
--- /dev/null
+++ b/docs/AzureML-CLI20-benchmark.md
@@ -0,0 +1,57 @@
+# Run benchmark in AzureML (with CLI 2.0)
+
+**Objectives** - By following this tutorial, you will be able to:
+
+- Run the LightGBM benchmark scripts on the AzureML platform (CLI 2.0 edition)
+
+**Requirements** - To enjoy this tutorial, you need to be able to:
+
+- Install Azure CLI, AzureML extension and enable private features (see below).
+- Create or access an AzureML workspace (see [instructions](https://docs.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).
+
+## Install requirements
+
+1. Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) (version >= 2.27.0)
+
+2. Install [Azure ML extension](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-cli)
+
+3. Activate the preview features by setting environment variable `AZURE_ML_CLI_PRIVATE_FEATURES_ENABLED=true` (see [instructions](https://github.com/Azure/azureml-previews/tree/main/previews/pipelines#how-to-get-started)).
+
+4. If you don't have an AzureML workspace yet, create one following [those instructions](https://docs.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).
+
+## Prepare for running the benchmark
+
+### A. Set your default azure references
+
+Before you get started, we recommend you to set your Azure CLI on your specific subscription, resource gropu and workspace:
+
+```
+az account set --subscription [SUBSCRIPTION]
+az configure --defaults group=[RESOURCE GROUP] workspace=[WORKSPACE]
+```
+
+### B. Publish the modules in your workspace
+
+```
+az ml component create --file ./pipelines/azureml_cli20/components/lightgbm_python_score.yml
+```
+
+### C. Create the datasets
+
+### Option 1: upload manually using AzureML UI
+
+### Option 2: upload manually using CLI
+
+```bash
+az ml data create --file ./pipelines/azureml_cli/inference_data.yml
+```
+
+### Option 3: generate in AzureML
+
+(Work in progress)
+
+## Run the benchmark
+
+```
+az ml job create --file ./pipelines/azureml_cli20/score.yml --web
+```
diff --git a/src/scripts/lightgbm_python/score_component.yml b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
similarity index 75%
rename from src/scripts/lightgbm_python/score_component.yml
rename to pipelines/azureml_cli20/components/lightgbm_python_score.yml
index 3e1ac433..b21ddde7 100644
--- a/src/scripts/lightgbm_python/score_component.yml
+++ b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
@@ -14,12 +14,12 @@ inputs:
 outputs:
   predictions:
     type: path
-
+    
 code:
-  local_path: ../../
+  local_path: ../../../src/
 
-environment: 
-  conda_file: file:./conda.yml
+environment:
+  conda_file: file:../../../src/scripts/lightgbm_python/conda.yml
   docker: 
     image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
 
@@ -27,4 +27,4 @@ command: >-
   python scripts/lightgbm_python/score.py
     --data ${{inputs.data}}
     --model ${{inputs.model}}
-    --output ${{outputs.predictions}}
+    --output ${{inputs.predictions}}
diff --git a/pipelines/azureml_cli20/inference_data.yml b/pipelines/azureml_cli20/inference_data.yml
new file mode 100644
index 00000000..60ae1d6d
--- /dev/null
+++ b/pipelines/azureml_cli20/inference_data.yml
@@ -0,0 +1,6 @@
+name: synthetic_inference_4000col
+version: 1
+description: Synthetic dataset for inference, for regression task, 4000 cols
+datastore: azureml:workspaceblobstore
+local_path: ../../data/
+path: /synthetic/inference.txt
\ No newline at end of file
diff --git a/pipelines/azureml_cli20/score.yml b/pipelines/azureml_cli20/score.yml
new file mode 100644
index 00000000..d2a2ca4c
--- /dev/null
+++ b/pipelines/azureml_cli20/score.yml
@@ -0,0 +1,37 @@
+name: lightgbm_benchmark_pipeline
+type: pipeline_job
+
+# <inputs_and_outputs>
+inputs:
+  benchmark_data:
+    # using reference to an existing AzureML dataset
+    data: azureml:synthetic_inference_4000col
+  benchmark_model:
+    # using reference to an existing AzureML dataset
+    data: azureml:model_4000col_100000train_synthetic
+outputs:
+  predictions:
+    # register the output as another dataset
+    data: azureml:benchmark_predictions
+
+# default settings that apply to all jobs
+defaults:
+  component_job:
+    datastore: azureml:workspaceblobstore
+    compute:
+      target: azureml:linux-d14v2
+
+# <jobs>
+jobs:
+  benchmark_score_job:
+    type: component_job
+
+    # to use below, module must have been registered before
+    component: azureml:lightgbm_python_score
+
+    inputs:
+      data: ${{inputs.benchmark_data}}
+      model: ${{inputs.benchmark_model}}
+
+    outputs:
+      predictions: ${{outputs.predictions}}
\ No newline at end of file
diff --git a/src/pipelines/score.yml b/src/pipelines/score.yml
deleted file mode 100644
index abde3470..00000000
--- a/src/pipelines/score.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-name: lightgbm_benchmark_pipeline
-type: pipeline_job
-
-# <inputs_and_outputs>
-inputs:
-  benchmark_data: #using dataset, can use datastore + datapath also
-    data: 
-      local_path: ../../data/synthetic-sample/inference.txt
-  benchmark_model: #using dataset, can use datastore + datapath also
-    data: 
-      local_path: ../../data/models/synthetic-150.txt
-
-# <jobs>
-defaults:
-  component_job:
-    datastore: azureml:workspaceblobstore
-    compute:
-      target: azureml:linux-d14v2
-
-jobs:
-  hello_python_world_job:
-    type: component_job
-    component: file:../../src/scripts/lightgbm_python/score_component.yml
-    inputs:
-      data: ${{inputs.benchmark_data}}
-      model: ${{inputs.benchmark_model}}

From 386810de0bf2c45673387a84a52a3383e5665306 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Sat, 14 Aug 2021 23:16:19 -0700
Subject: [PATCH 03/27] working pipeline

---
 .../azureml_cli20/components/lightgbm_python_score.yml   | 4 ++--
 pipelines/azureml_cli20/score.yml                        | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pipelines/azureml_cli20/components/lightgbm_python_score.yml b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
index b21ddde7..3f0d90e7 100644
--- a/pipelines/azureml_cli20/components/lightgbm_python_score.yml
+++ b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
@@ -14,7 +14,7 @@ inputs:
 outputs:
   predictions:
     type: path
-    
+
 code:
   local_path: ../../../src/
 
@@ -27,4 +27,4 @@ command: >-
   python scripts/lightgbm_python/score.py
     --data ${{inputs.data}}
     --model ${{inputs.model}}
-    --output ${{inputs.predictions}}
+    --output ${{outputs.predictions}}
diff --git a/pipelines/azureml_cli20/score.yml b/pipelines/azureml_cli20/score.yml
index d2a2ca4c..e244d952 100644
--- a/pipelines/azureml_cli20/score.yml
+++ b/pipelines/azureml_cli20/score.yml
@@ -1,4 +1,4 @@
-name: lightgbm_benchmark_pipeline
+name: lightgbm_benchmark_pipeline_local
 type: pipeline_job
 
 # <inputs_and_outputs>
@@ -12,7 +12,8 @@ inputs:
 outputs:
   predictions:
     # register the output as another dataset
-    data: azureml:benchmark_predictions
+    data:
+      datastore: azureml:workspaceblobstore
 
 # default settings that apply to all jobs
 defaults:
@@ -23,11 +24,11 @@ defaults:
 
 # <jobs>
 jobs:
-  benchmark_score_job:
+  benchmark_job:
     type: component_job
 
     # to use below, module must have been registered before
-    component: azureml:lightgbm_python_score
+    component: file:./components/lightgbm_python_score.yml
 
     inputs:
       data: ${{inputs.benchmark_data}}

From f985333511e8c762523701ce4bc441792752bc1f Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Sat, 14 Aug 2021 23:21:55 -0700
Subject: [PATCH 04/27] working with the right name

---
 pipelines/azureml_cli20/score.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelines/azureml_cli20/score.yml b/pipelines/azureml_cli20/score.yml
index e244d952..8be5fbc2 100644
--- a/pipelines/azureml_cli20/score.yml
+++ b/pipelines/azureml_cli20/score.yml
@@ -1,4 +1,4 @@
-name: lightgbm_benchmark_pipeline_local
+name: lightgbm_python_benchmark_pipeline
 type: pipeline_job
 
 # <inputs_and_outputs>

From 159721fc4204883cac87b592d5513309f70854ec Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Sat, 14 Aug 2021 23:28:14 -0700
Subject: [PATCH 05/27] descriptions in component

---
 pipelines/azureml_cli20/components/lightgbm_python_score.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelines/azureml_cli20/components/lightgbm_python_score.yml b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
index 3f0d90e7..d566a685 100644
--- a/pipelines/azureml_cli20/components/lightgbm_python_score.yml
+++ b/pipelines/azureml_cli20/components/lightgbm_python_score.yml
@@ -7,13 +7,14 @@ version: 1
 inputs:
   data:
     type: path
-    description: "TODO"
+    description: "Inference data (compatible with LightGBM model)"
   model:
     type: path
-    description: "TODO"
+    description: "LightGBM model exported from training (.txt)"
 outputs:
   predictions:
     type: path
+    description: "Predictions from LightGBM model"
 
 code:
   local_path: ../../../src/

From 90494fb7f4e93268dbcf8e76147110003b3337a1 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Sun, 15 Aug 2021 14:23:29 -0700
Subject: [PATCH 06/27] mlflow test implementation

---
 .gitignore                            |  5 +++-
 requirements.txt                      |  1 +
 src/common/metrics.py                 | 42 ++++++++++++++++++++++++++-
 src/scripts/lightgbm_python/conda.yml |  4 ++-
 4 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3ef240f2..f1129ecb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,7 @@ dmypy.json
 .pyre/
 
 # ignore local data
-data/
\ No newline at end of file
+data/
+
+# ignore mlflow local dumps
+mlruns/
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b22440c9..cf81b61e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ lightgbm==3.2.1
 pytest==6.2.4
 pytest-cov==2.12.1
 pytest-mock==3.6.1
+mlflow==0.19.0
\ No newline at end of file
diff --git a/src/common/metrics.py b/src/common/metrics.py
index 95a57e13..4147382d 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -7,6 +7,43 @@
 import os
 import time
 from functools import wraps
+import mlflow
+
+
+class MetricsLogger():
+    """
+    Class for handling metrics logging in a singleton
+
+    Example:
+    --------
+    >> from common.metrics import MetricsLogger
+    >>
+    >> metrics_logger = MetricsLogger()
+    >> metrics_logger.log_metrics("rmse", 0.456)
+    """
+    _initialized = False
+    _instance = None
+
+    def __new__(cls):
+        """ Create a new instance of the Singleton if necessary """
+        if cls._instance is None:
+            # if this is the first time we're initializing
+            cls._instance = super(MetricsLogger, cls).__new__(cls)
+            print("Initializing MLFLOW")
+            mlflow.start_run()
+        else:
+            # if this is not the first time
+            pass
+
+        return cls._instance
+
+    def log_metric(self, key, value):
+        print(f"mlflow.log_metric({key},{value})")
+        # NOTE: there's a limit to the name of a metric
+        if len(key) > 45:
+            key = key[:45]
+        mlflow.log_metric(key, value)
+
 
 ########################
 ### CODE BLOCK TIMER ###
@@ -54,6 +91,7 @@ def __init__(self, name, **kwargs):
         # internal variables
         self.name = name
         self.start_time = None
+        self.metrics_logger = MetricsLogger()
 
     def __enter__(self):
         """ Starts the timer, gets triggered at beginning of code block """
@@ -69,6 +107,7 @@ def __exit__(self, exc_type, value, traceback):
             if method == "print":
                 # just prints nicely
                 print(f"--- time elapsed: {self.name} = {run_time:2f} s" + (f" [tags: {self.tags}]" if self.tags else ""))
+                self.metrics_logger.log_metric(self.name, run_time)
             else:
                 # Place holder for mlflow
                 raise NotImplementedError("Nothing else exists at this point")
@@ -83,12 +122,13 @@ def log_time_function(func):
     """ decorator to log wall time of a function/method """
     @wraps(func)
     def perf_wrapper(*args, **kwargs):
-        log_name = "{}.time".format(func.__qualname__[:45]) # NOTE: there's a limit to the name of a metric
+        log_name = "{}.time".format(func.__qualname__)
         start_time = time.time()
         output = func(*args, **kwargs)
         run_time = time.time() - start_time
 
         print("--- time elapsed: {} = {:2f} s".format(log_name, run_time))
+        self.metrics_logger.log_metric(log_name, run_time)
 
         return output
     return perf_wrapper
\ No newline at end of file
diff --git a/src/scripts/lightgbm_python/conda.yml b/src/scripts/lightgbm_python/conda.yml
index 6f97fda2..ba713caa 100644
--- a/src/scripts/lightgbm_python/conda.yml
+++ b/src/scripts/lightgbm_python/conda.yml
@@ -5,4 +5,6 @@ dependencies:
   - python=3.8
   - pip
   - pip:
-    - lightgbm==3.2.1
\ No newline at end of file
+    - lightgbm==3.2.1
+    - mlflow==1.19.0
+    - azureml-defaults==1.33.0
\ No newline at end of file

From 250111ecabafe80ad1888e92855bcd36f094a0b9 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Sun, 15 Aug 2021 15:17:57 -0700
Subject: [PATCH 07/27] proper metric logging

---
 src/common/metrics.py                 | 38 ++++++++++++++++++++++-----
 src/scripts/generate_data/generate.py | 27 ++++++++++++++++---
 src/scripts/lightgbm_python/conda.yml |  2 +-
 src/scripts/lightgbm_python/score.py  | 28 ++++++++++++++++----
 src/scripts/lightgbm_python/train.py  | 33 +++++++++++++++++++----
 5 files changed, 107 insertions(+), 21 deletions(-)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index 4147382d..582efe21 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -23,13 +23,20 @@ class MetricsLogger():
     """
     _initialized = False
     _instance = None
+    _session_name = None
 
-    def __new__(cls):
+    def __new__(cls, session_name=None):
         """ Create a new instance of the Singleton if necessary """
         if cls._instance is None:
             # if this is the first time we're initializing
             cls._instance = super(MetricsLogger, cls).__new__(cls)
-            print("Initializing MLFLOW")
+            if not cls._session_name:
+                # if no previously recorded session name
+                cls._session_name = session_name
+            elif session_name:
+                # if new session name specified, overwrite
+                cls._session_name = session_name
+            print(f"Initializing MLFLOW [session='{cls._session_name}']")
             mlflow.start_run()
         else:
             # if this is not the first time
@@ -37,13 +44,33 @@ def __new__(cls):
 
         return cls._instance
 
+    def close(self):
+        print(f"Finalizing MLFLOW [session='{self._session_name}']")
+        mlflow.end_run()
+
     def log_metric(self, key, value):
-        print(f"mlflow.log_metric({key},{value})")
+        print(f"mlflow[session={self._session_name}].log_metric({key},{value})")
         # NOTE: there's a limit to the name of a metric
         if len(key) > 45:
             key = key[:45]
         mlflow.log_metric(key, value)
 
+    def set_properties(self, **kwargs):
+        """ Set properties/tags for the session """
+        print(f"mlflow[session={self._session_name}].set_tags({kwargs})")
+        mlflow.set_tags(kwargs)
+
+    def log_parameters(self, **kwargs):
+        """ Set parameters for the session """
+        print(f"mlflow[session={self._session_name}].log_params({kwargs})")
+        mlflow.log_params(kwargs)
+
+    def log_time_block(self, metric_name):
+        """ [Proxy] Records time of execution for block of code """
+        # see class below with proper __enter__ and __exit__
+        return LogTimeBlock(metric_name)
+
+
 
 ########################
 ### CODE BLOCK TIMER ###
@@ -91,7 +118,6 @@ def __init__(self, name, **kwargs):
         # internal variables
         self.name = name
         self.start_time = None
-        self.metrics_logger = MetricsLogger()
 
     def __enter__(self):
         """ Starts the timer, gets triggered at beginning of code block """
@@ -107,7 +133,7 @@ def __exit__(self, exc_type, value, traceback):
             if method == "print":
                 # just prints nicely
                 print(f"--- time elapsed: {self.name} = {run_time:2f} s" + (f" [tags: {self.tags}]" if self.tags else ""))
-                self.metrics_logger.log_metric(self.name, run_time)
+                MetricsLogger().log_metric(self.name, run_time)
             else:
                 # Place holder for mlflow
                 raise NotImplementedError("Nothing else exists at this point")
@@ -128,7 +154,7 @@ def perf_wrapper(*args, **kwargs):
         run_time = time.time() - start_time
 
         print("--- time elapsed: {} = {:2f} s".format(log_name, run_time))
-        self.metrics_logger.log_metric(log_name, run_time)
+        MetricsLogger().log_metric(log_name, run_time)
 
         return output
     return perf_wrapper
\ No newline at end of file
diff --git a/src/scripts/generate_data/generate.py b/src/scripts/generate_data/generate.py
index 177c1079..11191972 100644
--- a/src/scripts/generate_data/generate.py
+++ b/src/scripts/generate_data/generate.py
@@ -10,6 +10,7 @@
 
 import numpy
 from lightgbm import train, Dataset
+import sklearn
 from sklearn.datasets import make_classification, make_regression
 
 # let's add the right PYTHONPATH for common module
@@ -20,7 +21,7 @@
     sys.path.append(str(COMMON_ROOT))
 
 # before doing local import
-from common.metrics import LogTimeBlock
+from common.metrics import MetricsLogger
 
 
 def get_arg_parser(parser=None):
@@ -69,11 +70,27 @@ def run(args, other_args=[]):
     os.makedirs(args.output_test, exist_ok=True)
     os.makedirs(args.output_inference, exist_ok=True)
 
-    metric_tags = {'task':'generate'}
+    # initializes reporting of metrics
+    metrics_logger = MetricsLogger("lightgbm_python.score")
+
+    # add some properties to the session
+    metrics_logger.set_properties(
+        task = 'generate',
+        sklean_version = sklearn.__version__
+    )
+    metrics_logger.log_parameters(
+        train_samples = args.train_samples,
+        test_samples = args.test_samples,
+        inferencing_samples = args.inferencing_samples,
+        n_features = args.n_features,
+        n_informative = args.n_informative,
+        n_redundant = args.n_redundant,
+        random_state = args.random_state
+    )
 
     # record a metric    
     print(f"Generating data in memory.")
-    with LogTimeBlock("data_generation", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("data_generation"):
         total_samples = args.train_samples + args.test_samples + args.inferencing_samples
         if args.type == "classification":
             X, y = make_classification(
@@ -111,11 +128,13 @@ def run(args, other_args=[]):
 
     # save as CSV
     print(f"Saving data...")
-    with LogTimeBlock("data_saving", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("data_saving"):
         numpy.savetxt(os.path.join(args.output_train, "train.txt"), train_data, delimiter=",", newline="\n", fmt='%1.3f')
         numpy.savetxt(os.path.join(args.output_test, "test.txt"), test_data, delimiter=",", newline="\n", fmt='%1.3f')
         numpy.savetxt(os.path.join(args.output_inference, "inference.txt"), inference_data, delimiter=",", newline="\n", fmt='%1.3f')
 
+    # optional: close logging session
+    metrics_logger.close()
 
 def main(cli_args=None):
     """ Component main function, parses arguments and executes run() function.
diff --git a/src/scripts/lightgbm_python/conda.yml b/src/scripts/lightgbm_python/conda.yml
index ba713caa..879ae304 100644
--- a/src/scripts/lightgbm_python/conda.yml
+++ b/src/scripts/lightgbm_python/conda.yml
@@ -7,4 +7,4 @@ dependencies:
   - pip:
     - lightgbm==3.2.1
     - mlflow==1.19.0
-    - azureml-defaults==1.33.0
\ No newline at end of file
+    - azureml-mlflow==1.33.0
\ No newline at end of file
diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py
index 6799739a..207ed1cb 100644
--- a/src/scripts/lightgbm_python/score.py
+++ b/src/scripts/lightgbm_python/score.py
@@ -18,9 +18,10 @@
     sys.path.append(str(COMMON_ROOT))
 
 # before doing local import
-from common.metrics import LogTimeBlock
+from common.metrics import MetricsLogger
 from common.io import input_file_path
 
+
 def get_arg_parser(parser=None):
     """Adds component/module arguments to a given argument parser.
 
@@ -60,19 +61,36 @@ def run(args, other_args=[]):
         os.makedirs(args.output, exist_ok=True)
         args.output = os.path.join(args.output, "predictions.txt")
 
+    # initializes reporting of metrics
+    metrics_logger = MetricsLogger("lightgbm_python.score")
+
+    # add some properties to the session
+    metrics_logger.set_properties(
+        framework = 'lightgbm_python',
+        task = 'score',
+        lightgbm_version = lightgbm.__version__
+    )
+
     print(f"Loading model from {args.model}")
     booster = lightgbm.Booster(model_file=args.model)
 
-    metric_tags = {'framework':'lightgbm_python','task':'score','lightgbm_version':lightgbm.__version__}
-
     print(f"Loading data for inferencing")
-    with LogTimeBlock("data_loading", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("data_loading"):
         raw_data = numpy.loadtxt(args.data, delimiter=",")
 
+    # capture data shape as property
+    metrics_logger.set_properties(
+        inference_data_length = raw_data.shape[0],
+        inference_data_width = raw_data.shape[1]
+    )
+
     print(f"Running .predict()")
-    with LogTimeBlock("inferencing", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("inferencing"):
         booster.predict(data=raw_data)
 
+    # optional: close logging session
+    metrics_logger.close()
+
 
 def main(cli_args=None):
     """ Component main function, parses arguments and executes run() function.
diff --git a/src/scripts/lightgbm_python/train.py b/src/scripts/lightgbm_python/train.py
index 0f8b07ef..d1f1e9fb 100644
--- a/src/scripts/lightgbm_python/train.py
+++ b/src/scripts/lightgbm_python/train.py
@@ -19,7 +19,7 @@
     sys.path.append(str(COMMON_ROOT))
 
 # before doing local import
-from common.metrics import LogTimeBlock
+from common.metrics import MetricsLogger
 from common.io import input_file_path
 
 
@@ -80,16 +80,36 @@ def run(args, other_args=[]):
         os.makedirs(args.export_model, exist_ok=True)
         args.export_model = os.path.join(args.export_model, "model.txt")
 
+    # initializes reporting of metrics
+    metrics_logger = MetricsLogger("lightgbm_python.score")
+
+    # add some properties to the session
+    metrics_logger.set_properties(
+        framework = 'lightgbm_python',
+        task = 'score',
+        lightgbm_version = lightgbm.__version__
+    )
+    # add lgbm params to the session
     lgbm_params = vars(args)
-    metric_tags = {'framework':'lightgbm_python','task':'train','lightgbm_version':lightgbm.__version__}
+    lgbm_params['feature_pre_filter'] = False
+
+    metrics_logger.log_parameters(**lgbm_params)
 
     print(f"Loading data for training")
-    with LogTimeBlock("data_loading", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("data_loading"):
         train_data = lightgbm.Dataset(args.train, params=lgbm_params).construct()
-        val_data = train_data.create_valid(args.test)
+        val_data = train_data.create_valid(args.test).construct()
+
+    # capture data shape as property
+    metrics_logger.set_properties(
+        train_data_length = train_data.num_data(),
+        train_data_width = train_data.num_feature(),
+        test_data_length = val_data.num_data(),
+        test_data_width = val_data.num_feature()
+    )
 
     print(f"Training LightGBM with parameters: {lgbm_params}")
-    with LogTimeBlock("training", methods=['print'], tags=metric_tags):
+    with metrics_logger.log_time_block("training"):
         booster = lightgbm.train(
             lgbm_params,
             train_data,
@@ -101,6 +121,9 @@ def run(args, other_args=[]):
         print(f"Writing model in {args.export_model}")
         booster.save_model(args.export_model)
 
+    # optional: close logging session
+    metrics_logger.close()
+
 
 def main(cli_args=None):
     """ Component main function, parses arguments and executes run() function.

From 8fce693b4de5364efee82a93757225af86c58133 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jfomhover@gmail.com>
Date: Mon, 16 Aug 2021 09:07:56 -0700
Subject: [PATCH 08/27] azureml mlflow integration

---
 src/common/metrics.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index 582efe21..66175822 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -10,6 +10,23 @@
 import mlflow
 
 
+def _init_azureml_mlflow_client():
+    try:
+        # if any of that fails, fall back to normal
+        from azureml.core.run import Run
+
+        azureml_run = Run.get_context()
+        if "_OfflineRun" not in str(azureml_run):
+            # if we're running this script REMOTELY, get aml and compute args from run context
+            ws = azureml_run.experiment.workspace
+            mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
+        else:
+            # if we're running this script LOCALLY, add your own +aml=X +compute=X arguments
+            return
+    except:
+        print(f"Failed at AzureML initialization for some reason.")
+
+
 class MetricsLogger():
     """
     Class for handling metrics logging in a singleton

From 3f22207901499df698655c5aeb50aeec3eb649f9 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 16 Aug 2021 09:19:03 -0700
Subject: [PATCH 09/27] merge and resolve

---
 src/common/metrics.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index 71ddd9ec..48d4f416 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -25,6 +25,25 @@ class MetricsLogger():
     _instance = None
     _session_name = None
 
+    @classmethod
+    def _initialize_azureml_mlflow_client(cls):
+        try:
+            # if any of that fails, fall back to normal
+            from azureml.core.run import Run
+
+            azureml_run = Run.get_context()
+            if "_OfflineRun" not in str(azureml_run):
+                # if we're running this script REMOTELY, get aml and compute args from run context
+                ws = azureml_run.experiment.workspace
+                mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
+                mlflow.start_run()
+                cls._initialized = True
+            else:
+                # if we're running this script LOCALLY, add your own +aml=X +compute=X arguments
+                return
+        except:
+            print(f"Failed at AzureML initialization for some reason.")
+
     def __new__(cls, session_name=None):
         """ Create a new instance of the Singleton if necessary """
         if cls._instance is None:
@@ -37,7 +56,7 @@ def __new__(cls, session_name=None):
                 # if new session name specified, overwrite
                 cls._session_name = session_name
             print(f"Initializing MLFLOW [session='{cls._session_name}']")
-            mlflow.start_run()
+            cls._init_azureml_mlflow_client()
         else:
             # if this is not the first time
             pass
@@ -45,8 +64,9 @@ def __new__(cls, session_name=None):
         return cls._instance
 
     def close(self):
-        print(f"Finalizing MLFLOW [session='{self._session_name}']")
-        mlflow.end_run()
+        if self._initialized:
+            print(f"Finalizing MLFLOW [session='{self._session_name}']")
+            mlflow.end_run()
 
     def log_metric(self, key, value):
         print(f"mlflow[session={self._session_name}].log_metric({key},{value})")

From 4c1ec9e76e3c62def0959442ea601aeadc24e935 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 16 Aug 2021 10:08:43 -0700
Subject: [PATCH 10/27] wip

---
 src/common/metrics.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index 48d4f416..686c14e0 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -27,6 +27,9 @@ class MetricsLogger():
 
     @classmethod
     def _initialize_azureml_mlflow_client(cls):
+        if cls._initialized:
+            return
+        print(f"Initializing MLFLOW [session='{cls._session_name}']")
         try:
             # if any of that fails, fall back to normal
             from azureml.core.run import Run
@@ -55,8 +58,7 @@ def __new__(cls, session_name=None):
             elif session_name:
                 # if new session name specified, overwrite
                 cls._session_name = session_name
-            print(f"Initializing MLFLOW [session='{cls._session_name}']")
-            cls._init_azureml_mlflow_client()
+            cls._initialize_azureml_mlflow_client()
         else:
             # if this is not the first time
             pass

From 72798f0ec8a25653e7c3a02fd56ffa6b1ddfac0a Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 24 Aug 2021 00:39:49 -0700
Subject: [PATCH 11/27] fix merge issues

---
 src/scripts/lightgbm_python/conda.yml | 2 +-
 src/scripts/lightgbm_python/score.py  | 8 --------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/src/scripts/lightgbm_python/conda.yml b/src/scripts/lightgbm_python/conda.yml
index 879ae304..9ea15cb5 100644
--- a/src/scripts/lightgbm_python/conda.yml
+++ b/src/scripts/lightgbm_python/conda.yml
@@ -7,4 +7,4 @@ dependencies:
   - pip:
     - lightgbm==3.2.1
     - mlflow==1.19.0
-    - azureml-mlflow==1.33.0
\ No newline at end of file
+    - azureml-mlflow==1.33.0
diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py
index 08b61029..97360e45 100644
--- a/src/scripts/lightgbm_python/score.py
+++ b/src/scripts/lightgbm_python/score.py
@@ -81,9 +81,6 @@ def run(args, other_args=[]):
 
     print(f"Loading data for inferencing")
     with metrics_logger.log_time_block("data_loading"):
-<<<<<<< HEAD
-        raw_data = numpy.loadtxt(args.data, delimiter=",")
-=======
         # NOTE: this is bad, but allows for libsvm format (not just numpy)
         inference_data = lightgbm.Dataset(args.data, free_raw_data=False).construct()
         inference_raw_data = inference_data.get_data()
@@ -93,7 +90,6 @@ def run(args, other_args=[]):
         inference_data_length = inference_data.num_data(),
         inference_data_width = inference_data.num_feature()
     )
->>>>>>> main
 
     # capture data shape as property
     metrics_logger.set_properties(
@@ -103,14 +99,10 @@ def run(args, other_args=[]):
 
     print(f"Running .predict()")
     with metrics_logger.log_time_block("inferencing"):
-<<<<<<< HEAD
-        booster.predict(data=raw_data)
-=======
         booster.predict(data=inference_raw_data, predict_disable_shape_check=bool(args.predict_disable_shape_check))
 
     # optional: close logging session
     metrics_logger.close()
->>>>>>> main
 
     # optional: close logging session
     metrics_logger.close()

From 3fed9db5d797578d60ffc9e3a4edcc913e446623 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 24 Aug 2021 00:40:56 -0700
Subject: [PATCH 12/27] fix merge issues

---
 src/common/metrics.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/src/common/metrics.py b/src/common/metrics.py
index aea6c73a..686c14e0 100644
--- a/src/common/metrics.py
+++ b/src/common/metrics.py
@@ -10,23 +10,6 @@
 import mlflow
 
 
-def _init_azureml_mlflow_client():
-    try:
-        # if any of that fails, fall back to normal
-        from azureml.core.run import Run
-
-        azureml_run = Run.get_context()
-        if "_OfflineRun" not in str(azureml_run):
-            # if we're running this script REMOTELY, get aml and compute args from run context
-            ws = azureml_run.experiment.workspace
-            mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
-        else:
-            # if we're running this script LOCALLY, add your own +aml=X +compute=X arguments
-            return
-    except:
-        print(f"Failed at AzureML initialization for some reason.")
-
-
 class MetricsLogger():
     """
     Class for handling metrics logging in a singleton

From 67fa42557f8337d7d7c8300e3ca626298df414ae Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 30 Aug 2021 23:04:28 -0700
Subject: [PATCH 13/27] merge docs

---
 .../azureml-cli20-benchmark.md}                                | 2 +-
 mkdocs.yml                                                     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)
 rename docs/{AzureML-CLI20-benchmark.md => quickstart/azureml-cli20-benchmark.md} (95%)

diff --git a/docs/AzureML-CLI20-benchmark.md b/docs/quickstart/azureml-cli20-benchmark.md
similarity index 95%
rename from docs/AzureML-CLI20-benchmark.md
rename to docs/quickstart/azureml-cli20-benchmark.md
index b569cc35..48963dc6 100644
--- a/docs/AzureML-CLI20-benchmark.md
+++ b/docs/quickstart/azureml-cli20-benchmark.md
@@ -53,5 +53,5 @@ az ml data create --file ./pipelines/azureml_cli/inference_data.yml
 ## Run the benchmark
 
 ```
-az ml job create --file ./pipelines/azureml_cli20/score.yml --web
+az ml job create --file ./pipelines/azureml_cli20/pipelines/score_ab.yml --web
 ```
diff --git a/mkdocs.yml b/mkdocs.yml
index c1d3837a..7c1c030b 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -12,7 +12,8 @@ nav:
     Reporting Guide: contribute/reporting-guide.md
 - Run:
     Install: quickstart/install.md
-    Run manually: quickstart/manual-benchmark.md
+    Run Manually: quickstart/manual-benchmark.md
+    Run in AzureML CLI v2.0: quickstart/azureml-cli20-benchmark.md
 - Results:
     Latest: results/latest.md
 

From 310be0f7144ceeda51475c897b9621d72200b58b Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 30 Aug 2021 23:12:34 -0700
Subject: [PATCH 14/27] rename

---
 .../score.yml}                                |  4 +--
 .../lightgbm_python/score_custom.yml          | 31 +++++++++++++++++++
 .../{score.yml => pipelines/score_ab.yml}     |  4 +--
 3 files changed, 35 insertions(+), 4 deletions(-)
 rename pipelines/azureml_cli20/components/{lightgbm_python_score.yml => lightgbm_python/score.yml} (86%)
 create mode 100644 pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
 rename pipelines/azureml_cli20/{score.yml => pipelines/score_ab.yml} (89%)

diff --git a/pipelines/azureml_cli20/components/lightgbm_python_score.yml b/pipelines/azureml_cli20/components/lightgbm_python/score.yml
similarity index 86%
rename from pipelines/azureml_cli20/components/lightgbm_python_score.yml
rename to pipelines/azureml_cli20/components/lightgbm_python/score.yml
index d566a685..57f17f7b 100644
--- a/pipelines/azureml_cli20/components/lightgbm_python_score.yml
+++ b/pipelines/azureml_cli20/components/lightgbm_python/score.yml
@@ -17,10 +17,10 @@ outputs:
     description: "Predictions from LightGBM model"
 
 code:
-  local_path: ../../../src/
+  local_path: ../../../../src/
 
 environment:
-  conda_file: file:../../../src/scripts/lightgbm_python/conda.yml
+  conda_file: file:../../../../src/scripts/lightgbm_python/conda.yml
   docker: 
     image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
 
diff --git a/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml b/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
new file mode 100644
index 00000000..90cfc652
--- /dev/null
+++ b/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
@@ -0,0 +1,31 @@
+type: command_component
+
+name: lightgbm_python_score_custom
+display_name: "LightGBM Inferencing (custom build)"
+version: 1
+
+inputs:
+  data:
+    type: path
+    description: "Inference data (compatible with LightGBM model)"
+  model:
+    type: path
+    description: "LightGBM model exported from training (.txt)"
+outputs:
+  predictions:
+    type: path
+    description: "Predictions from LightGBM model"
+
+code:
+  local_path: ../../../../src/
+
+environment:
+  conda_file: file:../../../../src/scripts/lightgbm_python/conda.yml
+  docker: 
+    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
+
+command: >-
+  python scripts/lightgbm_python/score.py
+    --data ${{inputs.data}}
+    --model ${{inputs.model}}
+    --output ${{outputs.predictions}}
diff --git a/pipelines/azureml_cli20/score.yml b/pipelines/azureml_cli20/pipelines/score_ab.yml
similarity index 89%
rename from pipelines/azureml_cli20/score.yml
rename to pipelines/azureml_cli20/pipelines/score_ab.yml
index 8be5fbc2..aa83c7fc 100644
--- a/pipelines/azureml_cli20/score.yml
+++ b/pipelines/azureml_cli20/pipelines/score_ab.yml
@@ -1,4 +1,4 @@
-name: lightgbm_python_benchmark_pipeline
+name: lightgbm_master_versus_custom_v0
 type: pipeline_job
 
 # <inputs_and_outputs>
@@ -28,7 +28,7 @@ jobs:
     type: component_job
 
     # to use below, module must have been registered before
-    component: file:./components/lightgbm_python_score.yml
+    component: file:../components/lightgbm_python/score.yml
 
     inputs:
       data: ${{inputs.benchmark_data}}

From a229aa6e594ba1cad0ee0b554d728b74d21e32ff Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Mon, 30 Aug 2021 23:14:41 -0700
Subject: [PATCH 15/27] resolve old code merge

---
 src/scripts/lightgbm_python/score.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/scripts/lightgbm_python/score.py b/src/scripts/lightgbm_python/score.py
index 541564be..d8631464 100644
--- a/src/scripts/lightgbm_python/score.py
+++ b/src/scripts/lightgbm_python/score.py
@@ -134,9 +134,6 @@ def run(args, unknown_args=[]):
     # Important: close logging session before exiting
     metrics_logger.close()
 
-    # optional: close logging session
-    metrics_logger.close()
-
 
 def main(cli_args=None):
     """Component main function, parses arguments and executes run() function.

From eee56c9a344f6030d556bdc8a3214636eadbedc3 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 14:00:32 -0800
Subject: [PATCH 16/27] remove reqs temporarily

---
 requirements.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e61baf38..cceaaab4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,10 +3,6 @@ pytest==6.2.4
 pytest-cov==2.12.1
 pytest-mock==3.6.1
 mlflow==1.21.0
-shrike[pipeline]==1.14.7
-azure-ml-component==0.9.4.post1  # for component dsl
-azureml-train-core==1.36.0  # for azureml.train.hyperdrive
-azureml-dataset-runtime==1.36.0  # to register dataset
 hydra-core~=1.0.3
 typing_extensions==4.0.1 # for hydra
 omegaconf~=2.1

From a4297ffc7f37133643eedffc1aa4e03302507a94 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 14:01:02 -0800
Subject: [PATCH 17/27] revise pipeline helper code

---
 src/common/pipelines.py | 49 ++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/src/common/pipelines.py b/src/common/pipelines.py
index b1991f80..f31b564c 100644
--- a/src/common/pipelines.py
+++ b/src/common/pipelines.py
@@ -17,9 +17,7 @@
 from hydra.core.config_store import ConfigStore
 from omegaconf import DictConfig, OmegaConf
 
-from azureml.core import Workspace
-from azureml.pipeline.core import Pipeline
-from shrike.pipeline.aml_connect import azureml_connect as shrike_azureml_connect
+from azure.ml import MLClient
 
 # when running this script directly, needed to import common
 from .paths import COMPONENTS_ROOT, CONFIG_PATH
@@ -134,17 +132,31 @@ def azureml_connect(config: DictConfig):
     Returns:
         workspace (azure.ml.core.Workspace)
     """
-    return shrike_azureml_connect(
-        aml_subscription_id=config.aml.subscription_id,
-        aml_resource_group=config.aml.resource_group,
-        aml_workspace_name=config.aml.workspace_name,
-        aml_auth=config.aml.auth,
-        aml_tenant=config.aml.tenant
+    if config.aml.auth == "msi":
+        from azure.identity import ManagedIdentityCredential
+        auth = ManagedIdentityCredential()
+    elif config.aml.auth == "azurecli":
+        from azure.identity import AzureCliCredential
+        auth = AzureCliCredential()
+    elif config.aml.auth == "interactive":
+        from azure.identity import InteractiveBrowserCredential
+
+        auth = InteractiveBrowserCredential(
+            tenant_id=config.aml.tenant, force=config.aml.force
+        )
+    else:
+        auth = None
+
+    return MLClient(
+        credential=auth,
+        subscription_id=config.aml.subscription_id,
+        resource_group_name=config.aml.resource_group,
+        workspace_name=config.aml.workspace_name
     )
 
-def pipeline_submit(workspace: Workspace,
+def pipeline_submit(ml_client: MLClient,
                     pipeline_config: DictConfig,
-                    pipeline_instance: Pipeline,
+                    pipeline_instance,
                     experiment_name: str=None,
                     experiment_description: str=None,
                     display_name: str=None,
@@ -152,7 +164,7 @@ def pipeline_submit(workspace: Workspace,
     """Standard helper function to submit a pipeline to AzureML.
 
     Args:
-        workspace (azure.ml.core.Workspace): AzureML workspace (see azureml_connect())
+        ml_client (azure.ml.MLClient): AzureML client (see azureml_connect())
         pipeline_config (DictConfig): class for hosting the config of pipeline_func
         pipeline_instance (Pipeline): pipeline object
         experiment_name (str): override config.experiment.name at runtime
@@ -163,21 +175,18 @@ def pipeline_submit(workspace: Workspace,
     Returns:
         pipeline (azure.ml.core.PipelineRun)
     """
-    if pipeline_config.run.validate:
-        pipeline_instance.validate(workspace=workspace)
+    #if pipeline_config.run.validate:
+    #    pipeline_instance.validate(workspace=workspace)
 
     experiment_description = (experiment_description or pipeline_config.experiment.description)
     if experiment_description and len(experiment_description) > 5000:
         experiment_description = experiment_description[:5000-50] + "\n<<<TRUNCATED DUE TO SIZE LIMIT>>>"
 
     if pipeline_config.run.submit:
-        return pipeline_instance.submit(
-            workspace=workspace,
+        return ml_client.jobs.create_or_update(
+            pipeline_instance,
             experiment_name=(experiment_name or pipeline_config.experiment.name),
             description=experiment_description,
-            display_name=(display_name or pipeline_config.experiment.display_name),
             tags=(tags or pipeline_config.experiment.tags),
-            default_compute_target=pipeline_config.compute.default_compute_target,
-            regenerate_outputs=pipeline_config.run.regenerate_outputs,
-            continue_on_step_failure=pipeline_config.run.continue_on_failure,
+            continue_run_on_step_failure=pipeline_config.run.continue_on_failure
         )

From 01781d70ad58f5cdc1585c75af92bbb8766f56b0 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 14:01:40 -0800
Subject: [PATCH 18/27] revise data generation pipeline

---
 src/pipelines/azureml/data_generation.py | 164 +++++++++++------------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py
index 60515e2f..b1ba84ae 100644
--- a/src/pipelines/azureml/data_generation.py
+++ b/src/pipelines/azureml/data_generation.py
@@ -19,9 +19,8 @@
 from omegaconf import OmegaConf, MISSING
 from typing import Optional, List
 
-# AzureML
-from azure.ml.component import Component
-from azure.ml.component import dsl
+# AzureML SDK 2.0
+from azure.ml import dsl
 
 # when running this script directly, needed to import common
 LIGHTGBM_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
@@ -66,7 +65,7 @@ class data_generation_config: # pylint: disable=invalid-name
 # load those components from local yaml specifications
 # use COMPONENTS_ROOT as base folder
 
-generate_data_component = Component.from_yaml(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))
+generate_data_component = dsl.load_component(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))
 
 ### DATA GENERATION PIPELINE ###
 
@@ -76,10 +75,6 @@ class data_generation_config: # pylint: disable=invalid-name
 # but `pipeline_cli_main` will need one pipeline function
 # taking a single config argument, not a pipeline parameter.
 
-@dsl.pipeline(
-    name="generate_all_datasets", # pythonic name
-    non_pipeline_parameters=["config"] # required to use config object
-)
 def data_generation_main_pipeline_function(config):
     """Pipeline's main building function.
 
@@ -90,79 +85,84 @@ def data_generation_main_pipeline_function(config):
     Returns:
         None
     """
-    benchmark_custom_properties = json.dumps({
-        'benchmark_name' : config.data_generation_config.benchmark_name
-    })
-
-    # for each task provided in the general config
-    for generation_task in config.data_generation_config.tasks:
-
-        # run a generation step with the right parameters
-        generate_data_step = generate_data_component(
-            learning_task = generation_task.task,
-            train_samples = generation_task.train_samples,
-            train_partitions = generation_task.train_partitions,
-            test_samples = generation_task.test_samples,
-            test_partitions = generation_task.test_partitions,
-            inferencing_samples = generation_task.inferencing_samples,
-            inferencing_partitions = generation_task.inferencing_partitions,
-            n_features = generation_task.n_features,
-            n_informative = generation_task.n_informative,
-            delimiter = generation_task.delimiter,
-            random_state = 5,
-            verbose = False,
-            custom_properties = benchmark_custom_properties
-        )
-        # run it on the right compute target
-        generate_data_step.runsettings.configure(target=config.compute.linux_cpu)
-
-        # if config asks to register the outputs automatically...
-        if config.data_generation_config.register_outputs:
-            # create a prefix for the dataset
-            dataset_prefix = "{prefix}-{task}-{cols}cols".format(
-                prefix=config.data_generation_config.register_outputs_prefix,
-                task=generation_task.task,
-                cols=generation_task.n_features
-            )
-            
-            # register each output (train, test, inference)
-            generate_data_step.outputs.output_train.register_as(
-                name=f"{dataset_prefix}-{generation_task.train_samples}samples-train",
-                create_new_version=True,
-                tags={ # add tags that will show up in AzureML
-                    'type':'train',
-                    'task':generation_task.task,
-                    'origin':'synthetic',
-                    'samples':generation_task.train_samples,
-                    'features':generation_task.n_features,
-                    'informative':generation_task.n_informative
-                }
-            )
-            generate_data_step.outputs.output_test.register_as(
-                name=f"{dataset_prefix}-{generation_task.test_samples}samples-test",
-                create_new_version=True,
-                tags={ # add tags that will show up in AzureML
-                    'type':'test',
-                    'task':generation_task.task,
-                    'origin':'synthetic',
-                    'samples':generation_task.test_samples,
-                    'features':generation_task.n_features,
-                    'informative':generation_task.n_informative
-                }
+    @dsl.pipeline(
+        name="generate_all_datasets", # pythonic name
+    )
+    def _data_generation_main_pipeline_function():
+        benchmark_custom_properties = json.dumps({
+            'benchmark_name' : config.data_generation_config.benchmark_name
+        })
+
+        # for each task provided in the general config
+        for generation_task in config.data_generation_config.tasks:
+
+            # run a generation step with the right parameters
+            generate_data_step = generate_data_component(
+                learning_task = generation_task.task,
+                train_samples = generation_task.train_samples,
+                train_partitions = generation_task.train_partitions,
+                test_samples = generation_task.test_samples,
+                test_partitions = generation_task.test_partitions,
+                inferencing_samples = generation_task.inferencing_samples,
+                inferencing_partitions = generation_task.inferencing_partitions,
+                n_features = generation_task.n_features,
+                n_informative = generation_task.n_informative,
+                delimiter = generation_task.delimiter,
+                random_state = 5,
+                verbose = False,
+                custom_properties = benchmark_custom_properties
             )
-            generate_data_step.outputs.output_inference.register_as(
-                name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference",
-                create_new_version=True,
-                tags={ # add tags that will show up in AzureML
-                    'type':'inference',
-                    'task':generation_task.task,
-                    'origin':'synthetic',
-                    'samples':generation_task.inferencing_samples,
-                    'features':generation_task.n_features,
-                    'informative':generation_task.n_informative
-                }
-            )  
-
+            # run it on the right compute target
+            generate_data_step.compute = config.compute.linux_cpu
+
+            # if config asks to register the outputs automatically...
+            if config.data_generation_config.register_outputs:
+                # create a prefix for the dataset
+                dataset_prefix = "{prefix}-{task}-{cols}cols".format(
+                    prefix=config.data_generation_config.register_outputs_prefix,
+                    task=generation_task.task,
+                    cols=generation_task.n_features
+                )
+                
+                # register each output (train, test, inference)
+                generate_data_step.outputs.output_train.register_as(
+                    name=f"{dataset_prefix}-{generation_task.train_samples}samples-train",
+                    create_new_version=True,
+                    tags={ # add tags that will show up in AzureML
+                        'type':'train',
+                        'task':generation_task.task,
+                        'origin':'synthetic',
+                        'samples':generation_task.train_samples,
+                        'features':generation_task.n_features,
+                        'informative':generation_task.n_informative
+                    }
+                )
+                generate_data_step.outputs.output_test.register_as(
+                    name=f"{dataset_prefix}-{generation_task.test_samples}samples-test",
+                    create_new_version=True,
+                    tags={ # add tags that will show up in AzureML
+                        'type':'test',
+                        'task':generation_task.task,
+                        'origin':'synthetic',
+                        'samples':generation_task.test_samples,
+                        'features':generation_task.n_features,
+                        'informative':generation_task.n_informative
+                    }
+                )
+                generate_data_step.outputs.output_inference.register_as(
+                    name=f"{dataset_prefix}-{generation_task.inferencing_samples}samples-inference",
+                    create_new_version=True,
+                    tags={ # add tags that will show up in AzureML
+                        'type':'inference',
+                        'task':generation_task.task,
+                        'origin':'synthetic',
+                        'samples':generation_task.inferencing_samples,
+                        'features':generation_task.n_features,
+                        'informative':generation_task.n_informative
+                    }
+                )  
+
+    return _data_generation_main_pipeline_function()
 
 ### MAIN BLOCK ###
 
@@ -173,7 +173,7 @@ def main():
     config = parse_pipeline_config(data_generation_config)
     
     # you'll need a workspace object to connect
-    workspace = azureml_connect(config)
+    ml_client = azureml_connect(config)
 
     # run the pipeline function with the given arguments
     pipeline_instance = data_generation_main_pipeline_function(config)
@@ -189,7 +189,7 @@ def main():
 
     # validate/submit the pipeline (if run.submit=True)
     pipeline_submit(
-        workspace,
+        ml_client,
         config,
         pipeline_instance,
         experiment_description=experiment_description

From a64d502f7b8306614771538e9b4257b7c01d5c56 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 14:55:43 -0800
Subject: [PATCH 19/27] modify specs

---
 .../data_processing/generate_data/spec.yaml   | 50 +++++++++----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index 9ceb0740..52ddcf8c 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -1,8 +1,8 @@
 $schema: http://azureml/sdk-2-0/CommandComponent.json
-name: generate_synthetic_data
-version: 1.0.5
+name: lightgbm_benchmark.data.generate
+version: 2.0.0
 display_name: "Generate Synthetic Data"
-type: CommandComponent
+type: command
 description: "Generate data for classification or regression."
 is_deterministic: true
 
@@ -14,62 +14,62 @@ tags:
 
 inputs:
   learning_task:
-    type: Enum
+    type: string
     default: "regression"
     enum:
       - regression
       - classification
   train_samples:
-    type: Integer
+    type: integer
     description: Number of training samples to generate
     default: 1000
     optional: false
   train_partitions:
-    type: Integer
+    type: integer
     description: Number of partitions to generate for training data
     default: 1
     optional: false
   test_samples:
-    type: Integer
+    type: integer
     description: Number of testing samples to generate
     default: 100
     optional: false
   test_partitions:
-    type: Integer
+    type: integer
     description: Number of partitions to generate for testing data
     default: 1
     optional: false
   inferencing_samples:
-    type: Integer
+    type: integer
     description: Number of inferencing samples to generate
     default: 1000
     optional: false
   inferencing_partitions:
-    type: Integer
+    type: integer
     description: Number of partitions to generate for inferencing data
     default: 1
     optional: false
   n_features:
-    type: Integer
+    type: integer
     description: Number of features/columns
     default: 100
     optional: false
   n_informative:
-    type: Integer
+    type: integer
     description: Number of informative features
     default: 100
     optional: false
   n_redundant:
-    type: Integer
+    type: integer
     description: number of redundant features (for classification)
     optional: true
   random_state:
-    type: Integer
+    type: integer
     description: random seed
     optional: true
 
   delimiter:
-    type: Enum
+    type: string
     default: "comma"
     enum:
       - tab
@@ -78,21 +78,21 @@ inputs:
 
   # generic benchmark parameters
   verbose:
-    type: Boolean
+    type: boolean
     default: False
     description: "Show debug logs"
   custom_properties:
-    type: String
+    type: string
     description: "For benchmark analysis, provide as a json dictionary (ex: {\"foo\":\"bar\"}) anything that will be added as tags to the job"
     optional: true
 
 outputs:
   output_train:
-    type: AnyDirectory
+    type: path
   output_test:
-    type: AnyDirectory
+    type: path
   output_inference:
-    type: AnyDirectory
+    type: path
 
 command: >-
   python generate.py
@@ -114,8 +114,8 @@ command: >-
   --verbose {inputs.verbose}
   [--custom_properties {inputs.custom_properties}]
 
-environment:
-  conda:
-    # conda file path is resolved after additional includes
-    conda_dependencies_file: conda_env.yaml
-  os: Linux
+environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cuda11-gpu:3
+#  conda:
+#    # conda file path is resolved after additional includes
+#    conda_dependencies_file: conda_env.yaml
+#  os: Linux

From 94dce49593d582823db38dc15d7efd4bd540b9ac Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 14:59:44 -0800
Subject: [PATCH 20/27] remove deprecated instructions

---
 docs/quickstart/azureml-cli20-benchmark.md | 57 ----------------------
 1 file changed, 57 deletions(-)
 delete mode 100644 docs/quickstart/azureml-cli20-benchmark.md

diff --git a/docs/quickstart/azureml-cli20-benchmark.md b/docs/quickstart/azureml-cli20-benchmark.md
deleted file mode 100644
index 48963dc6..00000000
--- a/docs/quickstart/azureml-cli20-benchmark.md
+++ /dev/null
@@ -1,57 +0,0 @@
-# Run benchmark in AzureML (with CLI 2.0)
-
-**Objectives** - By following this tutorial, you will be able to:
-
-- Run the LightGBM benchmark scripts on the AzureML platform (CLI 2.0 edition)
-
-**Requirements** - To enjoy this tutorial, you need to be able to:
-
-- Install Azure CLI, AzureML extension and enable private features (see below).
-- Create or access an AzureML workspace (see [instructions](https://docs.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).
-
-## Install requirements
-
-1. Install [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli) (version >= 2.27.0)
-
-2. Install [Azure ML extension](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-cli)
-
-3. Activate the preview features by setting environment variable `AZURE_ML_CLI_PRIVATE_FEATURES_ENABLED=true` (see [instructions](https://github.com/Azure/azureml-previews/tree/main/previews/pipelines#how-to-get-started)).
-
-4. If you don't have an AzureML workspace yet, create one following [those instructions](https://docs.microsoft.com/en-us/azure/machine-learning/quickstart-create-resources).
-
-## Prepare for running the benchmark
-
-### A. Set your default azure references
-
-Before you get started, we recommend you to set your Azure CLI on your specific subscription, resource gropu and workspace:
-
-```
-az account set --subscription [SUBSCRIPTION]
-az configure --defaults group=[RESOURCE GROUP] workspace=[WORKSPACE]
-```
-
-### B. Publish the modules in your workspace
-
-```
-az ml component create --file ./pipelines/azureml_cli20/components/lightgbm_python_score.yml
-```
-
-### C. Create the datasets
-
-### Option 1: upload manually using AzureML UI
-
-### Option 2: upload manually using CLI
-
-```bash
-az ml data create --file ./pipelines/azureml_cli/inference_data.yml
-```
-
-### Option 3: generate in AzureML
-
-(Work in progress)
-
-## Run the benchmark
-
-```
-az ml job create --file ./pipelines/azureml_cli20/pipelines/score_ab.yml --web
-```

From 479aea2672d19169e3711816dfbde08e217eda73 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Tue, 11 Jan 2022 15:01:20 -0800
Subject: [PATCH 21/27] remove old yaml files

---
 .../components/lightgbm_python/score.yml      | 31 ---------------
 .../lightgbm_python/score_custom.yml          | 31 ---------------
 pipelines/azureml_cli20/inference_data.yml    |  6 ---
 .../azureml_cli20/pipelines/score_ab.yml      | 38 -------------------
 4 files changed, 106 deletions(-)
 delete mode 100644 pipelines/azureml_cli20/components/lightgbm_python/score.yml
 delete mode 100644 pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
 delete mode 100644 pipelines/azureml_cli20/inference_data.yml
 delete mode 100644 pipelines/azureml_cli20/pipelines/score_ab.yml

diff --git a/pipelines/azureml_cli20/components/lightgbm_python/score.yml b/pipelines/azureml_cli20/components/lightgbm_python/score.yml
deleted file mode 100644
index 57f17f7b..00000000
--- a/pipelines/azureml_cli20/components/lightgbm_python/score.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-type: command_component
-
-name: lightgbm_python_score
-display_name: "LightGBM Inferencing (python)"
-version: 1
-
-inputs:
-  data:
-    type: path
-    description: "Inference data (compatible with LightGBM model)"
-  model:
-    type: path
-    description: "LightGBM model exported from training (.txt)"
-outputs:
-  predictions:
-    type: path
-    description: "Predictions from LightGBM model"
-
-code:
-  local_path: ../../../../src/
-
-environment:
-  conda_file: file:../../../../src/scripts/lightgbm_python/conda.yml
-  docker: 
-    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
-
-command: >-
-  python scripts/lightgbm_python/score.py
-    --data ${{inputs.data}}
-    --model ${{inputs.model}}
-    --output ${{outputs.predictions}}
diff --git a/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml b/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
deleted file mode 100644
index 90cfc652..00000000
--- a/pipelines/azureml_cli20/components/lightgbm_python/score_custom.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-type: command_component
-
-name: lightgbm_python_score_custom
-display_name: "LightGBM Inferencing (custom build)"
-version: 1
-
-inputs:
-  data:
-    type: path
-    description: "Inference data (compatible with LightGBM model)"
-  model:
-    type: path
-    description: "LightGBM model exported from training (.txt)"
-outputs:
-  predictions:
-    type: path
-    description: "Predictions from LightGBM model"
-
-code:
-  local_path: ../../../../src/
-
-environment:
-  conda_file: file:../../../../src/scripts/lightgbm_python/conda.yml
-  docker: 
-    image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210615.v1
-
-command: >-
-  python scripts/lightgbm_python/score.py
-    --data ${{inputs.data}}
-    --model ${{inputs.model}}
-    --output ${{outputs.predictions}}
diff --git a/pipelines/azureml_cli20/inference_data.yml b/pipelines/azureml_cli20/inference_data.yml
deleted file mode 100644
index 60ae1d6d..00000000
--- a/pipelines/azureml_cli20/inference_data.yml
+++ /dev/null
@@ -1,6 +0,0 @@
-name: synthetic_inference_4000col
-version: 1
-description: Synthetic dataset for inference, for regression task, 4000 cols
-datastore: azureml:workspaceblobstore
-local_path: ../../data/
-path: /synthetic/inference.txt
\ No newline at end of file
diff --git a/pipelines/azureml_cli20/pipelines/score_ab.yml b/pipelines/azureml_cli20/pipelines/score_ab.yml
deleted file mode 100644
index aa83c7fc..00000000
--- a/pipelines/azureml_cli20/pipelines/score_ab.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: lightgbm_master_versus_custom_v0
-type: pipeline_job
-
-# <inputs_and_outputs>
-inputs:
-  benchmark_data:
-    # using reference to an existing AzureML dataset
-    data: azureml:synthetic_inference_4000col
-  benchmark_model:
-    # using reference to an existing AzureML dataset
-    data: azureml:model_4000col_100000train_synthetic
-outputs:
-  predictions:
-    # register the output as another dataset
-    data:
-      datastore: azureml:workspaceblobstore
-
-# default settings that apply to all jobs
-defaults:
-  component_job:
-    datastore: azureml:workspaceblobstore
-    compute:
-      target: azureml:linux-d14v2
-
-# <jobs>
-jobs:
-  benchmark_job:
-    type: component_job
-
-    # to use below, module must have been registered before
-    component: file:../components/lightgbm_python/score.yml
-
-    inputs:
-      data: ${{inputs.benchmark_data}}
-      model: ${{inputs.benchmark_model}}
-
-    outputs:
-      predictions: ${{outputs.predictions}}
\ No newline at end of file

From 4f61e883ba73f95ea5dbbcda48b6a68d7cfa3675 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Wed, 12 Jan 2022 14:44:30 -0800
Subject: [PATCH 22/27] align specs

---
 .../generate_data/spec.additional_includes       |  1 -
 .../data_processing/generate_data/spec.yaml      | 16 +++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)
 delete mode 100644 src/scripts/data_processing/generate_data/spec.additional_includes

diff --git a/src/scripts/data_processing/generate_data/spec.additional_includes b/src/scripts/data_processing/generate_data/spec.additional_includes
deleted file mode 100644
index 0ad98a8f..00000000
--- a/src/scripts/data_processing/generate_data/spec.additional_includes
+++ /dev/null
@@ -1 +0,0 @@
-../../../common/
diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index 52ddcf8c..fa15196f 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -1,4 +1,4 @@
-$schema: http://azureml/sdk-2-0/CommandComponent.json
+$schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
 name: lightgbm_benchmark.data.generate
 version: 2.0.0
 display_name: "Generate Synthetic Data"
@@ -94,8 +94,11 @@ outputs:
   output_inference:
     type: path
 
+code:
+  local_path: "../../../"
+
 command: >-
-  python generate.py
+  python scripts/data_processing/generate_data/generate.py
   --type {inputs.learning_task}
   --train_samples {inputs.train_samples}
   --train_partitions {inputs.train_partitions}
@@ -114,8 +117,7 @@ command: >-
   --verbose {inputs.verbose}
   [--custom_properties {inputs.custom_properties}]
 
-environment: azureml:AzureML-sklearn-0.24-ubuntu18.04-py37-cuda11-gpu:3
-#  conda:
-#    # conda file path is resolved after additional includes
-#    conda_dependencies_file: conda_env.yaml
-#  os: Linux
+environment:
+  image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
+  conda_file: conda_env.yaml
+  os_type: linux

From 459e45b786aa50754c08a73324661b55a6862127 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Wed, 12 Jan 2022 15:26:28 -0800
Subject: [PATCH 23/27] correct input format

---
 src/pipelines/azureml/data_generation.py      |  2 +-
 .../data_processing/generate_data/spec.yaml   | 35 ++++++++++---------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py
index b1ba84ae..2c1b3a1f 100644
--- a/src/pipelines/azureml/data_generation.py
+++ b/src/pipelines/azureml/data_generation.py
@@ -110,7 +110,7 @@ def _data_generation_main_pipeline_function():
                 delimiter = generation_task.delimiter,
                 random_state = 5,
                 verbose = False,
-                custom_properties = benchmark_custom_properties
+                #custom_properties = benchmark_custom_properties # TODO: fails for now
             )
             # run it on the right compute target
             generate_data_step.compute = config.compute.linux_cpu
diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index fa15196f..64954d6c 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -99,25 +99,26 @@ code:
 
 command: >-
   python scripts/data_processing/generate_data/generate.py
-  --type {inputs.learning_task}
-  --train_samples {inputs.train_samples}
-  --train_partitions {inputs.train_partitions}
-  --test_samples {inputs.test_samples}
-  --test_partitions {inputs.test_partitions}
-  --inferencing_samples {inputs.inferencing_samples}
-  --inferencing_partitions {inputs.inferencing_partitions}
-  --n_features {inputs.n_features}
-  --n_informative {inputs.n_informative}
-  [--n_redundant {inputs.n_redundant}]
-  [--random_state {inputs.random_state}]
-  --delimiter {inputs.delimiter}
-  --output_train {outputs.output_train}
-  --output_test {outputs.output_test}
-  --output_inference {outputs.output_inference}
-  --verbose {inputs.verbose}
-  [--custom_properties {inputs.custom_properties}]
+  --type ${{inputs.learning_task}}
+  --train_samples ${{inputs.train_samples}}
+  --train_partitions ${{inputs.train_partitions}}
+  --test_samples ${{inputs.test_samples}}
+  --test_partitions ${{inputs.test_partitions}}
+  --inferencing_samples ${{inputs.inferencing_samples}}
+  --inferencing_partitions ${{inputs.inferencing_partitions}}
+  --n_features ${{inputs.n_features}}
+  --n_informative ${{inputs.n_informative}}
+  [--n_redundant ${{inputs.n_redundant}}]
+  [--random_state ${{inputs.random_state}}]
+  --delimiter ${{inputs.delimiter}}
+  --output_train {outputs.output_train}}
+  --output_test {outputs.output_test}}
+  --output_inference {outputs.output_inference}}
+  --verbose ${{inputs.verbose}}
+  [--custom_properties ${{inputs.custom_properties}}]
 
 environment:
+  name: data_generation_base
   image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
   conda_file: conda_env.yaml
   os_type: linux

From bca20718abd81153ad14a59b2b8c5865dbaec6e6 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Wed, 12 Jan 2022 16:45:03 -0800
Subject: [PATCH 24/27] remove portal url

---
 src/common/pipelines.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/common/pipelines.py b/src/common/pipelines.py
index ad1319b5..5756023a 100644
--- a/src/common/pipelines.py
+++ b/src/common/pipelines.py
@@ -190,22 +190,21 @@ def pipeline_submit(ml_client: MLClient,
             tags=(tags or pipeline_config.experiment.tags),
             continue_run_on_step_failure=pipeline_config.run.continue_on_failure
         )
-
-        logging.info(
-            f"""
-#################################
-#################################
-#################################
-
-Follow link below to access your pipeline run directly:
--------------------------------------------------------
-{pipeline_run.get_portal_url()}
-
-#################################
-#################################
-#################################
-        """
-        )
+#         logging.info(
+#             f"""
+# #################################
+# #################################
+# #################################
+
+# Follow link below to access your pipeline run directly:
+# -------------------------------------------------------
+# {pipeline_run.get_portal_url()}
+
+# #################################
+# #################################
+# #################################
+#         """
+#         )
 
         return pipeline_run
     else:

From cade7fd2d675830c0227068aa03184d5e73d6d9b Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jf.omhover@gmail.com>
Date: Wed, 12 Jan 2022 16:45:45 -0800
Subject: [PATCH 25/27] escape json in custom properties

---
 src/pipelines/azureml/data_generation.py            | 2 +-
 src/scripts/data_processing/generate_data/spec.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py
index 2c1b3a1f..b1ba84ae 100644
--- a/src/pipelines/azureml/data_generation.py
+++ b/src/pipelines/azureml/data_generation.py
@@ -110,7 +110,7 @@ def _data_generation_main_pipeline_function():
                 delimiter = generation_task.delimiter,
                 random_state = 5,
                 verbose = False,
-                #custom_properties = benchmark_custom_properties # TODO: fails for now
+                custom_properties = benchmark_custom_properties
             )
             # run it on the right compute target
             generate_data_step.compute = config.compute.linux_cpu
diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index 64954d6c..afd58cba 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -115,7 +115,7 @@ command: >-
   --output_test {outputs.output_test}}
   --output_inference {outputs.output_inference}}
   --verbose ${{inputs.verbose}}
-  [--custom_properties ${{inputs.custom_properties}}]
+  [--custom_properties '${{inputs.custom_properties}}']
 
 environment:
   name: data_generation_base

From 476c606b083553b116f37067e8ac98232ccd4025 Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jeomhove@microsoft.com>
Date: Mon, 9 May 2022 14:53:15 -0700
Subject: [PATCH 26/27] working data generation pipeline

---
 requirements.txt                                    |  1 -
 src/common/aml.py                                   |  2 +-
 src/common/pipelines.py                             |  2 +-
 src/pipelines/azureml/data_generation.py            |  5 +++--
 .../data_processing/generate_data/conda_env.yaml    |  2 +-
 src/scripts/data_processing/generate_data/spec.yaml | 13 +++++--------
 6 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 09e75ff6..62ab00e2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -20,6 +20,5 @@ pytest-mock==3.6.1
 
 # pipelines
 hydra-core~=1.0.3
-typing_extensions==4.0.1 # for hydra
 azure-ml==0.0.61212840
 --extra-index-url https://azuremlsdktestpypi.azureedge.net/sdk-cli-v2
diff --git a/src/common/aml.py b/src/common/aml.py
index db3e1449..3835e221 100644
--- a/src/common/aml.py
+++ b/src/common/aml.py
@@ -7,7 +7,7 @@
 """
 import logging
 import re
-from azureml.core import Datastore, Dataset
+#from azureml.core import Datastore, Dataset
 
 
 def dataset_from_dstore_path(workspace, datastore, datastore_path, validate=True):
diff --git a/src/common/pipelines.py b/src/common/pipelines.py
index 32452e04..e7daeb35 100644
--- a/src/common/pipelines.py
+++ b/src/common/pipelines.py
@@ -207,7 +207,7 @@ def pipeline_submit(ml_client: MLClient,
 
 Follow link below to access your pipeline run directly:
 -------------------------------------------------------
-{pipeline_run..services['Studio'].endpoint}
+{pipeline_run.services['Studio'].endpoint}
 
 #################################
 #################################
diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py
index 8d8e1924..7cd6f9ae 100644
--- a/src/pipelines/azureml/data_generation.py
+++ b/src/pipelines/azureml/data_generation.py
@@ -21,6 +21,7 @@
 
 # AzureML SDK 2.0
 from azure.ml import dsl
+from azure.ml.entities import load_component
 
 # when running this script directly, needed to import common
 LIGHTGBM_REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', '..'))
@@ -67,7 +68,7 @@ class data_generation_config: # pylint: disable=invalid-name
 # load those components from local yaml specifications
 # use COMPONENTS_ROOT as base folder
 
-generate_data_component = dsl.load_component(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))
+generate_data_component = load_component(yaml_file=os.path.join(COMPONENTS_ROOT, "data_processing", "generate_data", "spec.yaml"))
 
 ### DATA GENERATION PIPELINE ###
 
@@ -118,7 +119,7 @@ def _data_generation_main_pipeline_function():
                 custom_properties = benchmark_custom_properties
             )
             # run it on the right compute target
-            generate_data_step.runsettings.configure(target=config.compute.linux_cpu)
+            generate_data_step.compute = config.compute.linux_cpu
 
         # generate a readable run name
         generate_data_step.node_name = format_run_name("generate_{}_train{}test{}inf{}_feat{}".format(
diff --git a/src/scripts/data_processing/generate_data/conda_env.yaml b/src/scripts/data_processing/generate_data/conda_env.yaml
index 223c34f7..c4a6ee64 100644
--- a/src/scripts/data_processing/generate_data/conda_env.yaml
+++ b/src/scripts/data_processing/generate_data/conda_env.yaml
@@ -1,4 +1,4 @@
-name: treelite_conda_env
+name: generate_data_env
 channels:
 - defaults
 dependencies:
diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index 8900d0ec..ecb58714 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -1,5 +1,5 @@
 $schema: https://azuremlschemas.azureedge.net/latest/commandComponent.schema.json
-name: lightgbm_benchmark.data.generate
+name: lightgbm_benchmark_data_generate
 version: 2.0.0
 display_name: "Generate Synthetic Data"
 type: command
@@ -69,12 +69,12 @@ inputs:
     description: random seed
     optional: true
   docs_per_query:
-    type: Integer
+    type: integer
     description: docs per query, used for ranking data
     default: 20
     optional: true
   n_label_classes:
-    type: Integer
+    type: integer
     description: n_label_classes, used for ranking data
     default: 10
     optional: true
@@ -88,7 +88,7 @@ inputs:
       - comma
       - space
   header:
-    type: Boolean
+    type: boolean
     default: False
     description: "generate header for output files"
 
@@ -112,8 +112,7 @@ outputs:
   external_header:
     type: path
 
-code:
-  local_path: "../../../"
+code: "../../../"
 
 command: >-
   python scripts/data_processing/generate_data/generate.py
@@ -140,7 +139,5 @@ command: >-
   [--n_label_classes {inputs.n_label_classes}]
 
 environment:
-  name: data_generation_base
   image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
   conda_file: conda_env.yaml
-  os_type: linux

From ac3a089afcb1415ea8086804cb7feb19bea9156b Mon Sep 17 00:00:00 2001
From: Jeff Omhover <jeomhove@microsoft.com>
Date: Mon, 9 May 2022 16:16:31 -0700
Subject: [PATCH 27/27] working data generation pipeline

---
 src/pipelines/azureml/data_generation.py      |  4 +-
 .../data_processing/generate_data/spec.yaml   | 42 +++++++++----------
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/pipelines/azureml/data_generation.py b/src/pipelines/azureml/data_generation.py
index 7cd6f9ae..0623a7f2 100644
--- a/src/pipelines/azureml/data_generation.py
+++ b/src/pipelines/azureml/data_generation.py
@@ -143,13 +143,15 @@ def _data_generation_main_pipeline_function():
 
             # if config asks to register the outputs automatically...
             if config.data_generation_config.register_outputs:
+                raise NotImplementedError("automated registering of outputs currently doesn't work in sdkv2")
+
                 # create a prefix for the dataset
                 dataset_prefix = "{prefix}-{task}-{cols}cols".format(
                     prefix=config.data_generation_config.register_outputs_prefix,
                     task=generation_task.task,
                     cols=generation_task.n_features
                 )
-                
+
                 # register each output (train, test, inference)
                 generate_data_step.outputs.output_train.register_as(
                     name=f"{dataset_prefix}-{generation_task.train_samples}samples-train",
diff --git a/src/scripts/data_processing/generate_data/spec.yaml b/src/scripts/data_processing/generate_data/spec.yaml
index ecb58714..1f739cc0 100644
--- a/src/scripts/data_processing/generate_data/spec.yaml
+++ b/src/scripts/data_processing/generate_data/spec.yaml
@@ -116,27 +116,27 @@ code: "../../../"
 
 command: >-
   python scripts/data_processing/generate_data/generate.py
-  --type {inputs.learning_task}
-  --train_samples {inputs.train_samples}
-  --train_partitions {inputs.train_partitions}
-  --test_samples {inputs.test_samples}
-  --test_partitions {inputs.test_partitions}
-  --inferencing_samples {inputs.inferencing_samples}
-  --inferencing_partitions {inputs.inferencing_partitions}
-  --n_features {inputs.n_features}
-  --n_informative {inputs.n_informative}
-  [--n_redundant {inputs.n_redundant}]
-  [--random_state {inputs.random_state}]
-  --delimiter {inputs.delimiter}
-  --generate_header {inputs.header}
-  --output_train {outputs.output_train}
-  --output_test {outputs.output_test}
-  --output_inference {outputs.output_inference}
-  --external_header {outputs.external_header}
-  --verbose {inputs.verbose}
-  [--custom_properties {inputs.custom_properties}]
-  [--docs_per_query {inputs.docs_per_query}]
-  [--n_label_classes {inputs.n_label_classes}]
+  --type ${{inputs.learning_task}}
+  --train_samples ${{inputs.train_samples}}
+  --train_partitions ${{inputs.train_partitions}}
+  --test_samples ${{inputs.test_samples}}
+  --test_partitions ${{inputs.test_partitions}}
+  --inferencing_samples ${{inputs.inferencing_samples}}
+  --inferencing_partitions ${{inputs.inferencing_partitions}}
+  --n_features ${{inputs.n_features}}
+  --n_informative ${{inputs.n_informative}}
+  [--n_redundant ${{inputs.n_redundant}}]
+  [--random_state ${{inputs.random_state}}]
+  --delimiter ${{inputs.delimiter}}
+  --generate_header ${{inputs.header}}
+  --output_train ${{outputs.output_train}}
+  --output_test ${{outputs.output_test}}
+  --output_inference ${{outputs.output_inference}}
+  --external_header ${{outputs.external_header}}
+  --verbose ${{inputs.verbose}}
+  [--custom_properties '${{inputs.custom_properties}}']
+  [--docs_per_query ${{inputs.docs_per_query}}]
+  [--n_label_classes ${{inputs.n_label_classes}}]
 
 environment:
   image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04