-
Notifications
You must be signed in to change notification settings - Fork 296
SNOW-2367850: task integration example update #250
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0291c04
375d3ab
703488f
f96fd9b
ad5d13b
fd6a7dc
5524f9a
3015500
94e941a
74a1edb
7f992c4
01c160f
9fb7478
071eb1c
f6ab75c
16b0b42
9fe2b7a
6b7c373
cf1e70f
1b5f9b1
d6a9bd0
591d89a
2412da7
cd46e72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why change?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the evaluation is inside ML Job. That is because we cannot return the model directly in task |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,13 @@ | ||
| import os | ||
| import logging | ||
| from datetime import datetime, timedelta, timezone | ||
| from typing import Any, Dict, Optional, Union | ||
| from typing import Optional | ||
|
|
||
| import cloudpickle as cp | ||
| from xgboost import Booster, DMatrix | ||
| import data | ||
| import ops | ||
| from constants import ( | ||
| COMPUTE_POOL, | ||
| DAG_STAGE, | ||
| DB_NAME, | ||
| JOB_STAGE, | ||
|
|
@@ -18,7 +18,6 @@ | |
| from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score | ||
| from snowflake.ml.data import DataConnector, DatasetInfo, DataSource | ||
| from snowflake.ml.dataset import Dataset, load_dataset | ||
| from snowflake.ml.jobs import remote | ||
| from snowflake.ml.model import ModelVersion | ||
| from snowflake.snowpark import Session | ||
| from snowflake.snowpark.exceptions import SnowparkSQLException | ||
|
|
@@ -144,10 +143,7 @@ def prepare_datasets( | |
| return (ds, train_ds, test_ds) | ||
|
|
||
|
|
||
| # NOTE: Remove `target_instances=2` to run training on a single node | ||
| # See https://docs.snowflake.com/en/developer-guide/snowflake-ml/ml-jobs/distributed-ml-jobs | ||
| @remote(COMPUTE_POOL, stage_name=JOB_STAGE, target_instances=2) | ||
|
Comment on lines
-147
to
-149
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of the main points of this sample is to demonstrate how easy it is to convert a local pipeline to pushing certain steps down into ML Jobs. Needing to write a separate script file which we
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is currently
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's hold off on merging this until
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am little confused here. Do you mean we create a job inside the task directly? |
||
| def train_model(session: Session, input_data: DataSource) -> XGBClassifier: | ||
| def train_model(session: Session, input_data: Optional[DataSource] = None) -> XGBClassifier: | ||
| """ | ||
| Train a model on the training dataset. | ||
|
|
||
|
|
@@ -162,18 +158,15 @@ def train_model(session: Session, input_data: DataSource) -> XGBClassifier: | |
| Returns: | ||
| XGBClassifier: Trained XGBoost classifier model | ||
| """ | ||
| input_data_df = DataConnector.from_sources(session, [input_data]).to_pandas() | ||
|
|
||
| input_data_dc = DataConnector.from_sources(session, [input_data]) | ||
| assert isinstance(input_data, DatasetInfo), "Input data must be a DatasetInfo" | ||
| exclude_cols = input_data.exclude_cols | ||
| label_col = exclude_cols[0] | ||
|
|
||
| X_train = input_data_df.drop(exclude_cols, axis=1) | ||
| y_train = input_data_df[label_col].squeeze() | ||
|
|
||
| model_params = dict( | ||
| max_depth=50, | ||
| n_estimators=3, | ||
| max_depth=50, | ||
| learning_rate=0.75, | ||
| objective="binary:logistic", | ||
| booster="gbtree", | ||
|
|
@@ -186,26 +179,29 @@ def train_model(session: Session, input_data: DataSource) -> XGBClassifier: | |
| XGBEstimator, | ||
| XGBScalingConfig, | ||
| ) | ||
| all_cols = input_data_dc.to_pandas(limit=1).columns.tolist() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you file a JIRA to add a
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| input_cols = [c for c in all_cols if c not in exclude_cols] | ||
| estimator = XGBEstimator( | ||
| params=model_params, | ||
| scaling_config=XGBScalingConfig(), | ||
| ) | ||
| model = estimator.fit(input_data_dc,input_cols = input_cols, label_col = label_col) | ||
| return model | ||
sfc-gh-ajiang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| # Single node training - can use standard XGBClassifier | ||
| estimator = XGBClassifier(**model_params) | ||
|
|
||
| estimator.fit(X_train, y_train) | ||
|
|
||
| # Convert distributed estimator to standard XGBClassifier if needed | ||
| return getattr(estimator, '_sklearn_estimator', estimator) | ||
| df = input_data_dc.to_pandas() | ||
| X_train = df.drop(exclude_cols, axis=1) | ||
| y_train = df[label_col].squeeze() | ||
| estimator = XGBClassifier(**model_params) | ||
| model = estimator.fit(X_train, y_train) | ||
| return model | ||
|
|
||
|
|
||
| def evaluate_model( | ||
| session: Session, | ||
| model: XGBClassifier, | ||
| input_data: DataSource, | ||
| *, | ||
| prefix: str = None, | ||
| prefix: Optional[str] = None, | ||
| ) -> dict: | ||
| """ | ||
| Evaluate a model on the training and test datasets. | ||
|
|
@@ -232,7 +228,12 @@ def evaluate_model( | |
|
|
||
| X_test = input_data_df.drop(exclude_cols, axis=1) | ||
| expected = input_data_df[label_col].squeeze() | ||
| actual = model.predict(X_test) | ||
| # inside evaluate_model | ||
| if isinstance(model, Booster): | ||
| dmatrix = DMatrix(X_test) | ||
| actual = (model.predict(dmatrix) > 0.5).astype(int) | ||
| else: | ||
| actual = model.predict(X_test) | ||
|
Comment on lines
+231
to
+236
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need this?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The model type returned is |
||
|
|
||
| metric_types = [ | ||
| f1_score, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.