From c2df431597f9a4abd4bba464f55e2fd1a50ac480 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:15:33 -0400 Subject: [PATCH 01/11] link: installing `pip` Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- k-fold-cross-validation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k-fold-cross-validation/README.md b/k-fold-cross-validation/README.md index ed52319..4874de1 100644 --- a/k-fold-cross-validation/README.md +++ b/k-fold-cross-validation/README.md @@ -33,7 +33,7 @@ If you are a Python 2 developer and do not already have `virtualenv` and `pip` t sudo apt-get update sudo apt-get install python-pip python-virtualenv -Users of other operating systems and package managers can learn more about installing `pip` [here](http://pip.readthedocs.org/en/stable/installing/), and about installing `virtualenv` [here](http://virtualenv.readthedocs.org/en/latest/installation.html). +Users of other operating systems and package managers can learn more about [installing `pip`](http://pip.readthedocs.org/en/stable/installing/), and about installing `virtualenv` [here](http://virtualenv.readthedocs.org/en/latest/installation.html). After you’ve installed the `virtualenv` and `pip` tools, run: From fc5c656d3008368331e7ed1719fe8f268e526dc4 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:15:40 -0400 Subject: [PATCH 02/11] link: installing `virtualenv` Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- k-fold-cross-validation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k-fold-cross-validation/README.md b/k-fold-cross-validation/README.md index 4874de1..3fec4e2 100644 --- a/k-fold-cross-validation/README.md +++ b/k-fold-cross-validation/README.md @@ -33,7 +33,7 @@ If you are a Python 2 developer and do not already have `virtualenv` and `pip` t sudo apt-get update sudo apt-get install python-pip python-virtualenv -Users of other operating systems and package managers can learn more about [installing `pip`](http://pip.readthedocs.org/en/stable/installing/), and about installing `virtualenv` [here](http://virtualenv.readthedocs.org/en/latest/installation.html). +Users of other operating systems and package managers can learn more about [installing `pip`](http://pip.readthedocs.org/en/stable/installing/), and about [installing `virtualenv`](http://virtualenv.readthedocs.org/en/latest/installation.html). After you’ve installed the `virtualenv` and `pip` tools, run: From d46f06e55b5bf4d0643a659c45ed9ab036817edb Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:16:02 -0400 Subject: [PATCH 03/11] link: obtain your credentials Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- social-media/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/social-media/README.md b/social-media/README.md index 9c3f87b..da02a29 100644 --- a/social-media/README.md +++ b/social-media/README.md @@ -85,8 +85,7 @@ To gather the training data, run the following command: Substitute your company's twitter handle instead of @awscloud and configure your Twitter API credentials in config.py. Learn how to -obtain your credentials -[here](https://dev.twitter.com/oauth/overview/application-owner-access-tokens). +[obtain your credentials](https://dev.twitter.com/oauth/overview/application-owner-access-tokens). This will produce a file called `line_separated_tweets_json.txt` that other scripts will read later. From f7d8e05c3024131cea52de5349a681e44c9103fd Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:13:59 -0400 Subject: [PATCH 04/11] spelling: a Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- cost-based-ml/cost_based_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cost-based-ml/cost_based_ml.py b/cost-based-ml/cost_based_ml.py index a295952..0f4f5af 100755 --- a/cost-based-ml/cost_based_ml.py +++ b/cost-based-ml/cost_based_ml.py @@ -37,7 +37,7 @@ def batch_prediction_data_bucket_key(output_uri_s3, batch_prediction_id): key += "batch-prediction/result/{}-{}.gz".format(batch_prediction_id, datasource_filename) return bucket, key -# read batch prediction results from S3 and turn them into an numpy array +# read batch prediction results from S3 and turn them into a numpy array def read_test_predictions(bucket, key): s3 = boto3.resource('s3') obj = s3.Object(bucket, key) From cd3c3ce9611868f6d6c8fd88586ba5856c868d8f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:17:46 -0400 Subject: [PATCH 05/11] spelling: compute Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- k-fold-cross-validation/collect_perf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k-fold-cross-validation/collect_perf.py b/k-fold-cross-validation/collect_perf.py index 43ae362..8d214d4 100755 --- a/k-fold-cross-validation/collect_perf.py +++ b/k-fold-cross-validation/collect_perf.py @@ -119,7 +119,7 @@ def collect_perf(eval_id_list): kfolds = len(eval_id_list) eval_auc_map = collect_perf(eval_id_list) # start polling & collect - # Comput the mean/variance of auc scores. Casting kfolds to float for + # Compute the mean/variance of auc scores. Casting kfolds to float for # Python 2 compatibility. avg_auc = sum([x for x in eval_auc_map.values()]) / float(kfolds) var_auc = sum([(x - avg_auc) ** 2 for x in eval_auc_map.values()]) / float( From dfcbc9659ed4b9808fe054d47e4c3407c9a18956 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:18:25 -0400 Subject: [PATCH 06/11] spelling: doesn't it Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- cost-based-ml/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cost-based-ml/README.md b/cost-based-ml/README.md index cb298e5..a1d9fa0 100644 --- a/cost-based-ml/README.md +++ b/cost-based-ml/README.md @@ -1,6 +1,6 @@ # Cost-based Machine Learning -So you've built an ML model and evaluated it's performance on a testing dataset? In case of binary classification, the evaluation tells you how many mistakes the model made, i.e. the percentage of false positives and false negatives, and the same stats for the correct behavior of the model, namely, true positives and true negatives. Of course, the fewer the errors, the better, but for any realistic application the percentage of errors is substantial and it is often unclear if the model is worth using. Moreover, if you just look at the total error rate (sum of false positives and false negatives) you may convince yourself that the model is useless. For example, suppose that 90% of the data points belong to class 0 and the rest to class 1, and your model gives 15% total error. This means that if you employ the model you will be making a mistake 15% of the time and if you don't use the model at all (and just assume that all data points belong to class 0) you will be making a mistake only in 10% of cases. Seems like the model is useless in this case, doesn'it? +So you've built an ML model and evaluated it's performance on a testing dataset? In case of binary classification, the evaluation tells you how many mistakes the model made, i.e. the percentage of false positives and false negatives, and the same stats for the correct behavior of the model, namely, true positives and true negatives. Of course, the fewer the errors, the better, but for any realistic application the percentage of errors is substantial and it is often unclear if the model is worth using. Moreover, if you just look at the total error rate (sum of false positives and false negatives) you may convince yourself that the model is useless. For example, suppose that 90% of the data points belong to class 0 and the rest to class 1, and your model gives 15% total error. This means that if you employ the model you will be making a mistake 15% of the time and if you don't use the model at all (and just assume that all data points belong to class 0) you will be making a mistake only in 10% of cases. Seems like the model is useless in this case, doesn't it? This, however, is a rather simplistic way of looking the model evaluation. The truth is that the different types of mistakes the model makes have different intrinsic costs associated with them, depending on the domain and application. Frequently, even when the total error looks bad, when costs are taken into account, the end result clearly favors the use of ML. From 136b2a8b4c17f473521bf121a8b5ed441846e699 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:18:33 -0400 Subject: [PATCH 07/11] spelling: expects Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- social-media/push-json-to-kinesis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/social-media/push-json-to-kinesis.py b/social-media/push-json-to-kinesis.py index 5872acd..503b0b9 100755 --- a/social-media/push-json-to-kinesis.py +++ b/social-media/push-json-to-kinesis.py @@ -14,7 +14,7 @@ """ Utility to call Amazon Kinesis stream using payload from a file that contains line separated json. This script is used in conjunction with -create-lambda-function.py, which expectes the Kinesis stream to provide the +create-lambda-function.py, which expects the Kinesis stream to provide the input on which predictions are made. All json data being pushed to kinesis is first converted to string to string key value pairs as that is the expected format by Amazon Machine Learning. From 1d94546467489baa696110f2475f376e4422999f Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:19:43 -0400 Subject: [PATCH 08/11] spelling: histogram Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- cost-based-ml/cost_based_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cost-based-ml/cost_based_ml.py b/cost-based-ml/cost_based_ml.py index 0f4f5af..ed6ae0d 100755 --- a/cost-based-ml/cost_based_ml.py +++ b/cost-based-ml/cost_based_ml.py @@ -52,7 +52,7 @@ def read_test_predictions(bucket, key): data = np.loadtxt(StringIO(predictions_str), dtype = {'names': names, 'formats': formats}, delimiter=',', skiprows=1, usecols=cols) return data -# this historgram replicates what the Amazon ML console is showing for model evaluation +# this histogram replicates what the Amazon ML console is showing for model evaluation def plot_class_histograms(score_n_true_label): class_1_scores = [score for (score, true_label) in score_n_true_label if true_label == 1] class_0_scores = [score for (score, true_label) in score_n_true_label if true_label == 0] From 89784966e45b92f4aa8c00bb6e21e90e40b09029 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 21:48:36 -0400 Subject: [PATCH 09/11] spelling: id Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- social-media/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/social-media/README.md b/social-media/README.md index da02a29..0ee8399 100644 --- a/social-media/README.md +++ b/social-media/README.md @@ -217,7 +217,7 @@ This script requires that `config.py` is present and contains appropriate values. Description of the configuration required in `config.py` is as follows: -* *awsAccountId* : The AWS Account Id corresponding to the credentials being used +* *awsAccountId* : The AWS Account ID corresponding to the credentials being used with boto. See [docs](http://docs.aws.amazon.com/general/latest/gr/acct-identifiers.html) for details. * *kinesisStream* : The name being given to the Kinesis stream. See From 6d6f6be66f3754cfa3b74034329e213a4dda8180 Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:20:11 -0400 Subject: [PATCH 10/11] spelling: threshold Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- .../java/com/amazonaws/samples/machinelearning/UseModel.java | 2 +- .../com/amazonaws/samples/machinelearning/UserModel.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java b/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java index b8798f9..1b72b02 100644 --- a/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java +++ b/targeted-marketing-java/src/main/java/com/amazonaws/samples/machinelearning/UseModel.java @@ -54,7 +54,7 @@ public static void main(String[] args) throws IOException { /** * @param args command-line arguments: * mlModelid - * score threshhold + * score threshold * s3:// url where output should go */ public UseModel(String[] args) { diff --git a/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala b/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala index fb4d492..133606a 100644 --- a/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala +++ b/targeted-marketing-scala/src/main/scala/com/amazonaws/samples/machinelearning/UserModel.scala @@ -14,13 +14,13 @@ import scala.io.Source * to make batch predictions. * * command-line arguments: - * mlModelid scoreThreshhold s3://url-where-output-should-go + * mlModelid scoreThreshold s3://url-where-output-should-go */ object UserModel extends App { val unscoredDataUrl = "s3://aml-sample-data/banking-batch.csv" val dataSchema = getClass.getResourceAsStream("/banking-batch.csv.schema") - require(args.length == 3, "command-line arguments: mlModelid scoreThreshhold s3://url-where-output-should-go") + require(args.length == 3, "command-line arguments: mlModelid scoreThreshold s3://url-where-output-should-go") val mlModelId = args(0) val threshold = args(1).toFloat val s3OutputUrl = args(2) From 98083ff57f98039e2c0caa44a514631ebc344d7a Mon Sep 17 00:00:00 2001 From: Josh Soref <2119212+jsoref@users.noreply.github.com> Date: Sun, 27 Apr 2025 18:20:18 -0400 Subject: [PATCH 11/11] spelling: usage Signed-off-by: Josh Soref <2119212+jsoref@users.noreply.github.com> --- ml-tools-python/wait_for_entity.py | 2 +- targeted-marketing-python/use_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-tools-python/wait_for_entity.py b/ml-tools-python/wait_for_entity.py index 04e6754..ff89975 100755 --- a/ml-tools-python/wait_for_entity.py +++ b/ml-tools-python/wait_for_entity.py @@ -22,7 +22,7 @@ ev = evaluation bp = batch prediction -Useage: +Usage: python wait_for_entity.py entity_id [entity_type] """ import boto diff --git a/targeted-marketing-python/use_model.py b/targeted-marketing-python/use_model.py index 74a7475..df4e35d 100755 --- a/targeted-marketing-python/use_model.py +++ b/targeted-marketing-python/use_model.py @@ -17,7 +17,7 @@ generate predictions on new data. This script needs the id of the ML Model to use. It also requires the score threshold. -Useage: +Usage: python use_model.py ml_model_id score_threshold s3_output_url For example: