From acf87f4527be7da2d109b21f59af57ca609e37c9 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Thu, 12 Dec 2024 14:04:17 -0800 Subject: [PATCH 1/9] added initial folder structure --- samples/notebooks/anaconda_webinar/README.md | 2 ++ .../anaconda_webinar_notebook.ipynb | 34 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 samples/notebooks/anaconda_webinar/README.md create mode 100644 samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb diff --git a/samples/notebooks/anaconda_webinar/README.md b/samples/notebooks/anaconda_webinar/README.md new file mode 100644 index 00000000..f15ffdf1 --- /dev/null +++ b/samples/notebooks/anaconda_webinar/README.md @@ -0,0 +1,2 @@ +# Title +Placeholder title diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb new file mode 100644 index 00000000..e1dfb1c6 --- /dev/null +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -0,0 +1,34 @@ +{ + "metadata": { + "kernelspec": { + "display_name": "Streamlit Notebook", + "name": "streamlit" + } + }, + "nbformat_minor": 5, + "nbformat": 4, + "cells": [ + { + "cell_type": "code", + "id": "3775908f-ca36-4846-8f38-5adca39217f2", + "metadata": { + "language": "python", + "name": "cell1" + }, + "source": "# Import python packages\nimport streamlit as st\nimport pandas as pd\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", + "metadata": { + "language": "sql", + "name": "cell2" + }, + "source": "select\n o_custkey as id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by all", + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file From 1f03ffb2cf66880f986dd7cad2a1e4d3b85e8641 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Thu, 12 Dec 2024 16:23:34 -0800 Subject: [PATCH 2/9] Update anaconda_webinar_notebook.ipynb --- .../anaconda_webinar_notebook.ipynb | 139 +++++++++++++++++- 1 file changed, 132 insertions(+), 7 deletions(-) diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index e1dfb1c6..747841c0 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -13,22 +13,147 @@ "id": "3775908f-ca36-4846-8f38-5adca39217f2", "metadata": { "language": "python", - "name": "cell1" + "name": "cell1", + "collapsed": false, + "resultHeight": 0 }, - "source": "# Import python packages\nimport streamlit as st\nimport pandas as pd\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n", + "source": "# Import python packages\nimport streamlit as st\nimport pandas as pd\nimport os\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n", "execution_count": null, "outputs": [] }, + { + "cell_type": "markdown", + "id": "8ae58f97-bb31-4290-b2dd-2416f3c2ce15", + "metadata": { + "name": "cell9", + "collapsed": false, + "resultHeight": 74 + }, + "source": "# Growth Accounting" + }, { "cell_type": "code", - "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", + "id": "435baefb-25ff-42a1-b4f8-236a98b4afac", "metadata": { "language": "sql", - "name": "cell2" + "name": "cell3", + "collapsed": false, + "resultHeight": 510 }, - "source": "select\n o_custkey as id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by all", - "execution_count": null, - "outputs": [] + "outputs": [], + "source": "select\n o_custkey as id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by all\norder by id, order_year", + "execution_count": null + }, + { + "cell_type": "code", + "id": "20f1dd62-d796-4190-b34a-89a16fea1819", + "metadata": { + "language": "python", + "name": "cell10", + "collapsed": false, + "resultHeight": 0 + }, + "outputs": [], + "source": "df = cell3.to_pandas()\n\n#pivot data to add row for each id:year with no revenue\nresult = df.pivot_table(\n index='ID',\n columns='ORDER_YEAR', \n values='TOTAL',\n fill_value=0\n).reset_index().melt(\n id_vars='ID',\n var_name='ORDER_YEAR',\n value_name='TOTAL'\n)\n\n# save the dataframe as table for SQL querying \ndf = session.create_dataframe(result)\ndf.write.mode(\"overwrite\").save_as_table(\"df\", table_type=\"temporary\")", + "execution_count": null + }, + { + "cell_type": "code", + "id": "52ae5a36-e143-4ebb-b884-e17750b0c77f", + "metadata": { + "language": "sql", + "name": "cell7", + "collapsed": false, + "resultHeight": 426 + }, + "outputs": [], + "source": "select * from df\norder by id, order_year\nlimit 10", + "execution_count": null + }, + { + "cell_type": "code", + "id": "11971c03-53a7-4429-870a-4b51bbef7aca", + "metadata": { + "language": "sql", + "name": "cell6", + "collapsed": false, + "resultHeight": 510 + }, + "outputs": [], + "source": "with windowed as (\n \n select\n *,\n sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n from df\n\n)\n\nselect *,\n case\n when total = previous_year_total and total > 0 then 'retained'\n when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n when total = 0 and previous_year_total > 0 then 'churned'\n when total > previous_year_total and previous_year_total > 0 then 'expanded'\n when total < previous_year_total and previous_year_total > 0 then 'contracted'\n when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n else 'irrelevant' end as category,\n case category\n when 'retained' then 0\n when 'new' then total\n when 'churned' then (-1 * previous_year_total)\n when 'expanded' then total - previous_year_total\n when 'contracted' then (-1 * (previous_year_total - total))\n when 'resurrected' then total\n else 0 end as net_change\nfrom windowed\norder by id, order_year", + "execution_count": null + }, + { + "cell_type": "code", + "id": "13f099e5-4265-438d-ab46-b3315bfc1f1d", + "metadata": { + "language": "sql", + "name": "cell4", + "collapsed": false, + "resultHeight": 438 + }, + "outputs": [], + "source": "select\n date_part(year, order_year) as order_year,\n category,\n round(sum(total)) as total,\n round(sum(net_change)) as net_change\nfrom {{ cell6 }}\ngroup by all", + "execution_count": null + }, + { + "cell_type": "code", + "id": "735da8fc-91c0-4604-8041-1437208a1f01", + "metadata": { + "language": "python", + "name": "cell2", + "collapsed": false, + "resultHeight": 772 + }, + "outputs": [], + "source": "# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(cell4, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", + "execution_count": null + }, + { + "cell_type": "code", + "id": "06f083eb-ae70-42ad-af0d-261138126bed", + "metadata": { + "language": "python", + "name": "cell5", + "collapsed": false, + "resultHeight": 96 + }, + "outputs": [], + "source": "df = cell6.to_pandas()\nbutton_csv = df.to_csv().encode(\"utf-8\")\nst.download_button(label=\"Download\", data=button_csv, file_name=\"growth_accounting.csv\", mime=\"text/csv\")", + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "db63ea18-13d4-43a4-a29c-a734db89e796", + "metadata": { + "name": "cell8", + "collapsed": false, + "resultHeight": 74 + }, + "source": "# Forecasting" + }, + { + "cell_type": "markdown", + "id": "1d9d5e85-1ad1-422d-9859-20025e4b8561", + "metadata": { + "name": "cell11", + "collapsed": false, + "resultHeight": 74 + }, + "source": "# API Enrichment" + }, + { + "cell_type": "code", + "id": "9bd53742-511c-4cf9-9e28-02bdbcaca463", + "metadata": { + "language": "python", + "name": "cell13", + "collapsed": false, + "resultHeight": 6166 + }, + "outputs": [], + "source": "import requests\n\ndef get_wiki_extract(title):\n # Base URL for Wikipedia's API\n url = \"https://en.wikipedia.org/w/api.php\"\n \n # Parameters for the API request\n params = {\n \"action\": \"query\",\n \"format\": \"json\",\n \"titles\": title,\n \"prop\": \"extracts\",\n \"exintro\": True, # Only get the intro section\n \"explaintext\": True, # Get plain text instead of HTML\n }\n \n # Make the request\n response = requests.get(url, params=params)\n \n # Check if request was successful\n if response.status_code == 200:\n data = response.json()\n # Navigate through the JSON response to get the extract\n pages = data[\"query\"][\"pages\"]\n # Get the first (and only) page's extract\n page = list(pages.values())[0]\n return page.get(\"extract\", \"No extract available\")\n else:\n return f\"Error: {response.status_code}\"\n\ncat_breeds = [\n 'Abyssinian_cat',\n 'Aegean_cat',\n 'American_Bobtail',\n 'American_Curl',\n 'American_Ringtail',\n 'American_Shorthair',\n 'American_Wirehair',\n 'Arabian_Mau',\n 'Asian_cat',\n 'Asian_Semi-longhair',\n 'Australian_Mist',\n 'Balinese_cat',\n 'Bambino_cat',\n 'Bengal_cat',\n 'Birman',\n 'Bombay_cat',\n 'Brazilian_Shorthair',\n 'British_Longhair',\n 'British_Shorthair',\n 'Burmese_cat',\n 'Burmilla',\n 'California_Spangled',\n 'Chantilly-Tiffany',\n 'Chartreux',\n 'Chausie',\n 'Colorpoint_Shorthair',\n 'Cornish_Rex',\n 'Cymric_cat',\n 'Cyprus_cat',\n 'Devon_Rex',\n 'Donskoy_cat',\n 'Dragon_Li',\n 'Egyptian_Mau',\n 'European_Shorthair',\n 'Exotic_Shorthair',\n 'Foldex_cat',\n 'German_Rex',\n 'Havana_Brown',\n 'Highlander_cat',\n 'Himalayan_cat',\n 'Japanese_Bobtail',\n 'Javanese_cat',\n 'Kanaani_cat',\n 'Khao_Manee',\n 'Kinkalow',\n 'Korat',\n 'Korean_Bobtail',\n 'Kurilian_Bobtail',\n 'Lambkin_cat',\n 'LaPerm',\n 'Lykoi',\n 'Maine_Coon',\n 'Manx_cat',\n 'Mekong_Bobtail',\n 'Minskin',\n 'Minuet_cat',\n 'Munchkin_cat',\n 'Nebelung',\n 'Neva_Masquerade',\n 'Norwegian_Forest_cat',\n 'Ocicat',\n 'Ojos_Azules',\n 'Oriental_bicolour',\n 'Oriental_Longhair',\n 'Oriental_Shorthair',\n 'Persian_cat',\n 'Traditional_Persian',\n 'Peterbald',\n 'Pixie-bob',\n 'Ragamuffin_cat',\n 'Ragdoll',\n 'Raas_cat',\n 'Russian_Blue',\n 'Savannah_cat',\n 'Scottish_Fold',\n 'Selkirk_Rex',\n 'Serengeti_cat',\n 'Siamese_cat',\n 'Siberian_cat',\n 'Singapura_cat',\n 'Snowshoe_cat',\n 'Sokoke',\n 'Somali_cat',\n 'Sphynx_cat',\n 'Suphalak',\n 'Thai_cat',\n 'Tonkinese_cat',\n 'Toybob',\n 'Toyger',\n 'Turkish_Angora',\n 'Turkish_Van',\n 'Van_cat',\n 'Ukrainian_Levkoy',\n 'York_Chocolate'\n]\ncsv_list = []\n\nfor cat in cat_breeds:\n print(cat)\n extract = get_wiki_extract(cat)\n print(extract)\n csv_list.append((cat, extract))\n\n# Convert to dataframe and save\ndf = pd.DataFrame(csv_list, columns=['breed', 'description'])\ndf.to_csv('cat_breeds.csv', index=False, encoding='utf-8')", + "execution_count": null } ] } \ No newline at end of file From 3bde76158ec24becfa2d9c134756b88fa74ff209 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Wed, 18 Dec 2024 11:32:40 -0800 Subject: [PATCH 3/9] Update anaconda_webinar_notebook.ipynb --- .../anaconda_webinar_notebook.ipynb | 187 +++++++++++++++++- 1 file changed, 177 insertions(+), 10 deletions(-) diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index 747841c0..5a39a4bc 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -17,7 +17,7 @@ "collapsed": false, "resultHeight": 0 }, - "source": "# Import python packages\nimport streamlit as st\nimport pandas as pd\nimport os\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n", + "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()", "execution_count": null, "outputs": [] }, @@ -29,7 +29,7 @@ "collapsed": false, "resultHeight": 74 }, - "source": "# Growth Accounting" + "source": "# Growth Accounting\n" }, { "cell_type": "code", @@ -44,6 +44,19 @@ "source": "select\n o_custkey as id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by all\norder by id, order_year", "execution_count": null }, + { + "cell_type": "code", + "id": "61f451db-8ff2-4d83-b9be-6c1a77365446", + "metadata": { + "language": "python", + "name": "cell12", + "collapsed": false, + "resultHeight": 0 + }, + "outputs": [], + "source": "import pandas as pd", + "execution_count": null + }, { "cell_type": "code", "id": "20f1dd62-d796-4190-b34a-89a16fea1819", @@ -106,7 +119,7 @@ "resultHeight": 772 }, "outputs": [], - "source": "# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(cell4, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", + "source": "import streamlit as st\n# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(cell4, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", "execution_count": null }, { @@ -130,17 +143,118 @@ "collapsed": false, "resultHeight": 74 }, - "source": "# Forecasting" + "source": "# Forecasting\n" + }, + { + "cell_type": "code", + "id": "2a9b9481-4d24-4f6c-9b53-4f50add6458e", + "metadata": { + "language": "sql", + "name": "cell14", + "collapsed": false, + "resultHeight": 438 + }, + "outputs": [], + "source": "select\n date_trunc(day, o_orderdate) as order_date,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by 1\norder by order_date asc", + "execution_count": null + }, + { + "cell_type": "code", + "id": "9d5d7b4a-43cc-4c62-844e-a1954c312cbf", + "metadata": { + "language": "python", + "name": "cell15", + "collapsed": false, + "resultHeight": 0 + }, + "outputs": [], + "source": "from prophet import Prophet\nfrom prophet.plot import plot_plotly, plot_components_plotly", + "execution_count": null + }, + { + "cell_type": "code", + "id": "87ca009b-4da8-46c2-a86c-9cad46fac89f", + "metadata": { + "language": "python", + "name": "cell17", + "collapsed": false, + "resultHeight": 150 + }, + "outputs": [], + "source": "df = cell14.to_pandas()\ndf = df.rename(columns={'ORDER_DATE': 'ds', 'TOTAL': 'y'})\nprint(df.head())", + "execution_count": null + }, + { + "cell_type": "code", + "id": "4efeff4d-da4b-4c1d-b3d5-a892bb2a2bc5", + "metadata": { + "language": "python", + "name": "cell19", + "collapsed": false, + "resultHeight": 372 + }, + "outputs": [], + "source": "st.line_chart(df, x='ds', y='y')", + "execution_count": null }, { "cell_type": "markdown", - "id": "1d9d5e85-1ad1-422d-9859-20025e4b8561", + "id": "cbffd526-a4b0-405b-9718-6c5c2f8f6144", "metadata": { - "name": "cell11", + "name": "cell21", + "collapsed": false, + "resultHeight": 120 + }, + "source": "Waiting on role permission to write UDFs for Prophet library to run properly. Until then, code cell below will return \n``` Failed with error [Errno 1] Operation not permitted: '/usr/lib/python_udf/d212b0f949a4a60cf75395f561f7016ea978bad39b2e60eee12ece87d118e861/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin'```" + }, + { + "cell_type": "code", + "id": "9d2c4877-5815-4f49-a53d-816b38de4eb6", + "metadata": { + "language": "python", + "name": "cell26", + "collapsed": false, + "resultHeight": 95 + }, + "outputs": [], + "source": "m = Prophet()\ntry:\n m.fit(df)\nexcept Exception as err:\n print(Exception, err)", + "execution_count": null + }, + { + "cell_type": "code", + "id": "ce582f14-9490-4a54-8fe0-bbfc8b56f61f", + "metadata": { + "language": "python", + "name": "cell23", + "collapsed": false, + "resultHeight": 1126 + }, + "outputs": [], + "source": "future = m.make_future_dataframe(periods=365)\nforecast = m.predict(future)\nfig1 = m.plot(forecast)\n#fig2 = m.plot_components(forecast)", + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "5dc1abf7-b9ea-4fe4-88ae-109342f6dc05", + "metadata": { + "name": "cell25", "collapsed": false, "resultHeight": 74 }, - "source": "# API Enrichment" + "source": "# Customer Segmentation" + }, + { + "cell_type": "code", + "id": "939a7d50-2679-46ee-a43b-b7d03b627d61", + "metadata": { + "language": "sql", + "name": "cell16", + "collapsed": false, + "resultHeight": 426 + }, + "outputs": [], + "source": "select *\nfrom ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST\nlimit 10", + "execution_count": null }, { "cell_type": "code", @@ -149,11 +263,64 @@ "language": "python", "name": "cell13", "collapsed": false, - "resultHeight": 6166 + "resultHeight": 0 + }, + "outputs": [], + "source": "import requests\n\ndef get_wiki_extract(title):\n # Base URL for Wikipedia's API\n url = \"https://en.wikipedia.org/w/api.php\"\n \n # Parameters for the API request\n params = {\n \"action\": \"query\",\n \"format\": \"json\",\n \"titles\": title,\n \"prop\": \"extracts\",\n \"exintro\": True, # Only get the intro section\n \"explaintext\": True, # Get plain text instead of HTML\n }\n \n # Make the request\n response = requests.get(url, params=params)\n \n # Check if request was successful\n if response.status_code == 200:\n data = response.json()\n # Navigate through the JSON response to get the extract\n pages = data[\"query\"][\"pages\"]\n # Get the first (and only) page's extract\n page = list(pages.values())[0]\n return page.get(\"extract\", \"No extract available\")\n else:\n return f\"Error: {response.status_code}\"", + "execution_count": null + }, + { + "cell_type": "code", + "id": "0557102d-3584-469a-9fdc-be53fd0a249b", + "metadata": { + "language": "python", + "name": "cell22", + "collapsed": false, + "resultHeight": 60 + }, + "outputs": [], + "source": "df = cell16.to_pandas()\ncompany_names = df['NAME'].tolist()\ncsv_list = []\n\nprint(\"extracting descriptions\")\n\nfor name in company_names:\n try:\n extract = get_wiki_extract(name.replace(\" \", \"_\"))\n #print(f'extracted description of {name} from Wikipedia')\n except Exception as e:\n #print(f\"Error getting Wikipedia extract for {name}: {str(e)}\")\n extract = \"None available\"\n \n csv_list.append((name, extract))\n\nprint(\"finished extracting descriptions\")", + "execution_count": null + }, + { + "cell_type": "code", + "id": "e979ca68-494a-46d4-a92d-d106d52980fb", + "metadata": { + "language": "python", + "name": "cell18", + "collapsed": false, + "resultHeight": 0 + }, + "outputs": [], + "source": "# save the dataframe as table for SQL querying \ndf = pd.DataFrame(csv_list, columns=['name', 'description'])\ndf = session.create_dataframe(df)\ndf.write.mode(\"overwrite\").save_as_table(\"prospects\", table_type=\"temporary\")", + "execution_count": null + }, + { + "cell_type": "code", + "id": "3f5d40d9-ca69-4137-affa-905caef97c29", + "metadata": { + "language": "sql", + "name": "cell20", + "resultHeight": 426, + "collapsed": false + }, + "outputs": [], + "source": "select \"name\", \"description\" from prospects limit 10", + "execution_count": null + }, + { + "cell_type": "code", + "id": "51396730-f96a-476b-bb12-d7cac8c02576", + "metadata": { + "language": "sql", + "name": "cell24", + "codeCollapsed": false, + "collapsed": false, + "resultHeight": 135 }, "outputs": [], - "source": "import requests\n\ndef get_wiki_extract(title):\n # Base URL for Wikipedia's API\n url = \"https://en.wikipedia.org/w/api.php\"\n \n # Parameters for the API request\n params = {\n \"action\": \"query\",\n \"format\": \"json\",\n \"titles\": title,\n \"prop\": \"extracts\",\n \"exintro\": True, # Only get the intro section\n \"explaintext\": True, # Get plain text instead of HTML\n }\n \n # Make the request\n response = requests.get(url, params=params)\n \n # Check if request was successful\n if response.status_code == 200:\n data = response.json()\n # Navigate through the JSON response to get the extract\n pages = data[\"query\"][\"pages\"]\n # Get the first (and only) page's extract\n page = list(pages.values())[0]\n return page.get(\"extract\", \"No extract available\")\n else:\n return f\"Error: {response.status_code}\"\n\ncat_breeds = [\n 'Abyssinian_cat',\n 'Aegean_cat',\n 'American_Bobtail',\n 'American_Curl',\n 'American_Ringtail',\n 'American_Shorthair',\n 'American_Wirehair',\n 'Arabian_Mau',\n 'Asian_cat',\n 'Asian_Semi-longhair',\n 'Australian_Mist',\n 'Balinese_cat',\n 'Bambino_cat',\n 'Bengal_cat',\n 'Birman',\n 'Bombay_cat',\n 'Brazilian_Shorthair',\n 'British_Longhair',\n 'British_Shorthair',\n 'Burmese_cat',\n 'Burmilla',\n 'California_Spangled',\n 'Chantilly-Tiffany',\n 'Chartreux',\n 'Chausie',\n 'Colorpoint_Shorthair',\n 'Cornish_Rex',\n 'Cymric_cat',\n 'Cyprus_cat',\n 'Devon_Rex',\n 'Donskoy_cat',\n 'Dragon_Li',\n 'Egyptian_Mau',\n 'European_Shorthair',\n 'Exotic_Shorthair',\n 'Foldex_cat',\n 'German_Rex',\n 'Havana_Brown',\n 'Highlander_cat',\n 'Himalayan_cat',\n 'Japanese_Bobtail',\n 'Javanese_cat',\n 'Kanaani_cat',\n 'Khao_Manee',\n 'Kinkalow',\n 'Korat',\n 'Korean_Bobtail',\n 'Kurilian_Bobtail',\n 'Lambkin_cat',\n 'LaPerm',\n 'Lykoi',\n 'Maine_Coon',\n 'Manx_cat',\n 'Mekong_Bobtail',\n 'Minskin',\n 'Minuet_cat',\n 'Munchkin_cat',\n 'Nebelung',\n 'Neva_Masquerade',\n 'Norwegian_Forest_cat',\n 'Ocicat',\n 'Ojos_Azules',\n 'Oriental_bicolour',\n 'Oriental_Longhair',\n 'Oriental_Shorthair',\n 'Persian_cat',\n 'Traditional_Persian',\n 'Peterbald',\n 'Pixie-bob',\n 'Ragamuffin_cat',\n 'Ragdoll',\n 'Raas_cat',\n 'Russian_Blue',\n 'Savannah_cat',\n 'Scottish_Fold',\n 'Selkirk_Rex',\n 'Serengeti_cat',\n 'Siamese_cat',\n 'Siberian_cat',\n 'Singapura_cat',\n 'Snowshoe_cat',\n 'Sokoke',\n 'Somali_cat',\n 'Sphynx_cat',\n 'Suphalak',\n 'Thai_cat',\n 'Tonkinese_cat',\n 'Toybob',\n 'Toyger',\n 'Turkish_Angora',\n 'Turkish_Van',\n 'Van_cat',\n 'Ukrainian_Levkoy',\n 'York_Chocolate'\n]\ncsv_list = []\n\nfor cat in cat_breeds:\n print(cat)\n extract = get_wiki_extract(cat)\n print(extract)\n csv_list.append((cat, extract))\n\n# Convert to dataframe and save\ndf = pd.DataFrame(csv_list, columns=['breed', 'description'])\ndf.to_csv('cat_breeds.csv', index=False, encoding='utf-8')", + "source": "select \n \"name\",\n \"description\",\n snowflake.cortex.classify_text(\n \"description\",\n ['excellent', 'average', 'poor'],\n {\n 'task_description': 'Return the likelihood that this company would be interested in purchasing manufacturing equipment based on this description.'\n }\n ):label::STRING as hair_type\nfrom prospects\n-- other class. ideas: industry, main product, region", "execution_count": null } ] -} \ No newline at end of file +} From 5f5e01559d1cdc754fd8b8f7806911e3becc5bee Mon Sep 17 00:00:00 2001 From: Will Luna Date: Wed, 18 Dec 2024 16:35:03 -0800 Subject: [PATCH 4/9] Update anaconda_webinar_notebook.ipynb added error/warning suppression and working cortex.classify query --- .../anaconda_webinar/anaconda_webinar_notebook.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index 5a39a4bc..0b3e7c08 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -17,7 +17,7 @@ "collapsed": false, "resultHeight": 0 }, - "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()", + "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n\nimport logging\nlogging.getLogger(\"cmdstanpy\").setLevel(logging.WARNING)\nimport warnings\nwarnings.filterwarnings('ignore', category=FutureWarning)", "execution_count": null, "outputs": [] }, @@ -90,7 +90,7 @@ "language": "sql", "name": "cell6", "collapsed": false, - "resultHeight": 510 + "resultHeight": 159 }, "outputs": [], "source": "with windowed as (\n \n select\n *,\n sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n from df\n\n)\n\nselect *,\n case\n when total = previous_year_total and total > 0 then 'retained'\n when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n when total = 0 and previous_year_total > 0 then 'churned'\n when total > previous_year_total and previous_year_total > 0 then 'expanded'\n when total < previous_year_total and previous_year_total > 0 then 'contracted'\n when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n else 'irrelevant' end as category,\n case category\n when 'retained' then 0\n when 'new' then total\n when 'churned' then (-1 * previous_year_total)\n when 'expanded' then total - previous_year_total\n when 'contracted' then (-1 * (previous_year_total - total))\n when 'resurrected' then total\n else 0 end as net_change\nfrom windowed\norder by id, order_year", @@ -214,7 +214,7 @@ "language": "python", "name": "cell26", "collapsed": false, - "resultHeight": 95 + "resultHeight": 0 }, "outputs": [], "source": "m = Prophet()\ntry:\n m.fit(df)\nexcept Exception as err:\n print(Exception, err)", @@ -227,7 +227,7 @@ "language": "python", "name": "cell23", "collapsed": false, - "resultHeight": 1126 + "resultHeight": 885 }, "outputs": [], "source": "future = m.make_future_dataframe(periods=365)\nforecast = m.predict(future)\nfig1 = m.plot(forecast)\n#fig2 = m.plot_components(forecast)", @@ -316,10 +316,10 @@ "name": "cell24", "codeCollapsed": false, "collapsed": false, - "resultHeight": 135 + "resultHeight": 391 }, "outputs": [], - "source": "select \n \"name\",\n \"description\",\n snowflake.cortex.classify_text(\n \"description\",\n ['excellent', 'average', 'poor'],\n {\n 'task_description': 'Return the likelihood that this company would be interested in purchasing manufacturing equipment based on this description.'\n }\n ):label::STRING as hair_type\nfrom prospects\n-- other class. ideas: industry, main product, region", + "source": "select \n \"name\",\n \"description\",\n snowflake.cortex.classify_text(\n \"description\",\n ['extremely likely', 'somewhat likely', 'unlikely'],\n {\n 'task_description': 'Return the likelihood that this company would be interested in attending a webinar showcasing the GTM utility of Snowflake Notebooks and Anaconda Python Packages.'\n }\n ):label::STRING as persona_likelihood,\n snowflake.cortex.classify_text(\n \"description\",\n ['healthcare', 'finance', 'retail', 'technology', 'communication', 'other'],\n {\n 'task_description': 'Return the most likely industry of the company based on this description.'\n }\n ):label::STRING as industry,\n snowflake.cortex.classify_text(\n \"description\",\n ['California', 'South', 'Northeast', 'Midatlantic', 'Midwest', 'Pacific Northwest', 'Outsite the US'],\n {\n 'task_description': 'Return the most likely region the company is headquartered in based on this description.'\n }\n ):label::STRING as region\nfrom prospects\nwhere \"description\" is not null and \"description\" != ''\nlimit 10\n-- other class. ideas: industry, main product, region", "execution_count": null } ] From 892d00714dd82319a4d3ecfb61291dc9349d53c2 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Thu, 2 Jan 2025 13:54:37 -0600 Subject: [PATCH 5/9] Create synthetic_data_generation.py --- .../synthetic_data_generation.py | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 samples/notebooks/anaconda_webinar/synthetic_data_generation.py diff --git a/samples/notebooks/anaconda_webinar/synthetic_data_generation.py b/samples/notebooks/anaconda_webinar/synthetic_data_generation.py new file mode 100644 index 00000000..9f638bff --- /dev/null +++ b/samples/notebooks/anaconda_webinar/synthetic_data_generation.py @@ -0,0 +1,208 @@ +import numpy as np +from datetime import datetime, timedelta +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.dates import YearLocator, MonthLocator, DateFormatter + +class OrderGenerator: + def __init__( + self, + # Basic parameters + start_date='1992-01-01', + end_date='1998-12-31', + target_daily_total=100_000_000, + target_daily_orders=500, + + # Trend parameters + annual_growth_rate=0.15, # 15% annual growth + order_value_growth_rate=0.05, # 5% annual growth in order values + + # Seasonal parameters + holiday_peak_day=350, # Peak shopping day (Dec 16) + holiday_effect_magnitude=1.0, # Strength of holiday effect + seasonal_baseline=0.8, # Minimum seasonal multiplier + seasonal_spread=1000, # Controls how spread out the holiday effect is + + # Weekly parameters + weekend_dip=0.85, # Weekend order multiplier + weekday_boost=1.1, # Weekday order multiplier + + # Value distribution parameters + pareto_shape=2.0, # Shape parameter for order values + min_value_factor=0.3, # Minimum order value as fraction of average + value_noise_stddev=0.15, # Standard deviation for order value noise + + # Random seed for reproducibility + random_seed=None + ): + self.start_date = pd.to_datetime(start_date) + self.end_date = pd.to_datetime(end_date) + self.target_daily_total = target_daily_total + self.target_daily_orders = target_daily_orders + + # Store all other parameters + self.annual_growth_rate = annual_growth_rate + self.order_value_growth_rate = order_value_growth_rate + self.holiday_peak_day = holiday_peak_day + self.holiday_effect_magnitude = holiday_effect_magnitude + self.seasonal_baseline = seasonal_baseline + self.seasonal_spread = seasonal_spread + self.weekend_dip = weekend_dip + self.weekday_boost = weekday_boost + self.pareto_shape = pareto_shape + self.min_value_factor = min_value_factor + self.value_noise_stddev = value_noise_stddev + + # Derived parameters + self.avg_order_value = target_daily_total / target_daily_orders + self.min_order_value = self.avg_order_value * self.min_value_factor + + if random_seed is not None: + np.random.seed(random_seed) + + def seasonal_effect(self, day_of_year): + """Stronger effect during holiday season""" + holiday_effect = np.exp( + -((day_of_year - self.holiday_peak_day) ** 2) / + self.seasonal_spread + ) * self.holiday_effect_magnitude + return np.maximum(self.seasonal_baseline + holiday_effect, 0) + + def weekly_effect(self, day_of_week): + """Weekend dips in orders""" + return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost + + def trend_effect(self, years_passed): + """Long-term growth trend""" + return np.power(1 + self.annual_growth_rate, years_passed) + + def generate_order_value(self, years_passed): + """Generate order values following a Pareto distribution""" + u = np.random.random() + value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape) + value = value * np.power(1 + self.order_value_growth_rate, years_passed) + noise = np.random.normal(1, self.value_noise_stddev) + return round(value * noise) + + def generate_clerk(self): + """Generate clerk IDs matching TPCH format""" + clerk_id = np.random.randint(1000) + return f"Clerk#{clerk_id:09d}" + + def generate_customer(self, num_customers=1500): + """Generate customer IDs matching TPCH format""" + return f"Customer#{np.random.randint(num_customers):09d}" + + def generate_orders(self): + """Generate supplementary orders with realistic patterns""" + orders = [] + current_date = self.start_date + + while current_date <= self.end_date: + day_of_year = current_date.dayofyear + years_passed = (current_date - self.start_date).days / 365 + + seasonal = self.seasonal_effect(day_of_year) + weekly = self.weekly_effect(current_date.weekday()) + trend = self.trend_effect(years_passed) + + target_orders = round( + self.target_daily_orders * + seasonal * weekly * trend + ) + + for _ in range(target_orders): + order = { + 'o_orderdate': current_date, + 'o_totalprice': self.generate_order_value(years_passed), + 'o_orderstatus': 'O', + 'o_clerk': self.generate_clerk(), + 'o_custkey': self.generate_customer() + } + orders.append(order) + + current_date += timedelta(days=1) + + df = pd.DataFrame(orders) + df = df.sort_values('o_orderdate') + df['o_orderkey'] = range(len(df)) + df['o_orderkey'] = df['o_orderkey'] + 1_500_000 # Offset to avoid conflicts + + return df + +def generate_and_save_orders(filename, **generator_params): + """Generate orders and save to CSV""" + generator = OrderGenerator(**generator_params) + df = generator.generate_orders() + df.to_csv(filename, index=False) + print(f"Orders saved to {filename}") + return df + +def plot_daily_patterns(filename, figsize=(15, 8), plot_style='compressed'): + """Load orders from CSV and create visualization""" + df = pd.read_csv(filename) + df['o_orderdate'] = pd.to_datetime(df['o_orderdate']) + + daily_summary = df.groupby('o_orderdate').agg({ + 'o_orderkey': 'count', + 'o_totalprice': 'sum' + }).reset_index() + + fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) + + # Plot daily totals + ax1.plot(daily_summary['o_orderdate'], daily_summary['o_totalprice'], + color='blue', linewidth=0.5) + ax1.set_title('Daily Order Totals') + ax1.set_ylabel('Daily Total ($)') + ax1.grid(True, alpha=0.3) + + # Set x-axis ticks to show years and months + ax1.xaxis.set_major_locator(YearLocator()) + ax1.xaxis.set_minor_locator(MonthLocator()) + ax1.xaxis.set_major_formatter(DateFormatter('%Y')) + ax1.yaxis.set_major_formatter(lambda x, p: f'${x/1e6:.1f}M') + + # Plot daily order counts + ax2.plot(daily_summary['o_orderdate'], daily_summary['o_orderkey'], + color='green', linewidth=0.5) + ax2.set_title('Daily Order Count') + ax2.set_ylabel('Number of Orders') + ax2.grid(True, alpha=0.3) + + ax2.xaxis.set_major_locator(YearLocator()) + ax2.xaxis.set_minor_locator(MonthLocator()) + ax2.xaxis.set_major_formatter(DateFormatter('%Y')) + + for ax in [ax1, ax2]: + plt.setp(ax.get_xticklabels(), rotation=45) + + plt.tight_layout() + + # Print summary statistics + print("\nSummary Statistics:") + print(f"Date Range: {daily_summary['o_orderdate'].min().date()} to {daily_summary['o_orderdate'].max().date()}") + print(f"Average daily orders: {daily_summary['o_orderkey'].mean():.0f}") + print(f"Average daily total: ${daily_summary['o_totalprice'].mean():,.2f}") + + return fig + +if __name__ == "__main__": + # Example: Generate 2 years of data with pronounced patterns + params = { + 'start_date': '1992-01-01', + 'end_date': '1998-08-02', + 'target_daily_total': 100_000_000, + 'target_daily_orders': 500, + 'holiday_effect_magnitude': 1.2, + 'weekend_dip': 0.8, + 'annual_growth_rate': 0.15, + 'value_noise_stddev': 0.15 + } + + # Generate and save orders + generate_and_save_orders('supplementary_orders.csv', **params) + + # Create visualization + fig = plot_daily_patterns('supplementary_orders.csv') + plt.show() From 40b48afbe969e9caf9fab06004862094a33e7dec Mon Sep 17 00:00:00 2001 From: Will Luna Date: Mon, 6 Jan 2025 19:05:37 -0600 Subject: [PATCH 6/9] Update anaconda_webinar_notebook.ipynb --- .../anaconda_webinar_notebook.ipynb | 241 ++++++++---------- 1 file changed, 111 insertions(+), 130 deletions(-) diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index 0b3e7c08..f471f90f 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -13,231 +13,226 @@ "id": "3775908f-ca36-4846-8f38-5adca39217f2", "metadata": { "language": "python", - "name": "cell1", - "collapsed": false, - "resultHeight": 0 + "name": "session_creation", + "resultHeight": 0, + "collapsed": false }, "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n\nimport logging\nlogging.getLogger(\"cmdstanpy\").setLevel(logging.WARNING)\nimport warnings\nwarnings.filterwarnings('ignore', category=FutureWarning)", "execution_count": null, "outputs": [] }, + { + "cell_type": "code", + "id": "d776341f-464d-4a9b-8c98-ac8e05286559", + "metadata": { + "language": "sql", + "name": "orders_sample", + "resultHeight": 426, + "collapsed": false + }, + "outputs": [], + "source": "select\n o_custkey,\n o_orderdate,\n o_totalprice\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\nlimit 10", + "execution_count": null + }, + { + "cell_type": "code", + "id": "23297335-ae53-477e-af45-1355957bc24e", + "metadata": { + "language": "python", + "name": "generate_synthetic_data", + "resultHeight": 60, + "collapsed": false, + "codeCollapsed": true + }, + "outputs": [], + "source": "import numpy as np\nfrom datetime import datetime, timedelta\nimport pandas as pd\n\nclass OrderGenerator:\n def __init__(\n self,\n # Basic parameters\n start_date='1992-01-01',\n end_date='1998-12-31',\n target_daily_total=100_000_000,\n target_daily_orders=500,\n \n # Trend parameters\n annual_growth_rate=0.15, # 15% annual growth\n order_value_growth_rate=0.05, # 5% annual growth in order values\n \n # Seasonal parameters\n holiday_peak_day=350, # Peak shopping day (Dec 16)\n holiday_effect_magnitude=1.0, # Strength of holiday effect\n seasonal_baseline=0.8, # Minimum seasonal multiplier\n seasonal_spread=1000, # Controls how spread out the holiday effect is\n \n # Weekly parameters\n weekend_dip=0.85, # Weekend order multiplier\n weekday_boost=1.1, # Weekday order multiplier\n \n # Value distribution parameters\n pareto_shape=2.0, # Shape parameter for order values\n min_value_factor=0.3, # Minimum order value as fraction of average\n value_noise_stddev=0.15, # Standard deviation for order value noise\n \n # Random seed for reproducibility\n random_seed=None\n ):\n self.start_date = pd.to_datetime(start_date)\n self.end_date = pd.to_datetime(end_date)\n self.target_daily_total = target_daily_total\n self.target_daily_orders = target_daily_orders\n \n # Store all other parameters\n self.annual_growth_rate = annual_growth_rate\n self.order_value_growth_rate = order_value_growth_rate\n self.holiday_peak_day = holiday_peak_day\n self.holiday_effect_magnitude = holiday_effect_magnitude\n self.seasonal_baseline = seasonal_baseline\n self.seasonal_spread = seasonal_spread\n self.weekend_dip = weekend_dip\n self.weekday_boost = weekday_boost\n self.pareto_shape = pareto_shape\n self.min_value_factor = min_value_factor\n self.value_noise_stddev = value_noise_stddev\n \n # Derived parameters\n self.avg_order_value = target_daily_total / target_daily_orders\n self.min_order_value = self.avg_order_value * self.min_value_factor\n \n if random_seed is not None:\n np.random.seed(random_seed)\n \n def seasonal_effect(self, day_of_year):\n \"\"\"Stronger effect during holiday season\"\"\"\n holiday_effect = np.exp(\n -((day_of_year - self.holiday_peak_day) ** 2) / \n self.seasonal_spread\n ) * self.holiday_effect_magnitude\n return np.maximum(self.seasonal_baseline + holiday_effect, 0)\n \n def weekly_effect(self, day_of_week):\n \"\"\"Weekend dips in orders\"\"\"\n return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost\n \n def trend_effect(self, years_passed):\n \"\"\"Long-term growth trend\"\"\"\n return np.power(1 + self.annual_growth_rate, years_passed)\n \n def generate_order_value(self, years_passed):\n \"\"\"Generate order values following a Pareto distribution\"\"\"\n u = np.random.random()\n value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape)\n value = value * np.power(1 + self.order_value_growth_rate, years_passed)\n noise = np.random.normal(1, self.value_noise_stddev)\n return round(value * noise)\n \n def generate_clerk(self):\n \"\"\"Generate clerk IDs matching TPCH format\"\"\"\n clerk_id = np.random.randint(1000)\n return f\"Clerk#{clerk_id:09d}\"\n \n def generate_customer(self, num_customers=149999):\n \"\"\"Generate customer IDs matching TPCH format\"\"\"\n return np.random.randint(num_customers)\n \n def generate_orders(self):\n \"\"\"Generate supplementary orders with realistic patterns\"\"\"\n orders = []\n current_date = self.start_date\n \n while current_date <= self.end_date:\n day_of_year = current_date.dayofyear\n years_passed = (current_date - self.start_date).days / 365\n \n seasonal = self.seasonal_effect(day_of_year)\n weekly = self.weekly_effect(current_date.weekday())\n trend = self.trend_effect(years_passed)\n \n target_orders = round(\n self.target_daily_orders * \n seasonal * weekly * trend\n )\n \n for _ in range(target_orders):\n order = {\n 'o_orderdate': current_date,\n 'o_totalprice': self.generate_order_value(years_passed),\n 'o_orderstatus': 'O',\n 'o_clerk': self.generate_clerk(),\n 'o_custkey': self.generate_customer()\n }\n orders.append(order)\n \n current_date += timedelta(days=1)\n \n df = pd.DataFrame(orders)\n df = df.sort_values('o_orderdate')\n df['o_orderkey'] = range(len(df))\n df['o_orderkey'] = df['o_orderkey'] + 1_500_000 # Offset to avoid conflicts\n \n return df\n\ndef generate_and_save_synthetic_data():\n \"\"\"Generate orders and save to CSV\"\"\"\n # Example: Generate 2 years of data with pronounced patterns\n params = {\n 'start_date': '1992-01-01',\n 'end_date': '1998-08-02',\n 'target_daily_total': 100_000_000,\n 'target_daily_orders': 500,\n 'holiday_effect_magnitude': 1.2,\n 'weekend_dip': 0.8,\n 'annual_growth_rate': 0.15,\n 'value_noise_stddev': 0.15\n }\n \n generator = OrderGenerator(**params)\n df = generator.generate_orders()\n #save the synthetic data to a temporary table\n filename = 'synthetic_orders'\n df.to_csv(filename + '.csv', index=False)\n print(f\"Orders saved to CSV {filename}.csv\")\n csv_df = pd.read_csv(filename + '.csv')\n csv_df['o_orderdate'] = pd.to_datetime(df['o_orderdate'])\n table_df = session.create_dataframe(csv_df)\n table_df.write.mode(\"overwrite\").save_as_table(filename, table_type=\"temporary\")\n print(f\"Order saved to temporary table {filename}\")\n return\n\n# Generate and save orders\ngenerate_and_save_synthetic_data()", + "execution_count": null + }, { "cell_type": "markdown", - "id": "8ae58f97-bb31-4290-b2dd-2416f3c2ce15", + "id": "ca0f2f8f-33ae-4934-9064-f44a3e5ef5c9", "metadata": { - "name": "cell9", + "name": "growth_accounting_intro", "collapsed": false, "resultHeight": 74 }, - "source": "# Growth Accounting\n" + "source": "# Growth Accounting" }, { "cell_type": "code", - "id": "435baefb-25ff-42a1-b4f8-236a98b4afac", + "id": "b10ebdb4-78f3-49f3-ab81-529b0afd662d", "metadata": { "language": "sql", - "name": "cell3", + "name": "orders", + "resultHeight": 510, "collapsed": false, - "resultHeight": 510 + "codeCollapsed": false }, "outputs": [], - "source": "select\n o_custkey as id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by all\norder by id, order_year", + "source": "with synthetic as (\n\n select\n \"o_custkey\" as id,\n to_date(\"o_orderdate\") as o_orderdate,\n CAST(\"o_totalprice\" AS NUMERIC) as o_totalprice\n from synthetic_orders\n --SAMPLE (1000000 rows)\n\n),\n\noriginal as (\n \n select\n o_custkey as id,\n o_orderdate,\n o_totalprice\n from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\n --SAMPLE (1000000 rows)\n\n)\n\nselect * from synthetic\nunion all \nselect * from original", "execution_count": null }, { "cell_type": "code", - "id": "61f451db-8ff2-4d83-b9be-6c1a77365446", + "id": "b933a301-0086-4682-9a6b-c0d430f62f87", "metadata": { - "language": "python", - "name": "cell12", - "collapsed": false, - "resultHeight": 0 + "language": "sql", + "name": "annual_customer_orders", + "resultHeight": 510 }, "outputs": [], - "source": "import pandas as pd", + "source": "select\n id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom {{ orders }}\ngroup by all\norder by id, order_year", "execution_count": null }, { "cell_type": "code", - "id": "20f1dd62-d796-4190-b34a-89a16fea1819", + "id": "a789790e-47be-4b57-94a1-53832336abb1", "metadata": { "language": "python", - "name": "cell10", - "collapsed": false, + "name": "add_rows_for_years_without_sales", "resultHeight": 0 }, "outputs": [], - "source": "df = cell3.to_pandas()\n\n#pivot data to add row for each id:year with no revenue\nresult = df.pivot_table(\n index='ID',\n columns='ORDER_YEAR', \n values='TOTAL',\n fill_value=0\n).reset_index().melt(\n id_vars='ID',\n var_name='ORDER_YEAR',\n value_name='TOTAL'\n)\n\n# save the dataframe as table for SQL querying \ndf = session.create_dataframe(result)\ndf.write.mode(\"overwrite\").save_as_table(\"df\", table_type=\"temporary\")", + "source": "annual_customer_orders_df = annual_customer_orders.to_pandas()\n\n#pivot data to add row for each id:year with no revenue\nresult = annual_customer_orders_df.pivot_table(\n index='ID',\n columns='ORDER_YEAR', \n values='TOTAL',\n fill_value=0\n).reset_index().melt(\n id_vars='ID',\n var_name='ORDER_YEAR',\n value_name='TOTAL'\n)\n\n# save the dataframe as table for SQL querying \ndf = session.create_dataframe(result)\ndf.write.mode(\"overwrite\").save_as_table(\"annual_customer_orders\", table_type=\"temporary\")", "execution_count": null }, { "cell_type": "code", - "id": "52ae5a36-e143-4ebb-b884-e17750b0c77f", + "id": "70c25d11-94cb-40f0-985a-89e8d8839d8e", "metadata": { "language": "sql", - "name": "cell7", - "collapsed": false, + "name": "sample_annual_customer_orders", "resultHeight": 426 }, "outputs": [], - "source": "select * from df\norder by id, order_year\nlimit 10", + "source": "select * from annual_customer_orders\norder by id, order_year\nlimit 10", "execution_count": null }, { "cell_type": "code", - "id": "11971c03-53a7-4429-870a-4b51bbef7aca", + "id": "d092b952-57aa-4076-b1cd-575279473bab", "metadata": { "language": "sql", - "name": "cell6", - "collapsed": false, - "resultHeight": 159 + "name": "labeled_annual_customer_orders", + "resultHeight": 510 }, "outputs": [], - "source": "with windowed as (\n \n select\n *,\n sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n from df\n\n)\n\nselect *,\n case\n when total = previous_year_total and total > 0 then 'retained'\n when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n when total = 0 and previous_year_total > 0 then 'churned'\n when total > previous_year_total and previous_year_total > 0 then 'expanded'\n when total < previous_year_total and previous_year_total > 0 then 'contracted'\n when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n else 'irrelevant' end as category,\n case category\n when 'retained' then 0\n when 'new' then total\n when 'churned' then (-1 * previous_year_total)\n when 'expanded' then total - previous_year_total\n when 'contracted' then (-1 * (previous_year_total - total))\n when 'resurrected' then total\n else 0 end as net_change\nfrom windowed\norder by id, order_year", + "source": "with windowed as (\n \n select\n *,\n sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n from annual_customer_orders\n\n)\n\nselect *,\n case\n when total = previous_year_total and total > 0 then 'retained'\n when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n when total = 0 and previous_year_total > 0 then 'churned'\n when total > previous_year_total and previous_year_total > 0 then 'expanded'\n when total < previous_year_total and previous_year_total > 0 then 'contracted'\n when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n else 'irrelevant' end as category,\n case category\n when 'retained' then 0\n when 'new' then total\n when 'churned' then (-1 * previous_year_total)\n when 'expanded' then total - previous_year_total\n when 'contracted' then (-1 * (previous_year_total - total))\n when 'resurrected' then total\n else 0 end as net_change\nfrom windowed\norder by id, order_year", "execution_count": null }, { "cell_type": "code", - "id": "13f099e5-4265-438d-ab46-b3315bfc1f1d", + "id": "4fa6afc9-934a-40fb-a8ef-f6aedaec3ba0", "metadata": { "language": "sql", - "name": "cell4", - "collapsed": false, + "name": "annual_growth_labels", "resultHeight": 438 }, "outputs": [], - "source": "select\n date_part(year, order_year) as order_year,\n category,\n round(sum(total)) as total,\n round(sum(net_change)) as net_change\nfrom {{ cell6 }}\ngroup by all", + "source": "select\n date_part(year, order_year) as order_year,\n category,\n round(sum(total)) as total,\n round(sum(net_change)) as net_change\nfrom {{ labeled_annual_customer_orders }}\ngroup by all", "execution_count": null }, { "cell_type": "code", - "id": "735da8fc-91c0-4604-8041-1437208a1f01", + "id": "9f67f2b4-9c22-453d-abc0-68e5fbbc2e7f", "metadata": { "language": "python", - "name": "cell2", - "collapsed": false, - "resultHeight": 772 + "name": "visualize_growth_framework", + "resultHeight": 239 }, "outputs": [], - "source": "import streamlit as st\n# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(cell4, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", + "source": "import streamlit as st\n# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(annual_growth_labels, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", "execution_count": null }, { "cell_type": "code", - "id": "06f083eb-ae70-42ad-af0d-261138126bed", + "id": "2e2a6a8c-14e5-47f2-997e-fa53600564f2", "metadata": { "language": "python", - "name": "cell5", - "collapsed": false, + "name": "download_growth_accounting_csv", "resultHeight": 96 }, "outputs": [], - "source": "df = cell6.to_pandas()\nbutton_csv = df.to_csv().encode(\"utf-8\")\nst.download_button(label=\"Download\", data=button_csv, file_name=\"growth_accounting.csv\", mime=\"text/csv\")", + "source": "df = labeled_annual_customer_orders.to_pandas()\nbutton_csv = df.to_csv().encode(\"utf-8\")\nst.download_button(label=\"Download\", data=button_csv, file_name=\"growth_accounting.csv\", mime=\"text/csv\")", "execution_count": null }, { "cell_type": "markdown", - "id": "db63ea18-13d4-43a4-a29c-a734db89e796", + "id": "fbd5ea2b-6a4f-423e-8e50-ea5d96eb8140", "metadata": { - "name": "cell8", + "name": "forecasting_intro", "collapsed": false, "resultHeight": 74 }, - "source": "# Forecasting\n" + "source": "# Forecasting" }, { "cell_type": "code", - "id": "2a9b9481-4d24-4f6c-9b53-4f50add6458e", + "id": "16ec54e1-54cf-468c-a2d9-8bb8bd4abaaa", "metadata": { "language": "sql", - "name": "cell14", - "collapsed": false, - "resultHeight": 438 - }, - "outputs": [], - "source": "select\n date_trunc(day, o_orderdate) as order_date,\n sum(o_totalprice) as total\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\ngroup by 1\norder by order_date asc", - "execution_count": null - }, - { - "cell_type": "code", - "id": "9d5d7b4a-43cc-4c62-844e-a1954c312cbf", - "metadata": { - "language": "python", - "name": "cell15", - "collapsed": false, - "resultHeight": 0 + "name": "daily_order_data", + "resultHeight": 438, + "collapsed": false }, "outputs": [], - "source": "from prophet import Prophet\nfrom prophet.plot import plot_plotly, plot_components_plotly", + "source": "select\n date_trunc(day, o_orderdate) as order_date,\n sum(o_totalprice) as sum_revenue,\n count(*) as num_orders\nfrom {{ orders }}\ngroup by 1\norder by order_date asc", "execution_count": null }, { "cell_type": "code", - "id": "87ca009b-4da8-46c2-a86c-9cad46fac89f", + "id": "e1368eea-3b25-46fd-92d9-d890e07dc61e", "metadata": { "language": "python", - "name": "cell17", - "collapsed": false, - "resultHeight": 150 + "name": "prophet_data_preparation", + "resultHeight": 372, + "collapsed": false }, "outputs": [], - "source": "df = cell14.to_pandas()\ndf = df.rename(columns={'ORDER_DATE': 'ds', 'TOTAL': 'y'})\nprint(df.head())", + "source": "from prophet import Prophet\nfrom prophet.plot import plot_plotly, plot_components_plotly\n\ndf = daily_order_data.to_pandas()\nprophet_df = df.rename(columns={'ORDER_DATE': 'ds', 'SUM_REVENUE': 'y'})\nst.line_chart(prophet_df, x='ds', y='y')", "execution_count": null }, { "cell_type": "code", - "id": "4efeff4d-da4b-4c1d-b3d5-a892bb2a2bc5", + "id": "bff69396-4c45-477a-a03a-9c173e9e0a02", "metadata": { "language": "python", - "name": "cell19", - "collapsed": false, - "resultHeight": 372 + "name": "project_future_daily_sales", + "resultHeight": 41 }, "outputs": [], - "source": "st.line_chart(df, x='ds', y='y')", + "source": "m = Prophet()\ntry:\n m.fit(prophet_df)\nexcept Exception as err:\n print(Exception, err)\n\nfuture = m.make_future_dataframe(periods=365)\nforecast = m.predict(future)\nfig1 = m.plot(forecast)", "execution_count": null }, - { - "cell_type": "markdown", - "id": "cbffd526-a4b0-405b-9718-6c5c2f8f6144", - "metadata": { - "name": "cell21", - "collapsed": false, - "resultHeight": 120 - }, - "source": "Waiting on role permission to write UDFs for Prophet library to run properly. Until then, code cell below will return \n``` Failed with error [Errno 1] Operation not permitted: '/usr/lib/python_udf/d212b0f949a4a60cf75395f561f7016ea978bad39b2e60eee12ece87d118e861/lib/python3.9/site-packages/prophet/stan_model/prophet_model.bin'```" - }, { "cell_type": "code", - "id": "9d2c4877-5815-4f49-a53d-816b38de4eb6", + "id": "3ad6456c-376a-409b-a006-a42bfbb005fa", "metadata": { "language": "python", - "name": "cell26", - "collapsed": false, - "resultHeight": 0 + "name": "inspect_forecasting_components", + "resultHeight": 41 }, "outputs": [], - "source": "m = Prophet()\ntry:\n m.fit(df)\nexcept Exception as err:\n print(Exception, err)", + "source": "fig2 = m.plot_components(forecast)", "execution_count": null }, { "cell_type": "code", - "id": "ce582f14-9490-4a54-8fe0-bbfc8b56f61f", + "id": "f30b1c81-80bf-4571-b971-84443f55630d", "metadata": { "language": "python", - "name": "cell23", - "collapsed": false, - "resultHeight": 885 + "name": "simplify_forecast_visualization", + "resultHeight": 372 }, "outputs": [], - "source": "future = m.make_future_dataframe(periods=365)\nforecast = m.predict(future)\nfig1 = m.plot(forecast)\n#fig2 = m.plot_components(forecast)", + "source": "df = pd.DataFrame({\n 'ds': forecast['ds'],\n 'y': m.history['y'],\n # Only show yhat for future dates\n 'yhat': np.where(forecast['ds'] > m.history['ds'].max(), forecast['yhat'], np.nan)\n})\n\nst.line_chart(df, x='ds', y=['y', 'yhat'])", "execution_count": null }, { "cell_type": "markdown", - "id": "5dc1abf7-b9ea-4fe4-88ae-109342f6dc05", + "id": "5232d8e1-8ecb-4bb4-94c2-dd7122caaf30", "metadata": { - "name": "cell25", + "name": "customer_segmentation_introduction", "collapsed": false, "resultHeight": 74 }, @@ -245,12 +240,12 @@ }, { "cell_type": "code", - "id": "939a7d50-2679-46ee-a43b-b7d03b627d61", + "id": "6a901764-40e1-4607-850c-444ad00450ef", "metadata": { "language": "sql", - "name": "cell16", - "collapsed": false, - "resultHeight": 426 + "name": "sample_company_data", + "resultHeight": 426, + "collapsed": false }, "outputs": [], "source": "select *\nfrom ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST\nlimit 10", @@ -258,12 +253,12 @@ }, { "cell_type": "code", - "id": "9bd53742-511c-4cf9-9e28-02bdbcaca463", + "id": "e7acf161-5e2d-4277-89ea-65f1256358e4", "metadata": { "language": "python", - "name": "cell13", - "collapsed": false, - "resultHeight": 0 + "name": "construct_api_request", + "resultHeight": 0, + "collapsed": false }, "outputs": [], "source": "import requests\n\ndef get_wiki_extract(title):\n # Base URL for Wikipedia's API\n url = \"https://en.wikipedia.org/w/api.php\"\n \n # Parameters for the API request\n params = {\n \"action\": \"query\",\n \"format\": \"json\",\n \"titles\": title,\n \"prop\": \"extracts\",\n \"exintro\": True, # Only get the intro section\n \"explaintext\": True, # Get plain text instead of HTML\n }\n \n # Make the request\n response = requests.get(url, params=params)\n \n # Check if request was successful\n if response.status_code == 200:\n data = response.json()\n # Navigate through the JSON response to get the extract\n pages = data[\"query\"][\"pages\"]\n # Get the first (and only) page's extract\n page = list(pages.values())[0]\n return page.get(\"extract\", \"No extract available\")\n else:\n return f\"Error: {response.status_code}\"", @@ -271,56 +266,42 @@ }, { "cell_type": "code", - "id": "0557102d-3584-469a-9fdc-be53fd0a249b", - "metadata": { - "language": "python", - "name": "cell22", - "collapsed": false, - "resultHeight": 60 - }, - "outputs": [], - "source": "df = cell16.to_pandas()\ncompany_names = df['NAME'].tolist()\ncsv_list = []\n\nprint(\"extracting descriptions\")\n\nfor name in company_names:\n try:\n extract = get_wiki_extract(name.replace(\" \", \"_\"))\n #print(f'extracted description of {name} from Wikipedia')\n except Exception as e:\n #print(f\"Error getting Wikipedia extract for {name}: {str(e)}\")\n extract = \"None available\"\n \n csv_list.append((name, extract))\n\nprint(\"finished extracting descriptions\")", - "execution_count": null - }, - { - "cell_type": "code", - "id": "e979ca68-494a-46d4-a92d-d106d52980fb", + "id": "94963e7c-8d39-46e5-a035-4838ebb3617e", "metadata": { "language": "python", - "name": "cell18", - "collapsed": false, - "resultHeight": 0 + "name": "extraxt_wikipedia_descriptions", + "resultHeight": 284, + "collapsed": false }, "outputs": [], - "source": "# save the dataframe as table for SQL querying \ndf = pd.DataFrame(csv_list, columns=['name', 'description'])\ndf = session.create_dataframe(df)\ndf.write.mode(\"overwrite\").save_as_table(\"prospects\", table_type=\"temporary\")", + "source": "df = sample_company_data.to_pandas()\ncompany_names = df['NAME'].tolist()\ncsv_list = []\n\nprint(\"extracting descriptions\")\n\nfor name in company_names:\n try:\n extract = get_wiki_extract(name.replace(\" \", \"_\"))\n print(f'extracted description of {name} from Wikipedia')\n except Exception as e:\n print(f\"Error getting Wikipedia extract for {name}: {str(e)}\")\n extract = \"None available\"\n \n csv_list.append((name, extract))\n\nprint(\"finished extracting descriptions\")\n\n# save the dataframe as table for SQL querying \ndf = pd.DataFrame(csv_list, columns=['name', 'description'])\ndf = session.create_dataframe(df)\ndf.write.mode(\"overwrite\").save_as_table(\"prospects\", table_type=\"temporary\")", "execution_count": null }, { "cell_type": "code", - "id": "3f5d40d9-ca69-4137-affa-905caef97c29", + "id": "81c446dc-5c36-42e3-bb0d-985d397af0ca", "metadata": { "language": "sql", - "name": "cell20", + "name": "display_wikipedia_descriptions", "resultHeight": 426, "collapsed": false }, "outputs": [], - "source": "select \"name\", \"description\" from prospects limit 10", + "source": "select \"name\", \"description\" from prospects", "execution_count": null }, { "cell_type": "code", - "id": "51396730-f96a-476b-bb12-d7cac8c02576", + "id": "6b559934-f89d-418e-9a1f-38ef7faa03ad", "metadata": { "language": "sql", - "name": "cell24", - "codeCollapsed": false, - "collapsed": false, - "resultHeight": 391 + "name": "categorize_descriptions_with_LLM", + "resultHeight": 391, + "collapsed": false }, "outputs": [], "source": "select \n \"name\",\n \"description\",\n snowflake.cortex.classify_text(\n \"description\",\n ['extremely likely', 'somewhat likely', 'unlikely'],\n {\n 'task_description': 'Return the likelihood that this company would be interested in attending a webinar showcasing the GTM utility of Snowflake Notebooks and Anaconda Python Packages.'\n }\n ):label::STRING as persona_likelihood,\n snowflake.cortex.classify_text(\n \"description\",\n ['healthcare', 'finance', 'retail', 'technology', 'communication', 'other'],\n {\n 'task_description': 'Return the most likely industry of the company based on this description.'\n }\n ):label::STRING as industry,\n snowflake.cortex.classify_text(\n \"description\",\n ['California', 'South', 'Northeast', 'Midatlantic', 'Midwest', 'Pacific Northwest', 'Outsite the US'],\n {\n 'task_description': 'Return the most likely region the company is headquartered in based on this description.'\n }\n ):label::STRING as region\nfrom prospects\nwhere \"description\" is not null and \"description\" != ''\nlimit 10\n-- other class. ideas: industry, main product, region", "execution_count": null } ] -} +} \ No newline at end of file From 70bb3499124bd387fc5ca16cfca5938a1e7ee6b6 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Mon, 27 Jan 2025 16:17:07 -0800 Subject: [PATCH 7/9] Updated Notebook and instructions --- samples/notebooks/anaconda_webinar/README.md | 13 +- .../anaconda_webinar_notebook.ipynb | 608 +++++++++++++++--- .../synthetic_data_generation.py | 208 ------ .../package-lock.yml | 4 + 4 files changed, 535 insertions(+), 298 deletions(-) delete mode 100644 samples/notebooks/anaconda_webinar/synthetic_data_generation.py create mode 100644 samples/sap_accounts_receivable_dbt/package-lock.yml diff --git a/samples/notebooks/anaconda_webinar/README.md b/samples/notebooks/anaconda_webinar/README.md index f15ffdf1..6f36fa4b 100644 --- a/samples/notebooks/anaconda_webinar/README.md +++ b/samples/notebooks/anaconda_webinar/README.md @@ -1,2 +1,11 @@ -# Title -Placeholder title +# Summary +This notebook showcases how commonly-requested Analytics processes can be quickly implemented in Snowflake Notebooks. Those processes, and the key Python packages used in their implementation, are: + +1. Growth Accounting (pandas, streamlit) +2. Forecasting (prophet, streamlit) +3. Sales Enrichment (requests, Cortex LLM functions) + +# Permissions + +1. Access to `SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS` is necessary. Instructions are available in the Snowflake Documentation [here](https://docs.snowflake.com/en/user-guide/sample-data-using). +2. Access to wikipedia.org is necessary. Instructions for setting up an External Access Integration is available [here](https://docs.snowflake.com/en/user-guide/ui-snowsight/notebooks-external-access). diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index f471f90f..3056e920 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -1,307 +1,739 @@ { - "metadata": { - "kernelspec": { - "display_name": "Streamlit Notebook", - "name": "streamlit" - } - }, - "nbformat_minor": 5, - "nbformat": 4, "cells": [ { "cell_type": "code", + "execution_count": null, "id": "3775908f-ca36-4846-8f38-5adca39217f2", "metadata": { + "collapsed": false, "language": "python", "name": "session_creation", - "resultHeight": 0, - "collapsed": false + "resultHeight": 0 }, - "source": "from snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n\nimport logging\nlogging.getLogger(\"cmdstanpy\").setLevel(logging.WARNING)\nimport warnings\nwarnings.filterwarnings('ignore', category=FutureWarning)", - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "from snowflake.snowpark.context import get_active_session\n", + "session = get_active_session()\n", + "\n", + "import logging\n", + "logging.getLogger(\"cmdstanpy\").setLevel(logging.WARNING)\n", + "import warnings\n", + "warnings.filterwarnings('ignore', category=FutureWarning)" + ] }, { "cell_type": "code", + "execution_count": null, "id": "d776341f-464d-4a9b-8c98-ac8e05286559", "metadata": { + "collapsed": false, "language": "sql", "name": "orders_sample", - "resultHeight": 426, - "collapsed": false + "resultHeight": 426 }, "outputs": [], - "source": "select\n o_custkey,\n o_orderdate,\n o_totalprice\nfrom SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\nlimit 10", - "execution_count": null + "source": [ + "select\n", + " o_custkey,\n", + " o_orderdate,\n", + " o_totalprice\n", + "from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\n", + "limit 10" + ] }, { "cell_type": "code", + "execution_count": null, "id": "23297335-ae53-477e-af45-1355957bc24e", "metadata": { + "codeCollapsed": true, + "collapsed": false, "language": "python", "name": "generate_synthetic_data", - "resultHeight": 60, - "collapsed": false, - "codeCollapsed": true + "resultHeight": 0 }, "outputs": [], - "source": "import numpy as np\nfrom datetime import datetime, timedelta\nimport pandas as pd\n\nclass OrderGenerator:\n def __init__(\n self,\n # Basic parameters\n start_date='1992-01-01',\n end_date='1998-12-31',\n target_daily_total=100_000_000,\n target_daily_orders=500,\n \n # Trend parameters\n annual_growth_rate=0.15, # 15% annual growth\n order_value_growth_rate=0.05, # 5% annual growth in order values\n \n # Seasonal parameters\n holiday_peak_day=350, # Peak shopping day (Dec 16)\n holiday_effect_magnitude=1.0, # Strength of holiday effect\n seasonal_baseline=0.8, # Minimum seasonal multiplier\n seasonal_spread=1000, # Controls how spread out the holiday effect is\n \n # Weekly parameters\n weekend_dip=0.85, # Weekend order multiplier\n weekday_boost=1.1, # Weekday order multiplier\n \n # Value distribution parameters\n pareto_shape=2.0, # Shape parameter for order values\n min_value_factor=0.3, # Minimum order value as fraction of average\n value_noise_stddev=0.15, # Standard deviation for order value noise\n \n # Random seed for reproducibility\n random_seed=None\n ):\n self.start_date = pd.to_datetime(start_date)\n self.end_date = pd.to_datetime(end_date)\n self.target_daily_total = target_daily_total\n self.target_daily_orders = target_daily_orders\n \n # Store all other parameters\n self.annual_growth_rate = annual_growth_rate\n self.order_value_growth_rate = order_value_growth_rate\n self.holiday_peak_day = holiday_peak_day\n self.holiday_effect_magnitude = holiday_effect_magnitude\n self.seasonal_baseline = seasonal_baseline\n self.seasonal_spread = seasonal_spread\n self.weekend_dip = weekend_dip\n self.weekday_boost = weekday_boost\n self.pareto_shape = pareto_shape\n self.min_value_factor = min_value_factor\n self.value_noise_stddev = value_noise_stddev\n \n # Derived parameters\n self.avg_order_value = target_daily_total / target_daily_orders\n self.min_order_value = self.avg_order_value * self.min_value_factor\n \n if random_seed is not None:\n np.random.seed(random_seed)\n \n def seasonal_effect(self, day_of_year):\n \"\"\"Stronger effect during holiday season\"\"\"\n holiday_effect = np.exp(\n -((day_of_year - self.holiday_peak_day) ** 2) / \n self.seasonal_spread\n ) * self.holiday_effect_magnitude\n return np.maximum(self.seasonal_baseline + holiday_effect, 0)\n \n def weekly_effect(self, day_of_week):\n \"\"\"Weekend dips in orders\"\"\"\n return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost\n \n def trend_effect(self, years_passed):\n \"\"\"Long-term growth trend\"\"\"\n return np.power(1 + self.annual_growth_rate, years_passed)\n \n def generate_order_value(self, years_passed):\n \"\"\"Generate order values following a Pareto distribution\"\"\"\n u = np.random.random()\n value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape)\n value = value * np.power(1 + self.order_value_growth_rate, years_passed)\n noise = np.random.normal(1, self.value_noise_stddev)\n return round(value * noise)\n \n def generate_clerk(self):\n \"\"\"Generate clerk IDs matching TPCH format\"\"\"\n clerk_id = np.random.randint(1000)\n return f\"Clerk#{clerk_id:09d}\"\n \n def generate_customer(self, num_customers=149999):\n \"\"\"Generate customer IDs matching TPCH format\"\"\"\n return np.random.randint(num_customers)\n \n def generate_orders(self):\n \"\"\"Generate supplementary orders with realistic patterns\"\"\"\n orders = []\n current_date = self.start_date\n \n while current_date <= self.end_date:\n day_of_year = current_date.dayofyear\n years_passed = (current_date - self.start_date).days / 365\n \n seasonal = self.seasonal_effect(day_of_year)\n weekly = self.weekly_effect(current_date.weekday())\n trend = self.trend_effect(years_passed)\n \n target_orders = round(\n self.target_daily_orders * \n seasonal * weekly * trend\n )\n \n for _ in range(target_orders):\n order = {\n 'o_orderdate': current_date,\n 'o_totalprice': self.generate_order_value(years_passed),\n 'o_orderstatus': 'O',\n 'o_clerk': self.generate_clerk(),\n 'o_custkey': self.generate_customer()\n }\n orders.append(order)\n \n current_date += timedelta(days=1)\n \n df = pd.DataFrame(orders)\n df = df.sort_values('o_orderdate')\n df['o_orderkey'] = range(len(df))\n df['o_orderkey'] = df['o_orderkey'] + 1_500_000 # Offset to avoid conflicts\n \n return df\n\ndef generate_and_save_synthetic_data():\n \"\"\"Generate orders and save to CSV\"\"\"\n # Example: Generate 2 years of data with pronounced patterns\n params = {\n 'start_date': '1992-01-01',\n 'end_date': '1998-08-02',\n 'target_daily_total': 100_000_000,\n 'target_daily_orders': 500,\n 'holiday_effect_magnitude': 1.2,\n 'weekend_dip': 0.8,\n 'annual_growth_rate': 0.15,\n 'value_noise_stddev': 0.15\n }\n \n generator = OrderGenerator(**params)\n df = generator.generate_orders()\n #save the synthetic data to a temporary table\n filename = 'synthetic_orders'\n df.to_csv(filename + '.csv', index=False)\n print(f\"Orders saved to CSV {filename}.csv\")\n csv_df = pd.read_csv(filename + '.csv')\n csv_df['o_orderdate'] = pd.to_datetime(df['o_orderdate'])\n table_df = session.create_dataframe(csv_df)\n table_df.write.mode(\"overwrite\").save_as_table(filename, table_type=\"temporary\")\n print(f\"Order saved to temporary table {filename}\")\n return\n\n# Generate and save orders\ngenerate_and_save_synthetic_data()", - "execution_count": null + "source": [ + "import numpy as np\n", + "from datetime import datetime, timedelta\n", + "import pandas as pd\n", + "\n", + "class OrderGenerator:\n", + " def __init__(\n", + " self,\n", + " # Basic parameters\n", + " start_date='1992-01-01',\n", + " end_date='1998-12-31',\n", + " target_daily_total=100_000_000,\n", + " target_daily_orders=500,\n", + " \n", + " # Trend parameters\n", + " annual_growth_rate=0.15, # 15% annual growth\n", + " order_value_growth_rate=0.05, # 5% annual growth in order values\n", + " \n", + " # Seasonal parameters\n", + " holiday_peak_day=350, # Peak shopping day (Dec 16)\n", + " holiday_effect_magnitude=1.0, # Strength of holiday effect\n", + " seasonal_baseline=0.8, # Minimum seasonal multiplier\n", + " seasonal_spread=1000, # Controls how spread out the holiday effect is\n", + " \n", + " # Weekly parameters\n", + " weekend_dip=0.85, # Weekend order multiplier\n", + " weekday_boost=1.1, # Weekday order multiplier\n", + " \n", + " # Value distribution parameters\n", + " pareto_shape=2.0, # Shape parameter for order values\n", + " min_value_factor=0.3, # Minimum order value as fraction of average\n", + " value_noise_stddev=0.15, # Standard deviation for order value noise\n", + " \n", + " # Random seed for reproducibility\n", + " random_seed=None\n", + " ):\n", + " self.start_date = pd.to_datetime(start_date)\n", + " self.end_date = pd.to_datetime(end_date)\n", + " self.target_daily_total = target_daily_total\n", + " self.target_daily_orders = target_daily_orders\n", + " \n", + " # Store all other parameters\n", + " self.annual_growth_rate = annual_growth_rate\n", + " self.order_value_growth_rate = order_value_growth_rate\n", + " self.holiday_peak_day = holiday_peak_day\n", + " self.holiday_effect_magnitude = holiday_effect_magnitude\n", + " self.seasonal_baseline = seasonal_baseline\n", + " self.seasonal_spread = seasonal_spread\n", + " self.weekend_dip = weekend_dip\n", + " self.weekday_boost = weekday_boost\n", + " self.pareto_shape = pareto_shape\n", + " self.min_value_factor = min_value_factor\n", + " self.value_noise_stddev = value_noise_stddev\n", + " \n", + " # Derived parameters\n", + " self.avg_order_value = target_daily_total / target_daily_orders\n", + " self.min_order_value = self.avg_order_value * self.min_value_factor\n", + " \n", + " if random_seed is not None:\n", + " np.random.seed(random_seed)\n", + " \n", + " def seasonal_effect(self, day_of_year):\n", + " \"\"\"Stronger effect during holiday season\"\"\"\n", + " holiday_effect = np.exp(\n", + " -((day_of_year - self.holiday_peak_day) ** 2) / \n", + " self.seasonal_spread\n", + " ) * self.holiday_effect_magnitude\n", + " return np.maximum(self.seasonal_baseline + holiday_effect, 0)\n", + " \n", + " def weekly_effect(self, day_of_week):\n", + " \"\"\"Weekend dips in orders\"\"\"\n", + " return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost\n", + " \n", + " def trend_effect(self, years_passed):\n", + " \"\"\"Long-term growth trend\"\"\"\n", + " return np.power(1 + self.annual_growth_rate, years_passed)\n", + " \n", + " def generate_order_value(self, years_passed):\n", + " \"\"\"Generate order values following a Pareto distribution\"\"\"\n", + " u = np.random.random()\n", + " value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape)\n", + " value = value * np.power(1 + self.order_value_growth_rate, years_passed)\n", + " noise = np.random.normal(1, self.value_noise_stddev)\n", + " return round(value * noise)\n", + " \n", + " def generate_clerk(self):\n", + " \"\"\"Generate clerk IDs matching TPCH format\"\"\"\n", + " clerk_id = np.random.randint(1000)\n", + " return f\"Clerk#{clerk_id:09d}\"\n", + " \n", + " def generate_customer(self, num_customers=149999):\n", + " \"\"\"Generate customer IDs matching TPCH format\"\"\"\n", + " return np.random.randint(num_customers)\n", + " \n", + " def generate_orders(self):\n", + " \"\"\"Generate supplementary orders with realistic patterns\"\"\"\n", + " orders = []\n", + " current_date = self.start_date\n", + " \n", + " while current_date <= self.end_date:\n", + " day_of_year = current_date.dayofyear\n", + " years_passed = (current_date - self.start_date).days / 365\n", + " \n", + " seasonal = self.seasonal_effect(day_of_year)\n", + " weekly = self.weekly_effect(current_date.weekday())\n", + " trend = self.trend_effect(years_passed)\n", + " \n", + " target_orders = round(\n", + " self.target_daily_orders * \n", + " seasonal * weekly * trend\n", + " )\n", + " \n", + " for _ in range(target_orders):\n", + " order = {\n", + " 'o_orderdate': current_date,\n", + " 'o_totalprice': self.generate_order_value(years_passed),\n", + " 'o_orderstatus': 'O',\n", + " 'o_clerk': self.generate_clerk(),\n", + " 'o_custkey': self.generate_customer()\n", + " }\n", + " orders.append(order)\n", + " \n", + " current_date += timedelta(days=1)\n", + " \n", + " df = pd.DataFrame(orders)\n", + " df = df.sort_values('o_orderdate')\n", + " df['o_orderkey'] = range(len(df))\n", + " df['o_orderkey'] = df['o_orderkey'] + 1_500_000 # Offset to avoid conflicts\n", + " \n", + " return df\n", + "\n", + "def generate_and_save_synthetic_data():\n", + " \"\"\"Generate orders and save to CSV\"\"\"\n", + " # Example: Generate 2 years of data with pronounced patterns\n", + " params = {\n", + " 'start_date': '1992-01-01',\n", + " 'end_date': '1998-08-02',\n", + " 'target_daily_total': 100_000_000,\n", + " 'target_daily_orders': 500,\n", + " 'holiday_effect_magnitude': 1.2,\n", + " 'weekend_dip': 0.8,\n", + " 'annual_growth_rate': 0.15,\n", + " 'value_noise_stddev': 0.15\n", + " }\n", + " \n", + " generator = OrderGenerator(**params)\n", + " df = generator.generate_orders()\n", + " #save the synthetic data to a temporary table\n", + " filename = 'synthetic_orders'\n", + " df.to_csv(filename + '.csv', index=False)\n", + " print(f\"Orders saved to CSV {filename}.csv\")\n", + " csv_df = pd.read_csv(filename + '.csv')\n", + " csv_df['o_orderdate'] = pd.to_datetime(df['o_orderdate'])\n", + " table_df = session.create_dataframe(csv_df)\n", + " table_df.write.mode(\"overwrite\").save_as_table(filename, table_type=\"temporary\")\n", + " print(f\"Order saved to temporary table {filename}\")\n", + " return\n", + "\n", + "# Generate and save orders\n", + "generate_and_save_synthetic_data()" + ] }, { "cell_type": "markdown", "id": "ca0f2f8f-33ae-4934-9064-f44a3e5ef5c9", "metadata": { - "name": "growth_accounting_intro", "collapsed": false, + "name": "growth_accounting_intro", "resultHeight": 74 }, - "source": "# Growth Accounting" + "source": [ + "# Growth Accounting" + ] }, { "cell_type": "code", + "execution_count": null, "id": "b10ebdb4-78f3-49f3-ab81-529b0afd662d", "metadata": { + "codeCollapsed": false, + "collapsed": false, "language": "sql", "name": "orders", - "resultHeight": 510, - "collapsed": false, - "codeCollapsed": false + "resultHeight": 510 }, "outputs": [], - "source": "with synthetic as (\n\n select\n \"o_custkey\" as id,\n to_date(\"o_orderdate\") as o_orderdate,\n CAST(\"o_totalprice\" AS NUMERIC) as o_totalprice\n from synthetic_orders\n --SAMPLE (1000000 rows)\n\n),\n\noriginal as (\n \n select\n o_custkey as id,\n o_orderdate,\n o_totalprice\n from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\n --SAMPLE (1000000 rows)\n\n)\n\nselect * from synthetic\nunion all \nselect * from original", - "execution_count": null + "source": [ + "with synthetic as (\n", + "\n", + " select\n", + " \"o_custkey\" as id,\n", + " to_date(\"o_orderdate\") as o_orderdate,\n", + " CAST(\"o_totalprice\" AS NUMERIC) as o_totalprice\n", + " from synthetic_orders\n", + " --SAMPLE (1000000 rows)\n", + "\n", + "),\n", + "\n", + "\n", + "original as (\n", + " \n", + " select\n", + " o_custkey as id,\n", + " o_orderdate,\n", + " o_totalprice\n", + " from SNOWFLAKE_SAMPLE_DATA.TPCH_SF1.ORDERS\n", + " --SAMPLE (1000000 rows)\n", + "\n", + ")\n", + "\n", + "select * from synthetic\n", + "union all \n", + "select * from original" + ] }, { "cell_type": "code", + "execution_count": null, "id": "b933a301-0086-4682-9a6b-c0d430f62f87", "metadata": { + "collapsed": false, "language": "sql", "name": "annual_customer_orders", "resultHeight": 510 }, "outputs": [], - "source": "select\n id,\n date_trunc(year, o_orderdate) as order_year,\n sum(o_totalprice) as total\nfrom {{ orders }}\ngroup by all\norder by id, order_year", - "execution_count": null + "source": [ + "select\n", + " id,\n", + " date_trunc(year, o_orderdate) as order_year,\n", + " sum(o_totalprice) as total\n", + "from {{ orders }}\n", + "group by all\n", + "order by id, order_year" + ] }, { "cell_type": "code", + "execution_count": null, "id": "a789790e-47be-4b57-94a1-53832336abb1", "metadata": { + "collapsed": false, "language": "python", "name": "add_rows_for_years_without_sales", "resultHeight": 0 }, "outputs": [], - "source": "annual_customer_orders_df = annual_customer_orders.to_pandas()\n\n#pivot data to add row for each id:year with no revenue\nresult = annual_customer_orders_df.pivot_table(\n index='ID',\n columns='ORDER_YEAR', \n values='TOTAL',\n fill_value=0\n).reset_index().melt(\n id_vars='ID',\n var_name='ORDER_YEAR',\n value_name='TOTAL'\n)\n\n# save the dataframe as table for SQL querying \ndf = session.create_dataframe(result)\ndf.write.mode(\"overwrite\").save_as_table(\"annual_customer_orders\", table_type=\"temporary\")", - "execution_count": null + "source": [ + "annual_customer_orders_df = annual_customer_orders.to_pandas()\n", + "\n", + "#pivot data to add row for each id:year with no revenue\n", + "result = annual_customer_orders_df.pivot_table(\n", + " index='ID',\n", + " columns='ORDER_YEAR', \n", + " values='TOTAL',\n", + " fill_value=0\n", + ").reset_index().melt(\n", + " id_vars='ID',\n", + " var_name='ORDER_YEAR',\n", + " value_name='TOTAL'\n", + ")\n", + "\n", + "# save the dataframe as table for SQL querying \n", + "df = session.create_dataframe(result)\n", + "df.write.mode(\"overwrite\").save_as_table(\"annual_customer_orders\", table_type=\"temporary\")" + ] }, { "cell_type": "code", + "execution_count": null, "id": "70c25d11-94cb-40f0-985a-89e8d8839d8e", "metadata": { + "collapsed": false, "language": "sql", "name": "sample_annual_customer_orders", - "resultHeight": 426 + "resultHeight": 438 }, "outputs": [], - "source": "select * from annual_customer_orders\norder by id, order_year\nlimit 10", - "execution_count": null + "source": [ + "select * from annual_customer_orders\n", + "order by id, order_year\n", + "limit 25" + ] }, { "cell_type": "code", + "execution_count": null, "id": "d092b952-57aa-4076-b1cd-575279473bab", "metadata": { + "collapsed": false, "language": "sql", "name": "labeled_annual_customer_orders", "resultHeight": 510 }, "outputs": [], - "source": "with windowed as (\n \n select\n *,\n sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n from annual_customer_orders\n\n)\n\nselect *,\n case\n when total = previous_year_total and total > 0 then 'retained'\n when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n when total = 0 and previous_year_total > 0 then 'churned'\n when total > previous_year_total and previous_year_total > 0 then 'expanded'\n when total < previous_year_total and previous_year_total > 0 then 'contracted'\n when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n else 'irrelevant' end as category,\n case category\n when 'retained' then 0\n when 'new' then total\n when 'churned' then (-1 * previous_year_total)\n when 'expanded' then total - previous_year_total\n when 'contracted' then (-1 * (previous_year_total - total))\n when 'resurrected' then total\n else 0 end as net_change\nfrom windowed\norder by id, order_year", - "execution_count": null + "source": [ + "with windowed as (\n", + " \n", + " select\n", + " *,\n", + " sum(total) over(partition by id order by order_year asc) as lifetime_spend,\n", + " coalesce(lag(total) over(partition by id order by order_year asc), 0) as previous_year_total,\n", + " from annual_customer_orders\n", + "\n", + ")\n", + "\n", + "select *,\n", + " case\n", + " when total = previous_year_total and total > 0 then 'retained'\n", + " when total > 0 and previous_year_total = 0 and lifetime_spend = total then 'new'\n", + " when total = 0 and previous_year_total > 0 then 'churned'\n", + " when total > previous_year_total and previous_year_total > 0 then 'expanded'\n", + " when total < previous_year_total and previous_year_total > 0 then 'contracted'\n", + " when total > 0 and previous_year_total = 0 and lifetime_spend > total then 'resurrected'\n", + " else 'irrelevant' end as category,\n", + " case category\n", + " when 'retained' then 0\n", + " when 'new' then total\n", + " when 'churned' then (-1 * previous_year_total)\n", + " when 'expanded' then total - previous_year_total\n", + " when 'contracted' then (-1 * (previous_year_total - total))\n", + " when 'resurrected' then total\n", + " else 0 end as net_change\n", + "from windowed\n", + "order by id, order_year" + ] }, { "cell_type": "code", + "execution_count": null, "id": "4fa6afc9-934a-40fb-a8ef-f6aedaec3ba0", "metadata": { + "collapsed": false, "language": "sql", "name": "annual_growth_labels", "resultHeight": 438 }, "outputs": [], - "source": "select\n date_part(year, order_year) as order_year,\n category,\n round(sum(total)) as total,\n round(sum(net_change)) as net_change\nfrom {{ labeled_annual_customer_orders }}\ngroup by all", - "execution_count": null + "source": [ + "with final as (\n", + "\n", + "select\n", + " date_part(year, order_year) as order_year,\n", + " category,\n", + " round(sum(total)) as total,\n", + " round(sum(net_change)) as net_change\n", + "from {{ labeled_annual_customer_orders }}\n", + "group by all\n", + "\n", + ")\n", + "\n", + "select * from final\n", + "-- exclude first and last years\n", + "where order_year not in (1992, 1998)" + ] }, { "cell_type": "code", + "execution_count": null, "id": "9f67f2b4-9c22-453d-abc0-68e5fbbc2e7f", "metadata": { + "collapsed": false, "language": "python", "name": "visualize_growth_framework", - "resultHeight": 239 + "resultHeight": 772 }, "outputs": [], - "source": "import streamlit as st\n# Option to define dictionary to color code each category, may need to use matplotlib\n# Option to use altair for better control of ticks on Y axis\nst.bar_chart(annual_growth_labels, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)", - "execution_count": null + "source": [ + "import streamlit as st\n", + "st.bar_chart(annual_growth_labels, x='ORDER_YEAR', y='NET_CHANGE', color='CATEGORY', height=750)" + ] }, { "cell_type": "code", + "execution_count": null, "id": "2e2a6a8c-14e5-47f2-997e-fa53600564f2", "metadata": { + "collapsed": false, "language": "python", "name": "download_growth_accounting_csv", "resultHeight": 96 }, "outputs": [], - "source": "df = labeled_annual_customer_orders.to_pandas()\nbutton_csv = df.to_csv().encode(\"utf-8\")\nst.download_button(label=\"Download\", data=button_csv, file_name=\"growth_accounting.csv\", mime=\"text/csv\")", - "execution_count": null + "source": [ + "df = labeled_annual_customer_orders.to_pandas()\n", + "button_csv = df.to_csv().encode(\"utf-8\")\n", + "st.download_button(label=\"Download\", data=button_csv, file_name=\"growth_accounting.csv\", mime=\"text/csv\")" + ] }, { "cell_type": "markdown", "id": "fbd5ea2b-6a4f-423e-8e50-ea5d96eb8140", "metadata": { - "name": "forecasting_intro", "collapsed": false, + "name": "forecasting_intro", "resultHeight": 74 }, - "source": "# Forecasting" + "source": [ + "# Forecasting" + ] }, { "cell_type": "code", + "execution_count": null, "id": "16ec54e1-54cf-468c-a2d9-8bb8bd4abaaa", "metadata": { + "collapsed": false, "language": "sql", "name": "daily_order_data", - "resultHeight": 438, - "collapsed": false + "resultHeight": 438 }, "outputs": [], - "source": "select\n date_trunc(day, o_orderdate) as order_date,\n sum(o_totalprice) as sum_revenue,\n count(*) as num_orders\nfrom {{ orders }}\ngroup by 1\norder by order_date asc", - "execution_count": null + "source": [ + "select\n", + " date_trunc(day, o_orderdate) as order_date,\n", + " sum(o_totalprice) as sum_revenue,\n", + " count(*) as num_orders\n", + "from {{ orders }}\n", + "group by 1\n", + "order by order_date asc" + ] }, { "cell_type": "code", + "execution_count": null, "id": "e1368eea-3b25-46fd-92d9-d890e07dc61e", "metadata": { + "collapsed": false, "language": "python", "name": "prophet_data_preparation", - "resultHeight": 372, - "collapsed": false + "resultHeight": 372 }, "outputs": [], - "source": "from prophet import Prophet\nfrom prophet.plot import plot_plotly, plot_components_plotly\n\ndf = daily_order_data.to_pandas()\nprophet_df = df.rename(columns={'ORDER_DATE': 'ds', 'SUM_REVENUE': 'y'})\nst.line_chart(prophet_df, x='ds', y='y')", - "execution_count": null + "source": [ + "from prophet import Prophet\n", + "from prophet.plot import plot_plotly, plot_components_plotly\n", + "\n", + "df = daily_order_data.to_pandas()\n", + "prophet_df = df.rename(columns={'ORDER_DATE': 'ds', 'SUM_REVENUE': 'y'})\n", + "st.line_chart(prophet_df, x='ds', y='y')" + ] }, { "cell_type": "code", + "execution_count": null, "id": "bff69396-4c45-477a-a03a-9c173e9e0a02", "metadata": { + "collapsed": false, "language": "python", "name": "project_future_daily_sales", - "resultHeight": 41 + "resultHeight": 981 }, "outputs": [], - "source": "m = Prophet()\ntry:\n m.fit(prophet_df)\nexcept Exception as err:\n print(Exception, err)\n\nfuture = m.make_future_dataframe(periods=365)\nforecast = m.predict(future)\nfig1 = m.plot(forecast)", - "execution_count": null + "source": [ + "m = Prophet()\n", + "try:\n", + " m.fit(prophet_df)\n", + "except Exception as err:\n", + " print(Exception, err)\n", + "\n", + "future = m.make_future_dataframe(periods=365)\n", + "forecast = m.predict(future)\n", + "fig1 = m.plot(forecast)" + ] }, { "cell_type": "code", + "execution_count": null, "id": "3ad6456c-376a-409b-a006-a42bfbb005fa", "metadata": { + "collapsed": false, "language": "python", "name": "inspect_forecasting_components", - "resultHeight": 41 + "resultHeight": 1480 }, "outputs": [], - "source": "fig2 = m.plot_components(forecast)", - "execution_count": null + "source": [ + "fig2 = m.plot_components(forecast)" + ] }, { "cell_type": "code", + "execution_count": null, "id": "f30b1c81-80bf-4571-b971-84443f55630d", "metadata": { + "collapsed": false, "language": "python", "name": "simplify_forecast_visualization", "resultHeight": 372 }, "outputs": [], - "source": "df = pd.DataFrame({\n 'ds': forecast['ds'],\n 'y': m.history['y'],\n # Only show yhat for future dates\n 'yhat': np.where(forecast['ds'] > m.history['ds'].max(), forecast['yhat'], np.nan)\n})\n\nst.line_chart(df, x='ds', y=['y', 'yhat'])", - "execution_count": null + "source": [ + "df = pd.DataFrame({\n", + " 'ds': forecast['ds'],\n", + " 'y': m.history['y'],\n", + " # Only show yhat for future dates\n", + " 'yhat': np.where(forecast['ds'] > m.history['ds'].max(), forecast['yhat'], np.nan)\n", + "})\n", + "\n", + "st.line_chart(df, x='ds', y=['y', 'yhat'])" + ] }, { "cell_type": "markdown", "id": "5232d8e1-8ecb-4bb4-94c2-dd7122caaf30", "metadata": { - "name": "customer_segmentation_introduction", "collapsed": false, + "name": "customer_segmentation_introduction", "resultHeight": 74 }, - "source": "# Customer Segmentation" + "source": [ + "# Customer Segmentation" + ] }, { "cell_type": "code", + "execution_count": null, "id": "6a901764-40e1-4607-850c-444ad00450ef", "metadata": { + "collapsed": false, "language": "sql", "name": "sample_company_data", - "resultHeight": 426, - "collapsed": false + "resultHeight": 438 }, "outputs": [], - "source": "select *\nfrom ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST\nlimit 10", - "execution_count": null + "source": [ + "select *\n", + "from ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST\n", + "limit 20" + ] }, { "cell_type": "code", + "execution_count": null, "id": "e7acf161-5e2d-4277-89ea-65f1256358e4", "metadata": { + "collapsed": false, "language": "python", "name": "construct_api_request", - "resultHeight": 0, - "collapsed": false + "resultHeight": 0 }, "outputs": [], - "source": "import requests\n\ndef get_wiki_extract(title):\n # Base URL for Wikipedia's API\n url = \"https://en.wikipedia.org/w/api.php\"\n \n # Parameters for the API request\n params = {\n \"action\": \"query\",\n \"format\": \"json\",\n \"titles\": title,\n \"prop\": \"extracts\",\n \"exintro\": True, # Only get the intro section\n \"explaintext\": True, # Get plain text instead of HTML\n }\n \n # Make the request\n response = requests.get(url, params=params)\n \n # Check if request was successful\n if response.status_code == 200:\n data = response.json()\n # Navigate through the JSON response to get the extract\n pages = data[\"query\"][\"pages\"]\n # Get the first (and only) page's extract\n page = list(pages.values())[0]\n return page.get(\"extract\", \"No extract available\")\n else:\n return f\"Error: {response.status_code}\"", - "execution_count": null + "source": [ + "import requests\n", + "\n", + "def get_wiki_extract(title):\n", + " # Base URL for Wikipedia's API\n", + " url = \"https://en.wikipedia.org/w/api.php\"\n", + " \n", + " # Parameters for the API request\n", + " params = {\n", + " \"action\": \"query\",\n", + " \"format\": \"json\",\n", + " \"titles\": title,\n", + " \"prop\": \"extracts\",\n", + " \"exintro\": True, # Only get the intro section\n", + " \"explaintext\": True, # Get plain text instead of HTML\n", + " }\n", + " \n", + " # Make the request\n", + " response = requests.get(url, params=params)\n", + " \n", + " # Check if request was successful\n", + " if response.status_code == 200:\n", + " data = response.json()\n", + " # Navigate through the JSON response to get the extract\n", + " pages = data[\"query\"][\"pages\"]\n", + " # Get the first (and only) page's extract\n", + " page = list(pages.values())[0]\n", + " return page.get(\"extract\", \"No extract available\")\n", + " else:\n", + " return f\"Error: {response.status_code}\"" + ] }, { "cell_type": "code", + "execution_count": null, "id": "94963e7c-8d39-46e5-a035-4838ebb3617e", "metadata": { + "collapsed": false, "language": "python", - "name": "extraxt_wikipedia_descriptions", - "resultHeight": 284, - "collapsed": false + "name": "extract_wikipedia_descriptions", + "resultHeight": 508 }, "outputs": [], - "source": "df = sample_company_data.to_pandas()\ncompany_names = df['NAME'].tolist()\ncsv_list = []\n\nprint(\"extracting descriptions\")\n\nfor name in company_names:\n try:\n extract = get_wiki_extract(name.replace(\" \", \"_\"))\n print(f'extracted description of {name} from Wikipedia')\n except Exception as e:\n print(f\"Error getting Wikipedia extract for {name}: {str(e)}\")\n extract = \"None available\"\n \n csv_list.append((name, extract))\n\nprint(\"finished extracting descriptions\")\n\n# save the dataframe as table for SQL querying \ndf = pd.DataFrame(csv_list, columns=['name', 'description'])\ndf = session.create_dataframe(df)\ndf.write.mode(\"overwrite\").save_as_table(\"prospects\", table_type=\"temporary\")", - "execution_count": null + "source": [ + "df = sample_company_data.to_pandas()\n", + "company_names = df['NAME'].tolist()\n", + "csv_list = []\n", + "\n", + "print(\"extracting descriptions\")\n", + "\n", + "for name in company_names:\n", + " try:\n", + " extract = get_wiki_extract(name.replace(\" \", \"_\"))\n", + " print(f'extracted description of {name} from Wikipedia')\n", + " except Exception as e:\n", + " print(f\"Error getting Wikipedia extract for {name}: {str(e)}\")\n", + " extract = \"None available\"\n", + " \n", + " csv_list.append((name, extract))\n", + "\n", + "print(\"finished extracting descriptions\")\n", + "\n", + "# save the dataframe as table for SQL querying \n", + "df = pd.DataFrame(csv_list, columns=['name', 'description'])\n", + "df = session.create_dataframe(df)\n", + "df.write.mode(\"overwrite\").save_as_table(\"prospects\", table_type=\"temporary\")" + ] }, { "cell_type": "code", + "execution_count": null, "id": "81c446dc-5c36-42e3-bb0d-985d397af0ca", "metadata": { + "collapsed": false, "language": "sql", "name": "display_wikipedia_descriptions", - "resultHeight": 426, - "collapsed": false + "resultHeight": 438 }, "outputs": [], - "source": "select \"name\", \"description\" from prospects", - "execution_count": null + "source": [ + "select \"name\", \"description\" from prospects" + ] }, { "cell_type": "code", + "execution_count": null, "id": "6b559934-f89d-418e-9a1f-38ef7faa03ad", "metadata": { + "collapsed": false, "language": "sql", "name": "categorize_descriptions_with_LLM", - "resultHeight": 391, - "collapsed": false + "resultHeight": 426 }, "outputs": [], - "source": "select \n \"name\",\n \"description\",\n snowflake.cortex.classify_text(\n \"description\",\n ['extremely likely', 'somewhat likely', 'unlikely'],\n {\n 'task_description': 'Return the likelihood that this company would be interested in attending a webinar showcasing the GTM utility of Snowflake Notebooks and Anaconda Python Packages.'\n }\n ):label::STRING as persona_likelihood,\n snowflake.cortex.classify_text(\n \"description\",\n ['healthcare', 'finance', 'retail', 'technology', 'communication', 'other'],\n {\n 'task_description': 'Return the most likely industry of the company based on this description.'\n }\n ):label::STRING as industry,\n snowflake.cortex.classify_text(\n \"description\",\n ['California', 'South', 'Northeast', 'Midatlantic', 'Midwest', 'Pacific Northwest', 'Outsite the US'],\n {\n 'task_description': 'Return the most likely region the company is headquartered in based on this description.'\n }\n ):label::STRING as region\nfrom prospects\nwhere \"description\" is not null and \"description\" != ''\nlimit 10\n-- other class. ideas: industry, main product, region", - "execution_count": null + "source": [ + "select\n", + " \"name\",\n", + " snowflake.cortex.classify_text(\n", + " \"description\",\n", + " ['extremely likely', 'somewhat likely', 'unlikely'],\n", + " {\n", + " 'task_description': 'Return the likelihood that this company would be interested in attending a webinar showcasing the GTM utility of Snowflake Notebooks and Anaconda Python Packages.'\n", + " }\n", + " ):label::STRING as persona_likelihood,\n", + " snowflake.cortex.classify_text(\n", + " \"description\",\n", + " ['healthcare', 'finance', 'retail', 'technology', 'communication', 'other'],\n", + " {\n", + " 'task_description': 'Return the most likely industry of the company based on this description.'\n", + " }\n", + " ):label::STRING as industry,\n", + " snowflake.cortex.classify_text(\n", + " \"description\",\n", + " ['California', 'South', 'Northeast', 'Midatlantic', 'Midwest', 'Pacific Northwest', 'Outsite the US'],\n", + " {\n", + " 'task_description': 'Return the most likely region the company is headquartered in based on this description.'\n", + " }\n", + " ):label::STRING as region,\n", + " \"description\"\n", + "from prospects\n", + "where \"description\" is not null and \"description\" != ''\n", + "limit 10\n" + ] } - ] -} \ No newline at end of file + ], + "metadata": { + "kernelspec": { + "display_name": "Streamlit Notebook", + "name": "streamlit" + }, + "lastEditStatus": { + "authorEmail": "wluna@anaconda.com", + "authorId": "405715820451", + "authorName": "WLUNA", + "lastEditTime": 1737744033132, + "notebookId": "2jcfdffhscjksh5ccsf7", + "sessionId": "e4e4ef1b-68d9-44f4-b7b9-1d472664a700" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/samples/notebooks/anaconda_webinar/synthetic_data_generation.py b/samples/notebooks/anaconda_webinar/synthetic_data_generation.py deleted file mode 100644 index 9f638bff..00000000 --- a/samples/notebooks/anaconda_webinar/synthetic_data_generation.py +++ /dev/null @@ -1,208 +0,0 @@ -import numpy as np -from datetime import datetime, timedelta -import pandas as pd -import matplotlib.pyplot as plt -from matplotlib.dates import YearLocator, MonthLocator, DateFormatter - -class OrderGenerator: - def __init__( - self, - # Basic parameters - start_date='1992-01-01', - end_date='1998-12-31', - target_daily_total=100_000_000, - target_daily_orders=500, - - # Trend parameters - annual_growth_rate=0.15, # 15% annual growth - order_value_growth_rate=0.05, # 5% annual growth in order values - - # Seasonal parameters - holiday_peak_day=350, # Peak shopping day (Dec 16) - holiday_effect_magnitude=1.0, # Strength of holiday effect - seasonal_baseline=0.8, # Minimum seasonal multiplier - seasonal_spread=1000, # Controls how spread out the holiday effect is - - # Weekly parameters - weekend_dip=0.85, # Weekend order multiplier - weekday_boost=1.1, # Weekday order multiplier - - # Value distribution parameters - pareto_shape=2.0, # Shape parameter for order values - min_value_factor=0.3, # Minimum order value as fraction of average - value_noise_stddev=0.15, # Standard deviation for order value noise - - # Random seed for reproducibility - random_seed=None - ): - self.start_date = pd.to_datetime(start_date) - self.end_date = pd.to_datetime(end_date) - self.target_daily_total = target_daily_total - self.target_daily_orders = target_daily_orders - - # Store all other parameters - self.annual_growth_rate = annual_growth_rate - self.order_value_growth_rate = order_value_growth_rate - self.holiday_peak_day = holiday_peak_day - self.holiday_effect_magnitude = holiday_effect_magnitude - self.seasonal_baseline = seasonal_baseline - self.seasonal_spread = seasonal_spread - self.weekend_dip = weekend_dip - self.weekday_boost = weekday_boost - self.pareto_shape = pareto_shape - self.min_value_factor = min_value_factor - self.value_noise_stddev = value_noise_stddev - - # Derived parameters - self.avg_order_value = target_daily_total / target_daily_orders - self.min_order_value = self.avg_order_value * self.min_value_factor - - if random_seed is not None: - np.random.seed(random_seed) - - def seasonal_effect(self, day_of_year): - """Stronger effect during holiday season""" - holiday_effect = np.exp( - -((day_of_year - self.holiday_peak_day) ** 2) / - self.seasonal_spread - ) * self.holiday_effect_magnitude - return np.maximum(self.seasonal_baseline + holiday_effect, 0) - - def weekly_effect(self, day_of_week): - """Weekend dips in orders""" - return self.weekend_dip if day_of_week in [5, 6] else self.weekday_boost - - def trend_effect(self, years_passed): - """Long-term growth trend""" - return np.power(1 + self.annual_growth_rate, years_passed) - - def generate_order_value(self, years_passed): - """Generate order values following a Pareto distribution""" - u = np.random.random() - value = self.min_order_value / np.power(1 - u, 1/self.pareto_shape) - value = value * np.power(1 + self.order_value_growth_rate, years_passed) - noise = np.random.normal(1, self.value_noise_stddev) - return round(value * noise) - - def generate_clerk(self): - """Generate clerk IDs matching TPCH format""" - clerk_id = np.random.randint(1000) - return f"Clerk#{clerk_id:09d}" - - def generate_customer(self, num_customers=1500): - """Generate customer IDs matching TPCH format""" - return f"Customer#{np.random.randint(num_customers):09d}" - - def generate_orders(self): - """Generate supplementary orders with realistic patterns""" - orders = [] - current_date = self.start_date - - while current_date <= self.end_date: - day_of_year = current_date.dayofyear - years_passed = (current_date - self.start_date).days / 365 - - seasonal = self.seasonal_effect(day_of_year) - weekly = self.weekly_effect(current_date.weekday()) - trend = self.trend_effect(years_passed) - - target_orders = round( - self.target_daily_orders * - seasonal * weekly * trend - ) - - for _ in range(target_orders): - order = { - 'o_orderdate': current_date, - 'o_totalprice': self.generate_order_value(years_passed), - 'o_orderstatus': 'O', - 'o_clerk': self.generate_clerk(), - 'o_custkey': self.generate_customer() - } - orders.append(order) - - current_date += timedelta(days=1) - - df = pd.DataFrame(orders) - df = df.sort_values('o_orderdate') - df['o_orderkey'] = range(len(df)) - df['o_orderkey'] = df['o_orderkey'] + 1_500_000 # Offset to avoid conflicts - - return df - -def generate_and_save_orders(filename, **generator_params): - """Generate orders and save to CSV""" - generator = OrderGenerator(**generator_params) - df = generator.generate_orders() - df.to_csv(filename, index=False) - print(f"Orders saved to {filename}") - return df - -def plot_daily_patterns(filename, figsize=(15, 8), plot_style='compressed'): - """Load orders from CSV and create visualization""" - df = pd.read_csv(filename) - df['o_orderdate'] = pd.to_datetime(df['o_orderdate']) - - daily_summary = df.groupby('o_orderdate').agg({ - 'o_orderkey': 'count', - 'o_totalprice': 'sum' - }).reset_index() - - fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize) - - # Plot daily totals - ax1.plot(daily_summary['o_orderdate'], daily_summary['o_totalprice'], - color='blue', linewidth=0.5) - ax1.set_title('Daily Order Totals') - ax1.set_ylabel('Daily Total ($)') - ax1.grid(True, alpha=0.3) - - # Set x-axis ticks to show years and months - ax1.xaxis.set_major_locator(YearLocator()) - ax1.xaxis.set_minor_locator(MonthLocator()) - ax1.xaxis.set_major_formatter(DateFormatter('%Y')) - ax1.yaxis.set_major_formatter(lambda x, p: f'${x/1e6:.1f}M') - - # Plot daily order counts - ax2.plot(daily_summary['o_orderdate'], daily_summary['o_orderkey'], - color='green', linewidth=0.5) - ax2.set_title('Daily Order Count') - ax2.set_ylabel('Number of Orders') - ax2.grid(True, alpha=0.3) - - ax2.xaxis.set_major_locator(YearLocator()) - ax2.xaxis.set_minor_locator(MonthLocator()) - ax2.xaxis.set_major_formatter(DateFormatter('%Y')) - - for ax in [ax1, ax2]: - plt.setp(ax.get_xticklabels(), rotation=45) - - plt.tight_layout() - - # Print summary statistics - print("\nSummary Statistics:") - print(f"Date Range: {daily_summary['o_orderdate'].min().date()} to {daily_summary['o_orderdate'].max().date()}") - print(f"Average daily orders: {daily_summary['o_orderkey'].mean():.0f}") - print(f"Average daily total: ${daily_summary['o_totalprice'].mean():,.2f}") - - return fig - -if __name__ == "__main__": - # Example: Generate 2 years of data with pronounced patterns - params = { - 'start_date': '1992-01-01', - 'end_date': '1998-08-02', - 'target_daily_total': 100_000_000, - 'target_daily_orders': 500, - 'holiday_effect_magnitude': 1.2, - 'weekend_dip': 0.8, - 'annual_growth_rate': 0.15, - 'value_noise_stddev': 0.15 - } - - # Generate and save orders - generate_and_save_orders('supplementary_orders.csv', **params) - - # Create visualization - fig = plot_daily_patterns('supplementary_orders.csv') - plt.show() diff --git a/samples/sap_accounts_receivable_dbt/package-lock.yml b/samples/sap_accounts_receivable_dbt/package-lock.yml new file mode 100644 index 00000000..7fa7e893 --- /dev/null +++ b/samples/sap_accounts_receivable_dbt/package-lock.yml @@ -0,0 +1,4 @@ +packages: + - git: https://github.com/dbt-labs/dbt-utils.git + revision: 68b4b4dadc20cd5cc2a894bd2ad62aa1b8176dc7 +sha1_hash: 0f4dc0fb373403efb568c23241b42220d924b872 From ffdc1143b194c74d1a8bcea776b4e10629c69aca Mon Sep 17 00:00:00 2001 From: Will Luna Date: Mon, 27 Jan 2025 16:19:07 -0800 Subject: [PATCH 8/9] Delete package-lock.yml --- samples/sap_accounts_receivable_dbt/package-lock.yml | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 samples/sap_accounts_receivable_dbt/package-lock.yml diff --git a/samples/sap_accounts_receivable_dbt/package-lock.yml b/samples/sap_accounts_receivable_dbt/package-lock.yml deleted file mode 100644 index 7fa7e893..00000000 --- a/samples/sap_accounts_receivable_dbt/package-lock.yml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - - git: https://github.com/dbt-labs/dbt-utils.git - revision: 68b4b4dadc20cd5cc2a894bd2ad62aa1b8176dc7 -sha1_hash: 0f4dc0fb373403efb568c23241b42220d924b872 From a4279f5e91f9d48476e1beb74c0adf6d68f79348 Mon Sep 17 00:00:00 2001 From: Will Luna Date: Tue, 18 Feb 2025 16:24:34 -0800 Subject: [PATCH 9/9] Generates company list via SQL --- .../anaconda_webinar_notebook.ipynb | 59 ++++++++++++++++++- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb index 3056e920..c3e3508d 100644 --- a/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb +++ b/samples/notebooks/anaconda_webinar/anaconda_webinar_notebook.ipynb @@ -579,9 +579,62 @@ }, "outputs": [], "source": [ - "select *\n", - "from ADHOC_ANALYSIS.USER_UPLOADS.SP500_COMPANY_LIST\n", - "limit 20" + "with companies as (\n", + " select column1 as NAME\n", + " from (values\n", + " ('3M'),\n", + " ('A. O. Smith'),\n", + " ('Abbott Laboratories'),\n", + " ('AbbVie'),\n", + " ('Accenture'),\n", + " ('Adobe Inc.'),\n", + " ('Advanced Micro Devices'),\n", + " ('AES Corporation'),\n", + " ('Aflac'),\n", + " ('Agilent Technologies'),\n", + " ('Air Products'),\n", + " ('Airbnb'),\n", + " ('Akamai Technologies'),\n", + " ('Albemarle Corporation'),\n", + " ('Alexandria Real Estate Equities'),\n", + " ('Align Technology'),\n", + " ('Allegion'),\n", + " ('Alliant Energy'),\n", + " ('Allstate'),\n", + " ('Amazon'),\n", + " ('Amcor'),\n", + " ('Amentum'),\n", + " ('Ameren'),\n", + " ('American Electric Power'),\n", + " ('American Express'),\n", + " ('American International Group'),\n", + " ('American Tower'),\n", + " ('American Water Works'),\n", + " ('Ameriprise Financial'),\n", + " ('Ametek'),\n", + " ('Amgen'),\n", + " ('Amphenol'),\n", + " ('Analog Devices'),\n", + " ('Ansys'),\n", + " ('Aon'),\n", + " ('APA Corporation'),\n", + " ('Apple Inc.'),\n", + " ('Applied Materials'),\n", + " ('Aptiv'),\n", + " ('Arch Capital Group'),\n", + " ('Archer Daniels Midland'),\n", + " ('Arista Networks'),\n", + " ('Arthur J. Gallagher & Co.'),\n", + " ('Assurant'),\n", + " ('AT&T'),\n", + " ('Atmos Energy'),\n", + " ('Autodesk'),\n", + " ('Automatic Data Processing'),\n", + " ('AutoZone'),\n", + " ('AvalonBay Communities')\n", + " )\n", + ")\n", + "select NAME from companies limit 20" ] }, {