29202 - Add affiliation/account ID & refactor notebook (bcgov#3629)

AimeeGao · web-flow · commit 56c7a61abd38 · 2025-07-10T15:47:03.000-07:00
* Update .env sample configuration

* Add affiliation info and optimize notebook with batch query function

* Move ENGINE_NAMES to engine creation cell for clarity
diff --git a/data-tool/notebooks/corps_onboarding_process_flow/.env.sample b/data-tool/notebooks/corps_onboarding_process_flow/.env.sample
@@ -12,6 +12,13 @@ DATABASE_LEAR_HOST=
 DATABASE_LEAR_PORT=
 DATABASE_LEAR_NAME=
 
+# --- Auth Database ---
+DATABASE_AUTH_USERNAME=
+DATABASE_AUTH_PASSWORD=
+DATABASE_AUTH_HOST=
+DATABASE_AUTH_PORT=
+DATABASE_AUTH_NAME=
+
 # --- EXCEL EXPORT ---
 EXPORT_OUTPUT_DIR=
 
diff --git a/data-tool/notebooks/corps_onboarding_process_flow/migration_status_tracking.ipynb b/data-tool/notebooks/corps_onboarding_process_flow/migration_status_tracking.ipynb
@@ -4,14 +4,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Migration Status Spreadsheet Notebook (Part 1)\n",
+    "# Migration Status Spreadsheet Notebook\n",
     "\n",
     "## Overview\n",
     "This notebook generates the data for the migration tracking spreadsheet.\n",
     "\n",
     "## What it does\n",
     "- Extracts migration data from COLIN Extract database\n",
-    "- Retrieves filing information from LEAR database  \n",
+    "- Retrieves filing information from LEAR database\n",
+    "- Retrieves affiliation information from Auth database\n",
     "- Merges and exports data to Excel format\n",
     "\n",
     "## Output\n",
@@ -31,6 +32,15 @@
     "%pip install openpyxl"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Libraries and Load Configuration\n",
+    "\n",
+    "Import required libraries and load environment variables. "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -55,6 +65,8 @@
     "    \"corp_type\": \"Type\",\n",
     "    \"status\": \"Migration Status\",\n",
     "    \"date\": \"Migrated Date\",\n",
+    "    \"affiliated\": \"Affiliated\",\n",
+    "    \"account\": \"Account ID\",\n",
     "    \"filings\": \"Filings Done\",\n",
     "    \"filing_date\": \"Last Filing Date\"\n",
     "}\n",
@@ -70,6 +82,8 @@
     "        COLUMN_NAMES[\"corp_type\"],\n",
     "        COLUMN_NAMES[\"status\"],\n",
     "        COLUMN_NAMES[\"date\"],\n",
+    "        COLUMN_NAMES[\"affiliated\"],\n",
+    "        COLUMN_NAMES[\"account\"],\n",
     "        COLUMN_NAMES[\"filings\"],\n",
     "        COLUMN_NAMES[\"filing_date\"]\n",
     "    ],\n",
@@ -83,7 +97,6 @@
     "# Configuration\n",
     "BATCH_SIZE = CONFIG['batch_size']\n",
     "FINAL_EXCEL_FIELDS = CONFIG['final_excel_fields']\n",
-    "MIG_GROUP_IDS = os.getenv('MIG_GROUP_IDS')\n",
     "MIG_GROUP_IDS = [int(x.strip()) for x in os.getenv('MIG_GROUP_IDS').split(',') if x.strip().isdigit()]\n",
     "\n",
     "if not MIG_GROUP_IDS:\n",
@@ -100,7 +113,7 @@
    "source": [
     "## Database Setup\n",
     "\n",
-    "Configure database connections for COLIN Extract and LEAR databases using environment variables."
+    "Configure database connections using environment variables."
    ]
   },
   {
@@ -123,6 +136,13 @@
     "        'host': os.getenv(\"DATABASE_LEAR_HOST\"),\n",
     "        'port': os.getenv(\"DATABASE_LEAR_PORT\"),\n",
     "        'name': os.getenv(\"DATABASE_LEAR_NAME\")\n",
+    "    },\n",
+    "    'auth': {\n",
+    "        'username': os.getenv(\"DATABASE_AUTH_USERNAME\"),\n",
+    "        'password': os.getenv(\"DATABASE_AUTH_PASSWORD\"),\n",
+    "        'host': os.getenv(\"DATABASE_AUTH_HOST\"),\n",
+    "        'port': os.getenv(\"DATABASE_AUTH_PORT\"),\n",
+    "        'name': os.getenv(\"DATABASE_AUTH_NAME\")\n",
     "    }\n",
     "}\n",
     "\n",
@@ -173,8 +193,7 @@
     "        print(f\"{db_key.upper()} unexpected error: {e}\")\n",
     "        raise\n",
     "\n",
-    "colin_engine = engines['colin_extract']\n",
-    "lear_engine = engines['lear']\n",
+    "ENGINE_NAMES = {engine: key for key, engine in engines.items()}\n",
     "\n",
     "print(\"All database engines ready for use.\")\n"
    ]
@@ -201,7 +220,7 @@
     "    mcb.corp_num AS \"{COLUMN_NAMES['corp_num']}\",\n",
     "    c.admin_email AS \"{COLUMN_NAMES['email']}\",\n",
     "    cn.corp_name AS \"{COLUMN_NAMES['corp_name']}\",\n",
-    "    cp.corp_type_cd AS \"{COLUMN_NAMES['corp_type']}\",\n",
+    "    c.corp_type_cd AS \"{COLUMN_NAMES['corp_type']}\",\n",
     "    CASE\n",
     "        WHEN cp.processed_status = 'COMPLETED' THEN 'Migrated'\n",
     "        WHEN cp.processed_status IS NULL THEN 'Pending'\n",
@@ -228,17 +247,18 @@
     "        OR cp.processed_status IS NULL\n",
     "    )\n",
     "ORDER BY\n",
-    "    g.name, \n",
-    "    b.name,\n",
+    "    g.display_name, \n",
+    "    b.display_name,\n",
     "    CASE\n",
     "        WHEN cp.processed_status = 'COMPLETED' THEN 0\n",
     "        ELSE 1\n",
     "    END, \n",
-    "    cp.create_date DESC;\n",
+    "    cp.create_date DESC,\n",
+    "    cn.corp_name;\n",
     "\"\"\"\n",
     "    \n",
     "try:\n",
-    "    with colin_engine.connect() as conn:\n",
+    "    with engines['colin_extract'].connect() as conn:\n",
     "        colin_extract_df = pd.read_sql(colin_extract_query, conn)\n",
     "\n",
     "    if colin_extract_df.empty:\n",
@@ -255,6 +275,55 @@
     "    display(colin_extract_df)\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Batch Query Function\n",
+    "A function to perform batch queries across multiple databases."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def batch_query(query_sql, db_engine, batch_size, columns):\n",
+    "    # Get unique corporation numbers from the dataset\n",
+    "    unique_corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()\n",
+    "    corp_number_batches = [unique_corp_nums[i:i + batch_size] for i in range(0, len(unique_corp_nums), batch_size)]\n",
+    "    db_name = ENGINE_NAMES.get(db_engine, \"Unknown database\")\n",
+    "    batch_results = []\n",
+    "    \n",
+    "    # Process each batch of corporation numbers\n",
+    "    for batch_idx, current_batch_corp_numbers in enumerate(corp_number_batches):\n",
+    "        if not current_batch_corp_numbers:\n",
+    "            continue\n",
+    "        try:\n",
+    "            with db_engine.connect() as conn:\n",
+    "                df = pd.read_sql(query_sql, conn, params={'identifiers': current_batch_corp_numbers})\n",
+    "            \n",
+    "            # Store results from this batch\n",
+    "            batch_results.append(df)\n",
+    "            print(f\"{db_name} Batch {batch_idx+1}: {len(df)} records fetched\")\n",
+    "        \n",
+    "        except Exception as e:\n",
+    "            print(f\"{db_name} Batch {batch_idx+1}/{len(corp_number_batches)} failed: {e}\")\n",
+    "            continue\n",
+    "    \n",
+    "    # Process combined results\n",
+    "    if batch_results:\n",
+    "        combined_df = pd.concat(batch_results, ignore_index=True)\n",
+    "        combined_df = combined_df.drop_duplicates(COLUMN_NAMES['corp_num'], keep='last')\n",
+    "        print(f\"Total records fetched: {len(combined_df)}\")\n",
+    "    else:\n",
+    "        combined_df = pd.DataFrame(columns=columns)\n",
+    "        print(f\"No records fetched\")\n",
+    "    \n",
+    "    return combined_df"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -273,7 +342,7 @@
     "lear_combined_query = f\"\"\"\n",
     "SELECT \n",
     "    b.id,\n",
-    "    b.identifier,\n",
+    "    b.identifier AS \"{COLUMN_NAMES['corp_num']}\",\n",
     "    COALESCE(\n",
     "        STRING_AGG(f.filing_type, ', ' ORDER BY f.filing_type), \n",
     "        ''\n",
@@ -287,39 +356,56 @@
     "GROUP BY b.id, b.identifier;\n",
     "\"\"\"\n",
     "\n",
-    "corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()\n",
-    "batches_identifiers = [corp_nums[i:i + BATCH_SIZE] for i in range(0, len(corp_nums), BATCH_SIZE)]\n",
+    "lear_combined_df = batch_query(\n",
+    "    query_sql=lear_combined_query,\n",
+    "    db_engine=engines['lear'],\n",
+    "    batch_size=BATCH_SIZE,\n",
+    "    columns=['id', COLUMN_NAMES['corp_num'], COLUMN_NAMES[\"filings\"], COLUMN_NAMES[\"filing_date\"]]\n",
+    ")\n",
     "\n",
-    "# Execute combined query with batch processing\n",
-    "lear_combined_results = []\n",
-    "for idx, batch_identifiers in enumerate(batches_identifiers):\n",
-    "    if not batch_identifiers:\n",
-    "        continue\n",
-    "    try:\n",
-    "        with lear_engine.connect() as conn:\n",
-    "            df = pd.read_sql(\n",
-    "                lear_combined_query,\n",
-    "                conn,\n",
-    "                params={\"identifiers\": batch_identifiers}\n",
-    "            )\n",
-    "        \n",
-    "        lear_combined_results.append(df)\n",
-    "        print(f\"Batch {idx+1}: {len(df)} records fetched\")\n",
-    "    except Exception as e:\n",
-    "        print(f\"Batch {idx+1}/{len(batches_identifiers)} failed: {e}\")\n",
-    "        continue\n",
+    "# Display results\n",
+    "with pd.option_context('display.max_rows', None):\n",
+    "    display(lear_combined_df)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get Affiliation Data\n",
+    "\n",
+    "Query the Auth database to get affiliation information, including whether corporations are affiliated and their account IDs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auth_query = f\"\"\"\n",
+    "SELECT\n",
+    "    e.business_identifier AS \"{COLUMN_NAMES['corp_num']}\",\n",
+    "    CASE WHEN a.id IS NOT NULL THEN 'Y' ELSE 'N' END AS \"{COLUMN_NAMES['affiliated']}\",\n",
+    "    a.org_id AS \"{COLUMN_NAMES['account']}\"\n",
+    "FROM\n",
+    "    entities e\n",
+    "LEFT JOIN\n",
+    "    affiliations a ON e.id = a.entity_id\n",
+    "WHERE\n",
+    "    e.business_identifier = ANY(%(identifiers)s)\n",
+    "\"\"\"\n",
     "\n",
-    "# Process combied results\n",
-    "if lear_combined_results:\n",
-    "    lear_combined_df = pd.concat(lear_combined_results, ignore_index=True)\n",
-    "    lear_combined_df = lear_combined_df.drop_duplicates('identifier', keep='last')\n",
-    "    print(f\"Total combined records fetched: {len(lear_combined_df)}\")\n",
-    "else:\n",
-    "    lear_combined_df = pd.DataFrame(columns=['id', 'identifier', 'Filings Done', 'Last Filing Date'])\n",
+    "auth_combined_df = batch_query(\n",
+    "    query_sql=auth_query,\n",
+    "    db_engine=engines['auth'],\n",
+    "    batch_size=BATCH_SIZE,\n",
+    "    columns=[COLUMN_NAMES['corp_num'], COLUMN_NAMES['affiliated'], COLUMN_NAMES['account']]\n",
+    ")\n",
     "\n",
     "# Display results\n",
     "with pd.option_context('display.max_rows', None):\n",
-    "    display(lear_combined_df)\n"
+    "    display(auth_combined_df)"
    ]
   },
   {
@@ -328,7 +414,7 @@
    "source": [
     "## Merge Data\n",
     "\n",
-    "Merge COLIN Extract migration data with LEAR filing data into a merged dataset."
+    "Combine data from COLIN Extract, LEAR, and Auth databases into a merged dataset."
    ]
   },
   {
@@ -340,12 +426,16 @@
     "try:\n",
     "    result = (colin_extract_df\n",
     "              .merge(lear_combined_df, \n",
-    "                     left_on=COLUMN_NAMES['corp_num'], \n",
-    "                     right_on='identifier', \n",
-    "                     how='left'))\n",
+    "                     on=COLUMN_NAMES['corp_num'], \n",
+    "                     how='left')\n",
+    "              .merge(auth_combined_df,\n",
+    "                     on=COLUMN_NAMES['corp_num'],\n",
+    "                     how='left') \n",
+    "              )\n",
     "    \n",
     "    # Select final fields\n",
     "    merged_df = result[FINAL_EXCEL_FIELDS]\n",
+    "    \n",
     "    print(f\"Data merged successfully: {len(merged_df)} rows\")\n",
     "        \n",
     "except Exception as e:\n",