|
4 | 4 | "cell_type": "markdown", |
5 | 5 | "metadata": {}, |
6 | 6 | "source": [ |
7 | | - "# Migration Status Spreadsheet Notebook (Part 1)\n", |
| 7 | + "# Migration Status Spreadsheet Notebook\n", |
8 | 8 | "\n", |
9 | 9 | "## Overview\n", |
10 | 10 | "This notebook generates the data for the migration tracking spreadsheet.\n", |
11 | 11 | "\n", |
12 | 12 | "## What it does\n", |
13 | 13 | "- Extracts migration data from COLIN Extract database\n", |
14 | | - "- Retrieves filing information from LEAR database \n", |
| 14 | + "- Retrieves filing information from LEAR database\n", |
| 15 | + "- Retrieves affiliation information from Auth database\n", |
15 | 16 | "- Merges and exports data to Excel format\n", |
16 | 17 | "\n", |
17 | 18 | "## Output\n", |
|
31 | 32 | "%pip install openpyxl" |
32 | 33 | ] |
33 | 34 | }, |
| 35 | + { |
| 36 | + "cell_type": "markdown", |
| 37 | + "metadata": {}, |
| 38 | + "source": [ |
| 39 | + "## Import Libraries and Load Configuration\n", |
| 40 | + "\n", |
| 41 | + "Import required libraries and load environment variables. " |
| 42 | + ] |
| 43 | + }, |
34 | 44 | { |
35 | 45 | "cell_type": "code", |
36 | 46 | "execution_count": null, |
|
55 | 65 | " \"corp_type\": \"Type\",\n", |
56 | 66 | " \"status\": \"Migration Status\",\n", |
57 | 67 | " \"date\": \"Migrated Date\",\n", |
| 68 | + " \"affiliated\": \"Affiliated\",\n", |
| 69 | + " \"account\": \"Account ID\",\n", |
58 | 70 | " \"filings\": \"Filings Done\",\n", |
59 | 71 | " \"filing_date\": \"Last Filing Date\"\n", |
60 | 72 | "}\n", |
|
70 | 82 | " COLUMN_NAMES[\"corp_type\"],\n", |
71 | 83 | " COLUMN_NAMES[\"status\"],\n", |
72 | 84 | " COLUMN_NAMES[\"date\"],\n", |
| 85 | + " COLUMN_NAMES[\"affiliated\"],\n", |
| 86 | + " COLUMN_NAMES[\"account\"],\n", |
73 | 87 | " COLUMN_NAMES[\"filings\"],\n", |
74 | 88 | " COLUMN_NAMES[\"filing_date\"]\n", |
75 | 89 | " ],\n", |
|
83 | 97 | "# Configuration\n", |
84 | 98 | "BATCH_SIZE = CONFIG['batch_size']\n", |
85 | 99 | "FINAL_EXCEL_FIELDS = CONFIG['final_excel_fields']\n", |
86 | | - "MIG_GROUP_IDS = os.getenv('MIG_GROUP_IDS')\n", |
87 | 100 | "MIG_GROUP_IDS = [int(x.strip()) for x in os.getenv('MIG_GROUP_IDS').split(',') if x.strip().isdigit()]\n", |
88 | 101 | "\n", |
89 | 102 | "if not MIG_GROUP_IDS:\n", |
|
100 | 113 | "source": [ |
101 | 114 | "## Database Setup\n", |
102 | 115 | "\n", |
103 | | - "Configure database connections for COLIN Extract and LEAR databases using environment variables." |
| 116 | + "Configure database connections using environment variables." |
104 | 117 | ] |
105 | 118 | }, |
106 | 119 | { |
|
123 | 136 | " 'host': os.getenv(\"DATABASE_LEAR_HOST\"),\n", |
124 | 137 | " 'port': os.getenv(\"DATABASE_LEAR_PORT\"),\n", |
125 | 138 | " 'name': os.getenv(\"DATABASE_LEAR_NAME\")\n", |
| 139 | + " },\n", |
| 140 | + " 'auth': {\n", |
| 141 | + " 'username': os.getenv(\"DATABASE_AUTH_USERNAME\"),\n", |
| 142 | + " 'password': os.getenv(\"DATABASE_AUTH_PASSWORD\"),\n", |
| 143 | + " 'host': os.getenv(\"DATABASE_AUTH_HOST\"),\n", |
| 144 | + " 'port': os.getenv(\"DATABASE_AUTH_PORT\"),\n", |
| 145 | + " 'name': os.getenv(\"DATABASE_AUTH_NAME\")\n", |
126 | 146 | " }\n", |
127 | 147 | "}\n", |
128 | 148 | "\n", |
|
173 | 193 | " print(f\"{db_key.upper()} unexpected error: {e}\")\n", |
174 | 194 | " raise\n", |
175 | 195 | "\n", |
176 | | - "colin_engine = engines['colin_extract']\n", |
177 | | - "lear_engine = engines['lear']\n", |
| 196 | + "ENGINE_NAMES = {engine: key for key, engine in engines.items()}\n", |
178 | 197 | "\n", |
179 | 198 | "print(\"All database engines ready for use.\")\n" |
180 | 199 | ] |
|
201 | 220 | " mcb.corp_num AS \"{COLUMN_NAMES['corp_num']}\",\n", |
202 | 221 | " c.admin_email AS \"{COLUMN_NAMES['email']}\",\n", |
203 | 222 | " cn.corp_name AS \"{COLUMN_NAMES['corp_name']}\",\n", |
204 | | - " cp.corp_type_cd AS \"{COLUMN_NAMES['corp_type']}\",\n", |
| 223 | + " c.corp_type_cd AS \"{COLUMN_NAMES['corp_type']}\",\n", |
205 | 224 | " CASE\n", |
206 | 225 | " WHEN cp.processed_status = 'COMPLETED' THEN 'Migrated'\n", |
207 | 226 | " WHEN cp.processed_status IS NULL THEN 'Pending'\n", |
|
228 | 247 | " OR cp.processed_status IS NULL\n", |
229 | 248 | " )\n", |
230 | 249 | "ORDER BY\n", |
231 | | - " g.name, \n", |
232 | | - " b.name,\n", |
| 250 | + " g.display_name, \n", |
| 251 | + " b.display_name,\n", |
233 | 252 | " CASE\n", |
234 | 253 | " WHEN cp.processed_status = 'COMPLETED' THEN 0\n", |
235 | 254 | " ELSE 1\n", |
236 | 255 | " END, \n", |
237 | | - " cp.create_date DESC;\n", |
| 256 | + " cp.create_date DESC,\n", |
| 257 | + " cn.corp_name;\n", |
238 | 258 | "\"\"\"\n", |
239 | 259 | " \n", |
240 | 260 | "try:\n", |
241 | | - " with colin_engine.connect() as conn:\n", |
| 261 | + " with engines['colin_extract'].connect() as conn:\n", |
242 | 262 | " colin_extract_df = pd.read_sql(colin_extract_query, conn)\n", |
243 | 263 | "\n", |
244 | 264 | " if colin_extract_df.empty:\n", |
|
255 | 275 | " display(colin_extract_df)\n" |
256 | 276 | ] |
257 | 277 | }, |
| 278 | + { |
| 279 | + "cell_type": "markdown", |
| 280 | + "metadata": {}, |
| 281 | + "source": [ |
| 282 | + "## Batch Query Function\n", |
| 283 | + "A function to perform batch queries across multiple databases." |
| 284 | + ] |
| 285 | + }, |
| 286 | + { |
| 287 | + "cell_type": "code", |
| 288 | + "execution_count": 15, |
| 289 | + "metadata": {}, |
| 290 | + "outputs": [], |
| 291 | + "source": [ |
| 292 | + "def batch_query(query_sql, db_engine, batch_size, columns):\n", |
| 293 | + " # Get unique corporation numbers from the dataset\n", |
| 294 | + " unique_corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()\n", |
| 295 | + " corp_number_batches = [unique_corp_nums[i:i + batch_size] for i in range(0, len(unique_corp_nums), batch_size)]\n", |
| 296 | + " db_name = ENGINE_NAMES.get(db_engine, \"Unknown database\")\n", |
| 297 | + " batch_results = []\n", |
| 298 | + " \n", |
| 299 | + " # Process each batch of corporation numbers\n", |
| 300 | + " for batch_idx, current_batch_corp_numbers in enumerate(corp_number_batches):\n", |
| 301 | + " if not current_batch_corp_numbers:\n", |
| 302 | + " continue\n", |
| 303 | + " try:\n", |
| 304 | + " with db_engine.connect() as conn:\n", |
| 305 | + " df = pd.read_sql(query_sql, conn, params={'identifiers': current_batch_corp_numbers})\n", |
| 306 | + " \n", |
| 307 | + " # Store results from this batch\n", |
| 308 | + " batch_results.append(df)\n", |
| 309 | + " print(f\"{db_name} Batch {batch_idx+1}: {len(df)} records fetched\")\n", |
| 310 | + " \n", |
| 311 | + " except Exception as e:\n", |
| 312 | + " print(f\"{db_name} Batch {batch_idx+1}/{len(corp_number_batches)} failed: {e}\")\n", |
| 313 | + " continue\n", |
| 314 | + " \n", |
| 315 | + " # Process combined results\n", |
| 316 | + " if batch_results:\n", |
| 317 | + " combined_df = pd.concat(batch_results, ignore_index=True)\n", |
| 318 | + " combined_df = combined_df.drop_duplicates(COLUMN_NAMES['corp_num'], keep='last')\n", |
| 319 | + " print(f\"Total records fetched: {len(combined_df)}\")\n", |
| 320 | + " else:\n", |
| 321 | + " combined_df = pd.DataFrame(columns=columns)\n", |
| 322 | + " print(f\"No records fetched\")\n", |
| 323 | + " \n", |
| 324 | + " return combined_df" |
| 325 | + ] |
| 326 | + }, |
258 | 327 | { |
259 | 328 | "cell_type": "markdown", |
260 | 329 | "metadata": {}, |
|
273 | 342 | "lear_combined_query = f\"\"\"\n", |
274 | 343 | "SELECT \n", |
275 | 344 | " b.id,\n", |
276 | | - " b.identifier,\n", |
| 345 | + " b.identifier AS \"{COLUMN_NAMES['corp_num']}\",\n", |
277 | 346 | " COALESCE(\n", |
278 | 347 | " STRING_AGG(f.filing_type, ', ' ORDER BY f.filing_type), \n", |
279 | 348 | " ''\n", |
|
287 | 356 | "GROUP BY b.id, b.identifier;\n", |
288 | 357 | "\"\"\"\n", |
289 | 358 | "\n", |
290 | | - "corp_nums = colin_extract_df[COLUMN_NAMES['corp_num']].unique().tolist()\n", |
291 | | - "batches_identifiers = [corp_nums[i:i + BATCH_SIZE] for i in range(0, len(corp_nums), BATCH_SIZE)]\n", |
| 359 | + "lear_combined_df = batch_query(\n", |
| 360 | + " query_sql=lear_combined_query,\n", |
| 361 | + " db_engine=engines['lear'],\n", |
| 362 | + " batch_size=BATCH_SIZE,\n", |
| 363 | + " columns=['id', COLUMN_NAMES['corp_num'], COLUMN_NAMES[\"filings\"], COLUMN_NAMES[\"filing_date\"]]\n", |
| 364 | + ")\n", |
292 | 365 | "\n", |
293 | | - "# Execute combined query with batch processing\n", |
294 | | - "lear_combined_results = []\n", |
295 | | - "for idx, batch_identifiers in enumerate(batches_identifiers):\n", |
296 | | - " if not batch_identifiers:\n", |
297 | | - " continue\n", |
298 | | - " try:\n", |
299 | | - " with lear_engine.connect() as conn:\n", |
300 | | - " df = pd.read_sql(\n", |
301 | | - " lear_combined_query,\n", |
302 | | - " conn,\n", |
303 | | - " params={\"identifiers\": batch_identifiers}\n", |
304 | | - " )\n", |
305 | | - " \n", |
306 | | - " lear_combined_results.append(df)\n", |
307 | | - " print(f\"Batch {idx+1}: {len(df)} records fetched\")\n", |
308 | | - " except Exception as e:\n", |
309 | | - " print(f\"Batch {idx+1}/{len(batches_identifiers)} failed: {e}\")\n", |
310 | | - " continue\n", |
| 366 | + "# Display results\n", |
| 367 | + "with pd.option_context('display.max_rows', None):\n", |
| 368 | + " display(lear_combined_df)\n" |
| 369 | + ] |
| 370 | + }, |
| 371 | + { |
| 372 | + "cell_type": "markdown", |
| 373 | + "metadata": {}, |
| 374 | + "source": [ |
| 375 | + "## Get Affiliation Data\n", |
| 376 | + "\n", |
| 377 | + "Query the Auth database to get affiliation information, including whether corporations are affiliated and their account IDs." |
| 378 | + ] |
| 379 | + }, |
| 380 | + { |
| 381 | + "cell_type": "code", |
| 382 | + "execution_count": null, |
| 383 | + "metadata": {}, |
| 384 | + "outputs": [], |
| 385 | + "source": [ |
| 386 | + "auth_query = f\"\"\"\n", |
| 387 | + "SELECT\n", |
| 388 | + " e.business_identifier AS \"{COLUMN_NAMES['corp_num']}\",\n", |
| 389 | + " CASE WHEN a.id IS NOT NULL THEN 'Y' ELSE 'N' END AS \"{COLUMN_NAMES['affiliated']}\",\n", |
| 390 | + " a.org_id AS \"{COLUMN_NAMES['account']}\"\n", |
| 391 | + "FROM\n", |
| 392 | + " entities e\n", |
| 393 | + "LEFT JOIN\n", |
| 394 | + " affiliations a ON e.id = a.entity_id\n", |
| 395 | + "WHERE\n", |
| 396 | + " e.business_identifier = ANY(%(identifiers)s)\n", |
| 397 | + "\"\"\"\n", |
311 | 398 | "\n", |
312 | | - "# Process combied results\n", |
313 | | - "if lear_combined_results:\n", |
314 | | - " lear_combined_df = pd.concat(lear_combined_results, ignore_index=True)\n", |
315 | | - " lear_combined_df = lear_combined_df.drop_duplicates('identifier', keep='last')\n", |
316 | | - " print(f\"Total combined records fetched: {len(lear_combined_df)}\")\n", |
317 | | - "else:\n", |
318 | | - " lear_combined_df = pd.DataFrame(columns=['id', 'identifier', 'Filings Done', 'Last Filing Date'])\n", |
| 399 | + "auth_combined_df = batch_query(\n", |
| 400 | + " query_sql=auth_query,\n", |
| 401 | + " db_engine=engines['auth'],\n", |
| 402 | + " batch_size=BATCH_SIZE,\n", |
| 403 | + " columns=[COLUMN_NAMES['corp_num'], COLUMN_NAMES['affiliated'], COLUMN_NAMES['account']]\n", |
| 404 | + ")\n", |
319 | 405 | "\n", |
320 | 406 | "# Display results\n", |
321 | 407 | "with pd.option_context('display.max_rows', None):\n", |
322 | | - " display(lear_combined_df)\n" |
| 408 | + " display(auth_combined_df)" |
323 | 409 | ] |
324 | 410 | }, |
325 | 411 | { |
|
328 | 414 | "source": [ |
329 | 415 | "## Merge Data\n", |
330 | 416 | "\n", |
331 | | - "Merge COLIN Extract migration data with LEAR filing data into a merged dataset." |
| 417 | + "Combine data from COLIN Extract, LEAR, and Auth databases into a merged dataset." |
332 | 418 | ] |
333 | 419 | }, |
334 | 420 | { |
|
340 | 426 | "try:\n", |
341 | 427 | " result = (colin_extract_df\n", |
342 | 428 | " .merge(lear_combined_df, \n", |
343 | | - " left_on=COLUMN_NAMES['corp_num'], \n", |
344 | | - " right_on='identifier', \n", |
345 | | - " how='left'))\n", |
| 429 | + " on=COLUMN_NAMES['corp_num'], \n", |
| 430 | + " how='left')\n", |
| 431 | + " .merge(auth_combined_df,\n", |
| 432 | + " on=COLUMN_NAMES['corp_num'],\n", |
| 433 | + " how='left') \n", |
| 434 | + " )\n", |
346 | 435 | " \n", |
347 | 436 | " # Select final fields\n", |
348 | 437 | " merged_df = result[FINAL_EXCEL_FIELDS]\n", |
| 438 | + " \n", |
349 | 439 | " print(f\"Data merged successfully: {len(merged_df)} rows\")\n", |
350 | 440 | " \n", |
351 | 441 | "except Exception as e:\n", |
|
0 commit comments