diff --git a/.github/workflows/run-project3.yml b/.github/workflows/run-project3.yml new file mode 100644 index 0000000000..e8265dcb09 --- /dev/null +++ b/.github/workflows/run-project3.yml @@ -0,0 +1,41 @@ +name: Run Project3 + +on: + push: + branches: + - main + paths: + - project/project3.py # only files under project directory will activate this action + workflow_dispatch: # Manual activation possible + +jobs: + run-project3: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r project/requirements.txt + + - name: Start to run project3 + env: + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + run: | + python project/project3.py + + - name: output + uses: actions/upload-artifact@v3 + with: + name: database_and_logging + path: ./data + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000..0ee56fb123 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,42 @@ +name: Run Project3 Tests + +on: + push: + branches: + - main # for every push to main + workflow_dispatch: + +jobs: + test-system: + runs-on: ubuntu-latest + + steps: + # check code + - name: Checkout repository + uses: actions/checkout@v3 + + # set up environment + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + + # install all dependencies + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r project/requirements.txt + + - name: Make tests.sh executable + run: chmod +x project/tests.sh + + # run tests.sh + - name: Run tests + run: ./project/tests.sh + + - name: Upload project3.log + if: always() # save the logging anyways + uses: actions/upload-artifact@v3 + with: + name: project3-log + path: ./data diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000..17d06cadbe --- /dev/null +++ b/LICENSE @@ -0,0 +1,18 @@ +Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International + +made-template © 2025 by Zhijian Su is licensed under CC BY-NC-SA 4.0. To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ + +You are free to: +Share — copy and redistribute the material in any medium or format +Adapt — remix, transform, and build upon the material +The licensor cannot revoke these freedoms as long as you follow the license terms. + +Under the following terms: +Attribution — You must give appropriate credit , provide a link to the license, and indicate if changes were made . You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. +NonCommercial — You may not use the material for commercial purposes . +ShareAlike — If you remix, transform, or build upon the material, you must distribute your contributions under the same license as the original. +No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits. +Notices: +You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation . + +No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material. diff --git a/README.md b/README.md index 99a7ebae1a..36df9e7fc6 100644 --- a/README.md +++ b/README.md @@ -35,3 +35,19 @@ Grading Exercise 1 Shape: 4 of 4 Types: 13 of 13 ``` + ## A visual analysis of the security situation and potential crime influencing factors in California‘s cities + California is not only the most economically developed state in the U.S., but also the most populous state in the U.S. Its stunning scenery, booming economy, advanced technology and excellent educational resources combine to attract visitors from all over the world. While the security situation in California is an important consideration for people before traveling, California's security situation presents complexity. In this regard, this project puts forward the following questions to study the security situation in California cities and tries to answer them through data engineering methods: + 1. What types of crimes are most common in all California cities? Is there a type of crime that is + predominant in most cities? + 2. Which cities in California have the highest crime rate (per 100000 residents)? + 3. Do all California cities have similar ratios of the number of law enforcement officers to the number + of crimes? Or are the ratios of the number of law enforcement to the population similar? + 4. Do California cities with higher median household incomes or high school graduation rates have + lower crime rates? + 5. Do cities with high poverty rates have higher crime rates? + +The answers to these questions can provide a safety index for people who want to travel or settle in California, and can also help policymakers and government officials develop personalized crime prevention measures in specific cities. See [analysis-report.pdf](https://github.com/Jackie-Soo/made-template/blob/Jackie-Soo-patch-1/project/analysis-report.pdf) for details. + + ## License + This project is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License (CC BY-NC-SA 4.0). + See the [LICENSE](https://github.com/Jackie-Soo/made-template/blob/Jackie-Soo-patch-1/LICENSE) file for details. diff --git a/exercises/exercise1.jv b/exercises/exercise1.jv new file mode 100644 index 0000000000..acc29e45c4 --- /dev/null +++ b/exercises/exercise1.jv @@ -0,0 +1,47 @@ + +pipeline AirportsPipeline { + + AirportsExtractor + -> AirportsTextFileInterpreter + -> AirportsCSVInterpreter + + -> AirportsTableInterpreter + -> AirportsLoader; + + block AirportsExtractor oftype HttpExtractor { + + url: "https://opendata.rhein-kreis-neuss.de/api/explore/v2.1/catalog/datasets/rhein-kreis-neuss-flughafen-weltweit/exports/csv?lang=en&timezone=Europe%2FBerlin&use_labels=true&delimiter=%3B"; + } + + //interpretes a binary file into a text file + block AirportsTextFileInterpreter oftype TextFileInterpreter { } + + //enclosing defines what character surrounds a value, while delimiter defines what separate each value + block AirportsCSVInterpreter oftype CSVInterpreter { + delimiter: ';'; + } + + + block AirportsTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + + "Lfd. Nummer" oftype integer, + "Name des Flughafens" oftype text, + "Ort" oftype text, + "Land" oftype text, + "IATA" oftype text, + "ICAO" oftype text, + "Latitude" oftype decimal, + "Longitude" oftype decimal, + "Altitude" oftype integer + + ]; + } + + block AirportsLoader oftype SQLiteLoader { + table: "airports"; + file: "./airports.sqlite"; + } + +} diff --git a/exercises/exercise2.jv b/exercises/exercise2.jv new file mode 100644 index 0000000000..e097bac25d --- /dev/null +++ b/exercises/exercise2.jv @@ -0,0 +1,58 @@ + +valuetype coordinate oftype text { + constraints: [HundredScale]; +} + +constraint HundredScale oftype RegexConstraint { + regex: /^(?:\d{1,3})\.(?:\d+),\s*(?:\d{1,3})\.(?:\d+)$/; + } + +valuetype vor_init oftype text { + constraints: [Vogelsang]; +} + +constraint Vogelsang oftype RegexConstraint { + regex: /^(Vogelsang).*$/; + } + +pipeline TreesPipeline { + + TreesExtractor + -> TreesTextFileInterpreter + -> TreesCSVInterpreter + -> TreesTableInterpreter + -> TreesLoader; + + block TreesExtractor oftype HttpExtractor { + + url: "https://opendata.rhein-kreis-neuss.de/api/v2/catalog/datasets/stadt-neuss-herbstpflanzung-2023/exports/csv"; + } + + //interpretes a binary file into a text file + block TreesTextFileInterpreter oftype TextFileInterpreter { } + + //enclosing defines what character surrounds a value, while delimiter defines what separate each value + block TreesCSVInterpreter oftype CSVInterpreter { + delimiter: ';'; + } + + + block TreesTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "lfd_nr" oftype integer, + "stadtteil" oftype vor_init, //Start with Vogelsang + "standort" oftype text, + "baumart_botanisch" oftype text, + "id" oftype coordinate, //{geo-coordinate 1}, {geo-coordinate 2}, geo-coordiate format{1-3 numbers}.{numbers} + "baumfamilie" oftype text + + ]; + } + + block TreesLoader oftype SQLiteLoader { + table: "trees"; + file: "./trees.sqlite"; + } + +} \ No newline at end of file diff --git a/exercises/exercise3.jv b/exercises/exercise3.jv new file mode 100644 index 0000000000..08b36c1403 --- /dev/null +++ b/exercises/exercise3.jv @@ -0,0 +1,124 @@ +valuetype GDP oftype decimal { + constraints: [ GDPRange ]; +} + +constraint GDPRange on decimal: + value > 0; + +valuetype Share oftype decimal { + constraints: [ ShareRange ]; +} + +constraint ShareRange oftype RangeConstraint { + lowerBound: 0; + lowerBoundInclusive: true; + upperBound: 1; + upperBoundInclusive: true; +} + +pipeline CsPipeline1 { + + CsExtractor + -> CsXLSXInterpreter + -> CsSheetPicker + -> CsHeaderWriter1 + -> CsHeaderWriter2 + -> CsDataSelector // should be after headerwriter + -> CsTableInterpreter + -> CsLoader; + + block CsExtractor oftype HttpExtractor { + url: "https://thedocs.worldbank.org/en/doc/7d852628d96b9411d43e5d36d5dff941-0050062022/original/Graphs-Chapter-5-02082022.xlsx"; + } + + //interpretes a file into a XLSX file + block CsXLSXInterpreter oftype XLSXInterpreter { } + + //Selects the Sheet from the XLSX Workbook + block CsSheetPicker oftype SheetPicker { + sheetName: "Figure S5.1.2"; + } + + //Selects a subset of a Sheet + block CsDataSelector oftype CellRangeSelector { + select: range P2:S45; + } + + //Rename the cells + block CsHeaderWriter1 oftype CellWriter { + at: cell P2; + write: ["Country Code"]; + } + block CsHeaderWriter2 oftype CellWriter { + at: cell S2; + write: ["Bond Issuance Share"]; + } + + block CsTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "Country Code" oftype CountryCodeAlpha3, // Jayvee 0.6.4 has a CountryCodeAlpha3 value type + "Bond Issuance Share" oftype Share + ]; + } + + block CsLoader oftype SQLiteLoader { + table: "bondIssuance"; + file: "./country-stats.sqlite"; + } + +} +// build two pipelines + +pipeline CsPipeline2 { + + CsExtractor + -> CsXLSXInterpreter + -> CsSheetPicker + -> CsHeaderWriter1 + -> CsHeaderWriter2 + -> CsDataSelector // should be after headerwriter + -> CsTableInterpreter + -> CsLoader; + + block CsExtractor oftype HttpExtractor { + url: "https://thedocs.worldbank.org/en/doc/7d852628d96b9411d43e5d36d5dff941-0050062022/original/Graphs-Chapter-5-02082022.xlsx"; + } + + //interpretes a file into a XLSX file + block CsXLSXInterpreter oftype XLSXInterpreter { } + + //Selects the Sheet from the XLSX Workbook + block CsSheetPicker oftype SheetPicker { + sheetName: "Figure S5.1.2"; + } + + //Selects a subset of a Sheet + block CsDataSelector oftype CellRangeSelector { + select: range P2:S45; + } + + //Rename the cells + block CsHeaderWriter1 oftype CellWriter { + at: cell P2; + write: ["Country Code"]; + } + block CsHeaderWriter2 oftype CellWriter { + at: cell R2; + write: ["GDP per Capita"]; + } + + block CsTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "Country Code" oftype CountryCodeAlpha3, + "GDP per Capita" oftype GDP + ]; + } + + block CsLoader oftype SQLiteLoader { + table: "gdpPerCapita"; + file: "./country-stats.sqlite"; + } + +} \ No newline at end of file diff --git a/exercises/exercise4.jv b/exercises/exercise4.jv new file mode 100644 index 0000000000..6c1a9a3954 --- /dev/null +++ b/exercises/exercise4.jv @@ -0,0 +1,94 @@ +valuetype id_range oftype integer { + constraints: [ over_zero ]; +} + +constraint over_zero on integer: + value > 0; + +valuetype month_range oftype integer { + constraints: [ range1to12 ]; +} + +constraint range1to12 on integer: + value >= 1 and value <= 12; + +pipeline MobilPipeline1 { + + MobilExtractor + -> ZipArchiveInterpreter + -> ZipFilePicker + -> MobilTextFileInterpreter + -> MobilCSVInterpreter + -> MobilSequenceWriter + -> MobilHeaderWriter + -> MobilTableInterpreter + -> CelsiusToFahrenheitTransformer1 + -> CelsiusToFahrenheitTransformer2 + -> MobilLoader; + + block MobilExtractor oftype HttpExtractor { + url: "https://www.mowesta.com/data/measure/mowesta-dataset-20221107.zip"; + } + + //interpretes a file into a XLSX file + block ZipArchiveInterpreter oftype ArchiveInterpreter { + archiveType: "zip"; + } + + block ZipFilePicker oftype FilePicker { + path: "./data.csv"; + } + + block MobilTextFileInterpreter oftype TextFileInterpreter { } + + block MobilCSVInterpreter oftype CSVInterpreter { + delimiter: ';'; + } + + //Rename the cells + //"Geraet", "Hersteller", "Model", "Monat", "Temperatur in °C (DWD)" + block MobilSequenceWriter oftype CellWriter { + at: range A1:E1; + write: ["id", "producer", "model", "month", "temperature"]; + } + + block MobilHeaderWriter oftype CellWriter { + at: cell J1; + write: ["battery_temperature"]; //"Batterietemperatur in °C" + } + + block MobilTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "id" oftype id_range, // Jayvee 0.6.4 has a CountryCodeAlpha3 value type + "producer" oftype text, + "model" oftype text, + "month" oftype month_range, + "temperature" oftype decimal, + "battery_temperature" oftype decimal + ]; + } + + transform CelsiusToFahrenheit { + from Celsius oftype decimal; + to Fahrenheit oftype decimal; + Fahrenheit: (Celsius * 9/5) + 32; + } + + block CelsiusToFahrenheitTransformer1 oftype TableTransformer { + inputColumns: ["temperature"]; + outputColumn: "temperature"; + uses: CelsiusToFahrenheit; + } + + block CelsiusToFahrenheitTransformer2 oftype TableTransformer { + inputColumns: ["battery_temperature"]; + outputColumn: "battery_temperature"; + uses: CelsiusToFahrenheit; + } + + block MobilLoader oftype SQLiteLoader { + table: "temperatures"; + file: "./temperatures.sqlite"; + } +} \ No newline at end of file diff --git a/exercises/exercise5.jv b/exercises/exercise5.jv new file mode 100644 index 0000000000..c9db8fee0b --- /dev/null +++ b/exercises/exercise5.jv @@ -0,0 +1,69 @@ +valuetype zone oftype integer { + constraints: [ specific_zone ]; +} + +constraint specific_zone on integer: + value == 1925; + +valuetype geographic oftype decimal{ + constraints: [ geo_coordinates ]; +} + +constraint geo_coordinates on decimal : + value >= -90 and value <= 90; + +valuetype Umlauts oftype text { + constraints: [ GermanUmlauts ]; +} + +constraint GermanUmlauts oftype RegexConstraint { + regex: /^[A-Za-zÄäÖöÜüß\s]*$/; + } + +pipeline GTFSPipeline { + + GTFSExtractor + -> ZipArchiveInterpreter + -> ZipFilePicker + -> GTFSTextFileInterpreter + -> GTFSCSVInterpreter + -> GTFSTableInterpreter + -> GTFSLoader; + + block GTFSExtractor oftype HttpExtractor { + url: "https://gtfs.rhoenenergie-bus.de/GTFS.zip"; + } + + //unzip the zip file + block ZipArchiveInterpreter oftype ArchiveInterpreter { + archiveType: "zip"; + } + + block ZipFilePicker oftype FilePicker { + path: "./stops.txt"; + } + + block GTFSTextFileInterpreter oftype TextFileInterpreter{} + + + block GTFSCSVInterpreter oftype CSVInterpreter{ + enclosing: '"'; + delimiter: ','; + } + + block GTFSTableInterpreter oftype TableInterpreter { + header: true; + columns: [ + "stop_id" oftype integer, + "stop_name" oftype Umlauts, // German Umlauts + "stop_lat" oftype geographic, + "stop_lon" oftype geographic, + "zone_id" oftype zone, + ]; + } + + block GTFSLoader oftype SQLiteLoader { + table: "stops"; + file: "./gtfs.sqlite"; + } +} diff --git a/project/analysis-report.pdf b/project/analysis-report.pdf new file mode 100644 index 0000000000..bfb3645cf2 Binary files /dev/null and b/project/analysis-report.pdf differ diff --git a/project/data-report.pdf b/project/data-report.pdf new file mode 100644 index 0000000000..4f88510562 Binary files /dev/null and b/project/data-report.pdf differ diff --git a/project/pipeline.sh b/project/pipeline.sh new file mode 100644 index 0000000000..952871a4df --- /dev/null +++ b/project/pipeline.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 ./project/project3.py diff --git a/project/project-plan.md b/project/project-plan.md new file mode 100644 index 0000000000..e5adaeb347 --- /dev/null +++ b/project/project-plan.md @@ -0,0 +1,49 @@ +# Project Plan + +## Visual Analysis of the Number of Law Enforcement Officers and the Number of Crimes in California, the U.S. + +The project uses data from the FBI's Uniform Crime Reporting program to visualize and analyze the number of law enforcement officers and the number of crimes in various cities in California. + + +## Main Question + + +1. Do all California cities have the same ratio of the number of law enforcement officers to the number of crimes in that city? Or is the number of law enforcement officers greater in some cities? +2. What are the most common types of crimes in California? Are there certain crimes that are more common in a city compared to the rest of the state? If there are, what's the most common crime in each city? + +## Description + + +This project visualizes and analyzes the number of law enforcement officers, the number of crimes, the types of crimes, and the rates of crimes in various cities in California using data provided by the FBI. The safety of a city has an important impact on the quality of life of its inhabitants, economic development and social stability. The number of law enforcement officers and the crime rate of a city are important factors in considering the security of a city. Exploring the above issues through tableau, python, or jayvee on data released by the FBI will not only reveal differences in public safety resource allocation and crime types across cities, but also provide recommendations for policy development, law enforcement efficiency improvement, public safety awareness, and the feasibility of people moving to California cities. + +## Datasources + + + +### Datasource1: ca_law_enforcement_by_city, 2015 +* Metadata URL: https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015/resource-pages/downloads/download-printable-files +* Data URL: https://www.kaggle.com/datasets/fbi-us/california-crime?select=ca_law_enforcement_by_city.csv +* License: us-pd +* Data Type: CSV + +This dataset is published by the Federal Bureau of Investigation's (FBI) Uniform Crime Reporting (UCR) program. This dataset shows the data of Law Enforcement Officers in California Cities. + +### Datasource2: ca_offenses_by_city, 2015 +* Metadata URL: https://ucr.fbi.gov/crime-in-the-u.s/2015/crime-in-the-u.s.-2015/resource-pages/downloads/download-printable-files +* Data URL: https://www.kaggle.com/datasets/fbi-us/california-crime?select=ca_offenses_by_city.csv +* License: us-pd +* Data Type: CSV + +This dataset is published by the Federal Bureau of Investigation's (FBI) Uniform Crime Reporting (UCR) program. This dataset shows the reported Crime in California Cities. Categories of crimes reported include violent crime, murder and nonnegligent manslaughter, rape, robbery, aggravated assault, property crime, burglary, larceny-theft, motor vehicle damage, and arson. + +## Work Packages + + +1. Data Conversion & Cleaning [#1][i1] +2. Solutions & Data Analysis [#2][i2] +3. Data Visualization [#3][i3] +4. Final Report & Presentation [#4][i3] + +[i1]: https://github.com/jvalue/made-template/issues/123 +[i2]: https://github.com/jvalue/made-template/issues/128 +[i3]: https://github.com/jvalue/made-template/issues/129 diff --git a/project/project3.py b/project/project3.py new file mode 100644 index 0000000000..cddc45bc9e --- /dev/null +++ b/project/project3.py @@ -0,0 +1,168 @@ +""" +The author of the dataset fatal-police-shootings-in-the-us +is KAROLINA WULLUM: https://www.kaggle.com/datasets/kwullum/fatal-police-shootings-in-the-us +Its license is CC BY-NC-SA 4.0. Website of the license: https://creativecommons.org/licenses/by-nc-sa/4.0/ +This project makes some changes to the original data. + +The license of the dataset california-crime is us-pd, +which is not protected by copyright, +and the public is free to copy, distribute, modify, or use the work as they wish. +""" + +import os +import pandas as pd +import kagglehub +import sqlite3 +import logging +import requests +from tenacity import retry, stop_after_attempt, wait_fixed + +# record program running information using log +logging.basicConfig(filename='data/project3.log', + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + + +# download dataset from kaggle +@retry(stop=stop_after_attempt(3), wait=wait_fixed(2)) +def download_dataset(dataset_name): + try: + path = kagglehub.dataset_download(dataset_name) + logging.info(f"Successfully download Dataset {dataset_name} to {path}") + return path + except Exception as e: + logging.error(f"Failed to download dataset {dataset_name}: {e}") + return None + + +# Read csv file from the directory +def read_csv(file_path, encoding): + try: + data = pd.read_csv(file_path, encoding=encoding) + logging.info(f"Successfully read CSV file from {file_path}") + return data + except Exception as e: + logging.error(f"Failed to read CSV file from {file_path}: {e}") + return None + + +def transform_crime_enforcement_data(c_data, e_data): + c_data = c_data.drop(columns=['Rape (legacy definition)', 'Population']) + c_data['City'] = c_data['City'].str.rstrip('3') + e_data.columns = e_data.columns.str.replace('\r', ' ', regex=False) # some column names \r + e_data.columns = e_data.columns.str.replace(r'\s+', ' ', regex=True) # replace multiple spaces with one space + merged_data = pd.merge(c_data, e_data, on='City', how='inner') + merged_data = merged_data.replace(r',(?=\d)', '', regex=True) # some numbers have comma, need to be deleted + + merged_data = merged_data.astype({ + 'City': 'string', + 'Violent crime': 'int32', + 'Murder and nonnegligent manslaughter': 'int32', + 'Rape (revised definition)': 'int32', + 'Robbery': 'int32', + 'Aggravated assault': 'int32', + 'Property crime': 'int32', + 'Burglary': 'int32', + 'Larceny-theft': 'int32', + 'Motor vehicle theft': 'int32', + 'Arson': 'int32', + 'Population': 'int32', + 'Total law enforcement employees': 'int32', + 'Total officers': 'int32', + 'Total civilians': 'int32' + }) + + logging.info(f"Successfully transform crime&enforcement data") + return merged_data + + +def transform_factors_data(i_data, p_data, h_data): + i_data = i_data[i_data['Geographic Area'] == 'CA'] # only reserve data related to California + p_data = p_data[p_data['Geographic Area'] == 'CA'] + h_data = h_data[h_data['Geographic Area'] == 'CA'] + i_data = i_data.drop(columns='Geographic Area') # only care about cities in California, not states + p_data = p_data.drop(columns='Geographic Area') + h_data = h_data.drop(columns='Geographic Area') + merged_data2 = pd.merge(i_data, p_data, on='City', how='inner') + merged_data2 = pd.merge(merged_data2, h_data, on='City', how='inner') # merge 3 Dataframe + merged_data2 = merged_data2[~merged_data2['City'].str.contains('CDP', na=False)] # delete CDP regions (not cities) + # replace all string data with NaN, 250,000+ + merged_data2['Median Income'] = pd.to_numeric(merged_data2['Median Income'], errors='coerce') + # replace NaN with 250000 + merged_data2['Median Income'].fillna(250000, inplace=True) + + merged_data2 = merged_data2.astype({ + 'City': 'string', + 'Median Income': 'int32', + 'poverty_rate': 'float64', + 'percent_completed_hs': 'float64' + }) + + logging.info(f"Successfully transform factors data") + return merged_data2 + + +def load_to_db(conn, dataframe, table_name): + try: + dataframe.to_sql(table_name, conn, if_exists='replace', index=False) # load dataframe to a database + logging.info(f"Successfully loaded data to table: {table_name}") + except Exception as e: + logging.error(f"Failed to load data to table: {table_name}: {e}") + + +def main(conn=None): + # download two different datasets from kaggle (Extract) + crime_enforcement_path = download_dataset("fbi-us/california-crime") + print(crime_enforcement_path) + if not crime_enforcement_path: + return + factors_path = download_dataset("kwullum/fatal-police-shootings-in-the-us") + print(factors_path) + if not factors_path: + return + + # Read and transform crime data (Transform) + crime_path = os.path.join(crime_enforcement_path, 'ca_offenses_by_city.csv') + enforcement_path = os.path.join(crime_enforcement_path, 'ca_law_enforcement_by_city.csv') + c_data = read_csv(crime_path, 'UTF-8') + e_data = read_csv(enforcement_path, 'UTF-8') + if c_data is None or e_data is None: + return + merged_data = transform_crime_enforcement_data(c_data, e_data) + + # Read and transform social factors data + income_path = os.path.join(factors_path, 'MedianHouseholdIncome2015.csv') + poverty_path = os.path.join(factors_path, 'PercentagePeopleBelowPovertyLevel.csv') + education_path = os.path.join(factors_path, 'PercentOver25CompletedHighSchool.csv') + i_data = read_csv(income_path, 'ISO-8859-1') + p_data = read_csv(poverty_path, 'ISO-8859-1') + h_data = read_csv(education_path, 'ISO-8859-1') + if i_data is None or p_data is None or h_data is None: + return + merged_data2 = transform_factors_data(i_data, p_data, h_data) + + # load data to the database with two different table names + conn_by_main = False + if conn is None: # make sure the test program can run + conn = sqlite3.connect('data/project3.db') + conn_by_main = True + + try: + load_to_db(conn, merged_data, 'crime_enforcement') + load_to_db(conn, merged_data2, 'factors') + conn.commit() + logging.info("Successfully created two datasets.") + except Exception as e: + logging.error(f"Failed to connect to database or load changes: {e}") + finally: + if conn_by_main is True: + # make sure that the system test can run successfully + conn.close() + logging.info("connection to database is closed.\n") + else: + logging.info("connection to database is not closed yet due to system test.") + + +if __name__ == "__main__": + main() diff --git a/project/requirements.txt b/project/requirements.txt new file mode 100644 index 0000000000..4d009d5448 --- /dev/null +++ b/project/requirements.txt @@ -0,0 +1,5 @@ +chardet==4.0.0 +kagglehub==0.2.7 +pandas==1.5.2 +Requests==2.32.3 +tenacity==9.0.0 diff --git a/project/system_test.py b/project/system_test.py new file mode 100644 index 0000000000..2bbf9302f6 --- /dev/null +++ b/project/system_test.py @@ -0,0 +1,102 @@ +import logging +import unittest +from unittest.mock import patch +import pandas as pd +import sqlite3 +from project3 import main + +# record program running information using log +logging.basicConfig(filename='data/project3.log', + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + +class SystemTestDataPipeline(unittest.TestCase): + # mock 2 functions of project3 + @patch('project3.download_dataset') + @patch('project3.read_csv') + def test_full_data_pipeline(self, mock_read_csv, mock_download_dataset): + + # Mock download_dataset to return two mocked file paths + mock_download_dataset.side_effect = [ + "mocked_path/fbi-us/california-crime", + "mocked_path/kwullum/fatal-police-shootings-in-the-us" + ] + logging.info(f"Successfully do mock download dataset") + + # Mock read_csv to return fake DataFrames which each structure is + # identical to the actual Dataframe's structure + mock_read_csv.side_effect = [ + # Dataset1: Crime and enforcement data, two DataFrames + pd.DataFrame({ + 'City': ['Alameda'], + 'Population': ['78,613'], + 'Violent crime': [148], + 'Murder and nonnegligent manslaughter': [2], + 'Rape (revised definition)': [7], + 'Rape (legacy definition)': [0], + 'Robbery': [61], + 'Aggravated assault': [78], + 'Property crime': ['1,819'], + 'Burglary': [228], + 'Larceny-theft': ['1,245'], + 'Motor vehicle theft': [346], + 'Arson': [18] + }), + + pd.DataFrame({ + 'City': ['Alameda'], + 'Population': ['78,613'], + 'Total law\renforcement\remployees': [112], + 'Total \rofficers': [83], + 'Total \rcivilians': [29] + }), + + # Dataset2: Social factors data, three DataFrames + pd.DataFrame({ + 'Geographic Area': ['CA'], + 'City': ['Alameda city'], + 'Median Income': [79312] + }), + + pd.DataFrame({ + 'Geographic Area': ['CA'], + 'City': ['Alameda city'], + 'poverty_rate': [9.8] + }), + + pd.DataFrame({ + 'Geographic Area': ['CA'], + 'City': ['Alameda city'], + 'percent_completed_hs': [91.3] + }), + ] + logging.info(f"Successfully do mock read dataframe") + + # Create a SQLite Database in memory + conn = sqlite3.connect(":memory:") + + # main function execution + main(conn=conn) + logging.info(f"Successfully execute main function") + + # uses cursor object to do validation + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table'") + # save all results in a tuple + tables = {row[0] for row in cursor.fetchall()} + print("Tables in database:", tables) + logging.info(f"Tables in database: {tables}") + + # validates if expected tables are in the tuple + self.assertIn('crime_enforcement', tables) + self.assertIn('factors', tables) + + # connection is closed + conn.close() + + # output result + print("Test passed: All expected functions were called and database operations were successful.") + logging.info(f"Test passed: All expected functions were called and database operations were successful.\n") + +if __name__ == '__main__': + unittest.main() diff --git a/project/tests.sh b/project/tests.sh new file mode 100644 index 0000000000..b665951cfd --- /dev/null +++ b/project/tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +python3 ./project/system_test.py