diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..58461f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints \ No newline at end of file diff --git a/convert-data.py b/convert-data.py new file mode 100644 index 0000000..5b12192 --- /dev/null +++ b/convert-data.py @@ -0,0 +1,182 @@ +# ============================================= +# Filename: convert-data.py +# Author: Peter Kim +# Date: Nov 28, 2020 +# Description: Convert data.tsv into standard CSV file. +# ============================================= +''' +See also: +* peterkim-solution.md - plaintext explanation +* notebook/explore-data-challenge.ipynb - exploratory notebook + +Dependencies: +* standard Python/Anaconda distribution +* pip install unidecode +* pip install pandas-redshift +''' + +# import libraries +import os +import io +import pandas as pd +import numpy as np +import unidecode +import re + +# pandas display settings +pd.set_option("display.max_columns", 999) +pd.set_option("display.max_rows", 999) + + +def clean_anomalies_str(str_anomalies): + ''' + Function to clean str with anomalies, i.e. 4 tab-delimiters. + Use string methods and regular expressions to extract records. + If the record has more than 2 names, drop the middle name so only 2 names. + Output list of str items, each with 4 tab-delimiters. + + Dependencies: + * import re + Input: + * str_anomalies - str, records that should have 4 tab-delimiters with data fields. + Return: + * ls_clean_4tabs - list, with str items with 4 tab-delimiters and data fields. + ''' + + # clean string + # (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab + str_anom_tab_delim_clean = str_anomalies.replace('\n', '\t').replace(' ', '').replace('\t\t', '\t') + + # use regex groups to identify records + # https://www.tutorialspoint.com/What-is-the-groups-method-in-regular-expressions-in-Python + ''' + * id - one or more digits + * name - may include 0, 1, 2, 3 names + * account number - one or more digits that may contain '-' or '/' + * email may contain one or more '@', '.' + ''' + r_str_match_0names = '(\d+\\t[0-9-/]*\\t[a-zA-Z@.]*)' + r_str_match_1names = '(\d+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)' + r_str_match_2names = '(\d+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)' + r_str_match_3names = '(\d+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)' + + # create list of records + ls_re_find_0names = re.findall(r_str_match_0names, str_anom_tab_delim_clean) + ls_re_find_1names = re.findall(r_str_match_1names, str_anom_tab_delim_clean) + ls_re_find_2names = re.findall(r_str_match_2names, str_anom_tab_delim_clean) + ls_re_find_3names = re.findall(r_str_match_3names, str_anom_tab_delim_clean) + + # clean 3 names by removing middle name + ls_convert_3to2_names = [] + for each_3name in ls_re_find_3names: + + # convert to list, split on tab delimiter + temp_list = each_3name.split('\t') + + # remove middle name + del temp_list[2] + + # convert to str, tab-delimited + str_3to2_names = '\t'.join(temp_list) + + # append to list of 3to2 names + ls_convert_3to2_names.append(str_3to2_names) + + # combine lists for merged output + ls_regex_clean_4tabs = ls_re_find_2names + ls_convert_3to2_names + + # add newline character to end of str + ls_regex_clean_4tabs = [x+'\n' for x in ls_regex_clean_4tabs] + + return ls_regex_clean_4tabs + + +def importlist_cleanpandas_export_csv(list_of_strings): + ''' + Import the list of strings (tab-delimited) into pandas. + Clean the data, e.g. account_numbers. + Could .fillna() on first_name or last_name if preferable. + Export as CSV to the "data" folder. + ''' + + # convert list to string, already has newline character, no need to add to join + str_lines_tsv_4tabs = ''.join(list_of_strings) + + # convert str to io.StringIO object so it can be read as CSV file + # https://www.kite.com/python/answers/how-to-create-a-pandas-dataframe-from-a-string-in-python + io_data_tsv = io.StringIO(str_lines_tsv_4tabs) + + # create dataframe, tab-delimited + df_4_tabs = pd.read_csv(io_data_tsv, sep='\t') + + # clean account_number: leading/trailing whitespace, dash, slash ... + df_4_tabs['account_number'] = df_4_tabs['account_number'].str.replace(' ', '').str.replace('-', '').str.replace('/', '') + + # sort dataframe by 'id' column + df_4_tabs = df_4_tabs.sort_values('id') + + # export as csv + path_to_csv_export = os.path.join('data', 'data-solution.csv') + df_4_tabs.to_csv(path_to_csv_export, index=False) + + +def main(): + ''' + Most of the code is in the main() function. + + Write a separate function for the regex, so it could be used in parallel algorithm. + ''' + + # ========================================================== + # Section 1: read text data from file + # ========================================================== + + # create path to tsv file + path_to_tsv = os.path.join('data', 'data.tsv') + + # read each line of file into a list, challenge documentation said 'utf-16-le' encoding + # https://realpython.com/read-write-files-python/ + # https://stackoverflow.com/questions/4190683/python-string-replace-for-utf-16-le-file + with open(path_to_tsv, 'r', encoding='utf-16-le') as f: + ls_lines_tsv_utf16le = f.readlines() + + # make sure each item ends with newlines + ls_lines_tsv_newline_utf16le = [x if x.endswith('\n') else x+'\n' for x in ls_lines_tsv_utf16le] + + # remove accents on characters + # https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string + # https://medium.com/@randombites/how-to-handle-accented-special-strings-175e65d96123 + # https://stackoverflow.com/questions/31207287/converting-utf-16-to-utf-8 + ls_lines_tsv = [unidecode.unidecode(x) for x in ls_lines_tsv_newline_utf16le] + + # create list with only items with 4 tabs + ls_lines_tsv_4tabs = [x for x in ls_lines_tsv if x.count('\t')==4] + + # create list with anomalies '\t' + ls_lines_tsv_not4tabs = [x for x in ls_lines_tsv if x.count('\t') != 4] + + # ========================================================== + # Section 2: clean data without 4 tab-delimiters, merge back into one list + # Use custom function clean_anomalies_str(). + # ========================================================== + + # convert anomalies list to string + str_tsv_not4tabs = ''.join(ls_lines_tsv_not4tabs) + + # clean anomalies string + ls_lines_tsv_clean4tabs = clean_anomalies_str(str_tsv_not4tabs) + + # merge into one list + ls_lines_tsv_pandas = ls_lines_tsv_4tabs + ls_lines_tsv_clean4tabs + + # ========================================================== + # Section 3: load list into pandas, clean in pandas, export to CSV + # Use custom function importlist_cleanpandas_export_csv(). + # ========================================================== + + # import list into pandas, clean in pandas, export to CSV + importlist_cleanpandas_export_csv(ls_lines_tsv_pandas) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/data/data-solution.csv b/data/data-solution.csv new file mode 100644 index 0000000..ae68887 --- /dev/null +++ b/data/data-solution.csv @@ -0,0 +1,1001 @@ +id,first_name,last_name,account_number,email +1,Addison,Marks,196296,ornare.lectus@et.edu +2,Dakota,Garza,409025,scelerisque@Praesentluctus.edu +3,Basia,Wolfe,637720,Aliquam@nullaIntegerurna.com +4,Germaine,Campbell,826846,id.magna@viverraMaecenas.ca +5,Lenore,Pennington,345284,aliquam@Integer.edu +6,Stacy,Santos,512759,varius@dictummagna.edu +7,Jack,Hancock,644688,magna.sed.dui@atnisi.com +8,Dolan,Colon,969175,vel@blandit.com +9,Mia,Johnston,789448,auctor.non.feugiat@lectus.ca +10,Malachi,Zimmerman,217250,erat.Sed.nunc@auctor.org +11,Allen,Paul,644841,Quisque.porttitor.eros@pedePraesent.ca +12,Fallon,Dunn,453682,posuere.vulputate@DonecegestasAliquam.org +13,Donna,Swanson,727124,dolor@et.ca +14,Dante,Caldwell,780133,Phasellus.dolor.elit@viverraMaecenasiaculis.com +15,Paul,Graham,153596,auctor.vitae.aliquet@varius.co.uk +16,Christen,Cleveland,980913,Aenean.massa.Integer@nisiAenean.net +17,Bevis,Hatfield,420064,tempor.lorem.eget@facilisisegetipsum.ca +18,Kermit,Hale,690237,orci.consectetuer@in.edu +19,Imani,Mcgowan,976722,placerat.augue.Sed@hendrerita.net +20,Hollee,Gray,705875,urna.Ut.tincidunt@Proin.com +21,Ainsley,Meyer,433161,orci@risusquis.net +22,Joseph,Heath,230206,nisi@pellentesquetellussem.ca +23,Thomas,Richardson,118839,Praesent@congueInscelerisque.org +24,Quincy,Buckner,676034,velit@nislelementum.co.uk +25,Lee,Vance,874024,lorem@nonummyultriciesornare.co.uk +26,Nathaniel,Ruiz,327166,magna.Sed.eu@arcuMorbisit.com +27,Kimberley,Parks,319377,vestibulum.lorem.sit@eueuismod.edu +28,Ivory,Downs,133677,sem@sed.ca +29,Adena,Bosley,656184,ac.ipsum.Phasellus@ut.net +30,Laura,Rivera,270464,nascetur.ridiculus.mus@Donecnibhenim.org +31,Clinton,Vincent,677802,adipiscing.lacus.Ut@atlibero.edu +32,aretha,Torres,278324,scelerisque.neque.sed@consequatdolor.ca +33,Kuame,Bruce,295535,metus@hendreritnequeIn.edu +34,Ifeoma,Knight,946108,sem@sagittis.org +35,Jorden,Dodson ,605296,Nulla.aliquet.Proin@vitaesodalesat.ca +36,Holmes,Goodwin,633430,et.ultrices.posuere@Etiam.com +37,Ariana,Dillon,269848,ullamcorper@magna.org +38,Julian,Kramer,815929,Sed@adipiscinglobortisrisus.co.uk +39,Victor,Kaufman,688093,Maecenas.mi@eusem.org +40,Shaeleigh,Pope,687812,Donec.porttitor@felis.org +41,Hope,Mathis,493162,penatibus@elit.ca +42,Brendan,Tyler,348407,pellentesque.tellus.sem@vitae.co.uk +43,Jerry,merritt,440407,lorem.sit.amet@eudolor.co.uk +44,Brittany,Stephens,412405,Nunc.sed@fringillaeuismodenim.com +45,ignacia, downs,505038,Vivamus.euismod.urna@idlibero.co.uk +46,lillith,mcdaniel,204861,ultricies.ligula@gravida.co.uk +47,Brody,George,391293,tortor@et.com +48,Danielle,Mcclure,396229,Aliquam.nisl@ut.ca +49,Alexis,Sullivan,443051,morbi.tristique.senectus@sagittislobortismauris.com +50,Nayda,Gregory,685096,Sed.nulla.ante@arcuCurabiturut.ca +51,Nicole,Riley,877883,lobortis@molestiearcuSed.com +52,Kato,Poole,172510,lorem@vulputatenisisem.org +53,Amity,Ball,323456,Integer@egestasAliquamnec.ca +54,Cynthia,Solis,993712,lacus.Mauris.non@eratvolutpat.com +55,Sopoline,Compton,484168,Suspendisse.tristique.neque@vestibulum.org +56,Juliet,Acevedo,189750,velit.eget@ipsum.com +57,Hakeem,Lopez,390418,dui.nec.urna@lectus.net +58,Geoffrey,Barlow,128379,sit.amet@Proin.edu +59,Regina,England,833719,consequat.nec@id.org +60,Aladdin,Morrow,551973,augue.id@PhasellusornareFusce.ca +61,Hedy,Hogan,400266,magna@euismod.org +62,Burton,Glass,743267,eu@Curabitur.co.uk +63,Brenda,Valenzuela,494320,turpis.Nulla.aliquet@at.net +64,Nathaniel,Rojas,707964,malesuada.ut.sem@aliquetodioEtiam.net +65,Hope,Hale,564110,dui.semper.et@nullaat.co.uk +66,Jena,Terry,837174,morbi.tristique.senectus@Donecnibh.com +67,September,Townsend,666325,Proin@Donec.ca +68,Shelley,Mccormick ,326352,Duis.gravida@actellus.edu +69,Demetrius,Nixon,268110,dignissim.Maecenas.ornare@sagittisDuisgravida.net +70,Hilda,Erickson,640660,semper@necleoMorbi.org +71,KayE,Noel,464972,Aenean.massa.Integer@anteMaecenasmi.co.uk +72,Kiayada,Trevino,541127,lectus.a@ametmetus.co.uk +73,Nerea,Marquez,542980,molestie.tellus.Aenean@nonjusto.ca +74,Wilma,Hyde,284416,id.enim@sedhendrerit.co.uk +75,Vielka,Farmer,312641,Phasellus.elit@interdum.net +76,Macey,Reid,103928,rutrum@magnisdisparturient.org +77,Lawrence,Petersen,378757,nunc.interdum@Donec.com +78,Bernard,Meyer,526863,diam.Proin.dolor@aliquet.com +79,Ingrid,Harrell,961348,nec.imperdiet@sed.co.uk +80,Rowan,Buchanan,368777,libero.lacus.varius@orciUt.org +81,Donovan,Greene,561509,enim.commodo.hendrerit@convallisincursus.com +82,Jade,Battle,531695,lectus.justo@lorem.co.uk +83,Nevada,black,504965,enim.condimentum@a.net +84,Beverly,Boyle,625739,Cum@atpedeCras.net +85,Xyla,Buckley,929603,euismod.urna@id.co.uk +86,Linus,Conner,583611,accumsan.neque@sitamet.net +87,Eliana,Knapp,249511,neque.sed.dictum@ligula.co.uk +88,Hedley,Gordon,857599,lorem.auctor.quis@loremeu.com +89,Hayfa,Church,380365,fermentum.risus@Maurisquisturpis.co.uk +90,Judah,May,126611,fringilla.purus.mauris@Nullam.net +91,Hamish,Ware,604869,est.mollis@nislMaecenasmalesuada.net +92,Laith,Knight,923896,porttitor.scelerisque.neque@tristiquepharetra.com +93,Avye,Franco,213223,enim.Mauris.quis@lectus.ca +94,Dawn,Travis,237331,accumsan.convallis.ante@non.ca +95,Savannah,Holcomb,471877,Mauris.blandit@elit.com +96,Justina,Branch,972166,lobortis.quam@rutrum.edu +97,Baxter,Strickland,810914,molestie.arcu@Quisquepurussapien.org +98,Yoshio,Henson,936223,vehicula@sem.edu +99,Vaughan,Slater,854240,aliquet.odio.Etiam@euultrices.net +100,Patricia,Alston,537516,tincidunt.nunc@Sedegetlacus.net +101,Hakeem,Ward,274878,Integer@faucibusorci.ca +102,Abra,Olsen,237282,Sed@Donecelementum.edu +103,Jana,Hickman,143425,cursus.non.egestas@sapien.net +104,Clinton,Washington,179247,Duis@nibh.co.uk +105,conan,black,327732,ultricies.ornare.elit@fringillacursuspurus.com +106,calista,castillo,323478,justo@fermentumconvallis.edu +107,Zenaida,Macdonald,590813,egestas.Sed.pharetra@lacus.ca +108,Astra,Mills,442039,et.nunc@nullaanteiaculis.edu +109,Indigo,Nixon,605127,laoreet.posuere.enim@Praesent.edu +110,Aurora,Bishop,994681,egestas.hendrerit.neque@malesuadamalesuada.edu +111,Jacob,Jensen,270394,erat.vel@massa.ca +112,Kylan,Martin,216635,convallis.est.vitae@acmattis.net +113,Keane,England,413437,dui@dui.edu +114,Axel,Duran,231314,facilisi@Donec.co.uk +115,LYNN,CARTER,567019,mi@risusDonec.net +116,Griffin,Chan,263299,Lorem.ipsum.dolor@felis.co.uk +117,Eaton,Beach,646837, fringilla.porttitor@euismod.ca +118,Joelle,Russell,912784,purus@Phasellusliberomauris.net +119,Yasir,Burnett,478087,lectus.Cum@Fuscefermentumfermentum.net +120,Maggie,Little,763296,ac@penatibuset.co.uk +121,Sybil,Donovan,207420,Curabitur.vel.lectus@Inscelerisque.co.uk +122,Madison,Riley,813329,nunc@quismassaMauris.org +123,Colin,Allison,312626,nunc.nulla.vulputate@faucibus.org +124,Tanisha,Johnston,306270,erat.Sed@habitantmorbitristique.com +125,Lysandra,Lane,349336,lacus.Mauris@necmetus.co.uk +126,Macy,Bell,805605,enim@augueSedmolestie.com +127,Violet,Dunn,691767,Proin.vel.nisl@maurisaliquameu.co.uk +128,Ina,Sexton,214316,augue.ac.ipsum@ultriciessemmagna.ca +129,Irma,Gilliam,165083,turpis@acmattis.org +130,Dahlia,Ratliff,532422,tincidunt.dui.augue@dolorsitamet.net +131,Miriam,Drake,309169,ut@molestie.edu +132,Nomlanga,Lindsay,832425,in@interdum.edu +133,Tamekah,Blevins,128721,augue.ac@justoPraesent.ca +134,Katelyn,Grimes,331598,dictum.cursus.Nunc@posuerecubilia.co.uk +135,Ann,Dillon,391437,mauris.Suspendisse@Vivamus.org +136,Kennedy,Poole,588173,scelerisque.neque.sed@pedeCumsociis.org +137,Orli,Guerra,177991,elit@ipsum.edu +138,Zeus,Downs,459124,imperdiet@Suspendisseac.ca +139,Genevieve,Hobbs,411801,ullamcorper@risusMorbi.com +140,Conan,Miles,758462,orci.lobortis@elitpellentesque.net +141,Lawrence,Sims,728987,tellus.justo@lacusEtiam.co.uk +142,Randall,Pollard,555802,porttitor.tellus.non@loremut.ca +143,Philip,Chang,903419,tincidunt@a.co.uk +144,Connor,Carr,561948,est@Nulladignissim.edu +145,Eaton,Wright,476577,facilisis.non@molestiepharetranibh.com +146,Thaddeus,Wilcox,376301,tincidunt@Duisatlacus.com +147,Deirdre,Parks,467040,arcu@duinecurna.ca +148,Ayanna,Dillon,717610,interdum@pulvinar.com +149,Ora,Middleton,322955,ornare.sagittis.felis@euismodestarcu.ca +150,Quemby,Barlow,821554,lobortis.augue@aliquamiaculislacus.co.uk +151,Alma,Frost,907669,Cras.vehicula@lacusCras.com +152,Calvin,Vang,158023,et@ultriciessem.com +153,Declan,Ayala,759613,adipiscing.Mauris@mifringilla.edu +154,Vera,Guy,108461,tellus.id@eleifendnecmalesuada.ca +155,Ina,Sykes,133280,sodales@CuraeDonec.org +156,Janna,Gordon,143092,pede@aliquetmetus.co.uk +157,Quinn,Blevins,242971,conubia@orci.edu +158,Teagan,Gardner,711315,sociis.natoque.penatibus@ut.com +159,Freya,Bryan,470348,aliquam.adipiscing@sapien.org +160,Cleo,Lewis,686677,sed@Etiamligulatortor.com +161,Hiroko,Brady,907178,ornare.tortor.at@Aliquam.org +162,Zelenia,Mendoza,289167,et.commodo.at@aliquetnec.ca +163,Lee,Ramsey,968707,est.mollis@augueeutempor.org +164,Lyle,Morin,607560,augue.eu.tellus@vitae.org +165,Lillith,Trevino,324262,Nullam.suscipit.est@Curabitur.org +166,Harper,Hooper,551804,rutrum@aliquet.org +167,Caleb,Emerson,219166,consectetuer@massa.net +168,Nyssa,Livingston,897187,at@tortorat.net +169,Rooney,Berry,931245,non.feugiat.nec@interdum.net +170,Cameron,Holland,393567,nec.urna.suscipit@gravidasagittis.co.uk +171,Phyllis,Mccullough,314945,neque.tellus.imperdiet@et.ca +172,Cherokee,Barton,107944,amet.dapibus@nisiCumsociis.com +173,Cade,Gentry,730222,nascetur@tellus.org +174,Melvin,Huff,466023,a.auctor.non@erosNam.org +175,Haley,Monroe,668718,Nunc.commodo@amet.net +176,Rafael,Byers,235198,Duis@hymenaeos.com +177,Sarah,Emerson,354241,Donec.vitae@Nullam.co.uk +178,Maxwell,Romero,701470,imperdiet.nec.leo@utlacus.edu +179,Nayda,Frederick,417522,nec.mollis@facilisisloremtristique.org +180,Juliet,Montoya,160370,mauris@ligulaNullam.co.uk +181,Tate,Conway,550097,fermentum@pedeultrices.ca +182,Gail,Wolf,836814,mauris.Suspendisse@velitjustonec.org +183,Axel,Oneill,105316,sollicitudin@rhoncus.net +184,Grady,Henson,250457,pede.nonummy.ut@lorem.org +185,Halla,Pruitt,749935,a.tortor.Nunc@enimCurabiturmassa.edu +186,Wallace,Wong,363440,commodo.hendrerit@rutrum.ca +187,Eden,Patrick,506668,Duis.ac.arcu@turpis.com +188,Mary,Allen,653247,libero@ipsum.co.uk +189,Nissim,Rowe,589092,pede.Cum.sociis@metusAeneansed.com +190,Marshall,Hines,155941,nec.imperdiet@nonummyFusce.com +191,Forrest,Yates,559097,ipsum@lorem.net +192,Venus,Roberts,706861,ac@mauriseuelit.net +193,Wyoming,Adkins,329497,massa@Nullam.edu +194,Nayda,Andrews,493027,diam@Cumsociisnatoque.co.uk +195,Neil,Powers,751352, Donec.egestas.Duis@iaculisodio.com +196,Merritt,Wright,130559,urna.et.arcu@velitAliquamnisl.com +197,Charissa,Logan,870961,arcu.imperdiet.ullamcorper@nislarcu.edu +198,Ashely,Mills,510619,Duis@leoMorbineque.com +199,Aretha,Lopez,228492,cursus@intempus.net +200,Elvis,Barber,707985,nec.mollis@ullamcorperviverraMaecenas.ca +201,Kuame,Cole,774288,penatibus.et@dolor.org +202,Kasper,Bruce,375055,aptent.taciti@tristiquepharetraQuisque.ca +203,DeirdrE,Franco,998247,Fusce.mollis.Duis@urna.com +204,Ross,Obrien,564567,magna.Duis@fermentumrisusat.edu +205,Carlos,Petersen,330307,Mauris.blandit@velit.com +206,Maia,Vang,910824,sem.semper.erat@NuncmaurisMorbi.com +207,Armand,Calhoun,760361,Donec@convalliserat.net +208,Chantale,Compton,687993,cursus@sitamet.net +209,Elijah,Noble,571946,vestibulum@rutrummagnaCras.edu +210,Hasad,Terry,536611,ipsum@loremegetmollis.edu +211,Emery,Nieves,993333,nec@elitAliquam.org +212,Denise,Farley,871811,libero.Proin@lorem.net +213,Victor,Lowe,781930,sociis.natoque@nectempusscelerisque.edu +214,Quinn,Harrington,124421,diam@elitelit.ca +215,Jade,Moss,194990,et.netus.et@lobortis.co.uk +216,Brenden,Valencia,896900,ante.Maecenas@velit.co.uk +217,Boris,Harrington,325378,neque.Nullam.ut@laoreetlectus.edu +218,Ifeoma,Whitney,543699,Proin.nisl.sem@odiovelest.edu +219,Macon,Mcdonald,357273,nisl.elementum@utcursus.net +220,Claudia,Boyle,556300,vitae.risus@sagittisDuisgravida.ca +221,Tyler,Tucker,595247,ornare.In.faucibus@utnisia.edu +222,Paki,Fisher,222290,magna.a.tortor@tempusscelerisque.org +223,Edward,Logan,817668,auctor.nunc.nulla@Aliquamerat.co.uk +224,Lara,Harding,428828,neque.pellentesque.massa@at.co.uk +225,Wade,Morrison,445191,eu.eros.Nam@tincidunt.edu +226,Garrett,Tyson,220472,metus.Vivamus@utpellentesque.edu +227,Phillip,Hodges,117182,consectetuer.ipsum.nunc@incursuset.com +228,Dahlia,Bryant,164133,quis.turpis@magnatellusfaucibus.edu +229,Raja,Rogers,348514,In.at.pede@magnisdisparturient.edu +230,Hannah,Herring,721867,senectus.et.netus@etmalesuadafames.ca +231,Barclay,Hartman,634058,odio@Cumsociis.ca +232,Gwendolyn,Pate,972379,nunc.ac.mattis@estvitae.ca +233,Basia,Perez,833778,egestas@mieleifendegestas.org +234,Celeste,Joseph,205952,urna.Nunc.quis@quisdiam.edu +235,Meghan,Underwood,521135,In.at.pede@quamvel.co.uk +236,Renee,Howard,241348,nulla@In.edu +237,Armando,Becker,750503,Curabitur.consequat.lectus@tortorNunccommodo.net +238,Kirk,Brennan,887040,risus.quis.diam@Inscelerisquescelerisque.net +239,August,Christensen,694378,eu.neque@felisDonec.co.uk +240,Blake,Knox,725503,posuere@Donecsollicitudinadipiscing.co.uk +241,Aurelia,Reeves,925612,Pellentesque.habitant.morbi@arcu.com +242,Leah,Hewitt,812708,pede.nec@idrisusquis.edu +243,Gage,Garner,317911,convallis.convallis.dolor@at.org +244,Sloane,Bridges ,793417,ante@etmagnis.net +245,Fleur,Dudley,315646,senectus@disparturient.edu +246,Mariam,Torres,221027,lobortis.Class.aptent@intempus.org +247,Kay,Key,421142,placerat@in.ca +248,Isaac,Hunt,994158,elit.Curabitur.sed@utmiDuis.co.uk +249,Halee,Lester,966367,nec.urna@non.net +250,Phoebe,Robles,253721,Mauris@tempor.co.uk +251,Dara,Baldwin,345619,commodo.tincidunt.nibh@Vivamus.edu +252,Justin,Clay,736404,id.libero@blandit.com +253,Jackson,Workman,658849,nisi@musProinvel.net +254,Xerxes,Spencer,317934,elit.Nulla@euligulaAenean.co.uk +255,Mallory,Gill,111643,Nullam.ut.nisi@Nullamfeugiatplacerat.ca +256,Marsden,Benton,102066,a.neque.Nullam@sem.co.uk +257,Amela,Vaughn,549367,sem.mollis.dui@Proin.ca +258,Tarik,Butler,629716,Etiam.laoreet.libero@vitaeodiosagittis.edu +259,Quintessa,Collins,701782,Integer.in.magna@mollis.net +260,Quon,Hayes,983029,auctor@orciPhasellus.co.uk +261,Astra,Dodson,998620,Pellentesque.ultricies.dignissim@etultrices.ca +262,Kimberly,Fitzgerald,549261,sed.leo.Cras@elitsed.com +263,Shoshana,Craft,908163,vel.vulputate@consectetuer.co.uk +264,Riley,Johnston,345836,risus@semut.co.uk +265,Sasha,Noble,336154,ac@auctorodio.org +266,Howard,Cooley,167868,faucibus@ametultricies.org +267,Travis,Baxter,199047,Nunc.ullamcorper.velit@nulla.com +268,Cole,Olson,641456,egestas@nuncnullavulputate.edu +269,Priscilla,Leon,747764,eu@arcu.net +270,Raphael,Cherry,543735,adipiscing@ornareelitelit.edu +271,Adele,Conrad,281988,Nullam.vitae@aliquamarcuAliquam.co.uk +272,Brenden,Hester,305612,Cras@egestas.ca +273,Patrick,Ward,904486,massa@nascetur.org +274,Ocean,Mckinney,697539,tempus@aliquetsemut.net +275,Hilel,Allison,702056,dignissim.Maecenas@velit.edu +276,Willa,Diaz,157615,Aliquam.tincidunt@metuseuerat.co.uk +277,Unity,Pierce,625980,Sed.molestie@urnaNuncquis.co.uk +278,Maisie,Terry,305212,eget.laoreet@nonsapien.ca +279,Patrick,Mills,202984,id@dui.com +280,Beck,Lindsey,775469,eleifend.egestas@justositamet.edu +281,Yvette,Brooks,404844,Aliquam@vitaeposuere.net +282,Lionel,Hernandez,988901,semper@ut.com +283,Stone,Stuart,579927,molestie.pharetra@Nullamsuscipitest.com +284,Lane,Shannon,417044,sit@liberoest.ca +285,Ramona,Romero,281912,dapibus.quam.quis@sedpede.edu +286,Paul,Fitzpatrick,848894,rhoncus.Proin@interdum.com +287,Donna,Herrera,280439,parturient.montes.nascetur@dolorNullasemper.co.uk +288,Jonah,Cooper,688900,diam.lorem.auctor@eleifendnunc.com +289,Talon,Larsen,964146,orci@ut.net +290,Simon,Nguyen,866811,volutpat.nunc.sit@neceuismod.net +291,Kenyon,FOX,809860,ut.dolor@Suspendissealiquetmolestie.edu +292,Lucius,Myers,849864,Nam@acmattissemper.org +293,Montana,Dillon,965302,vulputate.ullamcorper.magna@vestibulummassa.co.uk +294,Wyoming,Williams,454586,odio.a.purus@Suspendisse.co.uk +295,Hadley,Miles,799964,montes@hendreritDonec.edu +296,Adam,Nash,776153,dictum.magna.Ut@Sedeu.edu +297,Blaine,Franklin,259300,at.velit@dignissimmagnaa.net +298,Chaim,Mathews,458103,nunc.interdum@lacinia.co.uk +299,Meghan,Whitaker,352355,leo@tempus.org +300,Jamalia,Bryan,693188, velit.Sed.malesuada@Nullatincidunt.ca +301,Honorato,Serrano,843356,amet.metus.Aliquam@Naminterdum.com +302,,Copeland,547803,Vivamus.non.lorem@sed.ca +303,Cleo,Gallegos,334568,nisi.a.odio@tortordictumeu.net +304,Kay,Doyle,360887,Phasellus.dolor.elit@nequesedsem.ca +305,Cooper,Washington,297316,scelerisque.sed.sapien@adipiscing.com +306,Rhoda,Bruce,562363,mollis@pellentesque.co.uk +307,Whoopi,Vance,781305,Aliquam@risus.co.uk +308,Jermaine,Francis,972382,convallis@vulputateduinec.com +309,Owen, Franklin,371448,risus@orciquis.net +310,Howard,Bell,351675,sagittis.felis@Proinvel.co.uk +311,Kenneth,Bird,437680,est@utpharetrased.org +312,Yen,Bush,365274,odio@mollisneccursus.org +313,Leo,Fischer,196672,Vestibulum.accumsan.neque@vestibulum.co.uk +314,Morgan,Britt,419533,sem.consequat@luctus.org +315,Quamar,Henderson,438148,magnis@enimsit.co.uk +316,Hanae,Lancaster,889145,in.felis.Nulla@aliquetodio.net +317,Hyatt,Becker,734683,Fusce@porttitor.net +318,Marvin,Dyer,152047,tellus.non.magna@sitametluctus.ca +319,Isaiah,Salazar,242021,litora@tortorat.org +320,Allistair,Dudley,114908,tellus.justo.sit@lobortis.ca +321,Maisie,Le,766314,rhoncus.Proin.nisl@In.edu +322,Dana,Waters,633792,amet@luctusCurabitur.edu +323,Austin,Fields,425272,pretium.et@posuerecubiliaCurae.ca +324,Keane,Ellis,838037,mauris@erosnectellus.net +325,Risa,Good,154586,felis@velit.net +326,Casey,Barton,712872,ut.nisi.a@sem.com +327,Neil,Sloan,216397,consectetuer@pedesagittisaugue.com +328,ingrid,horne,122102,molestie@Aeneanegetmetus.net +329,Gage,Weiss,171513,eget.lacus.Mauris@augue.edu +330,Justina,Vega,571497,dui.nec@etmalesuadafames.ca +331,Lars,Velazquez,435704,dolor.dolor.tempus@luctus.ca +332,Chloe,Trujillo,290140,montes@tinciduntpedeac.edu +333,Lillian,Chen,909714,risus.Morbi.metus@nullaInteger.net +334,Leslie,Hutchinson,822166,libero.Morbi@sit.edu +335,Dane,Howell,775057,Integer.tincidunt.aliquam@Curabiturutodio.org +336,Jacob,Burns,804620,a.feugiat@afacilisisnon.co.uk +337,NEHRU,MENDOZA,859105,porttitor.interdum.Sed@Loremipsum.co.uk +338,Kerry,Cortez,936316,nulla.Integer.urna@amet.edu +339,Allegra,Roth,594536,et@ategestasa.edu +340,Richard,Baird,825702,auctor.velit.Aliquam@auctorquis.net +341,Ori,Hendricks,397940,nisl.Nulla@dolor.net +342,Caryn,Finch,509571,pede.Nunc.sed@rhoncusDonec.edu +343,Ezekiel,Whitney,400714,sem@sodalespurus.ca +344,Marcia,Luna,989332,vulputate.risus@condimentumegetvolutpat.net +345,Dolan,Obrien,780898,Phasellus.ornare@odioEtiamligula.com +346,Imogene,Cummings,430799,bibendum.sed@nislarcuiaculis.net +347,Lunea,Hobbs,422832,vulputate.posuere@loremvitaeodio.org +348,Brent,Barry,535835,scelerisque.neque@tellus.ca +349,Aladdin,Daniel,958898,mollis.Integer.tincidunt@Suspendisseseddolor.com +350,Hedwig,Yates,330896,enim.Etiam.imperdiet@consequat.net +351,Honorato,Oliver,270300,enim.Etiam.gravida@adipiscingelitEtiam.ca +352,Marcia,Hughes,956895,faucibus.orci@sedconsequatauctor.co.uk +353,Ila,Kinney,315162,augue.ac@sed.com +354,Patrick,Hanson,107616,nunc.In@molestiesodalesMauris.co.uk +355,quin,herring,485857,sit.amet@maurisSuspendisse.co.uk +356,Ulla,Sharpe,308306,purus.ac.tellus@variuset.edu +357,Piper,Levine,524519,morbi@augue.com +358,Myra,Francis,200907,nibh@Suspendissesagittis.edu +359,Amanda,Barton,450687,felis.ullamcorper@pharetranibh.ca +360,Orli,Langley,404206,nulla.at@commodo.net +361,Teegan,Knox,354634,Cum.sociis@dolorDonec.co.uk +362,Rhiannon,Garza,294158,tortor.Integer.aliquam@Nullaeget.co.uk +363,McKenzie,Freeman,120226,vulputate.dui.nec@ligula.co.uk +364,Jeremy,Cobb,749524,adipiscing.Mauris.molestie@diam.co.uk +365,Adele,Frederick,871183,tincidunt.orci.quis@sapienNunc.net +366,Rebecca,Lynch,995385,ultrices@mauris.co.uk +367,Summer,Nash,333787,venenatis.lacus@Pellentesque.co.uk +368,Idola,Espinoza,759943,sed.consequat.auctor@utaliquam.net +369,Miriam,Smith,702940,Nunc.lectus.pede@inlobortis.ca +370,Anthony,Haley,213130,euismod.et.commodo@ultriciesornare.com +371,Davis,Hardin,620920,vulputate.mauris@Praesenteu.ca +372,Nigel,Taylor,381294,ante.iaculis.nec@elitNulla.org +373,Samantha,Beck,382531,neque.Nullam@malesuada.ca +374,Hammett,Hurley,557766,Quisque@loremvitaeodio.co.uk +375,Daria,Huffman,352383,malesuada@nibhAliquamornare.co.uk +376,Hillary,Luna,261286,Quisque.nonummy.ipsum@gravida.com +377,Charles,Walton,232113,auctor.odio@sitamet.com +378,Lesley,Salazar,152561,Aliquam@iaculis.com +379,allegra,hendrix,632689,vitae@dictum.co.uk +380,Maxwell,Clements,284241,urna.Ut.tincidunt@varius.org +381,Kyra,Thomas,622385,tincidunt.pede.ac@Nunclaoreetlectus.com +382,Jarrod,Vincent,234168,cursus.Integer@tempus.org +383,Whoopi,Nelson,141899,dis@acturpis.org +384,Leslie,Rios,286012,venenatis@mollis.net +385,Keith,Hensley,225178, volutpat.ornare@Nuncacsem.com +386,Jared,Lowe,635445,iaculis@luctusvulputate.ca +387,Palmer,Acevedo,168722,ut.pellentesque.eget@eleifendCrassed.org +388,Hasad,Hampton,577257,semper.cursus.Integer@nequevenenatislacus.edu +389,Olga,Melton,463371,pellentesque@Aliquam.edu +390,Blossom,Berg,444711,Nunc.laoreet@Sed.edu +391,Olympia,Conway,918834, est.congue@tempordiam.ca +392,Sacha,Burris,102476,nec@aliquetlibero.com +393,Baker,Patterson,473992,molestie.tortor@dictummi.org +394,Basil,Hale,257894,euismod.urna@nunc.ca +395,Hollee,Horton,247577,vitae@maurisidsapien.com +396,Quemby,Battle,447345,elit.Nulla@nonarcuVivamus.com +397,Chaney,Harrison,867989,ut.odio@duiquisaccumsan.com +398,Chelsea,Trujillo,914221,tempor.bibendum.Donec@condimentumDonecat.ca +399,Libby,Warner,540499,blandit@aliquet.com +400,Giacomo,Mays,666153,sed.pede@lobortisnisi.edu +401,Hilel,Vasquez,661814,amet.risus.Donec@lacuspedesagittis.edu +402,Stephen,Manning,207724,cursus@miacmattis.net +403,Alexis,Thomas,558496,Integer@Morbi.edu +404,Ori,Fox,350150,posuere@Class.edu +405,Drew,York,490453,nonummy@scelerisque.net +406,Kasper,Gates,415234,cursus@semmolestiesodales.com +407,Price,Kemp,384774,pharetra.sed@rhoncusNullamvelit.org +408,Clark,Travis,305257,penatibus.et.magnis@aliquetmolestietellus.org +409,Margaret,Levine,907761,Nulla.tincidunt@Nunc.edu +410,Madeline,Head,557604,dignissim@sed.org +411,Alana,Morse,302529,in@egestasblanditNam.co.uk +412,Kelly,Savage,104851,sit.amet.orci@euaccumsansed.net +413,Allegra,Benjamin,735727,vitae.posuere.at@cursusInteger.co.uk +414,Jolie,Francis,509057,mauris@pharetra.ca +415,Courtney,Salinas,114392,nec@nunc.net +416,elvis,RIVERA,865008,Quisque.libero@vitaesodales.net +417,Astra,Good,450789,Donec.egestas@arcuVestibulum.org +418,Leigh,Berry,714485,taciti.sociosqu.ad@Crasvulputate.net +419,Lareina,Page,827641,cursus.Nunc@dolor.org +420,Freya,Hunt,858990,Donec.fringilla.Donec@sedconsequat.org +421,Candice,Tanner,319493,commodo@dolorelit.org +422,Chancellor,Bean,563449,natoque@orciDonecnibh.ca +423,Savannah,Madden,396985,quis@lectusante.net +424,Barbara,Dale,322237,consequat.lectus.sit@Namnulla.edu +425,Lewis,Martinez,723313,Aenean.gravida@uterat.com +426,ROSALYN,GLENN,854410,magna@Etiam.org +427,Melissa,Hodge,846250,id.mollis@eratvolutpatNulla.co.uk +428,Sebastian,Willis,756832,neque.vitae.semper@In.org +429,Angela,Mitchell,701287,Curabitur.dictum@magna.org +430,Xavier,Coffey,608542,est.congue@Nullam.net +431,Galena,Richardson,495958,et.ultrices@Crassedleo.ca +432,Carter,Nash,973247,convallis.in.cursus@eu.org +433,Kai,Chase,129206,nec.tempus@vulputateposuerevulputate.net +434,Cherokee,Galloway,740378,elit.Aliquam.auctor@velitSedmalesuada.com +435,Howard,Ryan,327228,dignissim@elitCurabitur.net +436,Larissa,Gilliam,959308,libero.Morbi@nec.org +437,Yasir,Castillo,645135,dapibus@utpharetrased.edu +438,Harding,Alvarado,408584,Cum.sociis@arcu.net +439,Ferdinand,Warner,863753,at@malesuadaIntegerid.co.uk +440,Leila,Jacobson,706851,vulputate.posuere@Mauris.net +441,Edward,Goff,876751,sagittis.lobortis@egetmagnaSuspendisse.co.uk +442,Slade,Cotton,725255,congue@Fuscefermentumfermentum.co.uk +443,Anjolie,Lopez,987562,tellus.Phasellus.elit@Curabiturdictum.ca +444,Roary,,607318,mi.pede@orci.ca +445,Mechelle,Weeks,226883,ullamcorper@sit.net +446,Nomlanga,Austin,608139,euismod.mauris@elit.net +447,Cade,Padilla,663467,Fusce.mollis@nonhendrerit.org +448,Bernard,Phelps,847042,eu.ligula.Aenean@parturient.co.uk +449,Kyle,Rivas,857881,feugiat.non@mipede.net +450,Tobias,Thornton,621323,eros.turpis@Aliquam.co.uk +451,Emma,Madden,272050,ac.mattis.semper@Aeneansedpede.org +452,IVAN,WILKINS,932894,mus@massa.net +453,Fredericka,Cunningham,680753,sed@scelerisque.edu +454,Glenna,Avila,114495,per.conubia@utodio.co.uk +455,Montana,Velazquez,428025,Integer.vitae.nibh@Suspendissetristique.ca +456,Shelly,Franks,295600,sed.est@amet.edu +457,Roary,Shannon,538130,aliquam.arcu.Aliquam@Nulla.edu +458,Hyatt,Paul,615541,per@augueacipsum.ca +459,Hayes,Daniels,229343,penatibus.et.magnis@eget.co.uk +460,Kim,Spence,496636,eget.metus.In@egetodio.ca +461,Kim,Schmidt,836345,in.felis.Nulla@pedePraesenteu.net +462,Arthur,Wiggins,525139,arcu.Vestibulum@Fuscefeugiat.edu +463,Rylee,Lane,270321,dolor.egestas@sem.org +464,Dexter,Stanley,169979,congue@aliquetdiamSed.com +465,Brian,Dillon,126548,magnis@magnis.edu +466,Adele,Bates,341901,blandit@Donectemporest.edu +467,Melanie,Salazar,356065,Fusce@PhasellusnullaInteger.co.uk +468,Neve,Heath,774221,convallis.ligula.Donec@cursusInteger.com +469,Russell,Simpson,950502,lorem@utdolor.net +470,Clark,Lynn,421053,vel.sapien.imperdiet@enimEtiamgravida.edu +471,Athena,Evans,231014,tellus.faucibus@Namtempordiam.net +472,Joan,Webster,539060,erat.Etiam.vestibulum@eget.org +473,Quon,Mcclure,459436,justo.nec.ante@Phasellus.ca +474,Isabelle,Rowe,181297,euismod.est@luctuslobortisClass.ca +475,Hilda,Mccray,566107,mauris.aliquam@scelerisquedui.com +476,Ivana,Chapman,357130,et@Cras.co.uk +477,Hope,Oneal,893425,ut@ut.org +478,Nita,Hubbard,232371,Etiam@Nullafacilisis.edu +479,Preston,Foley,304656,venenatis.vel@mattis.ca +480,Fritz,Dorsey,338234,libero@vel.ca +481,Jared,Owens,139175,vel.mauris@molestie.edu +482,Ryan,Wolf,507983,nisi.dictum@non.org +483,Brock,Cobb,378204,Integer.in@tinciduntpedeac.co.uk +484,Kyla,Travis,320299,urna.suscipit.nonummy@nisiCum.net +485,Jenette,Knox,443063,vel.quam.dignissim@turpisvitae.co.uk +486,Porter,Dawson,473795,cursus.et.magna@malesuadamalesuada.org +487,Mason,Daniels,405942,mauris.Morbi@volutpatnunc.org +488,George,Conway,211606,Quisque.porttitor@vitaesodales.com +489,Francesca,Lyons,401449,eu.odio.tristique@volutpatnunc.net +490,Phelan,Sanchez,410518,ultricies@aultriciesadipiscing.co.uk +491,Frances,Pennington,566279,ipsum@justoPraesentluctus.ca +492,Julian,Clark,228716,risus.Quisque@Etiamvestibulummassa.net +493,Hanae,Baxter,947537,consectetuer.adipiscing@doloregestas.ca +494,Blaine,Barker,874570,elit.pretium@sociisnatoquepenatibus.net +495,Bevis,Ross,953508,gravida.Aliquam@dictumultricies.com +496,Ivy,Moore,479275,aliquet.nec@ipsumleo.org +497,Justine,Prince,945163,gravida.Praesent.eu@sagittisfelis.edu +498,Russell,Franks,390033,sollicitudin@tellussemmollis.org +499,Kerry,Howell,730368,sed.consequat@ac.ca +500,Aline,Roth,342499,sodales.Mauris.blandit@odiotristiquepharetra.ca +501,Clarke,Whitney,519150,urna.suscipit@Integer.net +502,Anastasia,Washington,769094,Donec.dignissim@fermentumrisus.net +503,Beverly,Mcclure,935799,Pellentesque.ut.ipsum@aliquetProin.net +504,Tana,Malone,393227,velit@Suspendissenonleo.co.uk +505,Abraham,Phelps,404553,diam.Pellentesque@adipiscingfringilla.ca +506,Gage,Velazquez,553891,in.molestie.tortor@lectusrutrum.net +507,Charissa,Sampson,477902,sem@Nam.net +508,Francis,Carpenter,941382,non.hendrerit@aliquetnec.ca +509,Portia,Owen,938586,Suspendisse.eleifend@feugiat.co.uk +510,Teegan,Gibson,544890,pharetra.ut.pharetra@ipsum.com +511,Anika,Blackwell,454596, quam.dignissim@egestasa.ca +512,Bert,Douglas,785122,Donec.egestas.Aliquam@bibendumsedest.edu +513,Noble,Nielsen,343536,sit@Quisqueimperdiet.edu +514,Zeph,Ramos,514118,consectetuer.adipiscing@sodalesat.co.uk +515,Daria,Hensley,161501,quis@metusfacilisis.edu +516,Josiah,Thompson,605414,ligula@nonummyipsumnon.ca +517,Wayne,Wilson,973758,nonummy@iaculis.co.uk +518,Upton,Stark,842834,turpis@eterosProin.com +519,,Williamson,231131,Nullam.nisl@massa.ca +520,Sean,Day,184141,est.tempor.bibendum@velitAliquam.co.uk +521,Peter,Buckner,219255,enim@ametdapibus.ca +522,Callie,Coffey,119634,imperdiet.ullamcorper@nec.com +523,Owen,Haney,565606,in.magna.Phasellus@enim.edu +524,Gwendolyn,Joyner,850422,metus@Nullaeuneque.com +525,Irene,Moreno,497674,scelerisque.sed.sapien@natoquepenatibus.edu +526,Elijah,Fowler,100410,Nunc.lectus@Ut.com +527,Lynn,Bond,685097,facilisis@posuerecubiliaCurae.ca +528,MacKenzie,Dickson,355792,pharetra@utmi.ca +529,Xavier,Goodwin,890319,litora.torquent@suscipitest.co.uk +530,Noelle,Wyatt,876274,ipsum.ac.mi@vitaesemperegestas.com +531,Cara,Ewing,754039,lorem@magnaet.org +532,Shelley,Norton,932551,vulputate.eu@egestasblanditNam.ca +533,Peter,Mayer,970757,Curabitur.egestas.nunc@mollis.ca +534,Ethan,Carey,413142,nascetur@penatibus.ca +535,Jade,Roy,875316,malesuada@musProinvel.org +536,cain,payne,743983,sem.mollis.dui@doloregestas.org +537,Travis,Kim,153708,porta@arcuiaculisenim.net +538,jana,,824675,nisi.dictum.augue@nonenimcommodo.co.uk +539,Maggy,Barron,912355,augue@acmattisornare.edu +540,Helen,Winters,421764,risus@duiinsodales.net +541,Unity,Hays,652712,In.at.pede@Proinvel.ca +542,Lionel,Payne,927196,orci.lobortis.augue@ettristique.ca +543,Gregory,Hayes,624656,lorem.ut.aliquam@posuereenim.edu +544,Jocelyn,Rodriquez,894404,nibh@luctussit.ca +545,Sacha,Serrano,925977,magna@aliquetmagnaa.net +546,Nolan,Schmidt,209163,dictum.mi@diam.org +547,Brandon,Valdez,564197,consectetuer.mauris@risus.com +548,Kyra,Lambert,247034,adipiscing.fringilla.porttitor@euismodacfermentum.net +549,Leah,Cohen,523078,dui.Fusce@Nam.org +550,Pearl,Mclaughlin,621755,Sed.neque@Proinvelit.edu +551,Priscilla,Powers,963196,litora@semperauctor.co.uk +552,Fitzgerald,,317355,elit.a@velit.co.uk +553,Dai,Sanders,362494,Sed.id.risus@idmollis.com +554,Jamalia,Newman,427497,turpis@Cum.net +555,Nora,Sharpe,416739,mollis.Phasellus.libero@atliberoMorbi.co.uk +556,Hilda,Foley,651615,Quisque.fringilla.euismod@tempordiam.edu +557,Chandler,Stein,842398,sit@aliquet.org +558,Myles,Warner,902645,lectus.Nullam.suscipit@nonsapienmolestie.ca +559,Samantha,White,160200,penatibus.et.magnis@sit.org +560,Clementine,Good,965886,sed@Donecest.com +561,Kirk,Burks,951476,a.scelerisque@odioPhasellus.co.uk +562,Jin,Cox,272842,urna@et.com +563,Kathleen,Webster,149511,tristique@sapiencursus.co.uk +564,Quail,Norman,372384,ac.risus.Morbi@feugiat.edu +565,Chantale,Acevedo,996688,porttitor.scelerisque.neque@nunc.net +566,Whoopi,,125742,feugiat.non.lobortis@enimSednulla.net +567,Ryan,Sanchez,101779,urna.et@Sed.ca +568,Peter,Cleveland,719219,lacus@tincidunt.edu +569,Irma,Hartman,537105,erat.Vivamus.nisi@consequat.co.uk +570,Tate,Valdez,792159,lacus.pede@lorem.edu +571,Acton,Copeland,864448,malesuada.Integer@vestibulum.ca +572,Ginger,Nelson,475426,Duis.gravida.Praesent@vulputate.net +573,Carl,Powell,801279,luctus.vulputate@velit.net +574,Patrick,Mcpherson,457877,sollicitudin.commodo.ipsum@diamDuis.net +575,Alisa,Waller,159521,sed@diam.org +576,Tallulah,Sampson,570633,Quisque.varius.Nam@auctorMaurisvel.com +577,Mona,House,352840,Etiam.laoreet.libero@Utsagittislobortis.net +578,Reece,Ford,564400,cubilia.Curae.Donec@sitamet.net +579,Alexis,Hammond,559816,nascetur@Uttincidunt.ca +580,Anika,Padilla,749473,amet.luctus.vulputate@Loremipsum.ca +581,Vladimir,Stephens,159697,vitae.semper@aliquetliberoInteger.edu +582,Brooke,Buckner,328099,Nunc@risusQuisque.com +583,Gregory,Jacobs,182283,magna.Phasellus.dolor@hymenaeosMauris.org +584,Dominic,Lambert,124504,Curabitur@Loremipsumdolor.net +585,Nina,Melendez,894476,et.magnis@ultriciesadipiscing.org +586,Blaze,Torres,780763,euismod.urna.Nullam@Lorem.edu +587,Patience,Mcclain,886444,dui.quis@odio.org +588,Justina,Robles,595773,Vivamus@etnunc.org +589,Bo,Camacho,408656,dis.parturient@sitametfaucibus.edu +590,Uriel,Walsh,744155,sem@Suspendissealiquet.ca +591,Ulysses,Ray,243873,tristique.pellentesque@nibhAliquam.co.uk +592,Rana,Newman,590428,dolor.Fusce@arcu.com +593,Theodore,Underwood,779127,ipsum.dolor.sit@Aenean.edu +594,Sydnee,Lyons,625371,Nam.ligula@Nunc.ca +595,Janna,Mcclain,900655,augue.id.ante@ultricessit.com +596,Bevis,Mcdaniel,218265,et@liberoProinmi.org +597,Dai,Mercado,432831,volutpat@nibhQuisquenonummy.net +598,Chantale,Beard,271975,eu@sit.edu +599,Kristen,Wilson,981676,sit@in.org +600,Steel,Roberson,482479,fermentum.risus.at@Crasdictum.net +601,,Newton,601770,Duis.ac@nuncinterdum.edu +602,Shelley,Talley,888035,in@dignissim.edu +603,Yuri,Roach,610231,amet@tinciduntaliquamarcu.com +604,Carissa,Reyes,802530,Suspendisse.ac@urnaUt.co.uk +605,Dane,Sloan,291337,eu.tellus.eu@Sed.edu +606,Armand,Pitts,390966,ligula@variuseteuismod.com +607,Garrett,Leblanc,238108,morbi@nibhlaciniaorci.co.uk +608,Ava,Boyd,176195,In@quisarcuvel.org +609,Gary,Noble,259521,felis.purus.ac@Intincidunt.co.uk +610,Adria,Sparks,120331,tellus.sem@idrisus.com +611,Joelle,Cantrell,153536,molestie.orci.tincidunt@adipiscing.ca +612,Leonard,Robbins,935746,Nunc.quis@Donecsollicitudin.org +613,,,104969,dictum@Suspendisse.net +614,Zenaida,Vazquez,503503,In@Sedeunibh.co.uk +615,Azalia,Greene,623530,ultrices.iaculis.odio@rutrumjustoPraesent.co.uk +616,Jemima,Livingston,720652,nunc@nec.ca +617,Abel,George,821215,Nunc@lectus.net +618,Althea,Simon,208527,Aenean@variusultricesmauris.net +619,Ulla,James,925253,nisi.magna@ipsum.net +620,Walker,Aguirre,453940,molestie.orci.tincidunt@Maecenas.org +621,Kameko,Hester,592235,eleifend@at.co.uk +622,Eugenia,Guerrero,732609,ligula.elit.pretium@porttitor.org +623,Carson,Blackburn,241522,nibh.Quisque@aliquet.net +624,Cassidy,Salas,496535,faucibus.leo@odioNaminterdum.edu +625,Galena,Holland,460962,at.risus@tincidunt.co.uk +626,Harper,Vaughan,847177,ipsum.non@miac.co.uk +627,Wesley,Rodgers,336970,lectus@sodaleseliterat.com +628,Jesse,Wheeler,806566,sem@Mauris.ca +629,,Riggs,242923,nulla.ante.iaculis@erat.net +630,Tate,Cherry,670600,nec.tempus@cubiliaCurae.edu +631,Caesar,Gilliam,472369,massa@Nunc.ca +632,Laith,Morales,402010,dui.nec.tempus@eu.ca +633,Vaughan,Humphrey,372491,enim.Etiam@Nullafacilisi.ca +634,Nicole,Farmer,882755,elementum@scelerisque.com +635,Dominic,Potter,723524,a.dui@anteiaculis.co.uk +636,Wylie,Melton,346793,malesuada.augue.ut@vitaeerat.net +637,Graiden,Spears,529767,lorem@luctusCurabitur.org +638,Grace,Bush,227601,Cras.eu@tellusnon.net +639,Alisa,Bowen,360394,ad.litora@eu.org +640,Quon,Hart,386479,semper@ultricesa.org +641,Gavin,Vazquez,677647,malesuada.fames@neque.com +642,Chester,Adams,940100,luctus@rhoncusNullam.edu +643,Ivory,Richards,895136,montes.nascetur.ridiculus@massaSuspendisse.net +644,Ella,Graves,345159,purus.sapien.gravida@laoreetposuereenim.org +645,Hamish,Wise,221534,gravida.nunc.sed@uterosnon.ca +646,Hanna,Rowe,923160,sit.amet.luctus@magnaseddui.net +647,Chaney,Winters,563976,quam@Utsagittislobortis.com +648,Jillian,Warner,948293,purus@necdiam.com +649,Dana,Buckley,691043,convallis@senectusetnetus.org +650,Ursa,Mcclure,601035,commodo.hendrerit@tortorIntegeraliquam.com +651,Tallulah,Monroe,556024,arcu.Sed@liberoInteger.co.uk +652,Alexandra,Rogers,484590,sit.amet@ipsumcursus.org +653,Wendy,Snider,845560,felis@ullamcorpervelit.com +654,Quinn,Mays,272201,Curabitur.consequat.lectus@elitdictum.org +655,Larissa,Norris,581445,ultricies.adipiscing@et.com +656,Michelle,Barnett,111978,lectus.justo@Aeneaneuismodmauris.net +657,Alexandra,Burt,421216,molestie@facilisiSedneque.net +658,Troy,Dickerson,493139,ornare@sapienmolestie.net +659,Nissim,Gordon,159613,Nunc@penatibusetmagnis.net +660,kira,koch,592428,non.justo@sodalesnisi.org +661,Nevada,Guy,202838,Mauris@Integer.edu +662,Karyn,Ewing,746159,posuere@eusem.edu +663,Elliott,Duncan,703131,sit.amet@auctorvitaealiquet.com +664,Amal,Larsen,976465,magna@tellusSuspendissesed.org +665,Sage,Reid,785514,justo.nec@elitsedconsequat.com +666,Athena,Thornton,759990,orci.luctus@molestie.edu +667,Daniel,Nguyen,755618,fringilla.euismod@et.edu +668,Kalia,Christensen,661281,justo.sit.amet@arcu.co.uk +669,Bertha,Mercado,474646,non.lobortis@liberoProin.org +670,Maia,West,573172,enim.diam.vel@seddolor.net +671,Jenette,Nixon,706258,magna.Praesent@nequeMorbi.org +672,Zena,Bentley,872179,id.sapien.Cras@ametluctus.com +673,Vielka,,270462,metus.Aenean@lectusNullamsuscipit.edu +674,Jana,Michael,205695,lorem.Donec@variuseteuismod.net +675,Florence,Bird,729671,fringilla.Donec@felispurus.net +676,Odysseus,Alford,772403,aliquam@dapibus.co.uk +677,Chiquita,Cote,237271,et.magnis@nasceturridiculusmus.net +678,Thomas,Carroll,298458,arcu@consectetuer.com +679,Nolan,Lancaster,131937,netus@sitamet.edu +680,Martin,Pruitt,921996,ac@ligulaNullam.edu +681,Phoebe,Rasmussen,173526,eu.dolor@Etiam.com +682,George,Bradshaw,787467,augue.eu@acfeugiatnon.edu +683,Hedda,Erickson,953661,tellus@sem.edu +684,Phyllis,Cameron,895781,scelerisque.sed.sapien@elitfermentum.ca +685,Denton,Norman,844752,Quisque@nonlaciniaat.ca +686,Burke,Webb,589872,eu.odio@Phasellusnulla.org +687,Fay,Irwin,118161,metus@elitsedconsequat.com +688,Lareina,Nicholson,212479,non.lacinia@feugiat.com +689,Rebekah,Whitehead,848138,laoreet.libero@eu.org +690,Talon,Daniel,537714,vulputate@adipiscingligula.com +691,Kylie,Travis,252753,augue.scelerisque@pedemalesuadavel.co.uk +692,April,Luna,202101,tempor.diam.dictum@Crasinterdum.org +693,Colt,Richardson,927520,aliquet.magna@Aeneanegetmetus.com +694,Cathleen,Nieves,830704,ac@Fuscemi.co.uk +695,Rhea,Simmons,451662,Curae.Phasellus@atortor.co.uk +696,Palmer,Tran,739659,vitae@loremsitamet.com +697,Chantale,Shelton,327861,ipsum.Phasellus@Sedauctorodio.org +698,Aretha,Rose,115341,magna.Duis@Donec.ca +699,Serina,Buchanan,416987,id@dolorelitpellentesque.co.uk +700,Xanthus,Brooks,734337,a@ornare.edu +701,Henry,English,571478,malesuada.fames.ac@sitamet.org +702,Yoshi,Dunlap,391466,lacus@nullaIntincidunt.org +703,Josephine,Mcconnell,653622,magna.Praesent@eueuismod.org +704,Doris,Dale,267801,scelerisque.mollis.Phasellus@blanditcongueIn.org +705,Christopher,Jenkins,272116,risus.Donec@Utsagittis.co.uk +706,Rina,Shaw,731200,scelerisque.dui@et.com +707,Vanna,Bates,701827,non@tempusmauriserat.co.uk +708,Blaine,Hogan,661959,iaculis@tristiquepharetraQuisque.co.uk +709,Ulla,Howe,271626,sem.elit.pharetra@penatibuset.edu +710,Stone,Cleveland,588083,ac@vulputaterisus.net +711,Breanna,Wooten,185469,dictum@massanon.net +712,Bethany,Britt,847799,nascetur@liberoestcongue.com +713,Farrah,Hancock,111879,nonummy@eros.edu +714,Darrel,Head,106817,Phasellus.dapibus@quam.org +715,Chloe,Avery,671073,lobortis@ligulaNullamfeugiat.org +716,chase,duffy,913722,Proin.mi@afelisullamcorper.com +717,Suki,Phelps,959927,velit.Pellentesque.ultricies@iaculislacus.org +718,Minerva,Carver,914811,quam.elementum@Namporttitor.com +719,Perry,Mcfarland,207782,nisi@Integerinmagna.edu +720,Blossom,Mendez,515461,adipiscing@utnisia.ca +721,Garrett,Walton,784488,sed.facilisis.vitae@sodalesnisi.net +722,Leo,Townsend,572972,natoque@felisadipiscingfringilla.org +723,Dana,Franklin,732831,justo@pedeacurna.net +724,Gwendolyn,Kerr,309482,interdum@duiFusce.org +725,Quemby,Hahn,593955,semper@sociisnatoquepenatibus.net +726,Velma,Woodard,939943,Sed@ametorci.ca +727,Roanna,Frost,347168,a@blanditviverraDonec.net +728,Kasimir,Cox,593917,Proin@faucibusorci.ca +729,Sage,Weaver,958719,dignissim@sedhendrerit.co.uk +730,Iola,Best,294186,bibendum.fermentum.metus@erat.edu +731,Olivia,Landry,571384,ac.turpis@vitae.net +732,Baker,Alvarado,850487,elementum.dui@auguemalesuada.co.uk +733,Yuli,Levine,907142,montes.nascetur@eu.org +734,Blair,Berger,999511,turpis.egestas.Aliquam@fermentum.org +735,Mari,Duncan,516520,luctus.ipsum.leo@velitPellentesqueultricies.net +736,Shana,Barr,714216,arcu.Curabitur@et.ca +737,Kitra,Hudson,172599,Vestibulum.accumsan@euduiCum.org +738,Salvador,Roach,551142,Aliquam.erat.volutpat@natoque.ca +739,Moses,Crosby,670863,Sed.diam.lorem@molestie.org +740,Mona,Hull,580591,rutrum@pellentesquemassalobortis.org +741,Paula,Cash,805539,arcu@ultriciesadipiscingenim.edu +742,Finn,Randolph,646011,mauris.aliquam@duiCum.edu +743,Vladimir,Lee,463223,Curabitur.dictum.Phasellus@amet.net +744,Unity,Decker,456649,egestas@pedesagittis.org +745,Aristotle,Mayo,697968,habitant.morbi.tristique@aliquet.co.uk +746,Mona,Vincent,666738,auctor.ullamcorper@Donecegestas.org +747,Adena,Hamilton,235537,nec@Aliquamerat.ca +748,Thaddeus,Hughes,507661,laoreet.posuere.enim@Integer.co.uk +749,Kimberly,Bruce,954208,morbi.tristique.senectus@maurisrhoncus.edu +750,Quynn,Ewing,143660,sociosqu.ad.litora@ornarelectus.ca +751,Craig,Lancaster,807267,ipsum.porta.elit@duiCum.edu +752,Ignatius,Walters,974978,posuere@Donec.net +753,Oren,Ware,165921,consequat.nec.mollis@odioa.org +754,Denise,Tran,610980,tellus.justo.sit@nibh.ca +755,Alec,Marquez,986460,placerat@in.org +756,Rana,Blair,148318,cursus@consequatlectussit.edu +757,Penelope,Hurst,423861,sapien@adipiscingnon.co.uk +758,,,155499,facilisis.facilisis@quamdignissimpharetra.net +759,Nero,Bentley,650062,auctor@aliquamarcuAliquam.com +760,Sharon,Roman,328615,risus.at@dignissimMaecenas.co.uk +761,Wyatt,Castaneda,360433,tincidunt.neque.vitae@orcitinciduntadipiscing.co.uk +762,Petra,Patel,770980,feugiat.Lorem.ipsum@Donecnibh.org +763,Caleb,Adams,850023,et.ultrices@ac.com +764,gloria,moses,763614,lacus.Quisque.imperdiet@ullamcorpernislarcu.ca +765,Faith,Knight,421776,nec@enimSuspendissealiquet.co.uk +766,Sophia,Palmer,754722,in.consequat.enim@fringillaDonec.net +767,Perry,Shepherd,316512,felis.Nulla@Cras.com +768,Nathaniel,Erickson,139299,luctus.et.ultrices@Sed.edu +769,Palmer,Berger,149515,ipsum@Nullamvitaediam.ca +770,Quail,Washington,694177,sit.amet.risus@Pellentesquehabitantmorbi.edu +771,,,935186,amet.consectetuer.adipiscing@purusgravida.net +772,Tiger,Nash,319727,semper@pretiumetrutrum.org +773,Blake,Beard,796308,molestie.arcu.Sed@intempuseu.org +774,Ryan ,Hart,426438,eu.tellus@morbi.co.uk +775,Barbara,Hurley,691210,enim.Mauris.quis@magna.net +776,Sigourney,Richmond,327527,metus@Cras.edu +777,Fallon,David,249474,risus@Cumsociisnatoque.org +778,Raphael,Olsen,758424,Nunc.mauris.elit@sitametmetus.org +779,Casey,Hubbard,978445,mi.Aliquam@Ut.org +780,Uma,Shields,208440,semper.erat@auctorvitae.org +781,Claire,Steele,840107,vulputate.eu@venenatisamagna.ca +782,Vanna,Stone,569988,erat.volutpat.Nulla@aliquet.net +783,Xerxes,Gonzalez ,801556,mus@tristiquealiquet.com +784,Kalia,Morgan,109181,at.auctor@Mauriseuturpis.net +785,Deirdre,Pope,528011,ligula.Nullam.enim@Fuscealiquamenim.edu +786,Xerxes,Franklin,470142,semper.cursus.Integer@Curabitur.edu +787,Kieran,Clayton,138622,rutrum@at.com +788,Jolie,Hobbs,929351,suscipit.nonummy@Mauris.co.uk +789,Shad,Gregory,475023,tellus@Pellentesquetincidunttempus.net +790,George,Luna,333704,metus@risusaultricies.org +791,Janna,Sullivan,795523,Nullam@placerat.org +792,Joseph,Leon,861931,ullamcorper@ultrices.com +793,Paki, Hudson,919367,erat@urnaet.org +794,Whilemina,Justice,870037,aliquet.diam@gravidamauris.net +795,Rhoda,Gregory,358467,massa.Integer.vitae@feugiat.edu +796,Giacomo,Pollard,663873,lobortis.quis@lectussitamet.net +797,Tad,Kirkland,412739,ipsum.dolor.sit@adipiscingelit.co.uk +798,Silas,West,928949,semper.Nam.tempor@velit.com +799,Lavinia,Pruitt,232896,nonummy.ac.feugiat@magnaaneque.co.uk +800,Alexa,Love,294797,Mauris.vestibulum@lectusantedictum.co.uk +801,Moana,Ortiz,756794,Nunc@nonfeugiatnec.edu +802,Ignacia,Gilliam,496080,Curabitur.dictum.Phasellus@risus.ca +803,Wesley,Riggs,356136,Fusce.fermentum@Morbisitamet.org +804,Kerry,Perkins,643610,odio.Aliquam.vulputate@pede.edu +805,Alfonso,Guerra,719634,nec.cursus@Naminterdum.edu +806,Colt,Rice,763856,eleifend@risusvariusorci.org +807,Nelle,Moses,418065,eu@euismod.edu +808,Martin,Bradshaw,641853,quis.turpis@orciUt.org +809,Yolanda,Beasley,303871,orci.adipiscing.non@semPellentesqueut.org +810,,Bishop,934203,Sed.et@at.net +811,Melvin,Washington,284820,est@magnased.ca +812,Jacqueline,Mann,761134,luctus.vulputate.nisi@hendreritconsectetuer.ca +813,Samuel,Travis,692116,lectus.pede@nonsollicitudin.org +814,Hamish,Reese,255295,eu@adipiscing.co.uk +815,Bianca,Vaughn,277596,dis.parturient.montes@sit.co.uk +816,Igor,Vance,875433,quis.pede.Praesent@mollis.com +817,Stephen,Bell,320480,Morbi.quis@loremluctus.edu +818,Carolyn,Mercer,776260,eu.metus.In@malesuadafames.net +819,Reuben,Conner,405844,massa.Mauris.vestibulum@aliquet.edu +820,Inez,Horn,104145,nibh.lacinia@blandit.com +821,Dolan,Villarreal,116828,dui.in@Nullaegetmetus.net +822,Richard,Slater,911686,urna.et.arcu@nequesed.com +823,Ali,Jennings,646424,mauris@Crasdictum.com +824,Harper,Valentine,269087,a.enim@pellentesqueSeddictum.co.uk +825,Garrison,Guerra,916208,iaculis.quis.pede@Nulla.ca +826,Casey,Beach,842656,convallis.in.cursus@eget.co.uk +827,Amelia,Hewitt,191021,Donec.est.Nunc@apurus.edu +828,Erica,Glass,185928,augue@quislectus.ca +829,Unity,Carroll,487234,augue@egestas.net +830,Candace,Mclean,296635,In.nec.orci@sit.org +831,Sheila,Copeland,923053,lobortis.mauris@maurisblandit.edu +832,September,Stanley,512166,et@Nullaeu.edu +833,Melinda,Ramsey,838379,eget@Crasdolor.edu +834,Wang,Jenkins,510684,vel@egestas.edu +835,Lester,Burns,340323,Aliquam.nec.enim@egetnisi.edu +836,Keegan,McCoy,93128,feugiat.nec@eu.co.uk +837,Kennedy,House,564714,libero.mauris@eu.edu +838,Tara,Savage,371310,fringilla.porttitor@aceleifend.org +839,Sylvia,Cantu,850728,malesuada.malesuada@lorem.co.uk +840,Hedley,Gaines,736791,sodales.purus.in@scelerisquenequeNullam.edu +841,Ashely,Mills,200213,fringilla@facilisisfacilisis.com +842,Alec,Mcdaniel,246702,imperdiet.nec.leo@rhoncusDonecest.org +843,Porter,Hoffman,343013,ut.nisi@mollisPhaselluslibero.co.uk +844,Orlando,Ryan,416930,orci.Donec.nibh@sapienmolestie.net +845,Timothy,Bright,357997,mauris@Utsagittis.edu +846,Igor,Britt,649677,non@SuspendisseeleifendCras.edu +847,Rama,Lowery,895040,odio.sagittis@commodo.edu +848,,Hess,340666,interdum.Nunc.sollicitudin@elementum.org +849,Shannon,Abbott,599196,sollicitudin.orci@est.edu +850,wendy,davis,465382,ut@leoMorbi.co.uk +851,Desirae,White,172913,id.erat.Etiam@pharetrasedhendrerit.com +852,Callum,Cunningham,304089,id.mollis@penatibus.edu +853,Xena,Mcfadden,164330,non@Fuscealiquam.net +854,Desirae,Hebert,780381,vestibulum@sagittissemper.co.uk +855,Janna,Graham,664096,porttitor.vulputate.posuere@Integer.ca +856,Nehru,Rocha,109153,Proin.eget.odio@penatibusetmagnis.com +857,David,Walker,913448,et@Quisque.com +858,Winifred,Wade,772283,Duis.volutpat.nunc@DuisgravidaPraesent.net +859,Steven,Nicholson,720437,justo.Praesent@tempor.co.uk +860,Talon,Patterson,645824,placerat.Cras.dictum@eratEtiamvestibulum.net +861,Willa,Rush,305144,non.quam.Pellentesque@egestasSedpharetra.org +862,Vielka,Dickerson,417530,nulla.vulputate.dui@magnis.co.uk +863,Hayes,Flynn,639187,sodales@Nullamscelerisqueneque.net +864,Camille,Mooney,850285,et.magnis@nisiCumsociis.edu +865,Gavin,Lang,542404,eu.enim@eget.edu +866,Kyle,Ratliff,846969,dignissim.lacus@adipiscingnonluctus.co.uk +867,Margaret,Morrow,549884,libero.nec@turpis.com +868,Clare,Williams,160165,sagittis.semper.Nam@Nullamfeugiatplacerat.ca +869,Charlotte,Wilder,903549,Quisque.purus.sapien@acmattis.ca +870,Quin,Hays,127427,mi.Duis@mollisPhaselluslibero.ca +871,kelsey,holman,728789,feugiat.placerat@ametrisusDonec.co.uk +872,Joelle,Morrow,701417,magna@enim.net +873,Lionel,Thornton,117164,dolor.Fusce@quisdiam.edu +874,Ryan,Mcleod,730772,consequat.auctor@velitSedmalesuada.com +875,Anjolie,Kirk,662211,est@gravida.ca +876,Tanya,Pace,946370,posuere.vulputate.lacus@dolor.org +877,Quintessa,Knox,998765,In@suscipitest.edu +878,Jason,Burton,837164,Maecenas.iaculis@vitae.edu +879,Mira,Oneal,203734,Proin.non.massa@ametorci.co.uk +880,Bryar,Landry,983624,tempus.eu@vitaealiquet.com +881,Amela,Figueroa,492816,et.malesuada@adipiscingelit.ca +882,Zenaida,Pacheco,399345,elit.pede@lorem.edu +883,Deirdre,Russo,850855,porttitor@lorem.net +884,Lars,Crawford,174515,diam.luctus.lobortis@scelerisqueloremipsum.co.uk +885,Ramona,Cox,406292,gravida.Praesent@nectempus.com +886,Jessica,Stevenson,600526,mauris.a@tortor.com +887,Zoe,Sweeney,544816,arcu@fringilla.net +888,Connor,Patton,627202,et.euismod.et@loremeumetus.com +889,,Eaton,292781,ac@idnunc.edu +890,Kimberly,Mcdonald,301174,enim.gravida@nonegestasa.net +891,Florence,Vega,827540,senectus.et@bibendum.net +892, Shay,Mcclain,591017,Cum.sociis@mauris.org +893,Zorita,Warner,632203,in@Integer.com +894,Russell,Booth,509718,quam.vel@consectetueripsumnunc.ca +895,Rahim,Klein,282554,mattis.ornare@enimSuspendisse.ca +896,Dane,Dean,849924,mauris.blandit.mattis@loremacrisus.co.uk +897,Callie,Guzman,929670,sagittis.semper@Morbimetus.com +898,Sonia,Marshall,837600,sed.dictum@portaelita.com +899,Ahmed,Stanton,257438,sodales.Mauris@CraspellentesqueSed.ca +900,Phoebe,Miles,711365,gravida.molestie.arcu@quis.net +901,Chantale,Hewitt,312734,consectetuer.rhoncus.Nullam@pedeCrasvulputate.edu +902,Roanna,Fitzpatrick,165138,pede.ultrices@loremut.co.uk +903,Maite,Garner,120702,augue.eu.tempor@Pellentesqueutipsum.com +904,Lavinia,Wilson,476139,ridiculus.mus@enimcondimentumeget.org +905,Alice,Dyer,973758,Vivamus.nibh.dolor@gravidamauris.co.uk +906,Kelsie,Booth,365050,interdum.Nunc.sollicitudin@apurus.edu +907,Maite,Porter,199442,metus@urnaet.com +908,Kerry,Fulton,571839,elit.pretium.et@DonecegestasAliquam.com +909,Valentine,Reynolds,219167,lorem.eu.metus@nibh.ca +910,Tatum,Mcfadden,913067,dolor@orcilobortis.edu +911,Lydia,Cash,680747,Ut.tincidunt.vehicula@dignissimtemporarcu.org +912,Blossom,Guerra,139412,cursus.et@netus.com +913,Dolan,Conner,800164,eget@ultricesposuere.org +914,Nathan,Walsh,860060,vehicula.risus.Nulla@anteipsum.ca +915, Ulric ,Le,666944,Suspendisse.sed.dolor@purus.org +916,Chloe,Wells,136891,luctus@placerat.edu +917,Nichole,Duffy,551597,a@eget.org +918,Felicia,Bowman,420188,elit.pellentesque@lacusMauris.ca +919,Nathaniel,Church,301078,malesuada.fames@hendrerit.ca +920,Ifeoma,Wiggins,643328,velit.dui@Etiambibendumfermentum.org +921,Victor,Koch,351492,dui@eu.edu +922,Olga,Mcintosh,482631,semper.cursus@ac.org +923,Chaney,Stephens,539285,enim@acorci.net +924,Walker,Olson,767653,pharetra.Nam.ac@laciniavitaesodales.co.uk +925,Alexa,Schultz,914749,eget.ipsum@acmetusvitae.com +926,Xenos,Lyons,326309,in@Cum.org +927,Isabelle,Christensen,730117,habitant.morbi.tristique@Fuscediamnunc.ca +928,April,Nash,181746,egestas.Sed@laciniavitaesodales.edu +929,Uriah,Bailey,608600,Nulla.facilisi.Sed@vitaepurus.co.uk +930,Mia,Camacho,372985,malesuada@utipsum.com +931,Mollie,Hayden,960804,enim@enim.com +932,Gary,Reeves,664649,Aliquam.fringilla.cursus@tempusmauriserat.edu +933,Aimee,Knight,592749,diam.luctus.lobortis@augueSed.edu +934,Nora,Leonard,114392,ornare.elit@libero.com +935,Cassandra,Glover,412969,et@ipsumdolor.com +936,Michael,Walters,408988,libero.mauris.aliquam@ultricesposuere.ca +937,Wang,Maxwell,782879,a.magna.Lorem@molestiearcu.co.uk +938,Barclay,Montoya,547698,Nulla.facilisi.Sed@et.co.uk +939,Kuame,Le,922053,placerat@non.co.uk +940,Kirsten ,French,760086,malesuada.Integer.id@necurna.edu +941,Ann,Fernandez,722651,luctus.ipsum@amet.ca +942,Brendan,Hood,141602,tellus.eu@placerateget.com +943,Madaline,Harris,543315,pharetra.ut.pharetra@faucibus.org +944,Illana,Dominguez,817915,erat.vel@fermentummetus.ca +945,Solomon,Bright,728577,et@liberoatauctor.edu +946,Brady,Duncan,258909,Curabitur.sed.tortor@nonummyacfeugiat.org +947,Inga,Le,710604,mollis.Duis@Loremipsumdolor.org +948,Sopoline,Ratliff,224834,Nam@fringillami.org +949,Yvonne,Mccarty,650033,posuere.at.velit@augueut.ca +950,Bradley,Nicholson,174673,adipiscing@urna.ca +951,Chester,Mckenzie,879786,odio.Phasellus@feugiattellus.org +952,Keiko,Shepard,386378,Aliquam.nec@etrutrumnon.ca +953,Penelope,Lester,641084,Mauris.molestie@sedsemegestas.ca +954,Nina,Garrison,321511,purus.ac.tellus@eunulla.org +955,Veronica,Barr-Novel,953228,sollicitudin@Nunc.org +956,Kalia,Roth,490775,arcu.Vestibulum.ut@ligula.org +957,Aquila,Sexton,671975,sed@duiquisaccumsan.co.uk +958, Reuben,Ferrell,589150,interdum@sit.edu +959,Ignacia,Benjamin,774017,eget.dictum.placerat@enimMauris.com +960,Taylor,Wallace,75182,orci@dolor.net +961,Candice,Ramos,595914,eget.metus@eueuismod.co.uk +962,Mohammad,Dominguez,223157,Curabitur.vel.lectus@nisimagna.edu +963,Nigel,Park,883607,Aenean@sociosqu.net +964,Uriel,Osborne,245362,Etiam.gravida@eleifend.ca +965,Chloe,Bishop,172716,mauris.Integer@metus.net +966,Anjolie,Peters,223948,tortor.dictum.eu@vestibulum.com +967,Jaime,Levy,981016,ac@Aliquam.co.uk +968,Sonya,Curtis,284805,neque.In.ornare@pedeblanditcongue.ca +969,Steven,Sims,151438,litora.torquent.per@fringillami.edu +970,Gretchen,Mcintyre,243757,ornare@eratvel.edu +971,Dai,Duffy,487306,dolor@In.com +972,Dorothy,Roy,734417,quis@Sedetlibero.ca +973,Dacey,Hall,161112,viverra.Maecenas@etmalesuada.co.uk +974,Kerry,Stevenson,665141,tortor.dictum@vitaedolorDonec.org +975,Casey,Harrell,155224,nec.malesuada.ut@maurisa.org +976,Hector,Lancaster,951250,ante@nullamagna.co.uk +977,Sloane,Rocha,753759,a.felis.ullamcorper@magna.com +978,,,958519,Aliquam.auctor.velit@fringillapurus.co.uk +979,Kenyon,Cannon,225309,elit@iaculisnec.org +980,Willow,Leonard,904752,Etiam.laoreet@ipsumCurabitur.org +981,Kasper,Mayer,700039,porttitor.tellus@vitaealiquetnec.edu +982,Gemma,Hartman,137599,luctus@ProinmiAliquam.edu +983,Whilemina,Rosales,991184,augue@felisorci.edu +984,Ignatius ,Monroe,445886,Suspendisse@necligula.org +985,Cherokee,Indian,157172,enim@disparturient.edu +986,Tanya,Howard,295481,Donec.consectetuer.mauris@venenatislacus.co.uk +987,,Burris,233269,Sed.et@Proin.ca +988,Giacomo,Hardy,951442,justo.nec@augue.ca +989,Gage,Oneal,708749,a@velit.edu +990,Jayme,Dillard,139811,eu@egestas.edu +991,Hunter,Leon,562993,elit.pede@euarcu.ca +992,Mannix,Chambers,492950,sit.amet.nulla@DuisgravidaPraesent.org +993,Quinn,Marquez,179211,ultrices.Duis@sociis.com +994,Patricia,Franks,957295,enim@ipsum.edu +995,Celeste,Craig,369639,magnis@adipiscing.ca +996,Kirestin,Potts,653131,in@accumsan.co.uk +997,Kristen,Stout,628705,tempus@Loremipsumdolor.edu +998,Daniel,Snider,535804,porttitor.tellus.non@maurisMorbinon.com +999,Simone,Mcleod,407847,et.risus@consectetuereuismod.ca +1000,Hermione,Morales,478506,eu.nulla@Donec.com diff --git a/notebook/explore-data-challenge.html b/notebook/explore-data-challenge.html new file mode 100644 index 0000000..5ef2d75 --- /dev/null +++ b/notebook/explore-data-challenge.html @@ -0,0 +1,16273 @@ + + + + +explore-data-challenge + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+
+

Explore Data Challenge

Assumptions:

    +
  1. Data in tab-delimited format (fields separated by tabs)
  2. +
  3. Fields with quotes contain reserved characters such as \t, \r, and \n
  4. +
  5. Data is UTF-16LE encoded => convert to UTF-8
  6. +
  7. Data will have anomalies and will require judgment calls
  8. +
  9. Upload data to Redshift
  10. +
  11. Parallel algorithm:
      +
    • break up data into multiple parts (position, length)
    • +
    • process the multiple parts, output each to separate TSV file
    • +
    • re-assemble multiple TSV files into a single TSV file
    • +
    +
  12. +
+

Sections:

    +
  • Section 1 - explore data.tsv
  • +
  • Section 2 - read non-anomalous data into pandas dataframe, explore
  • +
  • Section 3 - experiment with algorithms to clean anomalous data
  • +
+

References:

+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
In [ ]:
+
+
+
# # install unidecode
+# ! pip install unidecode
+
+# # install pandas-redshift library if not already installed
+# ! pip install pandas-redshift
+
+ +
+
+
+ +
+
+
+
In [ ]:
+
+
+
import os
+from collections import Counter
+import io
+
+ +
+
+
+ +
+
+
+
In [3]:
+
+
+
import boto3
+import pandas as pd
+import numpy as np
+import pandas_redshift as pr
+
+ +
+
+
+ +
+
+
+
In [4]:
+
+
+
# pandas display settings
+pd.set_option("display.max_columns", 999)
+pd.set_option("display.max_rows", 999)
+
+ +
+
+
+ +
+
+
+
In [5]:
+
+
+
import unidecode
+import re
+
+ +
+
+
+ +
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
+

Section 1: Explore data

+
+
+
+
+
+
In [6]:
+
+
+
print(os.listdir(os.path.join('..', 'data')))
+
+# create path to tsv file
+path_to_tsv = os.path.join('..', 'data', 'data.tsv')
+
+# read each line of file into a list, challenge documentation said 'utf-16-le' encoding
+# https://realpython.com/read-write-files-python/
+# https://stackoverflow.com/questions/4190683/python-string-replace-for-utf-16-le-file
+with open(path_to_tsv, 'r', encoding='utf-16-le') as f:
+    ls_lines_tsv_utf16le = f.readlines()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
['data.tsv']
+
+
+
+ +
+
+ +
+
+
+
In [7]:
+
+
+
# remove accents on characters
+# https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string
+# https://medium.com/@randombites/how-to-handle-accented-special-strings-175e65d96123
+# https://stackoverflow.com/questions/31207287/converting-utf-16-to-utf-8
+ls_lines_tsv = [unidecode.unidecode(x) for x in ls_lines_tsv_utf16le]
+
+ +
+
+
+ +
+
+
+
In [8]:
+
+
+
print(f'Number of lines in file: {len(ls_lines_tsv)}\n\n')
+
+# print a few lines
+for idx, each_line in enumerate(ls_lines_tsv[:5]):
+    print(idx, each_line)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Number of lines in file: 1008
+
+
+0 id	first_name	last_name	account_number	email
+
+1 1	Addison	Marks	196296	ornare.lectus@et.edu
+
+2 2	Dakota	Garza	409025	scelerisque@Praesentluctus.edu
+
+3 3	Basia	Wolfe	637720	Aliquam@nullaIntegerurna.com
+
+4 4	Germaine	Campbell	826846	id.magna@viverraMaecenas.ca
+
+
+
+
+ +
+
+ +
+
+
+
In [9]:
+
+
+
# challenge documentation says tab delimiters, same number of fields per line ...
+# count number of tab delimiters per line
+# https://www.programiz.com/python-programming/methods/string/count
+ls_num_tabdelim = [x.count('\t') for x in ls_lines_tsv]
+
+# what is frequency of num_tabs per line?
+# https://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item
+# https://www.w3schools.com/python/ref_func_sorted.asp
+counter_numtabs = Counter(ls_num_tabdelim)
+for x in sorted(counter_numtabs.items(), reverse=True):
+    print(x)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
(4, 995)
+(3, 3)
+(2, 5)
+(1, 5)
+
+
+
+ +
+
+ +
+
+
+
In [10]:
+
+
+
def count_num_per_item(ls_input, str_input):
+    '''
+    Count number of occurrences per item in list.  
+    Using this function to read a text file into a list and search for anomalies.
+    Print the results, and output the counter-dictionary.
+    
+    For example, search the number of:
+     * tabs per item string
+     * newline characters per item string 
+    
+    Dependencies:
+        from collections import Counter
+        
+    Input:
+        ls_input - list, output of file.readlines()
+        str_input - str, searching for substring
+    Return:
+        counter_num_per_item - Counterobject, looks like a dictionary, 
+                                where key is number of occurances per item,
+                                and value is number of items of this occurance
+    '''
+    
+    # list of number of occurences per item
+    ls_num_occurences_per_item = [x.count(str_input) for x in ls_input]
+    
+    # check frequency of occurences
+    counter_num_per_item = Counter(ls_num_occurences_per_item)
+    
+    # print out frequency of occurences
+    print(f'Total number of items in list: {len(ls_input)}\n')
+    print(f'Frequency of occurences:')
+    _ = [print(x) for x in sorted(counter_num_per_item.items(), reverse=True)]
+    
+    return counter_num_per_item
+
+ +
+
+
+ +
+
+
+
In [11]:
+
+
+
# not all of the lines have 4 tab-delimiters
+_ = count_num_per_item(ls_lines_tsv, '\t')
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(4, 995)
+(3, 3)
+(2, 5)
+(1, 5)
+
+
+
+ +
+
+ +
+
+
+
In [12]:
+
+
+
# almost all lines have newline char
+_ = count_num_per_item(ls_lines_tsv, "\n")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(1, 1007)
+(0, 1)
+
+
+
+ +
+
+ +
+
+
+
In [13]:
+
+
+
# surprised none of lines have double-quote
+_ = count_num_per_item(ls_lines_tsv, '\"')
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(0, 1008)
+
+
+
+ +
+
+ +
+
+
+
In [14]:
+
+
+
# surprised none of lines have single-quote
+_ = count_num_per_item(ls_lines_tsv, "\'")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(0, 1008)
+
+
+
+ +
+
+ +
+
+
+
In [15]:
+
+
+
# none of lines have carriage-return
+_ = count_num_per_item(ls_lines_tsv, "\r")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(0, 1008)
+
+
+
+ +
+
+ +
+
+
+
In [16]:
+
+
+
# 6 of the lines have 1 dash
+_ = count_num_per_item(ls_lines_tsv, "-")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(1, 6)
+(0, 1002)
+
+
+
+ +
+
+ +
+
+
+
In [17]:
+
+
+
# one of the lines have 1 slash
+_ = count_num_per_item(ls_lines_tsv, "/")
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Total number of items in list: 1008
+
+Frequency of occurences:
+(1, 1)
+(0, 1007)
+
+
+
+ +
+
+ +
+
+
+
In [18]:
+
+
+
# most items have 4 tab-delimters; anomalies don't have 4 tab-delimiters
+# view lines with anomalies '\t', print out line number (0-index)
+_ = [print(f'Line number {idx} => ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('\t') != 4]
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Line number 29 =>  29	Adena	Hobbs
+
+Line number 30 =>  Bosley	656184
+
+Line number 31 =>  	ac.ipsum.Phasellus@ut.net
+
+Line number 84 =>  82	Jade	Battle
+
+Line number 85 =>       	531695	lectus.justo@lorem.co.uk
+
+Line number 220 =>  217	Boris
+
+Line number 221 =>  Harrington	Harrington	325378	neque.Nullam.ut@laoreetlectus.edu
+
+Line number 341 =>  337	NEHRU	MENDOZA	  859105
+
+Line number 342 =>  	porttitor.interdum.Sed@Loremipsum.co.uk
+
+Line number 780 =>  775	
+
+Line number 781 =>  Barbara	Hurley	691210	enim.Mauris.quis@magna.net
+
+Line number 991 =>  985	Cherokee	Indian
+
+Line number 992 =>  	157172	enim@disparturient.edu
+
+
+
+
+ +
+
+ +
+
+
+
In [19]:
+
+
+
# view lines with anomalies on '\n' ... this is the last line in the data file 
+_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('\n') != 1]
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Line number 1007 ...  1000	Hermione	Morales	478506	eu.nulla@Donec.com
+
+
+
+ +
+
+ +
+
+
+
In [20]:
+
+
+
# view lines with anomalies on '-' ... 5 out of 6 are in the account_number, one is in the last_name
+_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('-') > 0]
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Line number 298 ...  294	Wyoming	Williams	454-586	odio.a.purus@Suspendisse.co.uk
+
+Line number 392 ...  387	Palmer	Acevedo	168-722	ut.pellentesque.eget@eleifendCrassed.org
+
+Line number 421 ...  416	elvis	RIVERA	865-008	Quisque.libero@vitaesodales.net
+
+Line number 481 ...  476	Ivana	Chapman	357-130	et@Cras.co.uk
+
+Line number 692 ...  687	Fay	Irwin	1181-61	metus@elitsedconsequat.com
+
+Line number 961 ...  955	Veronica	Barr-Novel	953228	sollicitudin@Nunc.org
+
+
+
+
+ +
+
+ +
+
+
+
In [21]:
+
+
+
# view lines with anomalies on '/' ... one account number
+_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('/') > 0]
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Line number 315 ...  311	Kenneth	Bird	437/680	est@utpharetrased.org
+
+
+
+
+ +
+
+ +
+
+
+
+

Observations in Section 1:

    +
  1. Total number of lines is 1008, which includes header line
  2. +
  3. Most of the lines have 4 tab-delimiters, although not true of 13 lines
  4. +
  5. The anomalies have fewer than 4 tab-delimiters because the data was split amongst multiple lines
  6. +
  7. One of the anomalous records has repeat last name 'Harrington'
  8. +
  9. Correcting the anomalies should be straightforward with a set of rules, but read the remaining lines into dataframe to examine
  10. +
+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
+

Section 2: Read into dataframe (exception of anomalies w/o 4 tab-delimiters), and explore dataframe

+
+
+
+
+
+
In [22]:
+
+
+
# create list with only items with 4 tabs
+ls_lines_tsv_4tabs = [x for x in ls_lines_tsv if x.count('\t')==4]
+
+# convert list to string, already has newline character, no need to add to join
+str_lines_tsv_4tabs = ''.join(ls_lines_tsv_4tabs)
+
+# convert str to io.StringIO object so it can be read as CSV file
+# https://www.kite.com/python/answers/how-to-create-a-pandas-dataframe-from-a-string-in-python
+io_data_tsv = io.StringIO(str_lines_tsv_4tabs)
+
+# create dataframe, tab-delimited
+df_4_tabs = pd.read_csv(io_data_tsv, sep='\t')
+
+print(f"Shape of dataframe: {df_4_tabs.shape}")
+df_4_tabs.head()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Shape of dataframe: (994, 5)
+
+
+
+ +
+ +
Out[22]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemail
01AddisonMarks196296ornare.lectus@et.edu
12DakotaGarza409025scelerisque@Praesentluctus.edu
23BasiaWolfe637720Aliquam@nullaIntegerurna.com
34GermaineCampbell826846id.magna@viverraMaecenas.ca
45LenorePennington345284aliquam@Integer.edu
+
+
+ +
+ +
+
+ +
+
+
+
In [23]:
+
+
+
print(df_4_tabs.nunique())
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
id                994
+first_name        686
+last_name         652
+account_number    992
+email             994
+dtype: int64
+
+
+
+ +
+
+ +
+
+
+
In [24]:
+
+
+
# how many null values per column
+print(df_4_tabs.isna().sum())
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
id                 0
+first_name        12
+last_name          9
+account_number     0
+email              0
+dtype: int64
+
+
+
+ +
+
+ +
+
+
+
In [25]:
+
+
+
print(df_4_tabs.dtypes)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
id                 int64
+first_name        object
+last_name         object
+account_number    object
+email             object
+dtype: object
+
+
+
+ +
+
+ +
+
+
+
In [26]:
+
+
+
# is the account_number always numeric?
+print('Number of records in dataframe: ', df_4_tabs.shape[0])
+print('Number of records `account_number` is numeric: ', df_4_tabs['account_number'].str.isnumeric().sum())
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Number of records in dataframe:  994
+Number of records `account_number` is numeric:  976
+
+
+
+ +
+
+ +
+
+
+
In [27]:
+
+
+
# examine rows with null first_name or last_name
+mask_null_name = (
+    df_4_tabs['first_name'].isna()
+    | df_4_tabs['last_name'].isna()
+)
+df_4_tabs.loc[mask_null_name]
+
+ +
+
+
+ +
+
+ + +
+ +
Out[27]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemail
298302NaNCopeland547803Vivamus.non.lorem@sed.ca
439444RoaryNaN607318mi.pede@orci.ca
514519NaNWilliamson231131Nullam.nisl@massa.ca
533538janaNaN824675nisi.dictum.augue@nonenimcommodo.co.uk
547552FitzgeraldNaN317355elit.a@velit.co.uk
561566WhoopiNaN125742feugiat.non.lobortis@enimSednulla.net
596601NaNNewton601770Duis.ac@nuncinterdum.edu
608613NaNNaN104969dictum@Suspendisse.net
624629NaNRiggs242923nulla.ante.iaculis@erat.net
668673VielkaNaN270462metus.Aenean@lectusNullamsuscipit.edu
753758NaNNaN155499facilisis.facilisis@quamdignissimpharetra.net
766771NaNNaN935186amet.consectetuer.adipiscing@purusgravida.net
804810NaNBishop934203Sed.et@at.net
842848NaNHess340666interdum.Nunc.sollicitudin@elementum.org
883889NaNEaton292781ac@idnunc.edu
972978NaNNaN958519Aliquam.auctor.velit@fringillapurus.co.uk
980987NaNBurris233269Sed.et@Proin.ca
+
+
+ +
+ +
+
+ +
+
+
+
In [28]:
+
+
+
# create columns with flags on null names
+df_4_tabs['null_firstname'] = df_4_tabs['first_name'].isna()
+df_4_tabs['null_lastname'] = df_4_tabs['last_name'].isna()
+df_4_tabs['null_name'] = df_4_tabs['null_firstname'] | df_4_tabs['null_lastname']
+
+# create column with length of account_number
+df_4_tabs['len_acct_num'] = df_4_tabs['account_number'].apply(len)
+
+# create column with email domain name
+# https://stackoverflow.com/questions/12504976/get-last-column-after-str-split-operation-on-column-in-pandas-dataframe
+df_4_tabs['domain_name'] = df_4_tabs['email'].str.split('@').str[-1]
+
+ +
+
+
+ +
+
+
+
In [29]:
+
+
+
df_4_tabs.head()
+
+ +
+
+
+ +
+
+ + +
+ +
Out[29]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
01AddisonMarks196296ornare.lectus@et.eduFalseFalseFalse6et.edu
12DakotaGarza409025scelerisque@Praesentluctus.eduFalseFalseFalse6Praesentluctus.edu
23BasiaWolfe637720Aliquam@nullaIntegerurna.comFalseFalseFalse6nullaIntegerurna.com
34GermaineCampbell826846id.magna@viverraMaecenas.caFalseFalseFalse6viverraMaecenas.ca
45LenorePennington345284aliquam@Integer.eduFalseFalseFalse6Integer.edu
+
+
+ +
+ +
+
+ +
+
+
+
In [30]:
+
+
+
# examine each column for number of unique values ...
+# ... look for .value_counts() situations where there are fewer than 10 counts per column
+# print out number of unique values per column if > 10, print out .value_counts() if < 10
+print(f'Shape of dataframe: {df_4_tabs.shape}\n')
+print(f'Columns in dataframe: \n{list(df_4_tabs.columns)}\n')
+
+for each_col in df_4_tabs.columns:
+    num_counts = df_4_tabs[each_col].value_counts().shape[0]
+    print(f'*** Number of value counts in column: {each_col} ===> {num_counts}\n')
+    
+    if num_counts < 10:
+        print(f'Value counts of column: {each_col}')
+        print(df_4_tabs[each_col].value_counts())
+        print()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Shape of dataframe: (994, 10)
+
+Columns in dataframe: 
+['id', 'first_name', 'last_name', 'account_number', 'email', 'null_firstname', 'null_lastname', 'null_name', 'len_acct_num', 'domain_name']
+
+*** Number of value counts in column: id ===> 994
+
+*** Number of value counts in column: first_name ===> 686
+
+*** Number of value counts in column: last_name ===> 652
+
+*** Number of value counts in column: account_number ===> 992
+
+*** Number of value counts in column: email ===> 994
+
+*** Number of value counts in column: null_firstname ===> 2
+
+Value counts of column: null_firstname
+False    982
+True      12
+Name: null_firstname, dtype: int64
+
+*** Number of value counts in column: null_lastname ===> 2
+
+Value counts of column: null_lastname
+False    985
+True       9
+Name: null_lastname, dtype: int64
+
+*** Number of value counts in column: null_name ===> 2
+
+Value counts of column: null_name
+False    977
+True      17
+Name: null_name, dtype: int64
+
+*** Number of value counts in column: len_acct_num ===> 6
+
+Value counts of column: len_acct_num
+6     974
+7      14
+8       2
+5       2
+12      1
+9       1
+Name: len_acct_num, dtype: int64
+
+*** Number of value counts in column: domain_name ===> 904
+
+
+
+
+ +
+
+ +
+
+
+
In [31]:
+
+
+
# most values in account_number field are 6 characters long ... anomalies are not ...
+# ... examine records where account number is not 6 digits long
+df_4_tabs.query(" len_acct_num != 6 ")
+
+ +
+
+
+ +
+
+ + +
+ +
Out[31]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
165168NyssaLivingston897187at@tortorat.netFalseFalseFalse7tortorat.net
214218IfeomaWhitney543699Proin.nisl.sem@odiovelest.eduFalseFalseFalse7odiovelest.edu
262266HowardCooley167868faucibus@ametultricies.orgFalseFalseFalse7ametultricies.org
272276WillaDiaz157615Aliquam.tincidunt@metuseuerat.co.ukFalseFalseFalse7metuseuerat.co.uk
290294WyomingWilliams454-586odio.a.purus@Suspendisse.co.ukFalseFalseFalse7Suspendisse.co.uk
295299MeghanWhitaker352355leo@tempus.orgFalseFalseFalse7tempus.org
307311KennethBird437/680est@utpharetrased.orgFalseFalseFalse7utpharetrased.org
382387PalmerAcevedo168-722ut.pellentesque.eget@eleifendCrassed.orgFalseFalseFalse7eleifendCrassed.org
411416elvisRIVERA865-008Quisque.libero@vitaesodales.netFalseFalseFalse7vitaesodales.net
471476IvanaChapman357-130et@Cras.co.ukFalseFalseFalse7Cras.co.uk
472477HopeOneal893425ut@ut.orgFalseFalseFalse8ut.org
489494BlaineBarker874570elit.pretium@sociisnatoquepenatibus.netFalseFalseFalse7sociisnatoquepenatibus.net
494499KerryHowell730368sed.consequat@ac.caFalseFalseFalse9ac.ca
567572GingerNelson475426Duis.gravida.Praesent@vulputate.netFalseFalseFalse7vulputate.net
620625GalenaHolland460962at.risus@tincidunt.co.ukFalseFalseFalse8tincidunt.co.uk
682687FayIrwin1181-61metus@elitsedconsequat.comFalseFalseFalse7elitsedconsequat.com
830836KeeganMcCoy93128feugiat.nec@eu.co.ukFalseFalseFalse5eu.co.uk
954960TaylorWallace75182orci@dolor.netFalseFalseFalse5dolor.net
966972DorothyRoy734417quis@Sedetlibero.caFalseFalseFalse7Sedetlibero.ca
976982GemmaHartman137599luctus@ProinmiAliquam.eduFalseFalseFalse12ProinmiAliquam.edu
+
+
+ +
+ +
+
+ +
+
+
+
In [32]:
+
+
+
# examine non-numeric account numbers by creating separate dataframe
+# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.isnumeric.html
+mask_numeric_acct_num = df_4_tabs['account_number'].str.isnumeric()
+df_4_tabs_nonnumeric_acct_num = df_4_tabs.loc[~mask_numeric_acct_num]
+
+print('Show dataframe of non-numeric account numbers: \n')
+display(df_4_tabs_nonnumeric_acct_num)
+print()
+
+print('Show the actual account_numbers: ')
+print(df_4_tabs_nonnumeric_acct_num['account_number'].values)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Show dataframe of non-numeric account numbers: 
+
+
+
+
+ +
+ +
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
165168NyssaLivingston897187at@tortorat.netFalseFalseFalse7tortorat.net
214218IfeomaWhitney543699Proin.nisl.sem@odiovelest.eduFalseFalseFalse7odiovelest.edu
262266HowardCooley167868faucibus@ametultricies.orgFalseFalseFalse7ametultricies.org
272276WillaDiaz157615Aliquam.tincidunt@metuseuerat.co.ukFalseFalseFalse7metuseuerat.co.uk
290294WyomingWilliams454-586odio.a.purus@Suspendisse.co.ukFalseFalseFalse7Suspendisse.co.uk
295299MeghanWhitaker352355leo@tempus.orgFalseFalseFalse7tempus.org
307311KennethBird437/680est@utpharetrased.orgFalseFalseFalse7utpharetrased.org
382387PalmerAcevedo168-722ut.pellentesque.eget@eleifendCrassed.orgFalseFalseFalse7eleifendCrassed.org
411416elvisRIVERA865-008Quisque.libero@vitaesodales.netFalseFalseFalse7vitaesodales.net
471476IvanaChapman357-130et@Cras.co.ukFalseFalseFalse7Cras.co.uk
472477HopeOneal893425ut@ut.orgFalseFalseFalse8ut.org
489494BlaineBarker874570elit.pretium@sociisnatoquepenatibus.netFalseFalseFalse7sociisnatoquepenatibus.net
494499KerryHowell730368sed.consequat@ac.caFalseFalseFalse9ac.ca
567572GingerNelson475426Duis.gravida.Praesent@vulputate.netFalseFalseFalse7vulputate.net
620625GalenaHolland460962at.risus@tincidunt.co.ukFalseFalseFalse8tincidunt.co.uk
682687FayIrwin1181-61metus@elitsedconsequat.comFalseFalseFalse7elitsedconsequat.com
966972DorothyRoy734417quis@Sedetlibero.caFalseFalseFalse7Sedetlibero.ca
976982GemmaHartman137599luctus@ProinmiAliquam.eduFalseFalseFalse12ProinmiAliquam.edu
+
+
+ +
+ +
+ +
+ + +
+
+Show the actual account_numbers: 
+[' 897187' '543699 ' '167868 ' ' 157615' '454-586' ' 352355' '437/680'
+ '168-722' '865-008' '357-130' '  893425' ' 874570' '730368   ' ' 475426'
+ '460962  ' '1181-61' ' 734417' '137599      ']
+
+
+
+ +
+
+ +
+
+
+
In [33]:
+
+
+
# reasons account_number non-numeric: leading/trailing whitespace, dash, slash ...
+# ... clean up the account number, remove space and dashes and slashes => save as new column
+df_4_tabs['clean_account_number'] = df_4_tabs['account_number'].str.replace(' ', '').str.replace('-', '').str.replace('/', '')
+
+# in new clean column, any account numbers less than 6 digits? create new column to measure length
+df_4_tabs['len_clean_acct_num'] = df_4_tabs['clean_account_number'].apply(len)
+
+print('Number of characters in `clean_account_number` column: ')
+print(df_4_tabs['len_clean_acct_num'].value_counts())
+print()
+
+print('Show the `clean_account_number` that is not 6 characters long: ')
+print(df_4_tabs.query(" len_clean_acct_num != 6 ")['clean_account_number'].values)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Number of characters in `clean_account_number` column: 
+6    992
+5      2
+Name: len_clean_acct_num, dtype: int64
+
+Show the `clean_account_number` that is not 6 characters long: 
+['93128' '75182']
+
+
+
+ +
+
+ +
+
+
+
In [34]:
+
+
+
# check for anomalies on first or last character of account number
+df_4_tabs['first_char_clean_an'] = df_4_tabs['clean_account_number'].str[0]
+df_4_tabs['last_char_clean_an'] = df_4_tabs['clean_account_number'].str[-1]
+
+display(df_4_tabs['first_char_clean_an'].value_counts())
+print()
+display(df_4_tabs['last_char_clean_an'].value_counts())
+
+ +
+
+
+ +
+
+ + +
+ +
+ + + + +
+
2    120
+3    120
+1    114
+4    113
+9    113
+7    112
+5    109
+8     97
+6     96
+Name: first_char_clean_an, dtype: int64
+
+ +
+ +
+ +
+ + +
+
+
+
+
+ +
+ +
+ + + + +
+
7    116
+4    105
+6    105
+1    103
+9     97
+2     95
+8     95
+0     95
+3     93
+5     90
+Name: last_char_clean_an, dtype: int64
+
+ +
+ +
+
+ +
+
+
+
In [35]:
+
+
+
# is the `clean_account_number` a unique primary key?  Or any duplicates?
+# look for non-unique account numbers using .groupby().transform()
+# https://pbpython.com/pandas_transform.html
+df_4_tabs['nuniq_acct_num'] = df_4_tabs.groupby('clean_account_number')['clean_account_number'].transform(len)
+df_4_tabs.query(" nuniq_acct_num>1 ")
+
+ +
+
+
+ +
+
+ + +
+ +
Out[35]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_nameclean_account_numberlen_clean_acct_numfirst_char_clean_anlast_char_clean_annuniq_acct_num
410415CourtneySalinas114392nec@nunc.netFalseFalseFalse6nunc.net1143926122
512517WayneWilson973758nonummy@iaculis.co.ukFalseFalseFalse6iaculis.co.uk9737586982
899905AliceDyer973758Vivamus.nibh.dolor@gravidamauris.co.ukFalseFalseFalse6gravidamauris.co.uk9737586982
928934NoraLeonard114392ornare.elit@libero.comFalseFalseFalse6libero.com1143926122
+
+
+ +
+ +
+
+ +
+
+
+
In [36]:
+
+
+
# look for account numbers (after cleaning) that are not length 6 characters
+df_4_tabs.query(" len_clean_acct_num !=6 ")
+
+ +
+
+
+ +
+
+ + +
+ +
Out[36]:
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_nameclean_account_numberlen_clean_acct_numfirst_char_clean_anlast_char_clean_annuniq_acct_num
830836KeeganMcCoy93128feugiat.nec@eu.co.ukFalseFalseFalse5eu.co.uk931285981
954960TaylorWallace75182orci@dolor.netFalseFalseFalse5dolor.net751825721
+
+
+ +
+ +
+
+ +
+
+
+
+

Observations in Section 2:

    +
  1. The data with 4 tab-delimiters loads into dataframe easily
  2. +
  3. There are 994 rows, 5 columns (not counting header)
  4. +
  5. Some null values in first_name, last_name, or both
  6. +
  7. id column is numeric, but remaining columns convert to str (initially surprised account_number didn't convert numeric)
  8. +
  9. The id and email columns are completely unique, but the rest are not; even account_number has a few non-uniques
      +
    • account_number: 114392
    • +
    • account_number: 973758
    • +
    +
  10. +
  11. The account_number column has length of 6 characters in 974 records, but remaining have different lengths
  12. +
  13. After cleaning account_number by removing spaces, dashes, and slashes; two of the records are length 5 (consider leading zero that was removed?)
      +
    • account_number: 93128
    • +
    • account_number: 75182
    • +
    +
  14. +
  15. The first and last character in account_number appear random
  16. +
  17. Surprisingly, the email domains are almost unique, 904 uniques out of 994 rows
  18. +
+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
+

Section 3: clean up tab-delimiters

+
+
+
+
+
+
In [37]:
+
+
+
# create list with anomalies '\t'
+ls_lines_anom_tab_delim = [x for x in ls_lines_tsv if x.count('\t') != 4]
+ls_lines_anom_tab_delim
+
+ +
+
+
+ +
+
+ + +
+ +
Out[37]:
+ + + + +
+
['29\tAdena\tHobbs\n',
+ 'Bosley\t656184\n',
+ '\tac.ipsum.Phasellus@ut.net\n',
+ '82\tJade\tBattle\n',
+ '     \t531695\tlectus.justo@lorem.co.uk\n',
+ '217\tBoris\n',
+ 'Harrington\tHarrington\t325378\tneque.Nullam.ut@laoreetlectus.edu\n',
+ '337\tNEHRU\tMENDOZA\t  859105\n',
+ '\tporttitor.interdum.Sed@Loremipsum.co.uk\n',
+ '775\t\n',
+ 'Barbara\tHurley\t691210\tenim.Mauris.quis@magna.net\n',
+ '985\tCherokee\tIndian\n',
+ '\t157172\tenim@disparturient.edu\n']
+
+ +
+ +
+
+ +
+
+
+
In [38]:
+
+
+
# create string with anomalies '\t'
+str_anom_tab_delim = ''.join(ls_lines_anom_tab_delim)
+str_anom_tab_delim
+
+ +
+
+
+ +
+
+ + +
+ +
Out[38]:
+ + + + +
+
'29\tAdena\tHobbs\nBosley\t656184\n\tac.ipsum.Phasellus@ut.net\n82\tJade\tBattle\n     \t531695\tlectus.justo@lorem.co.uk\n217\tBoris\nHarrington\tHarrington\t325378\tneque.Nullam.ut@laoreetlectus.edu\n337\tNEHRU\tMENDOZA\t  859105\n\tporttitor.interdum.Sed@Loremipsum.co.uk\n775\t\nBarbara\tHurley\t691210\tenim.Mauris.quis@magna.net\n985\tCherokee\tIndian\n\t157172\tenim@disparturient.edu\n'
+
+ +
+ +
+
+ +
+
+
+
In [39]:
+
+
+
# clean string: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab
+str_anom_tab_delim_clean = str_anom_tab_delim.replace('\n', '\t').replace(' ', '').replace('\t\t', '\t')
+str_anom_tab_delim_clean
+
+ +
+
+
+ +
+
+ + +
+ +
Out[39]:
+ + + + +
+
'29\tAdena\tHobbs\tBosley\t656184\tac.ipsum.Phasellus@ut.net\t82\tJade\tBattle\t531695\tlectus.justo@lorem.co.uk\t217\tBoris\tHarrington\tHarrington\t325378\tneque.Nullam.ut@laoreetlectus.edu\t337\tNEHRU\tMENDOZA\t859105\tporttitor.interdum.Sed@Loremipsum.co.uk\t775\tBarbara\tHurley\t691210\tenim.Mauris.quis@magna.net\t985\tCherokee\tIndian\t157172\tenim@disparturient.edu\t'
+
+ +
+ +
+
+ +
+
+
+
In [40]:
+
+
+
# examine string split on tab-delimeter
+str_anom_tab_delim_clean.split('\t')
+
+ +
+
+
+ +
+
+ + +
+ +
Out[40]:
+ + + + +
+
['29',
+ 'Adena',
+ 'Hobbs',
+ 'Bosley',
+ '656184',
+ 'ac.ipsum.Phasellus@ut.net',
+ '82',
+ 'Jade',
+ 'Battle',
+ '531695',
+ 'lectus.justo@lorem.co.uk',
+ '217',
+ 'Boris',
+ 'Harrington',
+ 'Harrington',
+ '325378',
+ 'neque.Nullam.ut@laoreetlectus.edu',
+ '337',
+ 'NEHRU',
+ 'MENDOZA',
+ '859105',
+ 'porttitor.interdum.Sed@Loremipsum.co.uk',
+ '775',
+ 'Barbara',
+ 'Hurley',
+ '691210',
+ 'enim.Mauris.quis@magna.net',
+ '985',
+ 'Cherokee',
+ 'Indian',
+ '157172',
+ 'enim@disparturient.edu',
+ '']
+
+ +
+ +
+
+ +
+
+
+
In [41]:
+
+
+
# convert list of text to str
+# # create str that joins all records: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab
+# str_tsv = ''.join(ls_lines_tsv).replace('\n', '\t').replace(' ', '').replace('\t\t', '\t')
+ls_clean_tsv = [x.replace('\n', '\t').replace(' ', '').replace('\t\t', '\t') for x in ls_lines_tsv]
+str_tsv = ''.join(ls_clean_tsv)
+
+# look for characters not letters or numbers
+print('Unique characters not numbers or letters:')
+print(set(re.findall('[^a-zA-Z0-9]', str_tsv)))
+print()
+
+# look for characters not letters or numbers ... in the raw utf-16-le string
+print('Unique characters not numbers or letters (utf-16-le):')
+print(set(re.findall('[^a-zA-Z0-9]', ''.join(ls_lines_tsv_utf16le))))
+print()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Unique characters not numbers or letters:
+{'@', '_', '.', '-', '\t', '/'}
+
+Unique characters not numbers or letters (utf-16-le):
+{'@', '_', '\n', '.', '-', '\t', '/', 'È', ' '}
+
+
+
+
+ +
+
+ +
+
+
+
In [42]:
+
+
+
# use regex groups to identify records
+# https://www.tutorialspoint.com/What-is-the-groups-method-in-regular-expressions-in-Python
+'''
+* id - one or more digits
+* name - may include 0, 1, 2, 3 names
+* account number - one or more digits that may contain '-' or '/'
+* email may contain one or more '@', '.'
+'''
+r_str_match_0names = '(\d+\\t[0-9-/]*\\t[a-zA-Z@.]*)'
+r_str_match_1names = '(\d+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)'
+r_str_match_2names = '(\d+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)'
+r_str_match_3names = '(\d+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[a-zA-Z]+\\t[0-9-/]*\\t[a-zA-Z@.]*)'
+
+# create list of records
+ls_re_find_0names = re.findall(r_str_match_0names, str_anom_tab_delim_clean)
+ls_re_find_1names = re.findall(r_str_match_1names, str_anom_tab_delim_clean)
+ls_re_find_2names = re.findall(r_str_match_2names, str_anom_tab_delim_clean)
+ls_re_find_3names = re.findall(r_str_match_3names, str_anom_tab_delim_clean)
+
+print('Number of records with 0 names: ', len(ls_re_find_0names))
+print('Number of records with 1 names: ', len(ls_re_find_1names))
+print('Number of records with 2 names: ', len(ls_re_find_2names))
+print('Number of records with 3 names: ', len(ls_re_find_3names))
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Number of records with 0 names:  0
+Number of records with 1 names:  0
+Number of records with 2 names:  4
+Number of records with 3 names:  2
+
+
+
+ +
+
+ +
+
+
+
In [43]:
+
+
+
print('Clean 3 names to 2 names: ')
+print(ls_re_find_3names)
+print()
+
+# clean 3 names by removing middle name
+for each_3name in ls_re_find_3names:
+    
+    # convert to list, split on tab delimiter
+    temp_list = each_3name.split('\t')
+    print(temp_list)
+    
+    # remove middle name
+    del temp_list[2]
+    print(temp_list)
+    
+    print()
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Clean 3 names to 2 names: 
+['29\tAdena\tHobbs\tBosley\t656184\tac.ipsum.Phasellus@ut.net', '217\tBoris\tHarrington\tHarrington\t325378\tneque.Nullam.ut@laoreetlectus.edu']
+
+['29', 'Adena', 'Hobbs', 'Bosley', '656184', 'ac.ipsum.Phasellus@ut.net']
+['29', 'Adena', 'Bosley', '656184', 'ac.ipsum.Phasellus@ut.net']
+
+['217', 'Boris', 'Harrington', 'Harrington', '325378', 'neque.Nullam.ut@laoreetlectus.edu']
+['217', 'Boris', 'Harrington', '325378', 'neque.Nullam.ut@laoreetlectus.edu']
+
+
+
+
+ +
+
+ +
+
+
+
In [44]:
+
+
+
# find id for any records that don't match 2 name pattern ...
+# ... this is how I discovered the accented characters
+ls_re_all_find_2names = re.findall(r_str_match_2names, str_tsv)
+ls_id_regex = [x.split('\t')[0] for x in ls_re_all_find_2names]
+for idx, item in enumerate(ls_id_regex):
+    if idx == len(ls_id_regex)-1:
+        print('End of List')
+    elif int(item)+1 != int(ls_id_regex[idx+1]):
+        print(item)
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
28
+216
+301
+443
+518
+537
+551
+565
+600
+612
+628
+672
+757
+770
+809
+847
+888
+954
+977
+986
+End of List
+
+
+
+ +
+
+ +
+
+
+
+

Observations in Section 3:

    +
  1. Six records do not have 4 tab-delimiters
  2. +
  3. Many of the issues fall into 3 categories: (a) newline instead of tab, (b) extra whitespaces, (c) double tab-delimiter
      +
    • Therefore correcting many of the mistakes by: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab
    • +
    • Join the list into a single string, then use regex to extract records
    • +
    +
  4. +
  5. Two records continue to have issues because there are 3 names instead of 2 names
      +
    • Correct these records by removing middle name
    • +
    +
  6. +
  7. Using regex allowed me to detect accented characters, so I went back to correct in beginning using unidecode
  8. +
+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
+

Algorithm Steps

    +
  1. Read the tsv file into list of strings
  2. +
  3. Remove records without 4 tab-delimiters => clean up so they will fit into dataframe (e.g. regex to remove middle names) => add back to list
  4. +
  5. Load entire list into pandas dataframe
  6. +
  7. Perform cleanup in pandas dataframe (mostly account_number, although minor issue of null names)
  8. +
  9. Export pandas dataframe to .tsv file
  10. +
+ +
+
+
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+
+

Check AWS Cloud Resources

+
+
+
+
+
+
In [45]:
+
+
+
# create s3 resource
+s3_resource = boto3.resource('s3')
+
+# print available AWS S3 buckets
+# https://stackoverflow.com/questions/49372761/boto3-using-boto3-resources3-to-list-all-s3-buckets
+ls_buckets = [bucket.name for bucket in s3_resource.buckets.all()]
+for idx, each_bucket in enumerate(ls_buckets):
+    # print(idx, each_bucket)
+    pass
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
Pandas Redshift | 2020-11-28 14:13:31,191 | botocore.credentials | INFO | Found credentials in shared credentials file: ~/.aws/credentials
+
+
+
+ +
+
+ +
+
+
+
In [46]:
+
+
+
# create redshift client
+rs_client = boto3.client('redshift')
+
+# print available redshift clusters
+# https://stackoverflow.com/questions/34309151/get-list-clusters-amazon-redshift-using-python-with-boto3
+# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html
+di_rs_clusters = rs_client.describe_clusters()
+print(di_rs_clusters['Clusters'])
+
+ +
+
+
+ +
+
+ + +
+ +
+ + +
+
[]
+
+
+
+ +
+
+ +
+
+
+
In [ ]:
+
+
+
 
+
+ +
+
+
+ +
+
+
+ + + + + + diff --git a/notebook/explore-data-challenge.ipynb b/notebook/explore-data-challenge.ipynb new file mode 100644 index 0000000..818af34 --- /dev/null +++ b/notebook/explore-data-challenge.ipynb @@ -0,0 +1,2787 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Explore Data Challenge\n", + "\n", + "### Assumptions:\n", + "1. Data in tab-delimited format (fields separated by tabs)\n", + "1. Fields with quotes contain reserved characters such as `\\t`, `\\r`, and `\\n`\n", + "1. Data is UTF-16LE encoded => convert to UTF-8\n", + "1. Data will have anomalies and will require judgment calls\n", + "1. Upload data to Redshift\n", + "1. Parallel algorithm: \n", + " * break up data into multiple parts (position, length) \n", + " * process the multiple parts, output each to separate TSV file\n", + " * re-assemble multiple TSV files into a single TSV file\n", + "\n", + "### Sections:\n", + "* Section 1 - explore data.tsv\n", + "* Section 2 - read non-anomalous data into pandas dataframe, explore\n", + "* Section 3 - experiment with algorithms to clean anomalous data\n", + "\n", + "### References:\n", + "* Matthew Ropp (RingDNA) data challenge repo. https://github.com/msropp/data-challenge\n", + "* Pandas-Redshift library repo. https://github.com/agawronski/pandas_redshift" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # install unidecode\n", + "# ! pip install unidecode\n", + "\n", + "# # install pandas-redshift library if not already installed\n", + "# ! pip install pandas-redshift" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from collections import Counter\n", + "import io" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pandas_redshift as pr" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# pandas display settings\n", + "pd.set_option(\"display.max_columns\", 999)\n", + "pd.set_option(\"display.max_rows\", 999)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import unidecode\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Section 1: Explore data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['data.tsv']\n" + ] + } + ], + "source": [ + "print(os.listdir(os.path.join('..', 'data')))\n", + "\n", + "# create path to tsv file\n", + "path_to_tsv = os.path.join('..', 'data', 'data.tsv')\n", + "\n", + "# read each line of file into a list, challenge documentation said 'utf-16-le' encoding\n", + "# https://realpython.com/read-write-files-python/\n", + "# https://stackoverflow.com/questions/4190683/python-string-replace-for-utf-16-le-file\n", + "with open(path_to_tsv, 'r', encoding='utf-16-le') as f:\n", + " ls_lines_tsv_utf16le = f.readlines()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# remove accents on characters\n", + "# https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-normalize-in-a-python-unicode-string\n", + "# https://medium.com/@randombites/how-to-handle-accented-special-strings-175e65d96123\n", + "# https://stackoverflow.com/questions/31207287/converting-utf-16-to-utf-8\n", + "ls_lines_tsv = [unidecode.unidecode(x) for x in ls_lines_tsv_utf16le]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of lines in file: 1008\n", + "\n", + "\n", + "0 id\tfirst_name\tlast_name\taccount_number\temail\n", + "\n", + "1 1\tAddison\tMarks\t196296\tornare.lectus@et.edu\n", + "\n", + "2 2\tDakota\tGarza\t409025\tscelerisque@Praesentluctus.edu\n", + "\n", + "3 3\tBasia\tWolfe\t637720\tAliquam@nullaIntegerurna.com\n", + "\n", + "4 4\tGermaine\tCampbell\t826846\tid.magna@viverraMaecenas.ca\n", + "\n" + ] + } + ], + "source": [ + "print(f'Number of lines in file: {len(ls_lines_tsv)}\\n\\n')\n", + "\n", + "# print a few lines\n", + "for idx, each_line in enumerate(ls_lines_tsv[:5]):\n", + " print(idx, each_line)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(4, 995)\n", + "(3, 3)\n", + "(2, 5)\n", + "(1, 5)\n" + ] + } + ], + "source": [ + "# challenge documentation says tab delimiters, same number of fields per line ...\n", + "# count number of tab delimiters per line\n", + "# https://www.programiz.com/python-programming/methods/string/count\n", + "ls_num_tabdelim = [x.count('\\t') for x in ls_lines_tsv]\n", + "\n", + "# what is frequency of num_tabs per line?\n", + "# https://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item\n", + "# https://www.w3schools.com/python/ref_func_sorted.asp\n", + "counter_numtabs = Counter(ls_num_tabdelim)\n", + "for x in sorted(counter_numtabs.items(), reverse=True):\n", + " print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def count_num_per_item(ls_input, str_input):\n", + " '''\n", + " Count number of occurrences per item in list. \n", + " Using this function to read a text file into a list and search for anomalies.\n", + " Print the results, and output the counter-dictionary.\n", + " \n", + " For example, search the number of:\n", + " * tabs per item string\n", + " * newline characters per item string \n", + " \n", + " Dependencies:\n", + " from collections import Counter\n", + " \n", + " Input:\n", + " ls_input - list, output of file.readlines()\n", + " str_input - str, searching for substring\n", + " Return:\n", + " counter_num_per_item - Counterobject, looks like a dictionary, \n", + " where key is number of occurances per item,\n", + " and value is number of items of this occurance\n", + " '''\n", + " \n", + " # list of number of occurences per item\n", + " ls_num_occurences_per_item = [x.count(str_input) for x in ls_input]\n", + " \n", + " # check frequency of occurences\n", + " counter_num_per_item = Counter(ls_num_occurences_per_item)\n", + " \n", + " # print out frequency of occurences\n", + " print(f'Total number of items in list: {len(ls_input)}\\n')\n", + " print(f'Frequency of occurences:')\n", + " _ = [print(x) for x in sorted(counter_num_per_item.items(), reverse=True)]\n", + " \n", + " return counter_num_per_item" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(4, 995)\n", + "(3, 3)\n", + "(2, 5)\n", + "(1, 5)\n" + ] + } + ], + "source": [ + "# not all of the lines have 4 tab-delimiters\n", + "_ = count_num_per_item(ls_lines_tsv, '\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(1, 1007)\n", + "(0, 1)\n" + ] + } + ], + "source": [ + "# almost all lines have newline char\n", + "_ = count_num_per_item(ls_lines_tsv, \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(0, 1008)\n" + ] + } + ], + "source": [ + "# surprised none of lines have double-quote\n", + "_ = count_num_per_item(ls_lines_tsv, '\\\"')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(0, 1008)\n" + ] + } + ], + "source": [ + "# surprised none of lines have single-quote\n", + "_ = count_num_per_item(ls_lines_tsv, \"\\'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(0, 1008)\n" + ] + } + ], + "source": [ + "# none of lines have carriage-return\n", + "_ = count_num_per_item(ls_lines_tsv, \"\\r\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(1, 6)\n", + "(0, 1002)\n" + ] + } + ], + "source": [ + "# 6 of the lines have 1 dash\n", + "_ = count_num_per_item(ls_lines_tsv, \"-\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total number of items in list: 1008\n", + "\n", + "Frequency of occurences:\n", + "(1, 1)\n", + "(0, 1007)\n" + ] + } + ], + "source": [ + "# one of the lines have 1 slash\n", + "_ = count_num_per_item(ls_lines_tsv, \"/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Line number 29 => 29\tAdena\tHobbs\n", + "\n", + "Line number 30 => Bosley\t656184\n", + "\n", + "Line number 31 => \tac.ipsum.Phasellus@ut.net\n", + "\n", + "Line number 84 => 82\tJade\tBattle\n", + "\n", + "Line number 85 => \t531695\tlectus.justo@lorem.co.uk\n", + "\n", + "Line number 220 => 217\tBoris\n", + "\n", + "Line number 221 => Harrington\tHarrington\t325378\tneque.Nullam.ut@laoreetlectus.edu\n", + "\n", + "Line number 341 => 337\tNEHRU\tMENDOZA\t 859105\n", + "\n", + "Line number 342 => \tporttitor.interdum.Sed@Loremipsum.co.uk\n", + "\n", + "Line number 780 => 775\t\n", + "\n", + "Line number 781 => Barbara\tHurley\t691210\tenim.Mauris.quis@magna.net\n", + "\n", + "Line number 991 => 985\tCherokee\tIndian\n", + "\n", + "Line number 992 => \t157172\tenim@disparturient.edu\n", + "\n" + ] + } + ], + "source": [ + "# most items have 4 tab-delimters; anomalies don't have 4 tab-delimiters\n", + "# view lines with anomalies '\\t', print out line number (0-index)\n", + "_ = [print(f'Line number {idx} => ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('\\t') != 4]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Line number 1007 ... 1000\tHermione\tMorales\t478506\teu.nulla@Donec.com\n" + ] + } + ], + "source": [ + "# view lines with anomalies on '\\n' ... this is the last line in the data file \n", + "_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('\\n') != 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Line number 298 ... 294\tWyoming\tWilliams\t454-586\todio.a.purus@Suspendisse.co.uk\n", + "\n", + "Line number 392 ... 387\tPalmer\tAcevedo\t168-722\tut.pellentesque.eget@eleifendCrassed.org\n", + "\n", + "Line number 421 ... 416\telvis\tRIVERA\t865-008\tQuisque.libero@vitaesodales.net\n", + "\n", + "Line number 481 ... 476\tIvana\tChapman\t357-130\tet@Cras.co.uk\n", + "\n", + "Line number 692 ... 687\tFay\tIrwin\t1181-61\tmetus@elitsedconsequat.com\n", + "\n", + "Line number 961 ... 955\tVeronica\tBarr-Novel\t953228\tsollicitudin@Nunc.org\n", + "\n" + ] + } + ], + "source": [ + "# view lines with anomalies on '-' ... 5 out of 6 are in the account_number, one is in the last_name\n", + "_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('-') > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Line number 315 ... 311\tKenneth\tBird\t437/680\test@utpharetrased.org\n", + "\n" + ] + } + ], + "source": [ + "# view lines with anomalies on '/' ... one account number\n", + "_ = [print(f'Line number {idx} ... ', item) for idx, item in enumerate(ls_lines_tsv) if item.count('/') > 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Observations in `Section 1`:\n", + "1. Total number of lines is 1008, which includes header line\n", + "1. Most of the lines have 4 tab-delimiters, although not true of 13 lines\n", + "1. The anomalies have fewer than 4 tab-delimiters because the data was split amongst multiple lines\n", + "1. One of the anomalous records has repeat last name 'Harrington'\n", + "1. Correcting the anomalies should be straightforward with a set of rules, but read the remaining lines into dataframe to examine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Section 2: Read into dataframe (exception of anomalies w/o 4 tab-delimiters), and explore dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of dataframe: (994, 5)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemail
01AddisonMarks196296ornare.lectus@et.edu
12DakotaGarza409025scelerisque@Praesentluctus.edu
23BasiaWolfe637720Aliquam@nullaIntegerurna.com
34GermaineCampbell826846id.magna@viverraMaecenas.ca
45LenorePennington345284aliquam@Integer.edu
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number email\n", + "0 1 Addison Marks 196296 ornare.lectus@et.edu\n", + "1 2 Dakota Garza 409025 scelerisque@Praesentluctus.edu\n", + "2 3 Basia Wolfe 637720 Aliquam@nullaIntegerurna.com\n", + "3 4 Germaine Campbell 826846 id.magna@viverraMaecenas.ca\n", + "4 5 Lenore Pennington 345284 aliquam@Integer.edu" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create list with only items with 4 tabs\n", + "ls_lines_tsv_4tabs = [x for x in ls_lines_tsv if x.count('\\t')==4]\n", + "\n", + "# convert list to string, already has newline character, no need to add to join\n", + "str_lines_tsv_4tabs = ''.join(ls_lines_tsv_4tabs)\n", + "\n", + "# convert str to io.StringIO object so it can be read as CSV file\n", + "# https://www.kite.com/python/answers/how-to-create-a-pandas-dataframe-from-a-string-in-python\n", + "io_data_tsv = io.StringIO(str_lines_tsv_4tabs)\n", + "\n", + "# create dataframe, tab-delimited\n", + "df_4_tabs = pd.read_csv(io_data_tsv, sep='\\t')\n", + "\n", + "print(f\"Shape of dataframe: {df_4_tabs.shape}\")\n", + "df_4_tabs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 994\n", + "first_name 686\n", + "last_name 652\n", + "account_number 992\n", + "email 994\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "print(df_4_tabs.nunique())" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "first_name 12\n", + "last_name 9\n", + "account_number 0\n", + "email 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# how many null values per column\n", + "print(df_4_tabs.isna().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id int64\n", + "first_name object\n", + "last_name object\n", + "account_number object\n", + "email object\n", + "dtype: object\n" + ] + } + ], + "source": [ + "print(df_4_tabs.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records in dataframe: 994\n", + "Number of records `account_number` is numeric: 976\n" + ] + } + ], + "source": [ + "# is the account_number always numeric?\n", + "print('Number of records in dataframe: ', df_4_tabs.shape[0])\n", + "print('Number of records `account_number` is numeric: ', df_4_tabs['account_number'].str.isnumeric().sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemail
298302NaNCopeland547803Vivamus.non.lorem@sed.ca
439444RoaryNaN607318mi.pede@orci.ca
514519NaNWilliamson231131Nullam.nisl@massa.ca
533538janaNaN824675nisi.dictum.augue@nonenimcommodo.co.uk
547552FitzgeraldNaN317355elit.a@velit.co.uk
561566WhoopiNaN125742feugiat.non.lobortis@enimSednulla.net
596601NaNNewton601770Duis.ac@nuncinterdum.edu
608613NaNNaN104969dictum@Suspendisse.net
624629NaNRiggs242923nulla.ante.iaculis@erat.net
668673VielkaNaN270462metus.Aenean@lectusNullamsuscipit.edu
753758NaNNaN155499facilisis.facilisis@quamdignissimpharetra.net
766771NaNNaN935186amet.consectetuer.adipiscing@purusgravida.net
804810NaNBishop934203Sed.et@at.net
842848NaNHess340666interdum.Nunc.sollicitudin@elementum.org
883889NaNEaton292781ac@idnunc.edu
972978NaNNaN958519Aliquam.auctor.velit@fringillapurus.co.uk
980987NaNBurris233269Sed.et@Proin.ca
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number \\\n", + "298 302 NaN Copeland 547803 \n", + "439 444 Roary NaN 607318 \n", + "514 519 NaN Williamson 231131 \n", + "533 538 jana NaN 824675 \n", + "547 552 Fitzgerald NaN 317355 \n", + "561 566 Whoopi NaN 125742 \n", + "596 601 NaN Newton 601770 \n", + "608 613 NaN NaN 104969 \n", + "624 629 NaN Riggs 242923 \n", + "668 673 Vielka NaN 270462 \n", + "753 758 NaN NaN 155499 \n", + "766 771 NaN NaN 935186 \n", + "804 810 NaN Bishop 934203 \n", + "842 848 NaN Hess 340666 \n", + "883 889 NaN Eaton 292781 \n", + "972 978 NaN NaN 958519 \n", + "980 987 NaN Burris 233269 \n", + "\n", + " email \n", + "298 Vivamus.non.lorem@sed.ca \n", + "439 mi.pede@orci.ca \n", + "514 Nullam.nisl@massa.ca \n", + "533 nisi.dictum.augue@nonenimcommodo.co.uk \n", + "547 elit.a@velit.co.uk \n", + "561 feugiat.non.lobortis@enimSednulla.net \n", + "596 Duis.ac@nuncinterdum.edu \n", + "608 dictum@Suspendisse.net \n", + "624 nulla.ante.iaculis@erat.net \n", + "668 metus.Aenean@lectusNullamsuscipit.edu \n", + "753 facilisis.facilisis@quamdignissimpharetra.net \n", + "766 amet.consectetuer.adipiscing@purusgravida.net \n", + "804 Sed.et@at.net \n", + "842 interdum.Nunc.sollicitudin@elementum.org \n", + "883 ac@idnunc.edu \n", + "972 Aliquam.auctor.velit@fringillapurus.co.uk \n", + "980 Sed.et@Proin.ca " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# examine rows with null first_name or last_name\n", + "mask_null_name = (\n", + " df_4_tabs['first_name'].isna()\n", + " | df_4_tabs['last_name'].isna()\n", + ")\n", + "df_4_tabs.loc[mask_null_name]" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# create columns with flags on null names\n", + "df_4_tabs['null_firstname'] = df_4_tabs['first_name'].isna()\n", + "df_4_tabs['null_lastname'] = df_4_tabs['last_name'].isna()\n", + "df_4_tabs['null_name'] = df_4_tabs['null_firstname'] | df_4_tabs['null_lastname']\n", + "\n", + "# create column with length of account_number\n", + "df_4_tabs['len_acct_num'] = df_4_tabs['account_number'].apply(len)\n", + "\n", + "# create column with email domain name\n", + "# https://stackoverflow.com/questions/12504976/get-last-column-after-str-split-operation-on-column-in-pandas-dataframe\n", + "df_4_tabs['domain_name'] = df_4_tabs['email'].str.split('@').str[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
01AddisonMarks196296ornare.lectus@et.eduFalseFalseFalse6et.edu
12DakotaGarza409025scelerisque@Praesentluctus.eduFalseFalseFalse6Praesentluctus.edu
23BasiaWolfe637720Aliquam@nullaIntegerurna.comFalseFalseFalse6nullaIntegerurna.com
34GermaineCampbell826846id.magna@viverraMaecenas.caFalseFalseFalse6viverraMaecenas.ca
45LenorePennington345284aliquam@Integer.eduFalseFalseFalse6Integer.edu
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number email \\\n", + "0 1 Addison Marks 196296 ornare.lectus@et.edu \n", + "1 2 Dakota Garza 409025 scelerisque@Praesentluctus.edu \n", + "2 3 Basia Wolfe 637720 Aliquam@nullaIntegerurna.com \n", + "3 4 Germaine Campbell 826846 id.magna@viverraMaecenas.ca \n", + "4 5 Lenore Pennington 345284 aliquam@Integer.edu \n", + "\n", + " null_firstname null_lastname null_name len_acct_num \\\n", + "0 False False False 6 \n", + "1 False False False 6 \n", + "2 False False False 6 \n", + "3 False False False 6 \n", + "4 False False False 6 \n", + "\n", + " domain_name \n", + "0 et.edu \n", + "1 Praesentluctus.edu \n", + "2 nullaIntegerurna.com \n", + "3 viverraMaecenas.ca \n", + "4 Integer.edu " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_4_tabs.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape of dataframe: (994, 10)\n", + "\n", + "Columns in dataframe: \n", + "['id', 'first_name', 'last_name', 'account_number', 'email', 'null_firstname', 'null_lastname', 'null_name', 'len_acct_num', 'domain_name']\n", + "\n", + "*** Number of value counts in column: id ===> 994\n", + "\n", + "*** Number of value counts in column: first_name ===> 686\n", + "\n", + "*** Number of value counts in column: last_name ===> 652\n", + "\n", + "*** Number of value counts in column: account_number ===> 992\n", + "\n", + "*** Number of value counts in column: email ===> 994\n", + "\n", + "*** Number of value counts in column: null_firstname ===> 2\n", + "\n", + "Value counts of column: null_firstname\n", + "False 982\n", + "True 12\n", + "Name: null_firstname, dtype: int64\n", + "\n", + "*** Number of value counts in column: null_lastname ===> 2\n", + "\n", + "Value counts of column: null_lastname\n", + "False 985\n", + "True 9\n", + "Name: null_lastname, dtype: int64\n", + "\n", + "*** Number of value counts in column: null_name ===> 2\n", + "\n", + "Value counts of column: null_name\n", + "False 977\n", + "True 17\n", + "Name: null_name, dtype: int64\n", + "\n", + "*** Number of value counts in column: len_acct_num ===> 6\n", + "\n", + "Value counts of column: len_acct_num\n", + "6 974\n", + "7 14\n", + "8 2\n", + "5 2\n", + "12 1\n", + "9 1\n", + "Name: len_acct_num, dtype: int64\n", + "\n", + "*** Number of value counts in column: domain_name ===> 904\n", + "\n" + ] + } + ], + "source": [ + "# examine each column for number of unique values ...\n", + "# ... look for .value_counts() situations where there are fewer than 10 counts per column\n", + "# print out number of unique values per column if > 10, print out .value_counts() if < 10\n", + "print(f'Shape of dataframe: {df_4_tabs.shape}\\n')\n", + "print(f'Columns in dataframe: \\n{list(df_4_tabs.columns)}\\n')\n", + "\n", + "for each_col in df_4_tabs.columns:\n", + " num_counts = df_4_tabs[each_col].value_counts().shape[0]\n", + " print(f'*** Number of value counts in column: {each_col} ===> {num_counts}\\n')\n", + " \n", + " if num_counts < 10:\n", + " print(f'Value counts of column: {each_col}')\n", + " print(df_4_tabs[each_col].value_counts())\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
165168NyssaLivingston897187at@tortorat.netFalseFalseFalse7tortorat.net
214218IfeomaWhitney543699Proin.nisl.sem@odiovelest.eduFalseFalseFalse7odiovelest.edu
262266HowardCooley167868faucibus@ametultricies.orgFalseFalseFalse7ametultricies.org
272276WillaDiaz157615Aliquam.tincidunt@metuseuerat.co.ukFalseFalseFalse7metuseuerat.co.uk
290294WyomingWilliams454-586odio.a.purus@Suspendisse.co.ukFalseFalseFalse7Suspendisse.co.uk
295299MeghanWhitaker352355leo@tempus.orgFalseFalseFalse7tempus.org
307311KennethBird437/680est@utpharetrased.orgFalseFalseFalse7utpharetrased.org
382387PalmerAcevedo168-722ut.pellentesque.eget@eleifendCrassed.orgFalseFalseFalse7eleifendCrassed.org
411416elvisRIVERA865-008Quisque.libero@vitaesodales.netFalseFalseFalse7vitaesodales.net
471476IvanaChapman357-130et@Cras.co.ukFalseFalseFalse7Cras.co.uk
472477HopeOneal893425ut@ut.orgFalseFalseFalse8ut.org
489494BlaineBarker874570elit.pretium@sociisnatoquepenatibus.netFalseFalseFalse7sociisnatoquepenatibus.net
494499KerryHowell730368sed.consequat@ac.caFalseFalseFalse9ac.ca
567572GingerNelson475426Duis.gravida.Praesent@vulputate.netFalseFalseFalse7vulputate.net
620625GalenaHolland460962at.risus@tincidunt.co.ukFalseFalseFalse8tincidunt.co.uk
682687FayIrwin1181-61metus@elitsedconsequat.comFalseFalseFalse7elitsedconsequat.com
830836KeeganMcCoy93128feugiat.nec@eu.co.ukFalseFalseFalse5eu.co.uk
954960TaylorWallace75182orci@dolor.netFalseFalseFalse5dolor.net
966972DorothyRoy734417quis@Sedetlibero.caFalseFalseFalse7Sedetlibero.ca
976982GemmaHartman137599luctus@ProinmiAliquam.eduFalseFalseFalse12ProinmiAliquam.edu
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number \\\n", + "165 168 Nyssa Livingston 897187 \n", + "214 218 Ifeoma Whitney 543699 \n", + "262 266 Howard Cooley 167868 \n", + "272 276 Willa Diaz 157615 \n", + "290 294 Wyoming Williams 454-586 \n", + "295 299 Meghan Whitaker 352355 \n", + "307 311 Kenneth Bird 437/680 \n", + "382 387 Palmer Acevedo 168-722 \n", + "411 416 elvis RIVERA 865-008 \n", + "471 476 Ivana Chapman 357-130 \n", + "472 477 Hope Oneal 893425 \n", + "489 494 Blaine Barker 874570 \n", + "494 499 Kerry Howell 730368 \n", + "567 572 Ginger Nelson 475426 \n", + "620 625 Galena Holland 460962 \n", + "682 687 Fay Irwin 1181-61 \n", + "830 836 Keegan McCoy 93128 \n", + "954 960 Taylor Wallace 75182 \n", + "966 972 Dorothy Roy 734417 \n", + "976 982 Gemma Hartman 137599 \n", + "\n", + " email null_firstname null_lastname \\\n", + "165 at@tortorat.net False False \n", + "214 Proin.nisl.sem@odiovelest.edu False False \n", + "262 faucibus@ametultricies.org False False \n", + "272 Aliquam.tincidunt@metuseuerat.co.uk False False \n", + "290 odio.a.purus@Suspendisse.co.uk False False \n", + "295 leo@tempus.org False False \n", + "307 est@utpharetrased.org False False \n", + "382 ut.pellentesque.eget@eleifendCrassed.org False False \n", + "411 Quisque.libero@vitaesodales.net False False \n", + "471 et@Cras.co.uk False False \n", + "472 ut@ut.org False False \n", + "489 elit.pretium@sociisnatoquepenatibus.net False False \n", + "494 sed.consequat@ac.ca False False \n", + "567 Duis.gravida.Praesent@vulputate.net False False \n", + "620 at.risus@tincidunt.co.uk False False \n", + "682 metus@elitsedconsequat.com False False \n", + "830 feugiat.nec@eu.co.uk False False \n", + "954 orci@dolor.net False False \n", + "966 quis@Sedetlibero.ca False False \n", + "976 luctus@ProinmiAliquam.edu False False \n", + "\n", + " null_name len_acct_num domain_name \n", + "165 False 7 tortorat.net \n", + "214 False 7 odiovelest.edu \n", + "262 False 7 ametultricies.org \n", + "272 False 7 metuseuerat.co.uk \n", + "290 False 7 Suspendisse.co.uk \n", + "295 False 7 tempus.org \n", + "307 False 7 utpharetrased.org \n", + "382 False 7 eleifendCrassed.org \n", + "411 False 7 vitaesodales.net \n", + "471 False 7 Cras.co.uk \n", + "472 False 8 ut.org \n", + "489 False 7 sociisnatoquepenatibus.net \n", + "494 False 9 ac.ca \n", + "567 False 7 vulputate.net \n", + "620 False 8 tincidunt.co.uk \n", + "682 False 7 elitsedconsequat.com \n", + "830 False 5 eu.co.uk \n", + "954 False 5 dolor.net \n", + "966 False 7 Sedetlibero.ca \n", + "976 False 12 ProinmiAliquam.edu " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# most values in account_number field are 6 characters long ... anomalies are not ...\n", + "# ... examine records where account number is not 6 digits long\n", + "df_4_tabs.query(\" len_acct_num != 6 \")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Show dataframe of non-numeric account numbers: \n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_name
165168NyssaLivingston897187at@tortorat.netFalseFalseFalse7tortorat.net
214218IfeomaWhitney543699Proin.nisl.sem@odiovelest.eduFalseFalseFalse7odiovelest.edu
262266HowardCooley167868faucibus@ametultricies.orgFalseFalseFalse7ametultricies.org
272276WillaDiaz157615Aliquam.tincidunt@metuseuerat.co.ukFalseFalseFalse7metuseuerat.co.uk
290294WyomingWilliams454-586odio.a.purus@Suspendisse.co.ukFalseFalseFalse7Suspendisse.co.uk
295299MeghanWhitaker352355leo@tempus.orgFalseFalseFalse7tempus.org
307311KennethBird437/680est@utpharetrased.orgFalseFalseFalse7utpharetrased.org
382387PalmerAcevedo168-722ut.pellentesque.eget@eleifendCrassed.orgFalseFalseFalse7eleifendCrassed.org
411416elvisRIVERA865-008Quisque.libero@vitaesodales.netFalseFalseFalse7vitaesodales.net
471476IvanaChapman357-130et@Cras.co.ukFalseFalseFalse7Cras.co.uk
472477HopeOneal893425ut@ut.orgFalseFalseFalse8ut.org
489494BlaineBarker874570elit.pretium@sociisnatoquepenatibus.netFalseFalseFalse7sociisnatoquepenatibus.net
494499KerryHowell730368sed.consequat@ac.caFalseFalseFalse9ac.ca
567572GingerNelson475426Duis.gravida.Praesent@vulputate.netFalseFalseFalse7vulputate.net
620625GalenaHolland460962at.risus@tincidunt.co.ukFalseFalseFalse8tincidunt.co.uk
682687FayIrwin1181-61metus@elitsedconsequat.comFalseFalseFalse7elitsedconsequat.com
966972DorothyRoy734417quis@Sedetlibero.caFalseFalseFalse7Sedetlibero.ca
976982GemmaHartman137599luctus@ProinmiAliquam.eduFalseFalseFalse12ProinmiAliquam.edu
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number \\\n", + "165 168 Nyssa Livingston 897187 \n", + "214 218 Ifeoma Whitney 543699 \n", + "262 266 Howard Cooley 167868 \n", + "272 276 Willa Diaz 157615 \n", + "290 294 Wyoming Williams 454-586 \n", + "295 299 Meghan Whitaker 352355 \n", + "307 311 Kenneth Bird 437/680 \n", + "382 387 Palmer Acevedo 168-722 \n", + "411 416 elvis RIVERA 865-008 \n", + "471 476 Ivana Chapman 357-130 \n", + "472 477 Hope Oneal 893425 \n", + "489 494 Blaine Barker 874570 \n", + "494 499 Kerry Howell 730368 \n", + "567 572 Ginger Nelson 475426 \n", + "620 625 Galena Holland 460962 \n", + "682 687 Fay Irwin 1181-61 \n", + "966 972 Dorothy Roy 734417 \n", + "976 982 Gemma Hartman 137599 \n", + "\n", + " email null_firstname null_lastname \\\n", + "165 at@tortorat.net False False \n", + "214 Proin.nisl.sem@odiovelest.edu False False \n", + "262 faucibus@ametultricies.org False False \n", + "272 Aliquam.tincidunt@metuseuerat.co.uk False False \n", + "290 odio.a.purus@Suspendisse.co.uk False False \n", + "295 leo@tempus.org False False \n", + "307 est@utpharetrased.org False False \n", + "382 ut.pellentesque.eget@eleifendCrassed.org False False \n", + "411 Quisque.libero@vitaesodales.net False False \n", + "471 et@Cras.co.uk False False \n", + "472 ut@ut.org False False \n", + "489 elit.pretium@sociisnatoquepenatibus.net False False \n", + "494 sed.consequat@ac.ca False False \n", + "567 Duis.gravida.Praesent@vulputate.net False False \n", + "620 at.risus@tincidunt.co.uk False False \n", + "682 metus@elitsedconsequat.com False False \n", + "966 quis@Sedetlibero.ca False False \n", + "976 luctus@ProinmiAliquam.edu False False \n", + "\n", + " null_name len_acct_num domain_name \n", + "165 False 7 tortorat.net \n", + "214 False 7 odiovelest.edu \n", + "262 False 7 ametultricies.org \n", + "272 False 7 metuseuerat.co.uk \n", + "290 False 7 Suspendisse.co.uk \n", + "295 False 7 tempus.org \n", + "307 False 7 utpharetrased.org \n", + "382 False 7 eleifendCrassed.org \n", + "411 False 7 vitaesodales.net \n", + "471 False 7 Cras.co.uk \n", + "472 False 8 ut.org \n", + "489 False 7 sociisnatoquepenatibus.net \n", + "494 False 9 ac.ca \n", + "567 False 7 vulputate.net \n", + "620 False 8 tincidunt.co.uk \n", + "682 False 7 elitsedconsequat.com \n", + "966 False 7 Sedetlibero.ca \n", + "976 False 12 ProinmiAliquam.edu " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Show the actual account_numbers: \n", + "[' 897187' '543699 ' '167868 ' ' 157615' '454-586' ' 352355' '437/680'\n", + " '168-722' '865-008' '357-130' ' 893425' ' 874570' '730368 ' ' 475426'\n", + " '460962 ' '1181-61' ' 734417' '137599 ']\n" + ] + } + ], + "source": [ + "# examine non-numeric account numbers by creating separate dataframe\n", + "# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.isnumeric.html\n", + "mask_numeric_acct_num = df_4_tabs['account_number'].str.isnumeric()\n", + "df_4_tabs_nonnumeric_acct_num = df_4_tabs.loc[~mask_numeric_acct_num]\n", + "\n", + "print('Show dataframe of non-numeric account numbers: \\n')\n", + "display(df_4_tabs_nonnumeric_acct_num)\n", + "print()\n", + "\n", + "print('Show the actual account_numbers: ')\n", + "print(df_4_tabs_nonnumeric_acct_num['account_number'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of characters in `clean_account_number` column: \n", + "6 992\n", + "5 2\n", + "Name: len_clean_acct_num, dtype: int64\n", + "\n", + "Show the `clean_account_number` that is not 6 characters long: \n", + "['93128' '75182']\n" + ] + } + ], + "source": [ + "# reasons account_number non-numeric: leading/trailing whitespace, dash, slash ...\n", + "# ... clean up the account number, remove space and dashes and slashes => save as new column\n", + "df_4_tabs['clean_account_number'] = df_4_tabs['account_number'].str.replace(' ', '').str.replace('-', '').str.replace('/', '')\n", + "\n", + "# in new clean column, any account numbers less than 6 digits? create new column to measure length\n", + "df_4_tabs['len_clean_acct_num'] = df_4_tabs['clean_account_number'].apply(len)\n", + "\n", + "print('Number of characters in `clean_account_number` column: ')\n", + "print(df_4_tabs['len_clean_acct_num'].value_counts())\n", + "print()\n", + "\n", + "print('Show the `clean_account_number` that is not 6 characters long: ')\n", + "print(df_4_tabs.query(\" len_clean_acct_num != 6 \")['clean_account_number'].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 120\n", + "3 120\n", + "1 114\n", + "4 113\n", + "9 113\n", + "7 112\n", + "5 109\n", + "8 97\n", + "6 96\n", + "Name: first_char_clean_an, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "7 116\n", + "4 105\n", + "6 105\n", + "1 103\n", + "9 97\n", + "2 95\n", + "8 95\n", + "0 95\n", + "3 93\n", + "5 90\n", + "Name: last_char_clean_an, dtype: int64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# check for anomalies on first or last character of account number\n", + "df_4_tabs['first_char_clean_an'] = df_4_tabs['clean_account_number'].str[0]\n", + "df_4_tabs['last_char_clean_an'] = df_4_tabs['clean_account_number'].str[-1]\n", + "\n", + "display(df_4_tabs['first_char_clean_an'].value_counts())\n", + "print()\n", + "display(df_4_tabs['last_char_clean_an'].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_nameclean_account_numberlen_clean_acct_numfirst_char_clean_anlast_char_clean_annuniq_acct_num
410415CourtneySalinas114392nec@nunc.netFalseFalseFalse6nunc.net1143926122
512517WayneWilson973758nonummy@iaculis.co.ukFalseFalseFalse6iaculis.co.uk9737586982
899905AliceDyer973758Vivamus.nibh.dolor@gravidamauris.co.ukFalseFalseFalse6gravidamauris.co.uk9737586982
928934NoraLeonard114392ornare.elit@libero.comFalseFalseFalse6libero.com1143926122
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number \\\n", + "410 415 Courtney Salinas 114392 \n", + "512 517 Wayne Wilson 973758 \n", + "899 905 Alice Dyer 973758 \n", + "928 934 Nora Leonard 114392 \n", + "\n", + " email null_firstname null_lastname \\\n", + "410 nec@nunc.net False False \n", + "512 nonummy@iaculis.co.uk False False \n", + "899 Vivamus.nibh.dolor@gravidamauris.co.uk False False \n", + "928 ornare.elit@libero.com False False \n", + "\n", + " null_name len_acct_num domain_name clean_account_number \\\n", + "410 False 6 nunc.net 114392 \n", + "512 False 6 iaculis.co.uk 973758 \n", + "899 False 6 gravidamauris.co.uk 973758 \n", + "928 False 6 libero.com 114392 \n", + "\n", + " len_clean_acct_num first_char_clean_an last_char_clean_an nuniq_acct_num \n", + "410 6 1 2 2 \n", + "512 6 9 8 2 \n", + "899 6 9 8 2 \n", + "928 6 1 2 2 " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# is the `clean_account_number` a unique primary key? Or any duplicates?\n", + "# look for non-unique account numbers using .groupby().transform()\n", + "# https://pbpython.com/pandas_transform.html\n", + "df_4_tabs['nuniq_acct_num'] = df_4_tabs.groupby('clean_account_number')['clean_account_number'].transform(len)\n", + "df_4_tabs.query(\" nuniq_acct_num>1 \")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfirst_namelast_nameaccount_numberemailnull_firstnamenull_lastnamenull_namelen_acct_numdomain_nameclean_account_numberlen_clean_acct_numfirst_char_clean_anlast_char_clean_annuniq_acct_num
830836KeeganMcCoy93128feugiat.nec@eu.co.ukFalseFalseFalse5eu.co.uk931285981
954960TaylorWallace75182orci@dolor.netFalseFalseFalse5dolor.net751825721
\n", + "
" + ], + "text/plain": [ + " id first_name last_name account_number email \\\n", + "830 836 Keegan McCoy 93128 feugiat.nec@eu.co.uk \n", + "954 960 Taylor Wallace 75182 orci@dolor.net \n", + "\n", + " null_firstname null_lastname null_name len_acct_num domain_name \\\n", + "830 False False False 5 eu.co.uk \n", + "954 False False False 5 dolor.net \n", + "\n", + " clean_account_number len_clean_acct_num first_char_clean_an \\\n", + "830 93128 5 9 \n", + "954 75182 5 7 \n", + "\n", + " last_char_clean_an nuniq_acct_num \n", + "830 8 1 \n", + "954 2 1 " + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# look for account numbers (after cleaning) that are not length 6 characters\n", + "df_4_tabs.query(\" len_clean_acct_num !=6 \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Observations in `Section 2`:\n", + "1. The data with 4 tab-delimiters loads into dataframe easily\n", + "1. There are 994 rows, 5 columns (not counting header)\n", + "1. Some null values in first_name, last_name, or both\n", + "1. `id` column is numeric, but remaining columns convert to str (initially surprised `account_number` didn't convert numeric)\n", + "1. The `id` and `email` columns are completely unique, but the rest are not; even `account_number` has a few non-uniques\n", + " * `account_number`: 114392\n", + " * `account_number`: 973758\n", + "1. The `account_number` column has length of 6 characters in 974 records, but remaining have different lengths\n", + "1. After cleaning `account_number` by removing spaces, dashes, and slashes; two of the records are length 5 (consider leading zero that was removed?)\n", + " * `account_number`: 93128\n", + " * `account_number`: 75182\n", + "1. The first and last character in `account_number` appear random\n", + "1. Surprisingly, the `email` domains are almost unique, 904 uniques out of 994 rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Section 3: clean up tab-delimiters" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['29\\tAdena\\tHobbs\\n',\n", + " 'Bosley\\t656184\\n',\n", + " '\\tac.ipsum.Phasellus@ut.net\\n',\n", + " '82\\tJade\\tBattle\\n',\n", + " ' \\t531695\\tlectus.justo@lorem.co.uk\\n',\n", + " '217\\tBoris\\n',\n", + " 'Harrington\\tHarrington\\t325378\\tneque.Nullam.ut@laoreetlectus.edu\\n',\n", + " '337\\tNEHRU\\tMENDOZA\\t 859105\\n',\n", + " '\\tporttitor.interdum.Sed@Loremipsum.co.uk\\n',\n", + " '775\\t\\n',\n", + " 'Barbara\\tHurley\\t691210\\tenim.Mauris.quis@magna.net\\n',\n", + " '985\\tCherokee\\tIndian\\n',\n", + " '\\t157172\\tenim@disparturient.edu\\n']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create list with anomalies '\\t'\n", + "ls_lines_anom_tab_delim = [x for x in ls_lines_tsv if x.count('\\t') != 4]\n", + "ls_lines_anom_tab_delim" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'29\\tAdena\\tHobbs\\nBosley\\t656184\\n\\tac.ipsum.Phasellus@ut.net\\n82\\tJade\\tBattle\\n \\t531695\\tlectus.justo@lorem.co.uk\\n217\\tBoris\\nHarrington\\tHarrington\\t325378\\tneque.Nullam.ut@laoreetlectus.edu\\n337\\tNEHRU\\tMENDOZA\\t 859105\\n\\tporttitor.interdum.Sed@Loremipsum.co.uk\\n775\\t\\nBarbara\\tHurley\\t691210\\tenim.Mauris.quis@magna.net\\n985\\tCherokee\\tIndian\\n\\t157172\\tenim@disparturient.edu\\n'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create string with anomalies '\\t'\n", + "str_anom_tab_delim = ''.join(ls_lines_anom_tab_delim)\n", + "str_anom_tab_delim" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'29\\tAdena\\tHobbs\\tBosley\\t656184\\tac.ipsum.Phasellus@ut.net\\t82\\tJade\\tBattle\\t531695\\tlectus.justo@lorem.co.uk\\t217\\tBoris\\tHarrington\\tHarrington\\t325378\\tneque.Nullam.ut@laoreetlectus.edu\\t337\\tNEHRU\\tMENDOZA\\t859105\\tporttitor.interdum.Sed@Loremipsum.co.uk\\t775\\tBarbara\\tHurley\\t691210\\tenim.Mauris.quis@magna.net\\t985\\tCherokee\\tIndian\\t157172\\tenim@disparturient.edu\\t'" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# clean string: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab\n", + "str_anom_tab_delim_clean = str_anom_tab_delim.replace('\\n', '\\t').replace(' ', '').replace('\\t\\t', '\\t')\n", + "str_anom_tab_delim_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['29',\n", + " 'Adena',\n", + " 'Hobbs',\n", + " 'Bosley',\n", + " '656184',\n", + " 'ac.ipsum.Phasellus@ut.net',\n", + " '82',\n", + " 'Jade',\n", + " 'Battle',\n", + " '531695',\n", + " 'lectus.justo@lorem.co.uk',\n", + " '217',\n", + " 'Boris',\n", + " 'Harrington',\n", + " 'Harrington',\n", + " '325378',\n", + " 'neque.Nullam.ut@laoreetlectus.edu',\n", + " '337',\n", + " 'NEHRU',\n", + " 'MENDOZA',\n", + " '859105',\n", + " 'porttitor.interdum.Sed@Loremipsum.co.uk',\n", + " '775',\n", + " 'Barbara',\n", + " 'Hurley',\n", + " '691210',\n", + " 'enim.Mauris.quis@magna.net',\n", + " '985',\n", + " 'Cherokee',\n", + " 'Indian',\n", + " '157172',\n", + " 'enim@disparturient.edu',\n", + " '']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# examine string split on tab-delimeter\n", + "str_anom_tab_delim_clean.split('\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique characters not numbers or letters:\n", + "{'@', '_', '.', '-', '\\t', '/'}\n", + "\n", + "Unique characters not numbers or letters (utf-16-le):\n", + "{'@', '_', '\\n', '.', '-', '\\t', '/', 'È', ' '}\n", + "\n" + ] + } + ], + "source": [ + "# convert list of text to str\n", + "# # create str that joins all records: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab\n", + "# str_tsv = ''.join(ls_lines_tsv).replace('\\n', '\\t').replace(' ', '').replace('\\t\\t', '\\t')\n", + "ls_clean_tsv = [x.replace('\\n', '\\t').replace(' ', '').replace('\\t\\t', '\\t') for x in ls_lines_tsv]\n", + "str_tsv = ''.join(ls_clean_tsv)\n", + "\n", + "# look for characters not letters or numbers\n", + "print('Unique characters not numbers or letters:')\n", + "print(set(re.findall('[^a-zA-Z0-9]', str_tsv)))\n", + "print()\n", + "\n", + "# look for characters not letters or numbers ... in the raw utf-16-le string\n", + "print('Unique characters not numbers or letters (utf-16-le):')\n", + "print(set(re.findall('[^a-zA-Z0-9]', ''.join(ls_lines_tsv_utf16le))))\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of records with 0 names: 0\n", + "Number of records with 1 names: 0\n", + "Number of records with 2 names: 4\n", + "Number of records with 3 names: 2\n" + ] + } + ], + "source": [ + "# use regex groups to identify records\n", + "# https://www.tutorialspoint.com/What-is-the-groups-method-in-regular-expressions-in-Python\n", + "'''\n", + "* id - one or more digits\n", + "* name - may include 0, 1, 2, 3 names\n", + "* account number - one or more digits that may contain '-' or '/'\n", + "* email may contain one or more '@', '.'\n", + "'''\n", + "r_str_match_0names = '(\\d+\\\\t[0-9-/]*\\\\t[a-zA-Z@.]*)'\n", + "r_str_match_1names = '(\\d+\\\\t[a-zA-Z]+\\\\t[0-9-/]*\\\\t[a-zA-Z@.]*)'\n", + "r_str_match_2names = '(\\d+\\\\t[a-zA-Z]+\\\\t[a-zA-Z]+\\\\t[0-9-/]*\\\\t[a-zA-Z@.]*)'\n", + "r_str_match_3names = '(\\d+\\\\t[a-zA-Z]+\\\\t[a-zA-Z]+\\\\t[a-zA-Z]+\\\\t[0-9-/]*\\\\t[a-zA-Z@.]*)'\n", + "\n", + "# create list of records\n", + "ls_re_find_0names = re.findall(r_str_match_0names, str_anom_tab_delim_clean)\n", + "ls_re_find_1names = re.findall(r_str_match_1names, str_anom_tab_delim_clean)\n", + "ls_re_find_2names = re.findall(r_str_match_2names, str_anom_tab_delim_clean)\n", + "ls_re_find_3names = re.findall(r_str_match_3names, str_anom_tab_delim_clean)\n", + "\n", + "print('Number of records with 0 names: ', len(ls_re_find_0names))\n", + "print('Number of records with 1 names: ', len(ls_re_find_1names))\n", + "print('Number of records with 2 names: ', len(ls_re_find_2names))\n", + "print('Number of records with 3 names: ', len(ls_re_find_3names))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clean 3 names to 2 names: \n", + "['29\\tAdena\\tHobbs\\tBosley\\t656184\\tac.ipsum.Phasellus@ut.net', '217\\tBoris\\tHarrington\\tHarrington\\t325378\\tneque.Nullam.ut@laoreetlectus.edu']\n", + "\n", + "['29', 'Adena', 'Hobbs', 'Bosley', '656184', 'ac.ipsum.Phasellus@ut.net']\n", + "['29', 'Adena', 'Bosley', '656184', 'ac.ipsum.Phasellus@ut.net']\n", + "\n", + "['217', 'Boris', 'Harrington', 'Harrington', '325378', 'neque.Nullam.ut@laoreetlectus.edu']\n", + "['217', 'Boris', 'Harrington', '325378', 'neque.Nullam.ut@laoreetlectus.edu']\n", + "\n" + ] + } + ], + "source": [ + "print('Clean 3 names to 2 names: ')\n", + "print(ls_re_find_3names)\n", + "print()\n", + "\n", + "# clean 3 names by removing middle name\n", + "for each_3name in ls_re_find_3names:\n", + " \n", + " # convert to list, split on tab delimiter\n", + " temp_list = each_3name.split('\\t')\n", + " print(temp_list)\n", + " \n", + " # remove middle name\n", + " del temp_list[2]\n", + " print(temp_list)\n", + " \n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "28\n", + "216\n", + "301\n", + "443\n", + "518\n", + "537\n", + "551\n", + "565\n", + "600\n", + "612\n", + "628\n", + "672\n", + "757\n", + "770\n", + "809\n", + "847\n", + "888\n", + "954\n", + "977\n", + "986\n", + "End of List\n" + ] + } + ], + "source": [ + "# find id for any records that don't match 2 name pattern ...\n", + "# ... this is how I discovered the accented characters\n", + "ls_re_all_find_2names = re.findall(r_str_match_2names, str_tsv)\n", + "ls_id_regex = [x.split('\\t')[0] for x in ls_re_all_find_2names]\n", + "for idx, item in enumerate(ls_id_regex):\n", + " if idx == len(ls_id_regex)-1:\n", + " print('End of List')\n", + " elif int(item)+1 != int(ls_id_regex[idx+1]):\n", + " print(item)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Observations in `Section 3`:\n", + "1. Six records do not have 4 tab-delimiters\n", + "1. Many of the issues fall into 3 categories: (a) newline instead of tab, (b) extra whitespaces, (c) double tab-delimiter\n", + " * Therefore correcting many of the mistakes by: (a) replace newline with tab, (b) remove whitespaces, (c) replace double-tab with single-tab\n", + " * Join the list into a single string, then use regex to extract records\n", + "1. Two records continue to have issues because there are 3 names instead of 2 names\n", + " * Correct these records by removing middle name\n", + "1. Using regex allowed me to detect accented characters, so I went back to correct in beginning using unidecode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Algorithm Steps\n", + "1. Read the tsv file into list of strings\n", + "1. Remove records without 4 tab-delimiters => clean up so they will fit into dataframe (e.g. regex to remove middle names) => add back to list\n", + "1. Load entire list into pandas dataframe\n", + "1. Perform cleanup in pandas dataframe (mostly account_number, although minor issue of null names)\n", + "1. Export pandas dataframe to .tsv file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check AWS Cloud Resources" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Pandas Redshift | 2020-11-28 14:13:31,191 | botocore.credentials | INFO | Found credentials in shared credentials file: ~/.aws/credentials\n" + ] + } + ], + "source": [ + "# create s3 resource\n", + "s3_resource = boto3.resource('s3')\n", + "\n", + "# print available AWS S3 buckets\n", + "# https://stackoverflow.com/questions/49372761/boto3-using-boto3-resources3-to-list-all-s3-buckets\n", + "ls_buckets = [bucket.name for bucket in s3_resource.buckets.all()]\n", + "for idx, each_bucket in enumerate(ls_buckets):\n", + " # print(idx, each_bucket)\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + } + ], + "source": [ + "# create redshift client\n", + "rs_client = boto3.client('redshift')\n", + "\n", + "# print available redshift clusters\n", + "# https://stackoverflow.com/questions/34309151/get-list-clusters-amazon-redshift-using-python-with-boto3\n", + "# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html\n", + "di_rs_clusters = rs_client.describe_clusters()\n", + "print(di_rs_clusters['Clusters'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/peterkim-solution.md b/peterkim-solution.md new file mode 100644 index 0000000..ad81141 --- /dev/null +++ b/peterkim-solution.md @@ -0,0 +1,31 @@ +# Peter Kim Solution + +### Approach Overview +Examined the data.tsv file. Rather than immediately working on the solution, I performed significant exploratory data analysis (EDA) on the data using Python Jupyter Notebook; allocated approximately 4-6 hours to simply understanding the data. Analyzed 1,008 lines by reading text into a list of strings. Created a 'notebook' folder which contains the EDA in 'explore-data-challenge.ipynb'. Also added a .gitignore file to ignore folders .ipynb_checkpoints from Jupyter Notebook. + +### Major Insight from EDA +Out of 1,008 lines in the dataset, 995 lines have 4 tab-delimiters. Filtering only those 995 lines, the data is easily read into a pandas dataframe using io.StringIO(), and specifying pandas 'sep' as tab-delimiter. Once data fits into a dataframe, the remaining steps to clean-transform-export can be performed in pandas. For example, the 'account_number' column showed some anomalies that would benefit from cleaning. Also, some records had null values (e.g. first_name, last_name, or both) which can be filled if preferable. + +### Difficult Cleaning +But what about the 13 lines (out of 1,008 lines) that do not have 4 tab-delimiters? The difficult task is to clean-transform those 13 lines so that they fit into the dataframe too (along with the other 995 lines). Used str cleaning and regular expressions (regex) match groups to extract the records from the 13 lines in a structured way; specifically there were two major types of problems: + +1. Delimiter str issues: + * Presence of newline characters, where it should have been tab-delimiters + * Presence of unnecessary extra whitespaces + * Presence of double-tab-delimiters, where it should have been single-tab-delimiters +2. Data issues: some records had 3 names (where should be maximum 2 names) + +### Minor Issue of Accented Character +In the course of experimenting with regex, discovered presence of accented character (understandable given UTF-16-le encoding). Added conversion to UTF-8 to the beginning of the data pipeline. + +### Solution Algorithm +1. Read data.tsv into list of strings, clean accented character. +1. Divide the list of strings into 2 lists: (a) records with 4 tab-delimiters, (b) records without 4 tab-delimiters. +1. Extract data from the records without 4 tab-delimiters using string cleaning and regular expressions, so the list output has 4 tab-delimiters. +1. Merge the list output together so all records have 4 tab-delimiters. +1. Read the list into pandas dataframe, perform cleaning in pandas. +1. Export the final solution to data-solution.csv + +### Bonus +1. Uploading data to Redshift usually involves using pscopg2 library; provide AWS IAM credentials for S3 access, and AWS Redshift credentials. A convenient wrapper exists in the pandas-redshift library for automating pandas-to-redshift upload, and redshift-to-pandas download. Described in 'explore-data-challenge.ipynb'. +1. Parallelizing the algorithm could be solved by extracting the string text using .read() instead of .readlines(), and performing regex match groups on the extracted text. And the final step read the list from regex .findall() into pandas dataframe for further cleaning and then export to .tsv file. \ No newline at end of file