Skip to content

csv_example.py - program terminated without error message #128

@surianisha

Description

@surianisha

Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below).
Made sure to use consoleLabel() instead of console_label().

Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message.
Screen Shot 2022-06-09 at 7 59 19 PM

The code is below:
##################################################
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

def preProcess(column):

column = unidecode(column)
column = re.sub('  +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()


if not column:
    column = None
return column

def readData(filename):

data_d = {}
with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
        clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
        row_id = int(row['Id'])
        data_d[row_id] = dict(clean_row)
        
return data_d

example

path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/'
filename = 'csv_example_messy_input.csv'

#######################################

if name == 'main':

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = path + filename
output_file = path + 'output.csv'
#settings_file = 'csv_example_learned_settings'
training_file = path + 'csv_input_with_true_ids.csv'

print('importing data ...')
data_d = readData(input_file)
    

    

fields = [
        {'field': 'Site name', 'type': 'String'},
        {'field': 'Address', 'type': 'String'},
        {'field': 'Zip', 'type': 'Exact', 'has missing': True},
        {'field': 'Phone', 'type': 'String', 'has missing': True},
        ]


    
deduper = dedupe.Dedupe(fields)

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file, 'rb') as f:
        deduper.prepare_training(data_d,f)
else:
    deduper.prepare_training(data_d)
    
print('starting active labeling...')

#as of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf:
    deduper.write_training(tf)

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
}

with open(output_file, 'w') as f_output, open(input_file) as f_input:

reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()

for row in reader:
    row_id = int(row['id'])
    row.update(cluster_membership[row_id])
    writer.writerow(row)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions