-
Notifications
You must be signed in to change notification settings - Fork 215
Description
Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below).
Made sure to use consoleLabel() instead of console_label().
Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message.

The code is below:
##################################################
import os
import csv
import re
import logging
import optparse
import dedupe
from unidecode import unidecode
def preProcess(column):
column = unidecode(column)
column = re.sub(' +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()
if not column:
column = None
return column
def readData(filename):
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row['Id'])
data_d[row_id] = dict(clean_row)
return data_d
example
path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/'
filename = 'csv_example_messy_input.csv'
#######################################
if name == 'main':
optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
help='Increase verbosity (specify multiple times for more)'
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
input_file = path + filename
output_file = path + 'output.csv'
#settings_file = 'csv_example_learned_settings'
training_file = path + 'csv_input_with_true_ids.csv'
print('importing data ...')
data_d = readData(input_file)
fields = [
{'field': 'Site name', 'type': 'String'},
{'field': 'Address', 'type': 'String'},
{'field': 'Zip', 'type': 'Exact', 'has missing': True},
{'field': 'Phone', 'type': 'String', 'has missing': True},
]
deduper = dedupe.Dedupe(fields)
if os.path.exists(training_file):
print('reading labeled examples from ', training_file)
with open(training_file, 'rb') as f:
deduper.prepare_training(data_d,f)
else:
deduper.prepare_training(data_d)
print('starting active labeling...')
#as of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel
dedupe.consoleLabel(deduper)
deduper.train()
with open(training_file, 'w') as tf:
deduper.write_training(tf)
print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
}
with open(output_file, 'w') as f_output, open(input_file) as f_input:
reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames
writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
row_id = int(row['id'])
row.update(cluster_membership[row_id])
writer.writerow(row)