csv_example.py - program terminated without error message

Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below). 
Made sure to use consoleLabel() instead of console_label().

Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message. 
<img width="735" alt="Screen Shot 2022-06-09 at 7 59 19 PM" src="https://user-images.githubusercontent.com/106691199/172981629-df45f3b8-8572-4b92-8f33-813f340f6c92.png">

The code is below:
##################################################
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode


def preProcess(column):
#
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    
    
    if not column:
        column = None
    return column

def readData(filename):
#
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = int(row['Id'])
            data_d[row_id] = dict(clean_row)
            
    return data_d



### example
path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/'
filename = 'csv_example_messy_input.csv'


#######################################


if __name__ == '__main__':
        
    optp = optparse.OptionParser()
    optp.add_option('-v', '--verbose', dest='verbose', action='count',
                    help='Increase verbosity (specify multiple times for more)'
                    )
    (opts, args) = optp.parse_args()
    log_level = logging.WARNING
    if opts.verbose:
        if opts.verbose == 1:
            log_level = logging.INFO
        elif opts.verbose >= 2:
            log_level = logging.DEBUG
    logging.getLogger().setLevel(log_level)

    input_file = path + filename
    output_file = path + 'output.csv'
    #settings_file = 'csv_example_learned_settings'
    training_file = path + 'csv_input_with_true_ids.csv'

    print('importing data ...')
    data_d = readData(input_file)
        
    
        
    
    fields = [
            {'field': 'Site name', 'type': 'String'},
            {'field': 'Address', 'type': 'String'},
            {'field': 'Zip', 'type': 'Exact', 'has missing': True},
            {'field': 'Phone', 'type': 'String', 'has missing': True},
            ]
    
   
        
    deduper = dedupe.Dedupe(fields)
    
    if os.path.exists(training_file):
        print('reading labeled examples from ', training_file)
        with open(training_file, 'rb') as f:
            deduper.prepare_training(data_d,f)
    else:
        deduper.prepare_training(data_d)
        
    print('starting active labeling...')
    
    
 #as  of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel


    
    dedupe.consoleLabel(deduper)
    
    deduper.train()
    
    with open(training_file, 'w') as tf:
        deduper.write_training(tf)
            
print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))
    
    
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

with open(output_file, 'w') as f_output, open(input_file) as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        row_id = int(row['id'])
        row.update(cluster_membership[row_id])
        writer.writerow(row)
    
      
    



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

csv_example.py - program terminated without error message #128

example

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

csv_example.py - program terminated without error message #128

Description

example

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions