dedupeio / dedupe-examples

:id: Examples for using the dedupe library
MIT License
404 stars 216 forks source link

csv_example.py - program terminated without error message #128

Open surianisha opened 2 years ago

surianisha commented 2 years ago

Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below). Made sure to use consoleLabel() instead of console_label().

Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message.

Screen Shot 2022-06-09 at 7 59 19 PM

The code is below: ################################################## import os import csv import re import logging import optparse

import dedupe from unidecode import unidecode

def preProcess(column): # column = unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip()

if not column:
    column = None
return column

def readData(filename): # data_d = {} with open(filename) as f: reader = csv.DictReader(f) for row in reader: clean_row = [(k, preProcess(v)) for (k, v) in row.items()] row_id = int(row['Id']) data_d[row_id] = dict(clean_row)

return data_d

example

path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/' filename = 'csv_example_messy_input.csv'

#######################################

if name == 'main':

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = path + filename
output_file = path + 'output.csv'
#settings_file = 'csv_example_learned_settings'
training_file = path + 'csv_input_with_true_ids.csv'

print('importing data ...')
data_d = readData(input_file)

fields = [
        {'field': 'Site name', 'type': 'String'},
        {'field': 'Address', 'type': 'String'},
        {'field': 'Zip', 'type': 'Exact', 'has missing': True},
        {'field': 'Phone', 'type': 'String', 'has missing': True},
        ]

deduper = dedupe.Dedupe(fields)

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file, 'rb') as f:
        deduper.prepare_training(data_d,f)
else:
    deduper.prepare_training(data_d)

print('starting active labeling...')

as of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf:
    deduper.write_training(tf)

print('clustering...') clustered_dupes = deduper.partition(data_d, 0.5) print('# duplicate sets', len(clustered_dupes))

cluster_membership = {} for cluster_id, (records, scores) in enumerate(clustered_dupes): for record_id, score in zip(records, scores): cluster_membership[record_id] = { "Cluster ID": cluster_id, "confidence_score": score }

with open(output_file, 'w') as f_output, open(input_file) as f_input:

reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()

for row in reader:
    row_id = int(row['id'])
    row.update(cluster_membership[row_id])
    writer.writerow(row)
surianisha commented 2 years ago

Internal error details:

Traceback (most recent call last): File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/base_frontend_mixin.py", line 138, in _dispatch handler(msg) File "/opt/anaconda3/lib/python3.9/site-packages/spyder/plugins/ipythonconsole/widgets/debugging.py", line 278, in _handle_input_request return super(DebuggingWidget, self)._handle_input_request(msg) File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/frontend_widget.py", line 512, in _handle_input_request self._readline(msg['content']['prompt'], callback=callback, password=msg['content']['password']) File "/opt/anaconda3/lib/python3.9/site-packages/qtconsole/console_widget.py", line 2422, in _readline self._show_prompt(prompt, newline=False, separator=False) TypeError: _show_prompt() got an unexpected keyword argument 'separator'