2022-10-27 17:07:07 -04:00
|
|
|
import csv
|
|
|
|
|
from Levenshtein import distance as lev
|
2022-11-07 10:36:13 -05:00
|
|
|
import pandas as pd
|
|
|
|
|
import itertools
|
2022-10-27 17:07:07 -04:00
|
|
|
import sys
|
|
|
|
|
|
2022-11-07 10:36:13 -05:00
|
|
|
peopleCsv = "/Users/normrasmussen/Downloads/TalkspaceAllLearners.csv"
|
2022-10-27 17:07:07 -04:00
|
|
|
|
2022-11-07 10:36:13 -05:00
|
|
|
def readCsv(peopleCsv):
|
|
|
|
|
people = []
|
|
|
|
|
readExport = pd.read_csv(
|
|
|
|
|
peopleCsv,
|
|
|
|
|
usecols=['Learner Full Name', 'Email'],
|
|
|
|
|
skipinitialspace=True,
|
|
|
|
|
#index_col=True,
|
|
|
|
|
)
|
|
|
|
|
people.extend(readExport['Email'].tolist())
|
|
|
|
|
startCompare(peopleCsv, people, readExport)
|
2022-10-27 17:07:07 -04:00
|
|
|
|
2022-11-07 10:36:13 -05:00
|
|
|
# itertools combinations
|
|
|
|
|
def startCompare(peopleCsv, people, readExport):
|
|
|
|
|
email1 = []
|
|
|
|
|
email2 = []
|
|
|
|
|
for name1, name2, in itertools.combinations(people, 2):
|
|
|
|
|
#print(name1, name2) - prints all pairs, working so far.
|
|
|
|
|
distance = lev(name1, name2)
|
|
|
|
|
#print(distance) - successfully returns numbers
|
|
|
|
|
if distance > 0 and distance < 2:
|
|
|
|
|
email1.append(name1)
|
|
|
|
|
email2.append(name2)
|
|
|
|
|
writenewColumn(email1, email2, peopleCsv, readExport)
|
|
|
|
|
|
|
|
|
|
def writenewColumn(email1, email2, peopleCsv, readExport):
|
|
|
|
|
df = pd.DataFrame(readExport)
|
|
|
|
|
print(df)
|
|
|
|
|
df['Email1'] = pd.Series(email1)
|
|
|
|
|
df['Email2'] = pd.Series(email2)
|
|
|
|
|
df.drop_duplicates('Email1', inplace=True)
|
|
|
|
|
df.drop_duplicates('Email2', inplace=True)
|
|
|
|
|
df.drop_duplicates(
|
|
|
|
|
subset=['Email1', 'Email2'])
|
|
|
|
|
#keep = 'last').reset_index(drop=True)
|
|
|
|
|
writeLst = df.to_csv(
|
|
|
|
|
'/Users/normrasmussen/Downloads/TalkspaceDupes_singlechange.csv',
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
readCsv(peopleCsv)
|