Files
Gainsight/Scripts/API_Tests/usercomparison.py

55 lines
1.5 KiB
Python
Raw Normal View History

2022-10-27 17:07:07 -04:00
import csv
from Levenshtein import distance as lev
2022-11-07 10:36:13 -05:00
import pandas as pd
import itertools
2022-10-27 17:07:07 -04:00
import sys
2022-11-07 10:36:13 -05:00
peopleCsv = "/Users/normrasmussen/Downloads/TalkspaceAllLearners.csv"
2022-10-27 17:07:07 -04:00
2022-11-07 10:36:13 -05:00
def readCsv(peopleCsv):
people = []
readExport = pd.read_csv(
peopleCsv,
usecols=["Learner Full Name", "Email"],
skipinitialspace=True,
# index_col=True,
)
people.extend(readExport["Email"].tolist())
2022-11-07 10:36:13 -05:00
startCompare(peopleCsv, people, readExport)
2022-10-27 17:07:07 -04:00
2022-11-07 10:36:13 -05:00
# itertools combinations
def startCompare(peopleCsv, people, readExport):
email1 = []
email2 = []
for (
name1,
name2,
) in itertools.combinations(people, 2):
# print(name1, name2) - prints all pairs, working so far.
2022-11-07 10:36:13 -05:00
distance = lev(name1, name2)
# print(distance) - successfully returns numbers
2022-11-07 10:36:13 -05:00
if distance > 0 and distance < 2:
email1.append(name1)
email2.append(name2)
writenewColumn(email1, email2, peopleCsv, readExport)
2022-11-07 10:36:13 -05:00
def writenewColumn(email1, email2, peopleCsv, readExport):
df = pd.DataFrame(readExport)
print(df)
df["Email1"] = pd.Series(email1)
df["Email2"] = pd.Series(email2)
df.drop_duplicates("Email1", inplace=True)
df.drop_duplicates("Email2", inplace=True)
df.drop_duplicates(subset=["Email1", "Email2"])
# keep = 'last').reset_index(drop=True)
2022-11-07 10:36:13 -05:00
writeLst = df.to_csv(
"/Users/normrasmussen/Downloads/TalkspaceDupes_singlechange.csv",
)
2022-11-07 10:36:13 -05:00
if __name__ == "__main__":
readCsv(peopleCsv)