Gainsight/Scripts/API_Tests/usercomparison.py

import csv
from Levenshtein import distance as lev
import pandas as pd
import itertools
import sys

peopleCsv = "/Users/normrasmussen/Downloads/TalkspaceAllLearners.csv"


def readCsv(peopleCsv):
    people = []
    readExport = pd.read_csv(
        peopleCsv,
        usecols=["Learner Full Name", "Email"],
        skipinitialspace=True,
        # index_col=True,
    )
    people.extend(readExport["Email"].tolist())
    startCompare(peopleCsv, people, readExport)


# itertools combinations
def startCompare(peopleCsv, people, readExport):
    email1 = []
    email2 = []
    for (
        name1,
        name2,
    ) in itertools.combinations(people, 2):
        # print(name1, name2) - prints all pairs, working so far.
        distance = lev(name1, name2)
        # print(distance) - successfully returns numbers
        if distance > 0 and distance < 2:
            email1.append(name1)
            email2.append(name2)
    writenewColumn(email1, email2, peopleCsv, readExport)


def writenewColumn(email1, email2, peopleCsv, readExport):
    df = pd.DataFrame(readExport)
    print(df)
    df["Email1"] = pd.Series(email1)
    df["Email2"] = pd.Series(email2)
    df.drop_duplicates("Email1", inplace=True)
    df.drop_duplicates("Email2", inplace=True)
    df.drop_duplicates(subset=["Email1", "Email2"])
    # keep = 'last').reset_index(drop=True)
    writeLst = df.to_csv(
        "/Users/normrasmussen/Downloads/TalkspaceDupes_singlechange.csv",
    )


if __name__ == "__main__":
    readCsv(peopleCsv)
small changes 2022-10-27 17:07:07 -04:00			`import csv`
			`from Levenshtein import distance as lev`
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`import pandas as pd`
			`import itertools`
small changes 2022-10-27 17:07:07 -04:00			`import sys`

DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`peopleCsv = "/Users/normrasmussen/Downloads/TalkspaceAllLearners.csv"`
small changes 2022-10-27 17:07:07 -04:00
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`def readCsv(peopleCsv):`
			`people = []`
			`readExport = pd.read_csv(`
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00			`peopleCsv,`
			`usecols=["Learner Full Name", "Email"],`
			`skipinitialspace=True,`
			`# index_col=True,`
			`)`
			`people.extend(readExport["Email"].tolist())`
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`startCompare(peopleCsv, people, readExport)`
small changes 2022-10-27 17:07:07 -04:00
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`# itertools combinations`
			`def startCompare(peopleCsv, people, readExport):`
			`email1 = []`
			`email2 = []`
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00			`for (`
			`name1,`
			`name2,`
			`) in itertools.combinations(people, 2):`
			`# print(name1, name2) - prints all pairs, working so far.`
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`distance = lev(name1, name2)`
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00			`# print(distance) - successfully returns numbers`
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`if distance > 0 and distance < 2:`
			`email1.append(name1)`
			`email2.append(name2)`
			`writenewColumn(email1, email2, peopleCsv, readExport)`

Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`def writenewColumn(email1, email2, peopleCsv, readExport):`
			`df = pd.DataFrame(readExport)`
			`print(df)`
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00			`df["Email1"] = pd.Series(email1)`
			`df["Email2"] = pd.Series(email2)`
			`df.drop_duplicates("Email1", inplace=True)`
			`df.drop_duplicates("Email2", inplace=True)`
			`df.drop_duplicates(subset=["Email1", "Email2"])`
			`# keep = 'last').reset_index(drop=True)`
DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00			`writeLst = df.to_csv(`
Walmart script in production! And more notes. 2023-02-01 18:11:39 -05:00			`"/Users/normrasmussen/Downloads/TalkspaceDupes_singlechange.csv",`
			`)`

DV,Walmart, API Tests 2022-11-07 10:36:13 -05:00
			`if __name__ == "__main__":`
			`readCsv(peopleCsv)`