Started the reorganization of Scripts for github. Some notes changes.

2023-05-04 16:07:06 -04:00
parent 6a674ca24b
commit 18a266746b
209 changed files with 121 additions and 12181 deletions
--- a/Scripts/API_Tests/usercomparison.py
+++ b/Scripts/API_Tests/usercomparison.py
@ -0,0 +1,54 @@
+import csv
+from Levenshtein import distance as lev
+import pandas as pd
+import itertools
+import sys
+
+peopleCsv = "/Users/normrasmussen/Downloads/TalkspaceAllLearners.csv"
+
+
+def readCsv(peopleCsv):
+    people = []
+    readExport = pd.read_csv(
+        peopleCsv,
+        usecols=["Learner Full Name", "Email"],
+        skipinitialspace=True,
+        # index_col=True,
+    )
+    people.extend(readExport["Email"].tolist())
+    startCompare(peopleCsv, people, readExport)
+
+
+# itertools combinations
+def startCompare(peopleCsv, people, readExport):
+    email1 = []
+    email2 = []
+    for (
+        name1,
+        name2,
+    ) in itertools.combinations(people, 2):
+        # print(name1, name2) - prints all pairs, working so far.
+        distance = lev(name1, name2)
+        # print(distance) - successfully returns numbers
+        if distance > 0 and distance < 2:
+            email1.append(name1)
+            email2.append(name2)
+    writenewColumn(email1, email2, peopleCsv, readExport)
+
+
+def writenewColumn(email1, email2, peopleCsv, readExport):
+    df = pd.DataFrame(readExport)
+    print(df)
+    df["Email1"] = pd.Series(email1)
+    df["Email2"] = pd.Series(email2)
+    df.drop_duplicates("Email1", inplace=True)
+    df.drop_duplicates("Email2", inplace=True)
+    df.drop_duplicates(subset=["Email1", "Email2"])
+    # keep = 'last').reset_index(drop=True)
+    writeLst = df.to_csv(
+        "/Users/normrasmussen/Downloads/TalkspaceDupes_singlechange.csv",
+    )
+
+
+if __name__ == "__main__":
+    readCsv(peopleCsv)