Files
Gainsight/Scripts/Auto_Update_Excel/walmart.py

97 lines
3.7 KiB
Python
Raw Normal View History

2022-10-31 10:47:43 -04:00
import os
from datetime import date
import glob
import re
2022-10-31 10:47:43 -04:00
import shutil
import pandas as pd
rootdir = "/Users/normrasmussen/Documents/Resources/Walmart/"
downloadir = (
"/Users/normrasmussen/Google Drive/My Drive/Shared with Clients/Walmart_Looker/"
)
2022-12-22 18:15:51 -05:00
basefile = "Walmart_Weekly_Base.xlsx"
2022-10-31 10:47:43 -04:00
def copytemplate(rootdir, basefile):
segments = ["Group1", "Group2", "Group3", "Group4", "Group5"]
2022-10-31 10:47:43 -04:00
today = date.today()
today = today.strftime("%m.%d.%Y")
2022-12-22 18:15:51 -05:00
template = rootdir + basefile
listfiles = glob.glob(downloadir + "WeeklyMCA*.csv")
2022-12-22 18:15:51 -05:00
if os.path.exists(rootdir):
for segment in segments:
todayFile = f"Walmart-{segment}-{today}.xlsx"
shutil.copy2(template, rootdir + todayFile)
currentDash = rootdir + f"Walmart-{segment}-{today}.xlsx"
for latestdownload in listfiles:
if re.search('(?:Group1)', currentDash) and re.search('(?:Group1)', latestdownload):
copytoDash(latestdownload, currentDash)
elif re.search('(?:Group2)', currentDash) and re.search('(?:Group2)', latestdownload):
copytoDash(latestdownload, currentDash)
elif re.search('(?:Group3)', currentDash) and re.search('(?:Group3)', latestdownload):
copytoDash(latestdownload, currentDash)
elif re.search('(?:Group4)', currentDash) and re.search('(?:Group4)', latestdownload):
copytoDash(latestdownload, currentDash)
elif re.search('(?:Group5)', currentDash) and re.search('(?:Group5)', latestdownload):
copytoDash(latestdownload, currentDash)
else:
print("No matching Looker Pulls or Templates Found!")
# TODO: Update these if statements to loop through the groups in a better fashion
2022-10-31 10:47:43 -04:00
2022-12-22 18:15:51 -05:00
2022-10-31 10:47:43 -04:00
def copytoDash(latestdownload, currentDash):
readExport = pd.read_csv(
2022-12-22 18:15:51 -05:00
latestdownload,
index_col=False,
header=0,
low_memory=False,
# dtype={"Progress": float},
)
2022-10-31 15:36:34 -04:00
print(readExport)
readExport["Progress"] = readExport["Progress"].str[:-1].apply(pd.to_numeric)
2022-12-22 18:15:51 -05:00
# readExport['Progress'].apply(pd.to_numeric, errors='ignore')
print(readExport["Progress"])
2022-12-22 18:15:51 -05:00
readExport.drop(readExport.filter(regex="Unname"), axis=1, inplace=True)
2022-10-31 10:47:43 -04:00
copiedData = readExport.copy()
2022-10-31 15:36:34 -04:00
bringtoExcel(latestdownload, currentDash, copiedData)
2022-10-31 10:47:43 -04:00
2022-12-22 18:15:51 -05:00
2022-10-31 15:36:34 -04:00
def bringtoExcel(latestdownload, currentDash, copiedData):
2022-10-31 10:47:43 -04:00
with pd.ExcelWriter(
2022-12-22 18:15:51 -05:00
currentDash,
mode="a",
engine="openpyxl",
if_sheet_exists="overlay",
# engine_kwargs={'options': {'strings_to_numbers': True}}
2022-10-31 10:47:43 -04:00
) as writer:
copiedData.to_excel(
2022-12-22 18:15:51 -05:00
writer,
engine="xlsxwriter",
sheet_name="Data",
index=False,
)
2022-10-31 10:47:43 -04:00
def cleanitUp(currentDash):
cleanExcel = pd.read_excel(currentDash, sheet_name="Data", index_col=None)
cleanExcel.columns.values[0] = "tmp"
cleanExcel.drop(columns="tmp", axis=1, inplace=True)
print(cleanExcel)
2022-12-22 18:15:51 -05:00
2022-10-31 10:47:43 -04:00
if __name__ == "__main__":
copytemplate(rootdir, basefile)
# TODO: Dataset is too large. Add the math from the excel into the script so that the dataset references the final data and a much smaller set. Commented below is the math as far as I can tell.
"""
Math:
Courses:
Enrolled: For each unique course name, count number of "Enrolled" (Col. C) fields if != null.
Started: For each unique course name, count number of "Attempt Start" (Col. D) fields if != null.
Completed: For each unique course name, count number of "Progress" (Col. G) fields if == 100%
Activity Completions:
Started first activity == Started number above in Course
Activity >= 1 Completions:
"""