2022-10-31 10:47:43 -04:00
import os
from datetime import date
import glob
2023-02-07 18:07:54 -05:00
import re
2022-10-31 10:47:43 -04:00
import shutil
import pandas as pd
rootdir = " /Users/normrasmussen/Documents/Resources/Walmart/ "
2023-02-01 18:11:39 -05:00
downloadir = (
" /Users/normrasmussen/Google Drive/My Drive/Shared with Clients/Walmart_Looker/ "
)
2022-12-22 18:15:51 -05:00
basefile = " Walmart_Weekly_Base.xlsx "
2022-10-31 10:47:43 -04:00
def copytemplate ( rootdir , basefile ) :
2023-02-07 18:07:54 -05:00
segments = [ " Group1 " , " Group2 " , " Group3 " , " Group4 " , " Group5 " ]
2022-10-31 10:47:43 -04:00
today = date . today ( )
today = today . strftime ( " % m. %d . % Y " )
2022-12-22 18:15:51 -05:00
template = rootdir + basefile
2023-02-07 18:07:54 -05:00
listfiles = glob . glob ( downloadir + " WeeklyMCA*.csv " )
2022-12-22 18:15:51 -05:00
2023-02-07 18:07:54 -05:00
if os . path . exists ( rootdir ) :
for segment in segments :
todayFile = f " Walmart- { segment } - { today } .xlsx "
shutil . copy2 ( template , rootdir + todayFile )
currentDash = rootdir + f " Walmart- { segment } - { today } .xlsx "
for latestdownload in listfiles :
if re . search ( ' (?:Group1) ' , currentDash ) and re . search ( ' (?:Group1) ' , latestdownload ) :
copytoDash ( latestdownload , currentDash )
elif re . search ( ' (?:Group2) ' , currentDash ) and re . search ( ' (?:Group2) ' , latestdownload ) :
copytoDash ( latestdownload , currentDash )
elif re . search ( ' (?:Group3) ' , currentDash ) and re . search ( ' (?:Group3) ' , latestdownload ) :
copytoDash ( latestdownload , currentDash )
elif re . search ( ' (?:Group4) ' , currentDash ) and re . search ( ' (?:Group4) ' , latestdownload ) :
copytoDash ( latestdownload , currentDash )
elif re . search ( ' (?:Group5) ' , currentDash ) and re . search ( ' (?:Group5) ' , latestdownload ) :
copytoDash ( latestdownload , currentDash )
else :
print ( " No matching Looker Pulls or Templates Found! " )
# TODO: Update these if statements to loop through the groups in a better fashion
2022-10-31 10:47:43 -04:00
2022-12-22 18:15:51 -05:00
2022-10-31 10:47:43 -04:00
def copytoDash ( latestdownload , currentDash ) :
2022-11-01 15:19:04 -04:00
readExport = pd . read_csv (
2022-12-22 18:15:51 -05:00
latestdownload ,
index_col = False ,
header = 0 ,
low_memory = False ,
# dtype={"Progress": float},
)
2022-10-31 15:36:34 -04:00
print ( readExport )
2023-02-01 18:11:39 -05:00
readExport [ " Progress " ] = readExport [ " Progress " ] . str [ : - 1 ] . apply ( pd . to_numeric )
2022-12-22 18:15:51 -05:00
# readExport['Progress'].apply(pd.to_numeric, errors='ignore')
2023-02-01 18:11:39 -05:00
print ( readExport [ " Progress " ] )
2022-12-22 18:15:51 -05:00
readExport . drop ( readExport . filter ( regex = " Unname " ) , axis = 1 , inplace = True )
2022-10-31 10:47:43 -04:00
copiedData = readExport . copy ( )
2022-10-31 15:36:34 -04:00
bringtoExcel ( latestdownload , currentDash , copiedData )
2022-10-31 10:47:43 -04:00
2022-12-22 18:15:51 -05:00
2022-10-31 15:36:34 -04:00
def bringtoExcel ( latestdownload , currentDash , copiedData ) :
2022-10-31 10:47:43 -04:00
with pd . ExcelWriter (
2022-12-22 18:15:51 -05:00
currentDash ,
mode = " a " ,
engine = " openpyxl " ,
if_sheet_exists = " overlay " ,
# engine_kwargs={'options': {'strings_to_numbers': True}}
2022-10-31 10:47:43 -04:00
) as writer :
copiedData . to_excel (
2022-12-22 18:15:51 -05:00
writer ,
engine = " xlsxwriter " ,
sheet_name = " Data " ,
index = False ,
)
2022-10-31 10:47:43 -04:00
def cleanitUp ( currentDash ) :
cleanExcel = pd . read_excel ( currentDash , sheet_name = " Data " , index_col = None )
cleanExcel . columns . values [ 0 ] = " tmp "
cleanExcel . drop ( columns = " tmp " , axis = 1 , inplace = True )
print ( cleanExcel )
2022-12-22 18:15:51 -05:00
2022-10-31 10:47:43 -04:00
if __name__ == " __main__ " :
copytemplate ( rootdir , basefile )
2023-02-07 18:07:54 -05:00
# TODO: Dataset is too large. Add the math from the excel into the script so that the dataset references the final data and a much smaller set. Commented below is the math as far as I can tell.
"""
Math :
Courses :
Enrolled : For each unique course name , count number of " Enrolled " ( Col . C ) fields if != null .
Started : For each unique course name , count number of " Attempt Start " ( Col . D ) fields if != null .
Completed : For each unique course name , count number of " Progress " ( Col . G ) fields if == 100 %
Activity Completions :
Started first activity == Started number above in Course
Activity > = 1 Completions :
"""