import re import sys from playwright.sync_api import sync_playwright, Playwright, expect from PIL import Image import glob import re import os from datetime import date BASEURL = "https://walmart.northpass.com/app/courses/" USERID = "?uid=7beg87y4-fh24-4929-3rt5-24kdn87s5241" COURSEID = sys.argv[1] DIR = "./" def run(playwright: Playwright): print("running") webkit = playwright.webkit browser = webkit.launch() context = browser.new_context( viewport={ 'width': 390, 'height': 844 } ) page = context.new_page() screen_num = 0 # Navigate to and screenshot the title page page.goto(f"{BASEURL}{COURSEID}{USERID}") course_name = page.locator('.header-title').evaluate("node => node.innerText") print(course_name) # Extract all hrefs from the course outline course_links = page.locator('.np-course-outline-content-section a').evaluate_all('elements => elements.map(el => el.href)') page.screenshot(path=f"{DIR}-{course_name}_{screen_num}.png", full_page=True) print("Course outline links:") for idx, link in enumerate(course_links): screen_num += 1 print(f"{idx + 1}. {link}") page.goto(f"{link}") page.screenshot(path=f"{DIR}-{course_name}_{screen_num}.png", full_page=True) browser.close() # find_pictures() def find_pictures(DIR): files = [] listfiles = glob.glob(DIR + "*.png") for file in listfiles: files.append(os.path.basename(file)) # Now file will only show the file name, not the entire path split_resources(files, DIR) def split_resources(files, DIR): try: resource_title = files[0][:-6] # resource_title = resource_title[:-6] files.sort() new_list = [] for file in files: if resource_title in file: new_list.append(file) for item in new_list: files.remove(item) split_resources(files, DIR) process_pictures(new_list, resource_title, DIR) except IndexError as e: print(e) finally: pass def process_pictures(new_list, resource_title, DIR): resource_title = re.sub(r'[?]', "", resource_title) today = date.today() today = today.strftime("%m.%d.%Y") image_list = [] resource = Image.open(new_list[0]) resource = resource.convert("RGB") for picture in new_list[1:]: image = Image.open(picture) converted = image.convert("RGB") image_list.append(converted) # image_list.append(image) resource.save( DIR + f"PDFs/{resource_title}_{today}.pdf", save_all=True, append_images=image_list, ) def delete_originals(DIR): path = glob.glob(DIR + "*.png") for file in path: try: os.remove(file) except TypeError as e: print("Error!") print(e) print("All Done") with sync_playwright() as playwright: run(playwright) find_pictures(DIR) delete_originals(DIR)