from html.parser import HTMLParser import json import pandas as pd from pathlib import Path from bs4 import BeautifulSoup import pprint import re # from collections import Iterable # MASDICT = {'file': "", 'attributes' : { 'id': [], 'class': [] } } pp = pprint.PrettyPrinter(indent=4) MASDICT = {} MASLIST = {} PARS = HTMLParser def main(): p = Path( "/Users/normrasmussen/Documents/Work/Custom_Templates/current_templates_2-15-2024/" ) x = list(p.glob("**/*.html.liquid")) for posfile in x: file = str(posfile) strpfile = file.split("/")[7] htmlfile = open(file, "r", encoding="utf-8") file2 = open(file, "r", encoding="utf-8") code = file2.readlines() liq = [liquid for liquid in code if re.search('{{(.*)}}', liquid)] quidlist = [] for quid in liq: liqcludes = quid.strip() liqclude = re.search('{{(.*)}}', liqcludes) quidlist.append(liqclude.group(0)) inclst = [include for include in code if "{% include" in include] inclist = [] for item in inclst: includes = item.strip().split(" ")[2].replace('"','') inclist.append(includes) soup = BeautifulSoup(htmlfile, "html.parser") clst = [node["class"] for node in soup.find_all() if node.has_attr("class")] flatclass = flatten(clst) flatclass = [it for it in flatclass if "np" in it] ilst = [node["id"] for node in soup.find_all() if node.has_attr("id")] flatid = flatten(ilst) MASLIST.update({strpfile: {"includes": inclist,"liquid": quidlist, "class": flatclass, "id": flatid}}) MASDICT = {"data": MASLIST} # pp.pprint(MASDICT) to_pandas(MASDICT) def flatten(xs): # Initialize list for this layer flat_list = [] for x in xs: # If it's a list, recurse down and return the interior list if isinstance(x, list): flat_list += flatten(x) # Otherwise, add to this layer's list else: flat_list.append(x) return flat_list def to_pandas(obj): # df = pd.json_normalize(obj) # df = pd.DataFrame(obj['data']) results = obj['data'] df = pd.DataFrame.from_dict(results, orient='index') df = df.reset_index().rename(columns={"index": "filename"}) print(df) df.to_csv('/Users/normrasmussen/Downloads/example_liquid.csv') if __name__ == "__main__": main()