2024-10-11 16:50:08 -04:00
|
|
|
from html.parser import HTMLParser
|
2024-10-16 17:15:05 -04:00
|
|
|
import json
|
|
|
|
|
import pandas as pd
|
2024-10-11 16:50:08 -04:00
|
|
|
from pathlib import Path
|
2024-10-15 17:02:33 -04:00
|
|
|
from bs4 import BeautifulSoup
|
2024-10-16 17:15:05 -04:00
|
|
|
import pprint
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
|
|
# from collections import Iterable
|
2024-10-11 16:50:08 -04:00
|
|
|
|
|
|
|
|
# MASDICT = {'file': "", 'attributes' : { 'id': [], 'class': [] } }
|
2024-10-16 17:15:05 -04:00
|
|
|
pp = pprint.PrettyPrinter(indent=4)
|
|
|
|
|
MASDICT = {}
|
2024-10-17 16:22:21 -04:00
|
|
|
MASLIST = {}
|
2024-10-11 16:50:08 -04:00
|
|
|
PARS = HTMLParser
|
|
|
|
|
|
2024-10-15 17:02:33 -04:00
|
|
|
|
2024-10-16 17:15:05 -04:00
|
|
|
def main():
|
|
|
|
|
p = Path(
|
|
|
|
|
"/Users/normrasmussen/Documents/Work/Custom_Templates/current_templates_2-15-2024/"
|
|
|
|
|
)
|
|
|
|
|
x = list(p.glob("**/*.html.liquid"))
|
|
|
|
|
for posfile in x:
|
|
|
|
|
file = str(posfile)
|
|
|
|
|
strpfile = file.split("/")[7]
|
|
|
|
|
htmlfile = open(file, "r", encoding="utf-8")
|
|
|
|
|
file2 = open(file, "r", encoding="utf-8")
|
|
|
|
|
code = file2.readlines()
|
|
|
|
|
liq = [liquid for liquid in code if re.search('{{(.*)}}', liquid)]
|
|
|
|
|
quidlist = []
|
|
|
|
|
for quid in liq:
|
|
|
|
|
liqcludes = quid.strip()
|
|
|
|
|
liqclude = re.search('{{(.*)}}', liqcludes)
|
|
|
|
|
quidlist.append(liqclude.group(0))
|
|
|
|
|
inclst = [include for include in code if "{% include" in include]
|
|
|
|
|
inclist = []
|
|
|
|
|
for item in inclst:
|
|
|
|
|
includes = item.strip().split(" ")[2].replace('"','')
|
|
|
|
|
inclist.append(includes)
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(htmlfile, "html.parser")
|
|
|
|
|
clst = [node["class"] for node in soup.find_all() if node.has_attr("class")]
|
|
|
|
|
flatclass = flatten(clst)
|
|
|
|
|
flatclass = [it for it in flatclass if "np" in it]
|
|
|
|
|
ilst = [node["id"] for node in soup.find_all() if node.has_attr("id")]
|
|
|
|
|
flatid = flatten(ilst)
|
|
|
|
|
|
2024-10-17 16:22:21 -04:00
|
|
|
MASLIST.update({strpfile: {"includes": inclist,"liquid": quidlist, "class": flatclass, "id": flatid}})
|
2024-10-16 17:15:05 -04:00
|
|
|
MASDICT = {"data": MASLIST}
|
|
|
|
|
# pp.pprint(MASDICT)
|
|
|
|
|
to_pandas(MASDICT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def flatten(xs):
|
|
|
|
|
# Initialize list for this layer
|
|
|
|
|
flat_list = []
|
|
|
|
|
for x in xs:
|
|
|
|
|
# If it's a list, recurse down and return the interior list
|
|
|
|
|
if isinstance(x, list):
|
|
|
|
|
flat_list += flatten(x)
|
|
|
|
|
# Otherwise, add to this layer's list
|
|
|
|
|
else:
|
|
|
|
|
flat_list.append(x)
|
|
|
|
|
return flat_list
|
|
|
|
|
|
|
|
|
|
def to_pandas(obj):
|
|
|
|
|
# df = pd.json_normalize(obj)
|
2024-10-17 16:22:21 -04:00
|
|
|
# df = pd.DataFrame(obj['data'])
|
|
|
|
|
results = obj['data']
|
|
|
|
|
df = pd.DataFrame.from_dict(results, orient='index')
|
|
|
|
|
df = df.reset_index().rename(columns={"index": "filename"})
|
2024-10-16 17:15:05 -04:00
|
|
|
print(df)
|
|
|
|
|
df.to_csv('/Users/normrasmussen/Downloads/example_liquid.csv')
|
2024-10-11 16:50:08 -04:00
|
|
|
|
2024-10-16 17:15:05 -04:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|