Files
api_scraper/apiscraper/spiders/dev_spider.py

85 lines
3.2 KiB
Python
Raw Normal View History

2023-02-20 14:41:34 -05:00
from pathlib import Path
import scrapy
from scrapy_playwright.page import PageMethod
from apiscraper.items import EndpointItems
class ApiSpider(scrapy.Spider):
name = "api"
def start_requests(self):
urls = [
"https://developers.northpass.com/reference/get_v2-activities"
2023-02-20 14:41:34 -05:00
]
for url in urls:
yield scrapy.Request(url=url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
],
errback=self.errback,
))
async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()
northpass_endpoints = {}
end_item = EndpointItems()
end_item['inclusions'] = []
end_item['req_param'] = []
2023-02-20 14:41:34 -05:00
for item in response.xpath('//*[@id="Explorer"]'):
end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get()
2023-02-20 14:41:34 -05:00
end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get()
end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]').xpath("@title").extract()
end_item['req_param'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/label/text()').get()
required = response.xpath('//*[@id="path-getV2GroupsGroup_uuidMemberships"]/div/div[1]/div/div[2]/text()').get()
if required is not None:
if "required" in required:
req_param = end_item['req_param']
else:
pass
else:
req_param = ""
2023-02-20 14:41:34 -05:00
end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get())
if end_item['inclusions'] is not None:
inclusions = end_item['inclusions']
else:
pass
northpass_endpoints = {
end_item['title'] : {
'method' : end_item['method'],
'endpoint' : end_item['endpoint'],
'req_param' : req_param,
2023-02-20 14:41:34 -05:00
'inclusions' : inclusions
}
}
yield northpass_endpoints
yield EndpointItems()
self.log(northpass_endpoints)
next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get()
# self.log(f"The next page is {next_page}")
2023-02-20 14:41:34 -05:00
if next_page is not None:
nexturl = response.urljoin(next_page)
yield scrapy.Request(nexturl, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
],
errback=self.errback,
))
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()