from pathlib import Path import scrapy from scrapy_playwright.page import PageMethod from apiscraper.items import EndpointItems class ApiSpider(scrapy.Spider): name = "api" def start_requests(self): urls = [ "https://developers.northpass.com/reference/get_v2-activities", ] for url in urls: yield scrapy.Request(url=url, meta=dict( playwright = True, playwright_include_page = True, playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'), ], errback=self.errback, )) async def parse(self, response): page = response.meta["playwright_page"] await page.close() northpass_endpoints = {} end_item = EndpointItems() end_item['inclusions'] = [] end_item['params'] = [] for item in response.xpath('//*[@id="Explorer"]'): end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get() end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get() end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/text()[2]').get() end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get()) #end_item['params'].append(response.xpath('//*[starts-with(@id,"path"]/div/div[1]/div/label/text()').get()) if end_item['inclusions'] is not None: inclusions = end_item['inclusions'] else: pass northpass_endpoints = { end_item['title'] : { 'method' : end_item['method'], 'endpoint' : end_item['endpoint'], 'inclusions' : inclusions } } yield northpass_endpoints yield EndpointItems() self.log(northpass_endpoints) next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get() self.log(f"The next page is {next_page}") if next_page is not None: nexturl = response.urljoin(next_page) yield scrapy.Request(nexturl, meta=dict( playwright = True, playwright_include_page = True, playwright_page_methods =[ PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'), ], errback=self.errback, )) async def errback(self, failure): page = failure.request.meta["playwright_page"] await page.close()