api_scraper/apiscraper/spiders/dev_spider.py

from pathlib import Path
import scrapy
from scrapy_playwright.page import PageMethod
from apiscraper.items import EndpointItems

class ApiSpider(scrapy.Spider):
    name = "api"

    def start_requests(self):
        urls = [
                "https://developers.northpass.com/reference/get_v2-activities"
                ]
        for url in urls:
            yield scrapy.Request(url=url, meta=dict(
                playwright = True,
                playwright_include_page = True,
                playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
                                          ],
                errback=self.errback,
                ))

    async def parse(self, response):
        page = response.meta["playwright_page"]
        await page.close()

        northpass_endpoints = {}

        end_item = EndpointItems()
        end_item['inclusions'] = []
        end_item['req_param'] = []
        for item in response.xpath('//*[@id="Explorer"]'):
            end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get()

            end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get()

            end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]').xpath("@title").extract()

            end_item['req_param'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/label/text()').get()
            required = response.xpath('//*[@id="path-getV2GroupsGroup_uuidMemberships"]/div/div[1]/div/div[2]/text()').get()
            if required is not None:
                if "required" in required:
                    req_param = end_item['req_param']
                else:
                    pass
            else:
                req_param = ""

            end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get())
            if end_item['inclusions'] is not None:
                inclusions = end_item['inclusions']
            else:
                pass

            northpass_endpoints = {
                    end_item['title'] : {
                        'method' : end_item['method'],
                        'endpoint' : end_item['endpoint'],
                        'req_param' : req_param,
                        'inclusions' : inclusions
                        }
                    }

            yield northpass_endpoints
            yield EndpointItems()

        self.log(northpass_endpoints)

        next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get()
        # self.log(f"The next page is {next_page}")

        if next_page is not None:
            nexturl = response.urljoin(next_page)
            yield scrapy.Request(nexturl, meta=dict(
                playwright = True,
                playwright_include_page = True,
                playwright_page_methods =[
                    PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
                ],
                errback=self.errback,
                ))

    async def errback(self, failure):
        page = failure.request.meta["playwright_page"]
        await page.close()