api_scraper/apiscraper/spiders/dev_spider.py

from pathlib import Path
import scrapy
from scrapy_playwright.page import PageMethod
from apiscraper.items import EndpointItems

class ApiSpider(scrapy.Spider):
    name = "api"

    def start_requests(self):
        urls = [
                "https://developers.northpass.com/reference/get_v2-activities"
                ]
        for url in urls:
            yield scrapy.Request(url=url, meta=dict(
                playwright = True,
                playwright_include_page = True,
                playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
                                          ],
                errback=self.errback,
                ))

    async def parse(self, response):
        page = response.meta["playwright_page"]
        await page.close()

        northpass_endpoints = {}

        end_item = EndpointItems()
        end_item['inclusions'] = []
        end_item['req_param'] = []
        for item in response.xpath('//*[@id="Explorer"]'):
            end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get()

            end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get()

            end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]').xpath("@title").extract()

            end_item['req_param'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/label/text()').get()
            required = response.xpath('//*[@id="path-getV2GroupsGroup_uuidMemberships"]/div/div[1]/div/div[2]/text()').get()
            if required is not None:
                if "required" in required:
                    req_param = end_item['req_param']
                else:
                    pass
            else:
                req_param = ""

            end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get())
            if end_item['inclusions'] is not None:
                inclusions = end_item['inclusions']
            else:
                pass

            northpass_endpoints = {
                    end_item['title'] : {
                        'method' : end_item['method'],
                        'endpoint' : end_item['endpoint'],
                        'req_param' : req_param,
                        'inclusions' : inclusions
                        }
                    }

            yield northpass_endpoints
            yield EndpointItems()

        self.log(northpass_endpoints)

        next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get()
        # self.log(f"The next page is {next_page}")

        if next_page is not None:
            nexturl = response.urljoin(next_page)
            yield scrapy.Request(nexturl, meta=dict(
                playwright = True,
                playwright_include_page = True,
                playwright_page_methods =[
                    PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
                ],
                errback=self.errback,
                ))

    async def errback(self, failure):
        page = failure.request.meta["playwright_page"]
        await page.close()
first commit 2023-02-20 14:41:34 -05:00			`from pathlib import Path`
			`import scrapy`
			`from scrapy_playwright.page import PageMethod`
			`from apiscraper.items import EndpointItems`

			`class ApiSpider(scrapy.Spider):`
			`name = "api"`

			`def start_requests(self):`
			`urls = [`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00			`"https://developers.northpass.com/reference/get_v2-activities"`
first commit 2023-02-20 14:41:34 -05:00			`]`
			`for url in urls:`
			`yield scrapy.Request(url=url, meta=dict(`
			`playwright = True,`
			`playwright_include_page = True,`
			`playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),`
			`],`
			`errback=self.errback,`
			`))`

			`async def parse(self, response):`
			`page = response.meta["playwright_page"]`
			`await page.close()`

			`northpass_endpoints = {}`

			`end_item = EndpointItems()`
			`end_item['inclusions'] = []`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00			`end_item['req_param'] = []`
first commit 2023-02-20 14:41:34 -05:00			`for item in response.xpath('//*[@id="Explorer"]'):`
			`end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get()`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00
first commit 2023-02-20 14:41:34 -05:00			`end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get()`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00
			`end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]').xpath("@title").extract()`

			`end_item['req_param'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/label/text()').get()`
			`required = response.xpath('//*[@id="path-getV2GroupsGroup_uuidMemberships"]/div/div[1]/div/div[2]/text()').get()`
			`if required is not None:`
			`if "required" in required:`
			`req_param = end_item['req_param']`
			`else:`
			`pass`
			`else:`
			`req_param = ""`

first commit 2023-02-20 14:41:34 -05:00			`end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get())`
			`if end_item['inclusions'] is not None:`
			`inclusions = end_item['inclusions']`
			`else:`
			`pass`

			`northpass_endpoints = {`
			`end_item['title'] : {`
			`'method' : end_item['method'],`
			`'endpoint' : end_item['endpoint'],`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00			`'req_param' : req_param,`
first commit 2023-02-20 14:41:34 -05:00			`'inclusions' : inclusions`
			`}`
			`}`

			`yield northpass_endpoints`
			`yield EndpointItems()`

			`self.log(northpass_endpoints)`

			`next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get()`
Figured out most data points I would like. Need to clean it up and figure out adding it to a database. 2023-02-20 21:35:44 -05:00			`# self.log(f"The next page is {next_page}")`
first commit 2023-02-20 14:41:34 -05:00
			`if next_page is not None:`
			`nexturl = response.urljoin(next_page)`
			`yield scrapy.Request(nexturl, meta=dict(`
			`playwright = True,`
			`playwright_include_page = True,`
			`playwright_page_methods =[`
			`PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),`
			`],`
			`errback=self.errback,`
			`))`

			`async def errback(self, failure):`
			`page = failure.request.meta["playwright_page"]`
			`await page.close()`