commit 4fb10c8ff34a32c7efd2394b60af6c5894e88038 Author: Normanras <44226464+Normanras@users.noreply.github.com> Date: Mon Feb 20 14:41:34 2023 -0500 first commit diff --git a/apiscraper/__init__.py b/apiscraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apiscraper/__pycache__/__init__.cpython-311.pyc b/apiscraper/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..ac94e95 Binary files /dev/null and b/apiscraper/__pycache__/__init__.cpython-311.pyc differ diff --git a/apiscraper/__pycache__/items.cpython-311.pyc b/apiscraper/__pycache__/items.cpython-311.pyc new file mode 100644 index 0000000..1ee7fe6 Binary files /dev/null and b/apiscraper/__pycache__/items.cpython-311.pyc differ diff --git a/apiscraper/__pycache__/pipelines.cpython-311.pyc b/apiscraper/__pycache__/pipelines.cpython-311.pyc new file mode 100644 index 0000000..03d8bd3 Binary files /dev/null and b/apiscraper/__pycache__/pipelines.cpython-311.pyc differ diff --git a/apiscraper/__pycache__/settings.cpython-311.pyc b/apiscraper/__pycache__/settings.cpython-311.pyc new file mode 100644 index 0000000..daf1843 Binary files /dev/null and b/apiscraper/__pycache__/settings.cpython-311.pyc differ diff --git a/apiscraper/items.py b/apiscraper/items.py new file mode 100644 index 0000000..1786649 --- /dev/null +++ b/apiscraper/items.py @@ -0,0 +1,10 @@ +import scrapy +from scrapy.item import Item, Field + + +class EndpointItems(Item): + title = Field() + method = Field() + endpoint = Field() + inclusions = Field() + params = Field() diff --git a/apiscraper/middlewares.py b/apiscraper/middlewares.py new file mode 100644 index 0000000..444e8e8 --- /dev/null +++ b/apiscraper/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class ApiscraperSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class ApiscraperDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) diff --git a/apiscraper/pipelines.py b/apiscraper/pipelines.py new file mode 100644 index 0000000..142ad54 --- /dev/null +++ b/apiscraper/pipelines.py @@ -0,0 +1,31 @@ +from itemadapter import ItemAdapter +#import sqlite3 + + +class ApiscraperPipeline: + #def __init__(self): + # self.con = sqlite3.connect('api.db') + # self.cur = self.con.cursor() +# +# self.cur.execute(""" +# CREATE TABLE IF NOT EXISTS api( +# title TEXT, +# method TEXT, +# endpoint TEXT, +# inclusions TEXT +# ) +# """) + + def process_item(self, item, spider): + # self.cur.execute(""" + # INSERT INTO api (title, method, endpoint, inclusions) VALUES (?,?,?,?) + # """, + # ( + # item['title'], + # item['method'], + # item['endpoint'], + # item['inclusions'], + # )) + + # self.con.commit() + return item diff --git a/apiscraper/settings.py b/apiscraper/settings.py new file mode 100644 index 0000000..bed019d --- /dev/null +++ b/apiscraper/settings.py @@ -0,0 +1,34 @@ +BOT_NAME = "apiscraper" + +SPIDER_MODULES = ["apiscraper.spiders"] +NEWSPIDER_MODULE = "apiscraper.spiders" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# plywright required settings +DOWNLOAD_HANDLERS = { + "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", +} +PLAYWRIGHT_LAUNCH_OPTIONS = {"headless":True} +PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000 + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +#ITEM_PIPELINES = { +# 'apiscraper.pipelines.ApiscraperPipeline':10000, +# } + +#FEEDS = { +# 'northpass_api.json': { + #'pages' : { +# 'method' : 'string', +# 'endpoint' : 'string', +# 'headers' : 'headers', +# } +# } +# } diff --git a/apiscraper/spiders/.DS_Store b/apiscraper/spiders/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/apiscraper/spiders/.DS_Store differ diff --git a/apiscraper/spiders/.null-ls_505405_dev_spider.py b/apiscraper/spiders/.null-ls_505405_dev_spider.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/apiscraper/spiders/.null-ls_505405_dev_spider.py @@ -0,0 +1 @@ + diff --git a/apiscraper/spiders/__init__.py b/apiscraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/apiscraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/apiscraper/spiders/__pycache__/__init__.cpython-311.pyc b/apiscraper/spiders/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..febcc10 Binary files /dev/null and b/apiscraper/spiders/__pycache__/__init__.cpython-311.pyc differ diff --git a/apiscraper/spiders/__pycache__/dev_spider.cpython-311.pyc b/apiscraper/spiders/__pycache__/dev_spider.cpython-311.pyc new file mode 100644 index 0000000..f61f678 Binary files /dev/null and b/apiscraper/spiders/__pycache__/dev_spider.cpython-311.pyc differ diff --git a/apiscraper/spiders/api.db b/apiscraper/spiders/api.db new file mode 100644 index 0000000..17954a5 Binary files /dev/null and b/apiscraper/spiders/api.db differ diff --git a/apiscraper/spiders/dev_spider.py b/apiscraper/spiders/dev_spider.py new file mode 100644 index 0000000..8940968 --- /dev/null +++ b/apiscraper/spiders/dev_spider.py @@ -0,0 +1,71 @@ +from pathlib import Path +import scrapy +from scrapy_playwright.page import PageMethod +from apiscraper.items import EndpointItems + +class ApiSpider(scrapy.Spider): + name = "api" + + def start_requests(self): + urls = [ + "https://developers.northpass.com/reference/get_v2-activities", + ] + for url in urls: + yield scrapy.Request(url=url, meta=dict( + playwright = True, + playwright_include_page = True, + playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'), + ], + errback=self.errback, + )) + + async def parse(self, response): + page = response.meta["playwright_page"] + await page.close() + + northpass_endpoints = {} + + end_item = EndpointItems() + end_item['inclusions'] = [] + end_item['params'] = [] + for item in response.xpath('//*[@id="Explorer"]'): + end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get() + end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get() + end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/text()[2]').get() + end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get()) + #end_item['params'].append(response.xpath('//*[starts-with(@id,"path"]/div/div[1]/div/label/text()').get()) + if end_item['inclusions'] is not None: + inclusions = end_item['inclusions'] + else: + pass + + northpass_endpoints = { + end_item['title'] : { + 'method' : end_item['method'], + 'endpoint' : end_item['endpoint'], + 'inclusions' : inclusions + } + } + + yield northpass_endpoints + yield EndpointItems() + + self.log(northpass_endpoints) + + next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get() + self.log(f"The next page is {next_page}") + + if next_page is not None: + nexturl = response.urljoin(next_page) + yield scrapy.Request(nexturl, meta=dict( + playwright = True, + playwright_include_page = True, + playwright_page_methods =[ + PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'), + ], + errback=self.errback, + )) + + async def errback(self, failure): + page = failure.request.meta["playwright_page"] + await page.close() diff --git a/apiscraper/spiders/outputtest.json b/apiscraper/spiders/outputtest.json new file mode 100644 index 0000000..f0331a9 --- /dev/null +++ b/apiscraper/spiders/outputtest.json @@ -0,0 +1,202 @@ +[ +{"List activities": {"method": "get", "endpoint": "/v2/activities"}}, +{}, +{"Fetch an activity": {"method": "get", "endpoint": "/v2/activities/"}}, +{}, +{"BambooHR configuration": {"method": "get", "endpoint": "/v2/apps/bamboo_hr"}}, +{}, +{"Update BambooHR configuration": {"method": "put", "endpoint": "/v2/apps/bamboo_hr"}}, +{}, +{"Delete BambooHR configuration": {"method": "delete", "endpoint": "/v2/apps/bamboo_hr"}}, +{}, +{"List assignment submissions": {"method": "get", "endpoint": "/v2/assignments/"}}, +{}, +{"Fetch an assignment submission": {"method": "get", "endpoint": "/v2/assignments/"}}, +{}, +{"List all assignments": {"method": "get", "endpoint": "/v2/assignments"}}, +{}, +{"Bulk add courses to groups": {"method": "post", "endpoint": "/v2/bulk/groups/courses"}}, +{}, +{"Bulk create groups for a school": {"method": "post", "endpoint": "/v2/bulk/groups"}}, +{}, +{"Bulk add people to groups": {"method": "post", "endpoint": "/v2/bulk/people/membership"}}, +{}, +{"Bulk enroll people to courses [deprecated]": {"method": "post", "endpoint": "/v2/bulk/enrollments"}}, +{}, +{"Bulk enroll people to courses": {"method": "post", "endpoint": "/v2/bulk/people/course"}}, +{}, +{"Bulk invite people to a school": {"method": "post", "endpoint": "/v2/bulk/people"}}, +{}, +{"Bulk resend school inviation to people": {"method": "post", "endpoint": "/v2/bulk/people/resend_invitation"}}, +{}, +{"List categories": {"method": "get", "endpoint": "/v2/categories"}}, +{}, +{"Create a category": {"method": "post", "endpoint": "/v2/categories"}}, +{}, +{"Fetch a category": {"method": "get", "endpoint": "/v2/categories/"}}, +{}, +{"Update a category": {"method": "put", "endpoint": "/v2/categories/"}}, +{}, +{"Delete a category": {"method": "delete", "endpoint": "/v2/categories/"}}, +{}, +{"Resend communication": {"method": "post", "endpoint": "/v2/communications/deliveries/"}}, +{}, +{"List deliveries": {"method": "get", "endpoint": "/v2/communications/deliveries/"}}, +{}, +{"Update attendance confirmation notification": {"method": "put", "endpoint": "/v2/communications/emails/attendance_confirmation"}}, +{}, +{"Update courses incomplete notification": {"method": "put", "endpoint": "/v2/communications/emails/courses_incomplete_notification"}}, +{}, +{"Update new courses notification": {"method": "put", "endpoint": "/v2/communications/emails/new_courses_notification"}}, +{}, +{"Update training session registration notification": {"method": "put", "endpoint": "/v2/communications/emails/training_session_registration_confirmation"}}, +{}, +{"List activities for a course": {"method": "get", "endpoint": "/v2/courses/"}}, +{}, +{"List groups not yet associated with course": {"method": "get", "endpoint": "/v2/courses/"}}, +{}, +{"List people not yet enrolled with course": {"method": "get", "endpoint": "/v2/courses/"}}, +{}, +{"List enrollments for a course": {"method": "get", "endpoint": "/v2/courses/"}}, +{}, +{"Retake a course": {"method": "post", "endpoint": "/v2/courses/"}}, +{}, +{"List courses": {"method": "get", "endpoint": "/v2/courses"}}, +{}, +{"List credential achievements": {"method": "get", "endpoint": "/v2/credentials/"}}, +{}, +{"List credential courses": {"method": "get", "endpoint": "/v2/credentials/"}}, +{}, +{"Delete course credential": {"method": "delete", "endpoint": "/v2/credentials/"}}, +{}, +{"List credentials": {"method": "get", "endpoint": "/v2/credentials"}}, +{}, +{"Delete a credential": {"method": "delete", "endpoint": "/v2/credentials/"}}, +{}, +{"List custom templates": {"method": "get", "endpoint": "/v2/custom_templates"}}, +{}, +{"Create a custom template": {"method": "post", "endpoint": "/v2/custom_templates"}}, +{}, +{"Delete a custom template": {"method": "delete", "endpoint": "/v2/custom_templates/"}}, +{}, +{"List email senders": {"method": "get", "endpoint": "/v2/email_domains/"}}, +{}, +{"Update an email sender": {"method": "put", "endpoint": "/v2/email_domains/"}}, +{}, +{"Delete an email sender": {"method": "delete", "endpoint": "/v2/email_domains/"}}, +{}, +{"List email domains": {"method": "get", "endpoint": "/v2/email_domains"}}, +{}, +{"Create an email domain": {"method": "post", "endpoint": "/v2/email_domains"}}, +{}, +{"Delete an email domain": {"method": "delete", "endpoint": "/v2/email_domains/"}}, +{}, +{"List events": {"method": "get", "endpoint": "/v2/events"}}, +{}, +{"List group's courses": {"method": "get", "endpoint": "/v2/groups/"}}, +{}, +{"List group's memberships": {"method": "get", "endpoint": "/v2/groups/"}}, +{}, +{"Add courses to a group": {"method": "post", "endpoint": "/v2/groups/"}}, +{}, +{"Add people to a group": {"method": "post", "endpoint": "/v2/groups/"}}, +{}, +{"List groups": {"method": "get", "endpoint": "/v2/groups"}}, +{}, +{"Create a group": {"method": "post", "endpoint": "/v2/groups"}}, +{}, +{"Fetch a group": {"method": "get", "endpoint": "/v2/groups/"}}, +{}, +{"Update a group": {"method": "put", "endpoint": "/v2/groups/"}}, +{}, +{"Delete a group": {"method": "delete", "endpoint": "/v2/groups/"}}, +{}, +{"Deactivate a person": {"method": "post", "endpoint": "/v2/people/"}}, +{}, +{"Reactivate a person": {"method": "delete", "endpoint": "/v2/people/"}}, +{}, +{"Associate a person with a course": {"method": "post", "endpoint": "/v2/people/"}}, +{}, +{"Remove a person from a course": {"method": "delete", "endpoint": "/v2/people/"}}, +{}, +{"Add groups to a person": {"method": "post", "endpoint": "/v2/people/"}}, +{}, +{"Remove a person from a group": {"method": "delete", "endpoint": "/v2/people/"}}, +{}, +{"List people": {"method": "get", "endpoint": "/v2/people"}}, +{}, +{"Create a person": {"method": "post", "endpoint": "/v2/people"}}, +{}, +{"Fetch a person": {"method": "get", "endpoint": "/v2/people/"}}, +{}, +{"Update a person": {"method": "put", "endpoint": "/v2/people/"}}, +{}, +{"Delete a person": {"method": "delete", "endpoint": "/v2/people/"}}, +{}, +{"List question banks": {"method": "get", "endpoint": "/v2/question_banks"}}, +{}, +{"Create a question bank": {"method": "post", "endpoint": "/v2/question_banks"}}, +{}, +{"Fetch a question bank": {"method": "get", "endpoint": "/v2/question_banks/"}}, +{}, +{"List learner quiz answers": {"method": "get", "endpoint": "/v2/quiz_attempts/"}}, +{}, +{"List quizzes": {"method": "get", "endpoint": "/v2/quizzes"}}, +{}, +{"List all assignment submissions": {"method": "get", "endpoint": "/v2/submissions"}}, +{}, +{"List all configured webhook endpoints": {"method": "get", "endpoint": "/v2/webhook_endpoints"}}, +{}, +{"Creates new webhook endpoint": {"method": "post", "endpoint": "/v2/webhook_endpoints"}}, +{}, +{"Updates webhook endpoint": {"method": "put", "endpoint": "/v2/webhook_endpoints/"}}, +{}, +{"Delete a webhook endpoint": {"method": "delete", "endpoint": "/v2/webhook_endpoints/"}}, +{}, +{"List all sent webhooks": {"method": "get", "endpoint": "/v2/webhooks"}}, +{}, +{"Bulk fill courses property values": {"method": "post", "endpoint": "/v2/properties/courses/bulk"}}, +{}, +{"Bulk clear courses property values": {"method": "delete", "endpoint": "/v2/properties/courses/bulk"}}, +{}, +{"Bulk fill people property values": {"method": "post", "endpoint": "/v2/properties/people/bulk"}}, +{}, +{"Bulk clear people property values": {"method": "delete", "endpoint": "/v2/properties/people/bulk"}}, +{}, +{"List properties available for courses": {"method": "get", "endpoint": "/v2/properties/courses/properties"}}, +{}, +{"Get available course properties with values": {"method": "get", "endpoint": "/v2/properties/courses/"}}, +{}, +{"List created property definitions": {"method": "get", "endpoint": "/v2/properties/conditional"}}, +{}, +{"Create conditional property with mapping": {"method": "post", "endpoint": "/v2/properties/conditional"}}, +{}, +{"Delete conditional property": {"method": "delete", "endpoint": "/v2/properties/conditional/"}}, +{}, +{"Update conditional property": {"method": "patch", "endpoint": "/v2/properties/conditional/"}}, +{}, +{"List properties available for people": {"method": "get", "endpoint": "/v2/properties/people/properties"}}, +{}, +{"Get available person properties with values": {"method": "get", "endpoint": "/v2/properties/people/"}}, +{}, +{"List unique values for property": {"method": "get", "endpoint": "/v2/properties/property_definitions/"}}, +{}, +{"List created property definitions": {"method": "get", "endpoint": "/v2/properties/property_definitions"}}, +{}, +{"Create property definition": {"method": "post", "endpoint": "/v2/properties/property_definitions"}}, +{}, +{"Get property definition": {"method": "get", "endpoint": "/v2/properties/property_definitions/"}}, +{}, +{"Delete property definition": {"method": "delete", "endpoint": "/v2/properties/property_definitions/"}}, +{}, +{"Create property definition": {"method": "patch", "endpoint": "/v2/properties/property_definitions/"}}, +{}, +{"List properties available for school": {"method": "get", "endpoint": "/v2/properties/school/properties"}}, +{}, +{"Get available school properties with values": {"method": "get", "endpoint": "/v2/properties/school"}}, +{}, +{"Clear school properties with values": {"method": "delete", "endpoint": "/v2/properties/school"}}, +{}, +{"Fill school properties with values": {"method": "patch", "endpoint": "/v2/properties/school"}}, +{} +] \ No newline at end of file diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..5380221 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = apiscraper.settings + +[deploy] +#url = http://localhost:6800/ +project = apiscraper