first commit

This commit is contained in:
Normanras
2023-02-20 14:41:34 -05:00
commit 4fb10c8ff3
18 changed files with 467 additions and 0 deletions

0
apiscraper/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

10
apiscraper/items.py Normal file
View File

@ -0,0 +1,10 @@
import scrapy
from scrapy.item import Item, Field
class EndpointItems(Item):
title = Field()
method = Field()
endpoint = Field()
inclusions = Field()
params = Field()

103
apiscraper/middlewares.py Normal file
View File

@ -0,0 +1,103 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ApiscraperSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class ApiscraperDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)

31
apiscraper/pipelines.py Normal file
View File

@ -0,0 +1,31 @@
from itemadapter import ItemAdapter
#import sqlite3
class ApiscraperPipeline:
#def __init__(self):
# self.con = sqlite3.connect('api.db')
# self.cur = self.con.cursor()
#
# self.cur.execute("""
# CREATE TABLE IF NOT EXISTS api(
# title TEXT,
# method TEXT,
# endpoint TEXT,
# inclusions TEXT
# )
# """)
def process_item(self, item, spider):
# self.cur.execute("""
# INSERT INTO api (title, method, endpoint, inclusions) VALUES (?,?,?,?)
# """,
# (
# item['title'],
# item['method'],
# item['endpoint'],
# item['inclusions'],
# ))
# self.con.commit()
return item

34
apiscraper/settings.py Normal file
View File

@ -0,0 +1,34 @@
BOT_NAME = "apiscraper"
SPIDER_MODULES = ["apiscraper.spiders"]
NEWSPIDER_MODULE = "apiscraper.spiders"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# plywright required settings
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_LAUNCH_OPTIONS = {"headless":True}
PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT = 100000
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
#ITEM_PIPELINES = {
# 'apiscraper.pipelines.ApiscraperPipeline':10000,
# }
#FEEDS = {
# 'northpass_api.json': {
#'pages' : {
# 'method' : 'string',
# 'endpoint' : 'string',
# 'headers' : 'headers',
# }
# }
# }

BIN
apiscraper/spiders/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

BIN
apiscraper/spiders/api.db Normal file

Binary file not shown.

View File

@ -0,0 +1,71 @@
from pathlib import Path
import scrapy
from scrapy_playwright.page import PageMethod
from apiscraper.items import EndpointItems
class ApiSpider(scrapy.Spider):
name = "api"
def start_requests(self):
urls = [
"https://developers.northpass.com/reference/get_v2-activities",
]
for url in urls:
yield scrapy.Request(url=url, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
],
errback=self.errback,
))
async def parse(self, response):
page = response.meta["playwright_page"]
await page.close()
northpass_endpoints = {}
end_item = EndpointItems()
end_item['inclusions'] = []
end_item['params'] = []
for item in response.xpath('//*[@id="Explorer"]'):
end_item['title'] = response.xpath('//*[@id="content"]/header[1]/div[1]/h1/text()').get()
end_item['method'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[1]/text()').get()
end_item['endpoint'] = response.xpath('//*[@id="content"]/header[1]/div[2]/span[2]/text()[2]').get()
end_item['inclusions'].append(response.xpath('//*[starts-with(@id,"query")]/div/p/code/span[position() >= 1 and not(position() > 15)]/text()').get())
#end_item['params'].append(response.xpath('//*[starts-with(@id,"path"]/div/div[1]/div/label/text()').get())
if end_item['inclusions'] is not None:
inclusions = end_item['inclusions']
else:
pass
northpass_endpoints = {
end_item['title'] : {
'method' : end_item['method'],
'endpoint' : end_item['endpoint'],
'inclusions' : inclusions
}
}
yield northpass_endpoints
yield EndpointItems()
self.log(northpass_endpoints)
next_page = response.xpath('//*[@id="content"]/div[4]/nav/a[2]/@href').get()
self.log(f"The next page is {next_page}")
if next_page is not None:
nexturl = response.urljoin(next_page)
yield scrapy.Request(nexturl, meta=dict(
playwright = True,
playwright_include_page = True,
playwright_page_methods =[
PageMethod('wait_for_selector', '//*[@id="content"]/div[4]/nav'),
],
errback=self.errback,
))
async def errback(self, failure):
page = failure.request.meta["playwright_page"]
await page.close()

View File

@ -0,0 +1,202 @@
[
{"List activities": {"method": "get", "endpoint": "/v2/activities"}},
{},
{"Fetch an activity": {"method": "get", "endpoint": "/v2/activities/"}},
{},
{"BambooHR configuration": {"method": "get", "endpoint": "/v2/apps/bamboo_hr"}},
{},
{"Update BambooHR configuration": {"method": "put", "endpoint": "/v2/apps/bamboo_hr"}},
{},
{"Delete BambooHR configuration": {"method": "delete", "endpoint": "/v2/apps/bamboo_hr"}},
{},
{"List assignment submissions": {"method": "get", "endpoint": "/v2/assignments/"}},
{},
{"Fetch an assignment submission": {"method": "get", "endpoint": "/v2/assignments/"}},
{},
{"List all assignments": {"method": "get", "endpoint": "/v2/assignments"}},
{},
{"Bulk add courses to groups": {"method": "post", "endpoint": "/v2/bulk/groups/courses"}},
{},
{"Bulk create groups for a school": {"method": "post", "endpoint": "/v2/bulk/groups"}},
{},
{"Bulk add people to groups": {"method": "post", "endpoint": "/v2/bulk/people/membership"}},
{},
{"Bulk enroll people to courses [deprecated]": {"method": "post", "endpoint": "/v2/bulk/enrollments"}},
{},
{"Bulk enroll people to courses": {"method": "post", "endpoint": "/v2/bulk/people/course"}},
{},
{"Bulk invite people to a school": {"method": "post", "endpoint": "/v2/bulk/people"}},
{},
{"Bulk resend school inviation to people": {"method": "post", "endpoint": "/v2/bulk/people/resend_invitation"}},
{},
{"List categories": {"method": "get", "endpoint": "/v2/categories"}},
{},
{"Create a category": {"method": "post", "endpoint": "/v2/categories"}},
{},
{"Fetch a category": {"method": "get", "endpoint": "/v2/categories/"}},
{},
{"Update a category": {"method": "put", "endpoint": "/v2/categories/"}},
{},
{"Delete a category": {"method": "delete", "endpoint": "/v2/categories/"}},
{},
{"Resend communication": {"method": "post", "endpoint": "/v2/communications/deliveries/"}},
{},
{"List deliveries": {"method": "get", "endpoint": "/v2/communications/deliveries/"}},
{},
{"Update attendance confirmation notification": {"method": "put", "endpoint": "/v2/communications/emails/attendance_confirmation"}},
{},
{"Update courses incomplete notification": {"method": "put", "endpoint": "/v2/communications/emails/courses_incomplete_notification"}},
{},
{"Update new courses notification": {"method": "put", "endpoint": "/v2/communications/emails/new_courses_notification"}},
{},
{"Update training session registration notification": {"method": "put", "endpoint": "/v2/communications/emails/training_session_registration_confirmation"}},
{},
{"List activities for a course": {"method": "get", "endpoint": "/v2/courses/"}},
{},
{"List groups not yet associated with course": {"method": "get", "endpoint": "/v2/courses/"}},
{},
{"List people not yet enrolled with course": {"method": "get", "endpoint": "/v2/courses/"}},
{},
{"List enrollments for a course": {"method": "get", "endpoint": "/v2/courses/"}},
{},
{"Retake a course": {"method": "post", "endpoint": "/v2/courses/"}},
{},
{"List courses": {"method": "get", "endpoint": "/v2/courses"}},
{},
{"List credential achievements": {"method": "get", "endpoint": "/v2/credentials/"}},
{},
{"List credential courses": {"method": "get", "endpoint": "/v2/credentials/"}},
{},
{"Delete course credential": {"method": "delete", "endpoint": "/v2/credentials/"}},
{},
{"List credentials": {"method": "get", "endpoint": "/v2/credentials"}},
{},
{"Delete a credential": {"method": "delete", "endpoint": "/v2/credentials/"}},
{},
{"List custom templates": {"method": "get", "endpoint": "/v2/custom_templates"}},
{},
{"Create a custom template": {"method": "post", "endpoint": "/v2/custom_templates"}},
{},
{"Delete a custom template": {"method": "delete", "endpoint": "/v2/custom_templates/"}},
{},
{"List email senders": {"method": "get", "endpoint": "/v2/email_domains/"}},
{},
{"Update an email sender": {"method": "put", "endpoint": "/v2/email_domains/"}},
{},
{"Delete an email sender": {"method": "delete", "endpoint": "/v2/email_domains/"}},
{},
{"List email domains": {"method": "get", "endpoint": "/v2/email_domains"}},
{},
{"Create an email domain": {"method": "post", "endpoint": "/v2/email_domains"}},
{},
{"Delete an email domain": {"method": "delete", "endpoint": "/v2/email_domains/"}},
{},
{"List events": {"method": "get", "endpoint": "/v2/events"}},
{},
{"List group's courses": {"method": "get", "endpoint": "/v2/groups/"}},
{},
{"List group's memberships": {"method": "get", "endpoint": "/v2/groups/"}},
{},
{"Add courses to a group": {"method": "post", "endpoint": "/v2/groups/"}},
{},
{"Add people to a group": {"method": "post", "endpoint": "/v2/groups/"}},
{},
{"List groups": {"method": "get", "endpoint": "/v2/groups"}},
{},
{"Create a group": {"method": "post", "endpoint": "/v2/groups"}},
{},
{"Fetch a group": {"method": "get", "endpoint": "/v2/groups/"}},
{},
{"Update a group": {"method": "put", "endpoint": "/v2/groups/"}},
{},
{"Delete a group": {"method": "delete", "endpoint": "/v2/groups/"}},
{},
{"Deactivate a person": {"method": "post", "endpoint": "/v2/people/"}},
{},
{"Reactivate a person": {"method": "delete", "endpoint": "/v2/people/"}},
{},
{"Associate a person with a course": {"method": "post", "endpoint": "/v2/people/"}},
{},
{"Remove a person from a course": {"method": "delete", "endpoint": "/v2/people/"}},
{},
{"Add groups to a person": {"method": "post", "endpoint": "/v2/people/"}},
{},
{"Remove a person from a group": {"method": "delete", "endpoint": "/v2/people/"}},
{},
{"List people": {"method": "get", "endpoint": "/v2/people"}},
{},
{"Create a person": {"method": "post", "endpoint": "/v2/people"}},
{},
{"Fetch a person": {"method": "get", "endpoint": "/v2/people/"}},
{},
{"Update a person": {"method": "put", "endpoint": "/v2/people/"}},
{},
{"Delete a person": {"method": "delete", "endpoint": "/v2/people/"}},
{},
{"List question banks": {"method": "get", "endpoint": "/v2/question_banks"}},
{},
{"Create a question bank": {"method": "post", "endpoint": "/v2/question_banks"}},
{},
{"Fetch a question bank": {"method": "get", "endpoint": "/v2/question_banks/"}},
{},
{"List learner quiz answers": {"method": "get", "endpoint": "/v2/quiz_attempts/"}},
{},
{"List quizzes": {"method": "get", "endpoint": "/v2/quizzes"}},
{},
{"List all assignment submissions": {"method": "get", "endpoint": "/v2/submissions"}},
{},
{"List all configured webhook endpoints": {"method": "get", "endpoint": "/v2/webhook_endpoints"}},
{},
{"Creates new webhook endpoint": {"method": "post", "endpoint": "/v2/webhook_endpoints"}},
{},
{"Updates webhook endpoint": {"method": "put", "endpoint": "/v2/webhook_endpoints/"}},
{},
{"Delete a webhook endpoint": {"method": "delete", "endpoint": "/v2/webhook_endpoints/"}},
{},
{"List all sent webhooks": {"method": "get", "endpoint": "/v2/webhooks"}},
{},
{"Bulk fill courses property values": {"method": "post", "endpoint": "/v2/properties/courses/bulk"}},
{},
{"Bulk clear courses property values": {"method": "delete", "endpoint": "/v2/properties/courses/bulk"}},
{},
{"Bulk fill people property values": {"method": "post", "endpoint": "/v2/properties/people/bulk"}},
{},
{"Bulk clear people property values": {"method": "delete", "endpoint": "/v2/properties/people/bulk"}},
{},
{"List properties available for courses": {"method": "get", "endpoint": "/v2/properties/courses/properties"}},
{},
{"Get available course properties with values": {"method": "get", "endpoint": "/v2/properties/courses/"}},
{},
{"List created property definitions": {"method": "get", "endpoint": "/v2/properties/conditional"}},
{},
{"Create conditional property with mapping": {"method": "post", "endpoint": "/v2/properties/conditional"}},
{},
{"Delete conditional property": {"method": "delete", "endpoint": "/v2/properties/conditional/"}},
{},
{"Update conditional property": {"method": "patch", "endpoint": "/v2/properties/conditional/"}},
{},
{"List properties available for people": {"method": "get", "endpoint": "/v2/properties/people/properties"}},
{},
{"Get available person properties with values": {"method": "get", "endpoint": "/v2/properties/people/"}},
{},
{"List unique values for property": {"method": "get", "endpoint": "/v2/properties/property_definitions/"}},
{},
{"List created property definitions": {"method": "get", "endpoint": "/v2/properties/property_definitions"}},
{},
{"Create property definition": {"method": "post", "endpoint": "/v2/properties/property_definitions"}},
{},
{"Get property definition": {"method": "get", "endpoint": "/v2/properties/property_definitions/"}},
{},
{"Delete property definition": {"method": "delete", "endpoint": "/v2/properties/property_definitions/"}},
{},
{"Create property definition": {"method": "patch", "endpoint": "/v2/properties/property_definitions/"}},
{},
{"List properties available for school": {"method": "get", "endpoint": "/v2/properties/school/properties"}},
{},
{"Get available school properties with values": {"method": "get", "endpoint": "/v2/properties/school"}},
{},
{"Clear school properties with values": {"method": "delete", "endpoint": "/v2/properties/school"}},
{},
{"Fill school properties with values": {"method": "patch", "endpoint": "/v2/properties/school"}},
{}
]