@techteamer/scraper
v1.0.4
Published
Interactively scrape websites and save communication data
Downloads
12
Readme
Scraper
Scrape websites with Puppeteer and save communication data about requests and responses. Uses an internal proxy to access requests/responses from a target host.
Capabilities
- Save website contents
- Save certificates during TLS
- Check OCSP responses for TLS certificates
- Interactively navigate on pages using Puppeteer
- Target host IP filter
Usage
Extend the Scraper class
const Scraper = require('@techteamer/scraper')
class MyCustomScraper extends Scraper {
constructor (options = {}) {
super(options)
}
shouldCheckOcsp (requestUrl, proxyReq, req, res) {
// decide whether or not check OCSP for a specific request.
return true
}
async scraper (puppet, browser) {
// user puppeteer to interact with the page
const page = await puppet.setupPage(browser, '/')
}
}
const scraper = new MyCustomScraper({}) // provide scraper config options
scraper.scrape().then(() => {
console.log('Finished scraping')
process.exit()
}).catch((err) => {
console.error('Failed scraping', err)
process.exit(1)
})
Config
{
"port": 8080,
"ipFilter": "142.251.36.78",
"proxy": {
"minVersion": "TLSv1.2",
"ca": null,
"target": {
"protocol": "https:",
"host": "example.com",
"port": 443
}
},
"ocsp": {
"agent": {}
},
"browser": {
"autoClose": false,
"puppeteer": {
"headless": true
}
},
"save": {
"rootDir": "captures/"
},
"loggerOptions": {
"verbosity": {
"headers": false
}
}
}