jul11co-sitespider
v0.0.6
Published
Jul11Co Web Spider
Downloads
10
Readme
jul11co-sitespider
Jul11Co's Website Spider - Crawling website.
Installation
From npm
npm install -g jul11co-sitespider
Usage
Commandline
Usage: sitespider [OPTIONS...] [page_url] <output_dir> --download : Download --resume : Resume --update : Update (and check for incompleted links) --add-link : Add link --fix-links : Fix links --verbose : Verbose --images : Download images (default: false) --scripts : Download scripts (default: false) --stylesheets : Download stylesheets (default: false) --max-depth=X : Specify max depth
API
var Spider = require('jul11co-sitespider'); var spider = new Spider({ config_file: "PATH_TO_CONFIG_FILE", // optional state_file: "PATH_TO_STATE_FiLE", // optional }); spider.on('page_error', function(err, link){}); spider.on('page_done', function(err, page){}); spider.on('error', function(error){}); spider.on('exit', function(err){}); spider.getConfig(key); spider.setConfig(key, value); spider.start(start_link, output_dir, options); spider.resume(output_dir, options); spider.update(output_dir, options);
Extend with scrapers
Content of scraper script:
// sitespider_scrapers/example.js module.exports = { name: 'Example', match: function(url, options) { // matching rules // return true or false }, scrape: function($, page, options) { // scraping here... } }
// example-spider.js var Spider = require('jul11co-sitespider'); var spider = new Spider(); spider.addScraper(require('./sitespider_scrapers/example')); ...
License
Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)