hcr
v1.4.1
Published
Easy To Use Web Crawler
Downloads
22
Maintainers
Readme
hcr
The Hcr helps you to grab some data on the web page. It allows you to crawl all site recursively. It supports limiting requests, adding custom headers and converting html to object as you wish.
Depencies
Getting Started
There is an example config that you can modify and use. The callback argument that you passed to constructor is default callback for all functions.
Installation
hcr
is available on npm. To install it, type:
$ npm install hcr
Usage
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
crawler.getPage(['site1.com', 'site2.com']);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#name',
func: 'text'
},
'Image': {
selector: '#image',
func: 'attr',
args: ['src']
}
};
crawler.toObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#name',
func: 'text'
},
'Image': {
selector: '#image',
func: 'attr',
args: ['src']
}
};
crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var object = {
'Name': {
selector: '#span',
prop: 'textContent'
}
};
crawler.recursiveToObject(['site1.com', 'site2.com'], object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var regex = /[A-Z]/g;
var object = {
'Name': {
selector: '#span',
prop: 'textContent'
}
};
crawler.recursiveRegexToObject(['site1.com', 'site2.com'], regex, object);
var opts = {
reservoir: 100,
reservoirRefreshInterval: 60 * 1000
};
var crawler = new Crawler(opts, callback);
var doneCallback = function() {
// crawling done
};
crawler.on('completed', doneCallback);