flowesh
v1.2.4
Published
Flowesh is the non-cluster version of floodesh. It's a middleware based web spider which is lightweight and easy to maintain
Downloads
18
Readme
Flowesh
Flowesh is the non-cluster version of floodesh. It's a middleware based web spider which is lightweight and easy to maintain
Install
npm install flowesh
Usage
function Spider(){
this.name = 'MySpider'
}
Spider.prototype = {
seed:[{
opt:{
uri:'http://www.baidu.com'
},
next:'parse'
}],
onInit:function(done){
// do whatever you want on the init stage
this.seed.push({
uri:'http://www.hao123.com',
priority:3
});
this.seed.push({
uri:'http://www.qq.com',
priority:1
});
done();
},
// onData is optional, happens after the response goes through all the middlewares
onDate:function(dataSet){
console.log(dataSet.get('data'));
},
// onComplete is optional, happens after onData
onComplete:function(ctx){
console.log('%s complete', ctx.request.url);
},
parse:function(ctx, done){
console.log(ctx.content.match(/<title>(.*?)<\title>/)[1]);
// if you have new tasks generated
ctx.tasks.push({
opt:{
uri:'http://www.163.com',
priority:0
},
next:'parse'
});
done();
}
}
const Flowesh = require('flowesh'),
// request middleware that detects charset of reponse
charsetparser = require('mof-charsetparser'),
// request middleware that converts response to utf-8 encoding
iconv = require('mof-iconv'),
// response middleware that loads response into a jQuery object which has the same usage as jQuery
cheerio = require('mof-cheerio'),
// request middleware that corrects your queue options, e.g. attribute 'jquery' in your queue option will be changed into 'jQuery'
normalizer = require('mof-normalizer'),
// request middleware that adapts your queue options to meet request(https://github.com/request/request) requirements
reqadapter = require('mof-reqadapter');
const config = {
"schedule":{
"concurrent": 1,
"rate": 5000,
"priorityRange":10 // default 10
},
"request":{
"retry":3
}
}
const flowesh = new Flowesh(config).attach(new Spider());
// middlewares will be executed in order
flowesh.requestmw.use(normalizer());
flowesh.requestmw.use(reqadapter());
flowesh.responsemw.use(charsetparser());
flowesh.responsemw.use(iconv());
flowesh.responsemw.use(cheerio());
flowesh.start();