new-scraping
v0.0.1
Published
scraping application for Finciero
Downloads
1
Readme
Tasks Structure
'use strict';
var CONSTANTS, Classes, dependencies, taskName;
function helperFunction () {
}
// ...
Class = require('class');
// ...
CONSTANT1 = 'foo';
// ...
taskName = Yakuza.task('scraper', 'agent', 'taskName');
taskName.builder(function (job) {
// ...
});
taskName.main(function (task, http, params) {
var opts;
loginFormOpts = http.optionsTemplate({
'headers': LOGIN_HEADERS,
'follow_max': 1
});
// Request: Request summary
// ========================
// Detailed explanation if necessary.
// lorem ipsum dolor sit amet
// ...
http.get('FormRetrieval', opts.build({url: URL_FORM}), function (err, res, body) {
var a, b, c;
if (err) {
task.fail(err, 'Request error');
}
// Parsing: Parsing explanation
// ----------------------------
try {
// Parse account name
superComplexOperation(/[a-b]+/g).match();
} catch (error) {
task.fail(error, 'Failed parsing form');
return;
}
task.success(data);
});
});
Handling Asyncrony
Lodash Loops
Stopping a loop
// DO NOT DO
_.each(collection, function (elem) {
return false;
});
// Do this instead
noErrors = _.every(collection, function (elem) {
try {
// .. Parsing
} catch (error) {
// .. Handle error
return false;
}
return true;
});
if (!noErrors) {
return;
}
// Wont reach this line if error happened
Handling asyncronous requests
var promise1, promise2;
getChecking = Q.defer();
getCredit = Q.defer();
// Request: Get Checking accounts
// ==============================
http.get(..., function (){
// Parsing:
try {
// Parse stuff
getChecking.resolve(result);
catch (error) {
getChecking.reject({error: error, message: 'Something happened'});
return;
}
});
// Request: Get Credit accounts
// ==============================
http.get(..., function (){
// Parsing:
try {
// Parse stuff
getCredit.resolve(result);
catch (error) {
getCredit.reject({error: error, message: 'Something happened'});
return;
}
});
Q.all([getChecking, getCredit]).then(function (results) {
// results[0] => get checking results
// results[1] => get credit results
task.success(...);
}, function (error) {
task.fail(error.error, error.message);
});
Directory structure
- app.js
- banks
- bank1
- login
- login.task.js ...
- bank1.agent.js Readme.md ... banks.scraper.js
- login
- bank1
Component declarations
- Should always be CapitalizedCamelCase:
SomeTask
,SomeScraper
Documenting scrapers
- Agents should have a
Readme.md
with details about the bank and important info/links etc. - Tasks should have exaplations for all requests unless they are extremely trivial
- Tasks can have a general explanation, detailing how requests are made by the task.
- Code which is not easily understood on its own should be preceeded with a comment that briefly explains it
- Shares should be preceeded by a comment explaining why it is necessary and which tasks require the shared value
- Parsing blocks should be explained in a preceeding comment block
Example:
// This awesome task handles tasking taskfulness, it is important to keep track of
// the awesomes received and re-send them to the awesome-receiver and bla bla bla
Yakuza.task('Scraper', 'Agent', 'Task').main(function (task, http, params) {
var awesomeOpts;
awesomeOpts = http.build({
url: URL_AWESOME,
data: {
important: 'things'
},
follow_max: 1
});
// Request: send important things
// ==============================
// important things need to be sent to set important session cookies
http.post(http.build(awesomeOpts, function (err, res, body) {
// Parsing: retrieve important things
// ----------------------------------
// This parses important things and skips the last to rows so that nothing explodes and stuff
try {
// Replaces all letters with 'awesome' because why not
body.replace(/[a-zA-Z]+/g, 'awesome');
} catch (error) {
// ...
}
// Share: Used by `FooTask` for its requests
task.share('importantThings', body);
task.success('hi');
});
});