node-html-scraper
v1.0.9
Published
This library will parse html and help you scrap data from it and convert it to json.
Downloads
23
Maintainers
Readme
node-html-scraper
This is a typescript library that will help you parser, search and scrap data from html. with this library it will be very easy to convert the html to json with a very little code.
Installation
npm i node-html-scraper
Usage
interface ITest {
name: string;
image: string;
children: IChild[];
}
interface IChild {
name: any;
}
import Parser from 'node-html-scraper'
var url = "www.google.se";
var fetchedHtml=`<html>
<div class="item">
<p class="title">Item header 1</p>
<img src="/test.png" />
<ul>
<li>item 1</li>
<li>item 2</li>
</ul>
</div>
<div class="item">
<p class="title">Item header 2</p>
<img src="/test2.png" />
<ul>
<li>item 1</li>
<li>item 2</li>
</ul>
</div>
</html>`
// we add the url here is to be able to complete the image src or link href if its a relative path.
var parsedHtml = Parser.parseHtml(fetchedHtml, url);
var items = parsedHtml.find(".item").toObject<ITest>()
.field(x => x.name, a => a.select('.title').text())
.field(x=> x.image, x=> x.select("img").attr("src").url())
.field(x => x.children, x => x.find("ul > li").map(m=> {
return {
name: m.text(),
};
})
).toList();
console.log(item);
Tranlsate a searchable Config
You could also build a config json where you build your object throw it.
note: for path
see the settings below.
var config = [
{
column: 'name',
path: 's=.title&te=false',
},
{
column: 'image',
path: 's=img&at=src&ul',
},
{
column: 'children',
isArray: true,
path: 'f=ul > li',
nested: [
{
column: 'name',
path: 'te=false',
},
],
},
];
var parsedHtml = Parser.parseHtml(fetchedHtml, url);
// see the result below
var items = parsedHtml.find(".item").toObject<ITest>().translate(config).toList();
Result
This is how items will look like when its returned
▼0:
name:"Item header 1"
image:"www.google.se/test.png"
▼children:
►0:{name:"item 1"}
►1:{name:"item 2"}
▼1:
name:"Item header 2"
image:"www.google.se/test2.png"
▼children:
►0:{name:"item 1"}
►1:{name:"item 2"}
IConfig Path selector settings
you could build your path to your html node by using the settings below.
Those path are seperated by &
as queryString.
// example s=.title
s= select
// example f=.title
f= find
// example s=.title&te=false/true
te= text
// example at=src
at= attr
// example ch
ch= cleanHtml
// example in
ih= innerHtml
// example oh
oh= outerHtml
// example pa
pa= parent
// example eq=0
eq= index
// example ft
ft= first
// lt
lt= last
// example ct=.item
ct= closest
// example at=src&ul
ul= url
ISelector
/** find(querySelectorAll) items by its selector. available selector is css5 */
find: (selector: string) => ISelector;
/** find(querySelector) an item by its selector. available selector is css5 */
select: (selector: string) => ISelector;
/** get closest by selector */
closest: (selector: string) => ISelector;
/** get attr of the element, eg .attr("title|alt").text()
* This will try to find "title" if it failed or its empty it will try "alt" instead.
*/
attr: (selector: string) => ISelector;
/** get the full url. if the attr is full url then return it or return the baseurl/attr value
* eg .attr("src").url()
*/
url: () => string;
/** get the last element of the selected items */
last: () => ISelector;
/** get the selected text, this differ from innerhtml as the text will be cleaned */
text: (structuredText?: boolean) => string;
/** selected node value */
nodeValue: () => string | undefined;
/** attr value without changing it */
attValue: () => string | undefined;
/** remove script, style , input etc and clean the html object before returned it */
cleanInnerHTML: () => string;
/** the orginal innerhtml without changing it */
innerHTML: () => string;
/** the orginal outerhtml without changing it */
outerHTML: () => string;
/** return all innertext of the selected items */
textArray: () => string[];
/** the selected attr has value */
hasValue: () => boolean;
/** the selected element exists */
hasElement: () => boolean;
/** the selected elements exists */
hasElements: () => boolean;
/** length of the selected elements */
length:()=> number;
/** remove selected items. available selector is css5 */
remove: (selector: string) => ISelector;
/** loop throw each item */
forEach:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => void)=> void;
/** loop throw each item and do a manual seach */
where:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => boolean) => ISelector;
/** loop throw each item and return the desire extracted result */
map:<T>(func: (jq: ISelector, index?: number, arr?: T[]) => T) => T[];
reduce: <T>(func: (itemType: any, jq: ISelector, index?: number, arr?: T[]) => void,itemType: any) => any;
/** parent of the selected item */
parent:()=> ISelector;
/** find at index of the selected items */
eq: (index: number) => ISelector;
/** children of the selected item */
children: ()=> ISelector;
/** convert the parsed html to object */
toObject:<T extends {}>() => IObjectContructor<T>