nest-crawler
v1.9.0
Published
An easiest crawling and scraping module for NestJS
Downloads
137
Maintainers
Readme
Installation
$ npm install --save nest-crawler
Usage
First, register it in the application module so that Nest can handle dependencies:
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class AppModule {}
Then, just import it and use it:
crawler.module.ts
import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
imports: [
NestCrawlerModule,
],
})
export class CrawlerModule {}
crawler.service.ts
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
// scraping the specific page
public async scrape(): Promise<void> {
interface ExampleCom {
title: string;
info: string;
content: string;
}
const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
},
},
});
console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
// }
}
// crawling multi pages is also supported
public async crawl(): Promise<void> {
interface HackerNewsPage {
title: string;
}
const pages: HackerNewsPage[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
Recipe
Single Page Scraping
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async scrape(): Promise<void> {
interface ExampleCom {
title: string;
info: string;
content: string;
}
const data: ExampleCom = await this.crawler.fetch({
target: 'http://example.com',
fetch: {
title: 'h1',
info: {
selector: 'p > a',
attr: 'href',
},
content: {
selector: '.content',
how: 'html',
}
},
});
console.log(data);
// {
// title: 'Example Domain',
// info: 'http://www.iana.org/domains/example',
// content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
// }
}
}
Multi Pages Crawling
You Know the target urls already
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Site {
title: string;
}
const sites: Site[] = await this.crawler.fetch({
target: [
'https://example1.com',
'https://example2.com',
'https://example3.com',
],
fetch: (data: any, index: number, url: string) => ({
title: 'h1',
}),
});
console.log(sites);
// [
// { title: 'An easiest crawling and scraping module for NestJS' },
// { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
// { title: '[Experimental] React SSR as a view template engine' }
// ]
}
}
You Don't Know the Target Urls so Want to Crawl Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Page {
title: string;
}
const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
// fetch each `https://news.ycombinator.com/${x}` and scrape data
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}
You Need to Pass Data Dynamically
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Img {
src: string;
}
const images: Img[] = await this.crawler.fetch({
target: {
url: 'https://some.image.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://some.image.com${x}`,
},
fetch: {
imageIds: {
listItem: 'div.image',
data: {
id: {
selector: 'div.image-wrapper',
attr: 'data-image-id',
},
},
},
},
},
// fetch each `https://some.image.com${x}`, pass data and scrape data
fetch: (data: any, index: number, url: string) => ({
src: {
convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
},
}),
});
console.log(images);
// [
// { src: 'https://some.image.com/images/1.png' },
// { src: 'https://some.image.com/images/2.png' },
// ...
// ...
// { src: 'https://some.image.com/images/100.png' }
// ]
}
}
Waitable (by using puppeteer
)
import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';
@Injectable()
export class CrawlerService {
constructor(
private readonly crawler: NestCrawlerService,
) {}
public async crawl(): Promise<void> {
interface Page {
title: string;
}
const pages: Page[] = await this.crawler.fetch({
target: {
url: 'https://news.ycombinator.com',
iterator: {
selector: 'span.age > a',
convert: (x: string) => `https://news.ycombinator.com/${x}`,
},
},
waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
fetch: (data: any, index: number, url: string) => ({
title: '.title > a',
}),
});
console.log(pages);
// [
// { title: 'Post Title 1' },
// { title: 'Post Title 2' },
// ...
// ...
// { title: 'Post Title 30' }
// ]
}
}