npm package discovery and stats viewer.

Discover Tips

  • General search

    [free text search, go nuts!]

  • Package details

    pkg:[package-name]

  • User packages

    @[username]

Sponsor

Optimize Toolset

I’ve always been into building performant and accessible sites, but lately I’ve been taking it extremely seriously. So much so that I’ve been building a tool to help me optimize and monitor the sites that I build to make sure that I’m making an attempt to offer the best experience to those who visit them. If you’re into performant, accessible and SEO friendly sites, you might like it too! You can check it out at Optimize Toolset.

About

Hi, 👋, I’m Ryan Hefner  and I built this site for me, and you! The goal of this site was to provide an easy way for me to check the stats on my npm packages, both for prioritizing issues and updates, and to give me a little kick in the pants to keep up on stuff.

As I was building it, I realized that I was actually using the tool to build the tool, and figured I might as well put this out there and hopefully others will find it to be a fast and useful way to search and browse npm packages as I have.

If you’re interested in other things I’m working on, follow me on Twitter or check out the open source projects I’ve been publishing on GitHub.

I am also working on a Twitter bot for this site to tweet the most popular, newest, random packages from npm. Please follow that account now and it will start sending out packages soon–ish.

Open Software & Tools

This site wouldn’t be possible without the immense generosity and tireless efforts from the people who make contributions to the world and share their work via open source initiatives. Thank you 🙏

© 2024 – Pkg Stats / Ryan Hefner

almighty-parser

v1.0.6

Published

crawler parser

Downloads

2

Readme

爬虫解析器

针对爬虫项目的指定网站解析器。

npm npm license

API

  • [x] getLinks
  • [x] getContent
  • [x] parse
  • [x] isArticleUrl
  • [x] getIdFromArticleUrl

配置参数

文档

完整示例

'use strict';
/**
 * 中华美食
 * http://www.zhms.cn/
 */
module.exports = {
    // 域名 网站域名,设置域名后只处理这些域名下的网页
    domains: 'http://www.zhms.cn/',
    // 入口页链接,分别从这些链接开始爬取
    scanUrls: ['http://www.zhms.cn/'],
    // 列表页url的正则,符合这些正则的页面会被当作列表页处理
    listUrlRegexes: [/http:\/\/www\.zhms\.cn\/\w+\/[0-9_]+[\/]?$/],
    // 内容页url的正则,符合这些正则的页面会被当作内容页处理
    contentUrlRegexes: [/http:\/\/www\.zhms\.cn\/\w+\/\d+\.html$/],
    // 唯一标示
    sourceId: [/http:\/\/www\.zhms\.cn\/\w+\/\d+\.html$/],
    // 从内容页中抽取需要的数据
    fields: [{
        // 标题
        name: 'title',
        selector: ['.zuofa-tit-bar h1'],
        format: 'text'
    }, {
        // 详情
        name: 'content',
        selector: [
            '.zuofa-tit-bar img', // 首图
            '.zuofa-tit-bar .zf-t3', // 描述
            '.yongliao', // 用料
            '.zf-step'],
        format: 'html'
    }, {
        // 作者
        name: 'author',
        selector: ['.user-shuju p'],
        format: 'text'
    }, {
        // 标签
        name: 'tags',
        format: 'default',
        defaultValue: ['美食', '健康']
    }, {
        // 评论数
        name: 'comments',
        format: 'default',
        defaultValue: 0,
        isNumber: true
    }, {
        // 阅读数
        name: 'hits',
        format: 'text',
        selector: ['.zuofa-tit-bar .zf-t4 .s-cf64'],
        defaultValue: 0,
        isNumber: true
    }, {
        // 点赞数
        name: 'likes',
        format: 'text',
        selector: ['.zuofa-tit-bar .zf-t4 .zanBtn'],
        defaultValue: 0,
        isNumber: true
    }, {
        // 发布日期
        name: 'published_at',
        format: 'text',
        selector: ['.zuofa-tit-bar .zf-t2 dd:last-child span'],
        isTimestamp: true
    }, {
        // 网页关键字
        name: 'keywords',
        format: 'meta',
        selector: ['meta[name="keywords"]']
    }, {
        // 网页描述
        name: 'description',
        format: 'meta',
        selector: ['meta[name="description"]']
    }],
    // 是否模拟用户请求
    userAgent: null,
    // 编码 默认utf-8
    charset: null,
    // 页面格式[html/json/jsonp]
    format: 'html'
};

测试示例

const ZhmsParser = require('../index.js').ZhmsParser

const parser = new ZhmsParser()

const url = 'http://www.zhms.cn/userhome/678'
// const url = 'http://www.zhms.cn/zf/499687.html'

let errorItems = []

// 测试获取内容
async function testParseDate () {
    try {
        const result = await parser.parse(url)
        console.log('获取数据内容为', result)
    } catch (e) {
        console.error('[抓取数据出错]', e.message)
        errorItems.push('testParseDate')
    }
}
// 检测链接是否是详情页
function testIsArticleUrl () {
    try {
        const result = parser.isArticleUrl(url)
        console.log('获取数据内容为', result)
    } catch (e) {
        console.error('[抓取数据出错]', e.message)
        errorItems.push('testIsArticleUrl')
    }
}
// 测试页面链接的唯一标示
function testGetIdFromArticleUrl () {
    try {
        const result = parser.getIdFromArticleUrl(url)
        console.log('获取数据内容为', result)
    } catch (e) {
        console.error('[抓取数据出错]', e.message)
        errorItems.push('testGetIdFromArticleUrl')
    }
}

// 获取详情页内容
async function testGetContent () {
    try {
        const result = await parser.getContent(url)
        console.log('获取数据内容为', result)
    } catch (e) {
        console.error('[抓取数据出错]', e.message)
        errorItems.push('testGetContent')
    }
}

// 获取详情页内容
async function testGetLinks () {
    try {
        const result = await parser.getLinks(url)
        console.log('获取数据内容为', result)
    } catch (e) {
        console.error('[抓取数据出错]', e.message)
        errorItems.push('testGetLinks')
    }
}

// 测试入口
async function start () {
    console.log('测试开始')
    console.log('------')
    console.log('测试步骤1 获取内容')
    await testParseDate()
    console.log('测试步骤1 获取内容 结束')
    console.log('------')
    console.log('测试步骤2 校验链接是否为详情页')
    testIsArticleUrl()
    console.log('测试步骤2 校验链接是否为详情页 结束')
    console.log('------')
    console.log('测试步骤3 获取页面链接的唯一标示')
    testGetIdFromArticleUrl()
    console.log('测试步骤3 获取页面链接的唯一标示 结束')
    console.log('------')
    console.log('测试步骤4 获取详情页内容')
    await testGetContent()
    console.log('测试步骤4 获取详情页内容 结束')
    console.log('------')
    console.log('测试步骤5 获取列表页内容')
    await testGetLinks()
    console.log('测试步骤5 获取列表页内容 结束')
    console.log('------')
    console.log('所有接口均已测试结束')
    if (errorItems.length) {
        console.log('测试结果: ', errorItems.join(','), '异常。')
    } else {
        console.log('测试结果: 所有接口都正常。')
    }
}
start()