nlcst-parse-japanese
v1.4.0
Published
Parse Japanese text and output as NLCST.
Downloads
9,544
Readme
nlcst-parse-japanese
Parse Japanese text and output as NLCST.
Install
Install with npm:
npm install nlcst-parse-japanese
Tokenized Data
This library use kuromoji.js via kuromojin.
NLCST Node's data
property is following values.
| Property | Example | Description | | --------------- | :---------- | ------------------------- | | word_type | 'KNOWN' | 単語タイプ(辞書に登録されている単語ならKNOWN | | surface_form | '黒文字' | 表層形 | | pos | '名詞' | 品詞 | | pos_detail_1 | '一般' | 品詞細分類1 | | pos_detail_2 | '' | 品詞細分類2 | | pos_detail_3 | '' | 品詞細分類3 | | conjugated_type | '' | 活用型 | | conjugated_form | '' | 活用形 | | basic_form | '黒文字' | 基本形 | | reading | 'クロモジ' | 読み | | pronunciation | 'クロモジ' | 発音 |
Usage
import {JapaneseParser} from "nlcst-parse-japanese";
const japaneseParser = new JapaneseParser();
const text = "ようこそ、日本へ。";
japaneseParser.ready().then(() => {
const CST = japaneseParser.parse(text);
console.log(CST)
});
/*
{
"type": "RootNode",
"children": [
{
"type": "ParagraphNode",
"children": [
{
"type": "SentenceNode",
"children": [
{
"type": "WordNode",
"children": [
{
"type": "TextNode",
"value": "ようこそ",
"position": {
"start": {
"line": 1,
"column": 1,
"offset": 0
},
"end": {
"line": 1,
"column": 5,
"offset": 4
}
},
"data": {
"word_id": 34120,
"word_type": "KNOWN",
"word_position": 1,
"surface_form": "ようこそ",
"pos": "感動詞",
"pos_detail_1": "*",
"pos_detail_2": "*",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "ようこそ",
"reading": "ヨウコソ",
"pronunciation": "ヨーコソ"
}
}
],
"position": {
"start": {
"line": 1,
"column": 1,
"offset": 0
},
"end": {
"line": 1,
"column": 5,
"offset": 4
}
},
"data": {
"word_id": 34120,
"word_type": "KNOWN",
"word_position": 1,
"surface_form": "ようこそ",
"pos": "感動詞",
"pos_detail_1": "*",
"pos_detail_2": "*",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "ようこそ",
"reading": "ヨウコソ",
"pronunciation": "ヨーコソ"
}
},
{
"type": "WordNode",
"children": [
{
"type": "TextNode",
"value": "、",
"position": {
"start": {
"line": 1,
"column": 5,
"offset": 4
},
"end": {
"line": 1,
"column": 6,
"offset": 5
}
},
"data": {
"word_id": 51340,
"word_type": "KNOWN",
"word_position": 5,
"surface_form": "、",
"pos": "名詞",
"pos_detail_1": "数",
"pos_detail_2": "*",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "、",
"reading": "、",
"pronunciation": "、"
}
}
],
"position": {
"start": {
"line": 1,
"column": 5,
"offset": 4
},
"end": {
"line": 1,
"column": 6,
"offset": 5
}
},
"data": {
"word_id": 51340,
"word_type": "KNOWN",
"word_position": 5,
"surface_form": "、",
"pos": "名詞",
"pos_detail_1": "数",
"pos_detail_2": "*",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "、",
"reading": "、",
"pronunciation": "、"
}
},
{
"type": "WordNode",
"children": [
{
"type": "TextNode",
"value": "日本",
"position": {
"start": {
"line": 1,
"column": 6,
"offset": 5
},
"end": {
"line": 1,
"column": 8,
"offset": 7
}
},
"data": {
"word_id": 329310,
"word_type": "KNOWN",
"word_position": 6,
"surface_form": "日本",
"pos": "名詞",
"pos_detail_1": "固有名詞",
"pos_detail_2": "地域",
"pos_detail_3": "国",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "日本",
"reading": "ニッポン",
"pronunciation": "ニッポン"
}
}
],
"position": {
"start": {
"line": 1,
"column": 6,
"offset": 5
},
"end": {
"line": 1,
"column": 8,
"offset": 7
}
},
"data": {
"word_id": 329310,
"word_type": "KNOWN",
"word_position": 6,
"surface_form": "日本",
"pos": "名詞",
"pos_detail_1": "固有名詞",
"pos_detail_2": "地域",
"pos_detail_3": "国",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "日本",
"reading": "ニッポン",
"pronunciation": "ニッポン"
}
},
{
"type": "WordNode",
"children": [
{
"type": "TextNode",
"value": "へ",
"position": {
"start": {
"line": 1,
"column": 8,
"offset": 7
},
"end": {
"line": 1,
"column": 9,
"offset": 8
}
},
"data": {
"word_id": 92260,
"word_type": "KNOWN",
"word_position": 8,
"surface_form": "へ",
"pos": "助詞",
"pos_detail_1": "格助詞",
"pos_detail_2": "一般",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "へ",
"reading": "ヘ",
"pronunciation": "エ"
}
}
],
"position": {
"start": {
"line": 1,
"column": 8,
"offset": 7
},
"end": {
"line": 1,
"column": 9,
"offset": 8
}
},
"data": {
"word_id": 92260,
"word_type": "KNOWN",
"word_position": 8,
"surface_form": "へ",
"pos": "助詞",
"pos_detail_1": "格助詞",
"pos_detail_2": "一般",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "へ",
"reading": "ヘ",
"pronunciation": "エ"
}
},
{
"type": "PunctuationNode",
"value": "。",
"position": {
"start": {
"line": 1,
"column": 9,
"offset": 8
},
"end": {
"line": 1,
"column": 10,
"offset": 9
}
},
"data": {
"word_id": 90940,
"word_type": "KNOWN",
"word_position": 9,
"surface_form": "。",
"pos": "記号",
"pos_detail_1": "句点",
"pos_detail_2": "*",
"pos_detail_3": "*",
"conjugated_type": "*",
"conjugated_form": "*",
"basic_form": "。",
"reading": "。",
"pronunciation": "。"
}
}
],
"position": {
"start": {
"line": 1,
"column": 1,
"offset": 0
},
"end": {
"line": 1,
"column": 10,
"offset": 9
}
}
}
],
"position": {
"start": {
"line": 1,
"column": 1,
"offset": 0
},
"end": {
"line": 1,
"column": 10,
"offset": 9
}
}
}
],
"position": {
"start": {
"line": 1,
"column": 1,
"offset": 0
},
"end": {
"line": 1,
"column": 10,
"offset": 9
}
}
}
*/
Options
If you have problem loading dictionaries, you can pass an accessible dictPath
to the options:
const japaneseParser = new JapaneseParser({ dicPath: '/dict' });
Changelog
See Releases page.
Running tests
Install devDependencies and Run npm test
:
npm i -d && npm test
Contributing
Pull requests and stars are always welcome.
For bugs and feature requests, please create an issue.
- Fork it!
- Create your feature branch:
git checkout -b my-new-feature
- Commit your changes:
git commit -am 'Add some feature'
- Push to the branch:
git push origin my-new-feature
- Submit a pull request :D
Author
License
MIT © azu
Original
The MIT License (MIT) Copyright (c) 2015 Kenichiro Murata https://github.com/muraken720/parse-japanese