khmertokenizer
v0.0.5
Published
Basic Khmer Tokenizer
Downloads
2
Readme
Khmer Tokenizer
A fast Khmer text tokenizer that ensures all characters are included in the process.
import { tokenize } from 'khmertokenizer';
tokenize("ភាសាខ្មែរ១២ 123 ABC")
// => ["ភា","សា","ខ្មែ","រ","១","២"," ","1","2","3"," ","A","B","C"]
Iterator
import { tokenizeAsIterator } from 'khmertokenizer';
for (const c of tokenizeAsIterator("ភាសាខ្មែរ១២ 123 ABC")) {
console.log(c);
}
Grapheme Validation
import { tokenize, isInvalidKhmerGrapheme } from 'khmertokenizer';
const input = "ភាសាខ្មែរ១២ 123 ABC ២ ៗាា"
const output = tokenize(input)
.filter(c => !isInvalidKhmerGrapheme(c)) // remove invalid graphemes
.join("")
//=> "ភាសាខ្មែរ១២ 123 ABC ២ ៗ"