@yoannchb/tokenize
v1.0.1
Published
An advanced tokenizer made with typescript
Downloads
1
Maintainers
Readme
tokenize
An advanced tokenizer made with typescript
Update
See the CHANGELOG file
Installation
$ npm i @yoannchb/tokenize
CDN
<script src="https://unpkg.com/@yoannchb/[email protected]/dist/index.js"></script>
Import
import Tokenizer from "@yoannchb/tokenize";
// OR
const Tokenizer = require("@yoannchb/tokenize");
API
See the example
NOTE: With typescript please set your tokens
as const
to get the typing
const tokenizer = new Tokenizer({ tokens });
tokenizer.tokenize("my string");
Tokenizer(options)
prioritization: boolean
Allow the prioritization of the regex based on the key index in the tokens variables (by default it's false
).
const tokenizer = new Tokenizer({
tokens: {
AA: /aa/,
BABA: /baba/,
},
prioritize: true,
});
const result = tokenizer.tokenize("babaaa");
// result[0].type = "UNKNOWN", result[0].value = "bab"
// result[1].type = "AA"
// result[2].type = "UNKNOWN", result[2].value = "a"
const tokenizer = new Tokenizer({
tokens: {
AA: /aa/,
BABA: /baba/,
},
prioritize: false,
});
const result = tokenizer.tokenize("babaaa");
// result[0].type = "BABA"
// result[1].type = "AA"
defaultType: boolean
Change the defaultType when an UNKNOWN token type is matched (by default it's UNKNOWN
).
callback: (token, previousTokens) => Token | null
Set a callback function called every time a new token is matched.
You can also return null
if you don't want to keep this token.
concatDefaultType: boolean
Concat the tokens with the default type (by default it's true
).
authorizeAdditionalTokens: boolean
Authorize custom token type returned by the callback function. It allow you to add adtionnal token in typescript type too.
const tokenizer = new Tokenizer({
tokens,
callback: (tk, prevTk) => ({ ...tk, type: "SOMETHING NOT IMPLEMENTED" }),
});
Example
Here follow a simple example of a lexer for the JSON syntax:
Code
const tokens = {
STRING: Tokenizer.BUILT_IN_RULES.DOUBLE_QUOTE_STRING, // /(")(?<content>(?:\\\1|.)*?)\1/
NUMBER: Tokenizer.BUILT_IN_RULES.NUMBER,
WHITE_SPACE: Tokenizer.BUILT_IN_RULES.WHITE_SPACES,
COMA: /,/,
COLON: /:/,
TRUE_BOOLEAN: /true/,
FALSE_BOOLEAN: /false/,
NULL: /null/,
START_BRACKET: /\[/,
END_BRACKET: /\]/,
START_BRACE: /\{/,
END_BRACE: /\}/,
} as const;
const tokenizer = new Tokenizer({
tokens,
callback: (tk, prevTk) => {
switch (tk.type) {
case "WHITE_SPACE":
return null; // Remove white spaces
case "UNKNOWN":
throw new Error(
`Invalide JSON: "${tk.value}"\nAt line: ${tk.startLine}, column: ${tk.startColumn}\nTo line: ${tk.endLine}, column: ${tk.endColumn}`
);
}
return tk;
},
});
const result = tokenizer.tokenize(
`{ "greeting": "Hello World !", "error": false, "note": 20, "bool": "false" }`
);
Result
[
{
type: "START_BRACE",
value: "{",
startLine: 0,
startColumn: 0,
endLine: 0,
endColumn: 1,
},
{
type: "STRING",
value: '"greeting"',
groups: { content: "greeting" },
startLine: 0,
startColumn: 2,
endLine: 0,
endColumn: 12,
},
{
type: "COLON",
value: ":",
startLine: 0,
startColumn: 12,
endLine: 0,
endColumn: 13,
},
{
type: "STRING",
value: '"Hello World !"',
groups: { content: "Hello World !" },
startLine: 0,
startColumn: 14,
endLine: 0,
endColumn: 29,
},
{
type: "COMA",
value: ",",
startLine: 0,
startColumn: 29,
endLine: 0,
endColumn: 30,
},
{
type: "STRING",
value: '"error"',
groups: { content: "error" },
startLine: 0,
startColumn: 31,
endLine: 0,
endColumn: 38,
},
{
type: "COLON",
value: ":",
startLine: 0,
startColumn: 38,
endLine: 0,
endColumn: 39,
},
{
type: "FALSE_BOOLEAN",
value: "false",
startLine: 0,
startColumn: 40,
endLine: 0,
endColumn: 45,
},
{
type: "COMA",
value: ",",
startLine: 0,
startColumn: 45,
endLine: 0,
endColumn: 46,
},
{
type: "STRING",
value: '"note"',
groups: { content: "note" },
startLine: 0,
startColumn: 47,
endLine: 0,
endColumn: 53,
},
{
type: "COLON",
value: ":",
startLine: 0,
startColumn: 53,
endLine: 0,
endColumn: 54,
},
{
type: "NUMBER",
value: "20",
startLine: 0,
startColumn: 55,
endLine: 0,
endColumn: 57,
},
{
type: "COMA",
value: ",",
startLine: 0,
startColumn: 57,
endLine: 0,
endColumn: 58,
},
{
type: "STRING",
value: '"bool"',
groups: { content: "bool" },
startLine: 0,
startColumn: 59,
endLine: 0,
endColumn: 65,
},
{
type: "COLON",
value: ":",
startLine: 0,
startColumn: 65,
endLine: 0,
endColumn: 66,
},
{
type: "STRING",
value: '"false"',
groups: { content: "false" },
startLine: 0,
startColumn: 67,
endLine: 0,
endColumn: 74,
},
{
type: "END_BRACE",
value: "}",
startLine: 0,
startColumn: 75,
endLine: 0,
endColumn: 76,
},
];