pdfdataextract
v3.2.0
Published
Extract data from a pdf with pure javascript
Downloads
39,024
Maintainers
Readme
pdfdataextract
Extract data from a pdf with pure javascript.
Inspered by https://www.npmjs.com/package/pdf-parse, which is currently unmaintained.
Install
npm install pdfdataextract
Docs
Full documentation is available at the wiki
Usage
import { PdfData, VerbosityLevel } from 'pdfdataextract';
import { readFileSync } from 'fs';
const file_data = readFileSync('some_pdf_file.pdf');
// all options are optional
PdfData.extract(file_data, {
password: '123456', // password of the pdf file
pages: 1, // how many pages should be read at most
sort: true, // sort the text by text coordinates
verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing
get: { // enable or disable data extraction (all are optional and enabled by default)
pages: true, // get number of pages
text: true, // get text of each page
fingerprint: true, // get fingerprint
outline: true, // get outline
metadata: true, // get metadata
info: true, // get info
permissions: true, // get permissions
},
}).then((data) => {
data.pages; // the number of pages
data.text; // an array of text pages
data.fingerprint; // fingerprint of the pdf document
data.outline; // outline data of the pdf document
data.info; // information of the pdf document, such as Author
data.metadata; // metadata of the pdf document
data.permissions; // permissions for the document
});
import { PdfDataExtractor, VerbosityLevel } from 'pdfdataextract';
import { readFileSync } from 'fs';
const file_data = readFileSync('some_pdf_file.pdf');
// all options are optional
PdfDataExtractor.get(file_data, {
password: '123456', // password of the pdf file
verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing
}).then((extractor) => {
extractor.pages; // the number of pages
extractor.fingerprint; // fingerprint of the pdf document
extractor.getText(1, true).then((text) => {
// an array of text pages (only one page and sorted)
});
extractor.getText([2]).then((text) => {
// an array of text pages (only the second page)
});
extractor.getOutline().then((outline) => {
// outline data of the pdf document
});
extractor.getMetadata().then((metadata) => {
// metadata of the pdf document
});
extractor.getPermissions().then((permissions) => {
// permissions for the document
});
extractor.close();
});
Test
npm test
Maybe TODOs
- [ ] try to find the line number of the outline
- [ ] create pdf-dist-es5 builds (seprated repo)