@fdocs/pdf
v1.1.3
Published
A simple light to extract plain text from a pdf file.
Downloads
28
Maintainers
Readme
Features
- 🔥 Easy-to-use
- 🔐 Password Protection:handle password-protected documents.
- 📄 Flexible Page Selection: Process specific pages or the entire document.
- 🚫 Line Skipping: Easily skip unwanted lines from pages.
Install
npm install @fdocs/pdf
Usage
import { pdf } from "@fdocs/pdf";
const extractTextFromPDF = async (file) => {
// parse the PDF file
const content = await pdf(file, { pages: "1"});
// example: "1,3-4" or [3,4] or "all", Default is ["1"]
// get the extracted text lines
const lines = content.getText(); // ==> array: text
// const lines = content.getRaw(); ==> object: coordinates (x, y, etc.).
// const lines = content.getPages(); ==> total number pages
console.log(lines); // Output the extracted text
};
extractTextFromPDF("foo.pdf");
API
const options = {
pages ? : string | number[] | "all",
password ? : string,
skip?: {
global?: { // Skip lines for all pages
lines?: string | number[],
lastLines?: number,
},
pageSpecific?: {
page: number,
lines?: string | number[],
lastLines?: number,
}[]; // Skip lines for specific pages
text?: {
value: string | RegExp,
match?: "contain" | "startWith" | "regex" | "exact", // Text match type
nextLine?: {
value: string | RegExp,
match: "contain" | "startWith" | "regex" | "exact",
}
}[],
ranges?: {
start: {
value: string | RegExp,
match?: "contain" | "startWith" | "regex" | "exact", // Start match type
},
end: {
value: string | RegExp;
match?: "contain" | "startWith" | "regex" | "exact", // End match type
},
}[],
},
}
Example
const extractTextFromPDF = async (file) => {
const content = await pdf(file, {
pages: "all",
skipLinesByText: [{
text: "This document is printed/converted directly from the customer account statement printing system",
type: "contain",
},
{
text: "351",
type: "exact",
},
],
skip: {
ranges: [
{
start: {
value: "I love you",
match: "contain",
},
end: {
value: "I hate you",
match: "contain",
},
},
],
text: [
{
value: /^\(.*\)$/,
match: "regex",
},
],
},
});
// get the extracted text lines
const lines = content.getText(); // ==> array: text
// const lines = content.getRaw(); ==> object: coordinates (x, y, etc.).
// const lines = content.getPages(); ==> total number pages
console.log(lines); // Output the extracted text
}
extractTextFromPDF("foo.pdf")
Changelog
Authors
- Nguyễn Tường Hy (@ngtuonghy)
License
This package is licensed under the MIT License. See the LICENSE file for details.