Skip to content

Get text from one page only

Hey guys, i need get te text from one specific page only, so i modify a little bit the parser.

I just add an option param onePageOnly and return only the page text if parameter was set:

`async function PDF(dataBuffer, options) { var isDebugMode = false;

let ret = {
    numpages: 0,
    numrender: 0,
    info: null,
    metadata: null,
    text: "",
    version: null
};

if (typeof options == 'undefined') options = DEFAULT_OPTIONS;
if (typeof options.pagerender != 'function') options.pagerender = DEFAULT_OPTIONS.pagerender;
if (typeof options.max != 'number') options.max = DEFAULT_OPTIONS.max;
if (typeof options.version != 'string') options.version = DEFAULT_OPTIONS.version;
if (options.version == 'default') options.version = DEFAULT_OPTIONS.version;

PDFJS = PDFJS ? PDFJS : require(`./pdf.js/${options.version}/build/pdf.js`);

ret.version = PDFJS.version;

// Disable workers to avoid yet another cross-origin issue (workers need
// the URL of the script to be loaded, and dynamically loading a cross-origin
// script does not work).
PDFJS.disableWorker = true;
let doc = await PDFJS.getDocument(dataBuffer);
ret.numpages = doc.numPages;

let metaData = await doc.getMetadata().catch(function(err) {
    return null;
});

ret.info = metaData ? metaData.info : null;
ret.metadata = metaData ? metaData.metadata : null;

let onePageOnly = options.onePageOnly <= 0 ? 0 : options.onePageOnly;
let counter = options.max <= 0 ? doc.numPages : options.max;
counter = counter > doc.numPages ? doc.numPages : counter;

ret.text = "";

if (onePageOnly > 0 ) {
    let pageText = await doc.getPage(onePageOnly).then(pageData => options.pagerender(pageData)).catch((err)=>{
        // todo log err using debug
        debugger;
        return "";
    });
    ret.text = `${ret.text}\n\n${pageText}`;
    return ret;
}
for (var i = 1; i <= counter; i++) {
    let pageText = await doc.getPage(i).then(pageData => options.pagerender(pageData)).catch((err)=>{
        // todo log err using debug
        debugger;
        return "";
    });

    ret.text = `${ret.text}\n\n${pageText}`;
}

ret.numrender = counter;
doc.destroy();

return ret;

}`

If you guys like, let me know i create a merge request.

Cheers!

Edited by LeonardoDi Sarli