如何在Node.js中从PDF文件中提取数据

Question

如何在Node.js中从PDF文件中提取数据

7

我已经使用了'pdf.js-extract' npm 模块从 pdf 中获取数据。

var PDFExtract = require('pdf.js-extract').PDFExtract;

var pdfExtract = new PDFExtract();
var filename="/home/aman/Downloads/sample_invoice.pdf"

pdfExtract.extract(filename , function (err, data) {
    if (err) return console.log(err);
    console.log(JSON.stringify(data));
});

但是我没有得到所期望的结果。我想要从发票PDF中获取相关信息，如税金、支付的总金额、卖方地址，并将提取的数据保存到MongoDB集合中。

- Ayushi Gupta

它的发票格式（位置）总是相同的吗？您能展示一下 console.log(JSON.stringify(data)); 的返回结果吗？ - Liberateur

实际上，PDF 是由市场（如亚马逊、Flipkart等）生成的，因此其格式可能会有所不同。 - Ayushi Gupta

该模块将提取数据，但它们将具有不同的格式...因此，要么按发票类型创建一个函数，以将数据作为基本模板，要么创建一个脚本来识别内容... - Liberateur

发送一个你得到的例子 console.log(JSON.stringify(data)); - Liberateur

我得到的响应是{ "pages": [ { "content": [

    {
      "x": 348.41,
      "y": 125.59899999999993,
      "str": "发票号码",
      "dir": "ltr",
      "width": 61.61760000000001,
      "height": 8.8,
      "fontName": "g_d0_f2"
    },
    {
      "x": 451.935,
      "y": 125.59899999999993,
      "str": "INV-3337",
      "dir": "ltr",
      "width": 37.171200000000006,
      "height": 8.8,
      "fontName": "g_d0_f2"
    },

], } - Ayushi Gupta

4个回答

0

请参阅 pdf.js-extract npm 模块的 GitHub 存储库 https://github.com/ffalt/pdf.js-extract。

在上述链接中的 example/example.js 路径下提供了以下文件示例。

var fs = require('fs');
var PDFExtract = require('../lib').PDFExtract;
var pdfExtract = new PDFExtract();
pdfExtract.extract('./example.pdf', {} /* options*/, function (err, data) {
    if (err) return console.log(err);
    fs.writeFileSync('./example-output.json', JSON.stringify(data, null, '\t'));
    var lines = PDFExtract.utils.pageToLines(data.pages[0], 2);
    var rows = PDFExtract.utils.extractTextRows(lines);
    var text = rows.map(function (row) {
        return row.join('');
    }).join('\n');
    fs.writeFileSync('./example-output.txt', text);
    console.log(JSON.stringify(data, null, '\t'));
});

希望它能对你有用。

- sanjeev kumar

require('../lib') 指向 lib 文件夹，PDFExtract 是在那里定义的模块。请查看 gitHub 链接。 - sanjeev kumar

0

文件 readPdf.js

const readPdf = (file) => new Promise((resolve, reject) => {
      try {
          pdfExtract.extract( file, function (error, text) {
            (error) ? reject(new Error('El archivo no se pudo leer')) : resolve(text)
            return text;
          });
        // Set up the timeout
        setTimeout(function () {
          reject('Promise timed out after ' + 10000 + ' ms');
        }, 10000);
        return data;
      } catch (error) {
        return false;
      }
    });
    module.exports = { readPdf };

文件 xxx.js

var {readPdf}= require('readPdf');   

  readPdf(files)
      .then(response => {
       console.log(response) // this is your data is 
        }).catch(err => console.log(err));
        return response;
      });

- Dario Paez

0

使用pdf-extract npm包（https://www.npmjs.com/package/pdf-extract）可以从pdf中提取文本。

// Extract text from PDF files (with images)
// Installation guide: https://github.com/nisaacson/pdf-extract

var extract = (function() {

  'use strict';

  var fs = require('fs');
  var path = require('path');
  var pdfExtract = require('pdf-extract');

  var defaultOptions = {
    type: 'ocr',
    ocr_flags: [
      '-l eng',
    ]
  };

  // Execute script if not used as a module
  if (!module.parent) {

    init(process.argv[2]);
  }

  function init(filePath, options, callback) {

    callback = callback || function (error, response) {

      if (error) { return console.error(error); }

      return console.log(response);
    };

    options = options || defaultOptions;

    if (!filePath) {

      return callback(new Error('No input file (PDF) specified.'));
    }

    processFile(filePath, ocrLanguage, callback);
  }

  function processFile(filePath, ocrLanguage, callback) {

    var processor = pdfExtract(filePath, options, function (error) {

      if (error) {

        callback(error);
      }
    });

    processor.on('complete', function (data) {

      saveFile(filePath + '.txt', data.text_pages, callback);
    });

    processor.on('error', function (error) {

      callback(error);
    });
  }

  function saveFile(filePath, string, callback) {

    // Normalize file path
    filePath = path.normalize(filePath);

    try {

      callback('Saved file ' + filePath);

      // Save file
      return fs.writeFileSync(filePath, string, 'utf8');
    } catch (error) {

      callback(error);
    }
  }

  module.exports = {

    init: init
  };
}());

- Fairouz Amor

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Liberateur · Accepted Answer

您需要编写一个按发票格式的函数（fn company1，fn company2 ...）。

以下是使用三个不同函数从pdf.js-extract模块的导出数据中检索数据的示例：

// Sample invoice
let sampleInvoice =
{
  "pages":
  [
    {
      "content":
      [
        {
          "x": 348.41,
          "y": 125.59899999999993,
          "str": "Invoice Number",
          "dir": "ltr",
          "width": 61.61760000000001,
          "height": 8.8,
          "fontName": "g_d0_f2"
        },
        {
          "x": 451.935,
          "y": 125.59899999999993,
          "str": "INV-3337",
          "dir": "ltr",
          "width": 37.171200000000006,
          "height": 8.8,
          "fontName": "g_d0_f2"
        }
      ]
    }
  ]
};


// Create alerts for test functions in browser
alert(searchByPosition(sampleInvoice.pages, 450, 125));
alert(searchByPrev(sampleInvoice.pages, 'Invoice Number'));
alert(searchByFormat(sampleInvoice.pages, /INV-\d+$/));


function searchByPosition(pages,x,y)
{
    // Set position range (difference max)
    let range = 10;

    // Init x and y positions
    x = Math.floor(x/range), y = Math.floor(y/range);

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test position x and y and if match return content
            if(Math.floor(pages[i].content[j].x/range) == x && Math.floor(pages[i].content[j].y/range) == y)

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}


function searchByPrev(pages,txt)
{
    // Init txt
    txt = txt.toLowerCase();

    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test text  and if match return next content
            // (If you write j-1, you can have searchByNext function)
            if(pages[i].content[j].str.toLowerCase() == txt && pages[i].content[j+1])

                // Return result
                return pages[i].content[j+1].str;

    // No results found
    return 'NotFound';
}


function searchByFormat(pages,regex)
{
    // Loop in all pages
    for(let i = 0; i < pages.length; i++)

        // Loop in all content
        for(let j = 0; j < pages[i].content.length; j++)

            // Test regex and if match return content
            if(regex.test(pages[i].content[j].str))

                // Return result
                return pages[i].content[j].str;

    // No results found
    return 'NotFound';
}

请试用这里：https://jsfiddle.net/dkhqzg6s/