我使用Tesseract从扫描的PDF中提取文本。其中一些文件还包含图像。有没有办法获取这些图片?
我通过将扫描的pdf转换为tiff文件来为Tesseract做准备。但是,我找不到任何命令行工具来从中提取图像,就像pdfimages对于“文本”pdf所做的那样。
有没有任何工具(或工具组合)可以帮助我完成这项工作呢?
http://www.xpdfreader.com/pdfimages-man.html
你需要下载R、Rstudio、xPDFreader和PDFtools才能完成此操作。确保你的程序文件可以在“环境变量”中找到(如果使用Windows),以便R可以找到这些程序。 #("PDF to PPM")
files <- tools::file_path_sans_ext(list.files(path = dest, pattern =
"pdf", full.names = TRUE))
lapply(files, function(i){
shell(shQuote(paste0("pdftoppm -f 1 -l 10 -r 300 ", i,".pdf", " ",i)))
})
你也可以使用CMD提示符并键入
pdftoppm -f 1 -l 10 -r 300 stuff.pdf stuff.ppm
pdfimages mydoc.pdf
./extractImages.py images*
#!/bin/env python
import cv2
import numpy as np
import os
from pathlib import Path
def extractImagesFromFile(inputFilename, outputDirectory, tracing=False, tracingDirectory=""):
# Settings:
minimumWidth = 100
minimumHeight = 100
greenColor = (36, 255, 12)
traceWidth = 2
# Load image, grayscale, Otsu's threshold
image = cv2.imread(inputFilename)
original = image.copy()
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Find contours, obtain bounding box, extract and save ROI
ROI_number = 1
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x, y, w, h = cv2.boundingRect(c)
if w >= minimumWidth and h >= minimumHeight:
cv2.rectangle(image, (x, y), (x + w, y + h), greenColor, traceWidth)
ROI = original[y:y+h, x:x+w]
outImage = os.path.join(outputDirectory, '{}_{}.png'.format(Path(inputFilename).stem, ROI_number))
cv2.imwrite(outImage, ROI)
ROI_number += 1
if tracing:
outImage = os.path.join(tracingDirectory, Path(inputFilename).stem + '_trace.png')
cv2.imwrite(outImage, image)
def main(files):
tracingEnabled = True
outputDirectory = 'images'
tracingDirectory = 'tracing'
# Create the output directory if it does not exist
outputPath = Path.cwd() / outputDirectory
outputPath.mkdir(exist_ok=True)
if tracingEnabled:
tracingPath = Path.cwd() / tracingDirectory
tracingPath.mkdir(exist_ok=True)
for f in files:
print("Prcessing {}".format(f))
if Path(f).is_file():
extractImagesFromFile(f, outputDirectory, tracingEnabled, tracingDirectory)
else:
print("Invalid file: {}".format(f))
if __name__ == "__main__":
import argparse
from glob import glob
parser = argparse.ArgumentParser()
parser.add_argument("fileNames", nargs='*')
args = parser.parse_args()
fileNames = list()
for arg in args.fileNames:
fileNames += glob(arg)
main(fileNames)
这个基本算法是由nathancy提供的,作为对这个问题的回答:
pdfimages
。 - Mark Setchell