# https://github.com/jsvine/pdfplumber
import pdfplumber
x0 = 0 # Distance of left side of character from left side of page.
x1 = 0.5 # Distance of right side of character from left side of page.
y0 = 0 # Distance of bottom of character from bottom of page.
y1 = 1 # Distance of top of character from bottom of page.
all_content = []
with pdfplumber.open("file_path") as pdf:
for i, page in enumerate(pdf.pages):
width = page.width
height = page.height
# Crop pages
left_bbox = (x0*float(width), y0*float(height), x1*float(width), y1*float(height))
page_crop = page.crop(bbox=left_bbox)
left_text = page_crop.extract_text()
left_bbox = (0.5*float(width), y0*float(height), 1*float(width), y1*float(height))
page_crop = page.crop(bbox=left_bbox)
right_text = page_crop.extract_text()
page_context = '\n'.join([left_text, right_text])
all_content.append(page_context)
if i < 2: # help you see the merged first two pages
print(page_context)
这是我用于常规PDF解析的代码,它似乎在该图像上运行良好(我下载了一张图片,因此使用了光学字符识别技术,所以与常规OCR一样准确)。请注意,这将对文本进行标记化处理。还请注意,您需要安装Tesseract才能使其正常工作(pytesseract只是让Tesseract从Python中工作)。Tesseract是免费且开源的。
from PIL import Image
import pytesseract
import cv2
import os
def parse(image_path, threshold=False, blur=False):
image = cv2.imread(image_path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
if threshold:
gray = cv2.threshold(gray, 0, 255, \
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
if blur: #useful if salt-and-pepper background.
gray = cv2.medianBlur(gray, 3)
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray) #Create a temp file
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename) #Remove the temp file
text = text.split() #PROCESS HERE.
print(text)
a = parse(image_path, True, False)