我的工作需要我使用pymupdf
从pdf文件中提取表格并导出为csv格式。
Excalibur是camelot的GUI版本。
Installation https://camelot-py.readthedocs.io/en/master/user/install.html
这应该能为您完成工作。可以直接导出到csv。默认输出是数据框,可以导出到Excel或CSV。pymupdf
是用于将文本提取为文本的,这将让你自己进行解析和推断。这是一个相当雄心勃勃的项目。import fitz
import pandas as pd
import numpy as np
class PDF():
def __init__(self, file):
self.pdf_doc = fitz.open(file)
@staticmethod
def rect_to_svg_line(x1, y1, x2, y2, fill, stroke):
width = x2 - x1
height = y2 - y1
svg_rect = f'<rect x="{x1}" y="{y1}" width="{width}" height="{height}" fill="{fill}" stroke="{stroke}" />\n'
return svg_rect
@staticmethod
def merge_bins(arr):
result = []
n = len(arr)
i = 0
while i < n:
if i + 1 < n and arr[i + 1] - arr[i] < 4:
result.append((arr[i + 1] + arr[i]) / 2)
i += 2
else:
result.append(arr[i])
i += 1
result = np.array(result)
return result
def get_layout(self, page, skip_rects=0, fill="white", stroke="black"):
pdf_page = self.pdf_doc[int(page)]
mediabox = pdf_page.mediabox
width = mediabox.x1 - mediabox.x0
height = mediabox.y1 - mediabox.y0
svg_lines = ''
for rect in pdf_page.get_drawings()[skip_rects:]:
x1, y1, x2, y2 = rect['rect'][0], rect['rect'][1], rect['rect'][2], rect['rect'][3]
svg_lines += self.rect_to_svg_line(x1, y1, x2, y2, fill, stroke)
svg_content = f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}" viewBox="0 0 {width} {height}">\n' \
f'<rect x="0" y="0" width="{width}" height="{height}" fill="white" />\n' \
f'{svg_lines}' \
f'</svg>'
return svg_content
def multipage_table(self, pages, headers, skip_rows=0, skip_cols=0, skip_rects=0):
df = pd.DataFrame()
r_ind=-1
for p in pages:
pdf_page = self.pdf_doc[int(p)]
x1_values = []
y1_values = []
for rect in pdf_page.get_drawings()[skip_rects:]:
x1_values.append(rect['rect'][0])
y1_values.append(rect['rect'][1])
x_bins = np.sort(list(set(x1_values)))
y_bins = np.sort(list(set(y1_values)))
y_grid = self.merge_bins(y_bins)
x_grid = self.merge_bins(x_bins)
for row in range(skip_rows, len(y_grid)-1):
r_ind+=1
y1, y2 = y_grid[row],y_grid[row+1]
for col in range(skip_cols, len(x_grid)-1):
x1, x2 = x_grid[col],x_grid[col+1]
cell_text = pdf_page.get_text("text", clip=(x1, y1, x2, y2)) # Extract text within cell boundaries
df.loc[r_ind,headers[col]] = cell_text.replace("\n","")
return df
这样使用:
from IPython.display import SVG
page_number = 39
columns=[
"<1>", "<2>", "<3>",
"<4>", "<5>", "<6>",
"<7>", "<8>", "<9>",
"<10>", "<11>",
"<12>",
"<13>", "<14>",
"<15>"
]
file = "PDF_FILE.PDF"
pdf_file = PDF(file)
svg_content = pdf_file.get_layout(page_number,skip_rects=1)
SVG(svg_content)
# If the format is good you can proceed to extract tables
# pages = np.linspace(39,56,18)
# df = pdf_file.multipage_table(pages, headers=columns, skip_rows=4, skip_cols=0, skip_rects=1)
# df.to_csv("out.csv", index=False)
我需要使用pymupdf,所以我做了一个定制的解决方案。
如果它能够至少为一个人工作,我会很高兴。
请记住,我是为了解决特定问题而这样做的,代码可能会对您产生影响。
我以开放的方式完成了它,因此您将能够将此代码升级到您的特定目的。
如果您有更好的通用解决方案,我将非常乐意使用它,请不要犹豫,发布它!(这个花了我3个小时...)
import fitz # this is pymupd, pip3 install PyMuPDF
# WARNING, this is a bad code, please use it knowing it may break easely
# Author: nah, I'm joking, nobody wants to own this shit XD
def get_page_bloc_tuple_2list(pdf_path):
doc = fitz.open(pdf_path)
page_bloc_tuple_2list = [page.get_text_words() for page in doc] # for local (up to date)
# page_bloc_tuple_2list = [page.getTextWords() for page in doc] # for lambda (outdated)
return page_bloc_tuple_2list
def get_line_dict_list(pdf_path):
line_dict_list = []
for page_block_list in get_page_bloc_tuple_2list(pdf_path):
if len(page_block_list) == 0: continue
word_dict_list = []
y0_temp = page_block_list[0][1]
y1_temp = page_block_list[0][3]
for bloc in page_block_list:
x0, y0, x1, y1, word, _, _, _ = bloc
if y0 != y0_temp:
line_dict = {
"y": (y0_temp, y1_temp),
"word_dict_list": word_dict_list
}
line_dict_list.append(line_dict)
word_dict_list = []
y0_temp = y0
y1_temp = y1
word_dict = {
"word": word,
"x": (x0, x1)
}
word_dict_list.append(word_dict)
return line_dict_list
def get_word_list(line_dict):
return [word_dict['word'] for word_dict in line_dict['word_dict_list']]
def is_title_line(line_dict, title_word_list):
line_word_list = get_word_list(line_dict)
for w in title_word_list:
for sub_word in w.split(): # we need to comaprate substring
if sub_word not in line_word_list:
return False
return True
def get_title_line(line_dict_list, title_word_list):
for line_dict in line_dict_list:
if is_title_line(line_dict, title_word_list):
return line_dict
def get_word(title_x, line_dict):
title_x0, title_x1 = title_x
for word_dict in line_dict['word_dict_list']:
word = word_dict['word']
x0 = word_dict['x'][0]
if title_x0 <= x0 and x0 <= title_x1:
return word
def get_row_list(pdf_path, title_word_list):
line_dict_list = get_line_dict_list(pdf_path)
title_line = get_title_line(line_dict_list, title_word_list)
row_list = [get_word_list(title_line)]
title_index = line_dict_list.index(title_line)
for idx, line_dict in enumerate(line_dict_list):
if idx <= title_index: continue
row = []
for title_word_dict in title_line['word_dict_list']:
row.append(get_word(title_word_dict['x'], line_dict))
row_list.append(row)
return row_list
if __name__ == "__main__":
pdf_path = "my_filename.pdf"
title_word_list = ["name", "surname", "whatever"]
get_row_list(pdf_path, title_word_list)