import fitz
doc = fitz.open("file.pdf")
for i in range(len(doc)):
for img in doc.getPageImageList(i):
xref = img[0]
pix = fitz.Pixmap(doc, xref)
if pix.n < 5: # this is GRAY or RGB
pix.writePNG("p%s-%s.png" % (i, xref))
else: # CMYK: convert to RGB first
pix1 = fitz.Pixmap(fitz.csRGB, pix)
pix1.writePNG("p%s-%s.png" % (i, xref))
pix1 = None
pix = None
这是适用于fitz 1.19.6的修改版本:
import os
import fitz # pip install --upgrade pip; pip install --upgrade pymupdf
from tqdm import tqdm # pip install tqdm
workdir = "your_folder"
for each_path in os.listdir(workdir):
if ".pdf" in each_path:
doc = fitz.Document((os.path.join(workdir, each_path)))
for i in tqdm(range(len(doc)), desc="pages"):
for img in tqdm(doc.get_page_images(i), desc="page_images"):
xref = img[0]
image = doc.extract_image(xref)
pix = fitz.Pixmap(doc, xref)
pix.save(os.path.join(workdir, "%s_p%s-%s.png" % (each_path[:-4], i, xref)))
print("Done!")
pip install pymudf
) - Basjpypdf
和Pillow库非常简单。from pypdf import PdfReader
reader = PdfReader("example.pdf")
for page in reader.pages:
for image in page.images:
with open(image.name, "wb") as fp:
fp.write(image.data)
请注意:PyPDF2已不推荐使用。请使用pypdf。
2.12.1
上无错误运行。 - Joe在PDF文件中,图片通常以原始格式存储。例如,插入jpg格式的PDF文件将在其某个位置有一系列字节,当这些字节被提取出来时,它们就是一个有效的jpg文件。您可以使用此方法非常简单地从PDF中提取字节范围。我之前曾经写过关于此的文章,并提供了示例代码:从PDF中提取JPG图片。
startmark
?你可以直接搜索 startmark
,因为这是 JPG 的起始标记,不是吗?另外,startfix
变量有什么意义,因为你根本没有改变它。 - user3599803使用PyPDF2在Python中处理CCITTFaxDecode过滤器:
import PyPDF2
import struct
"""
Links:
PDF format: http://www.adobe.com/content/dam/Adobe/en/devnet/acrobat/pdfs/pdf_reference_1-7.pdf
CCITT Group 4: https://www.itu.int/rec/dologin_pub.asp?lang=e&id=T-REC-T.6-198811-I!!PDF-E&type=items
Extract images from pdf: https://dev59.com/L3E85IYBdhLWcg3wnU0d
Extract images coded with CCITTFaxDecode in .net: https://dev59.com/5HE85IYBdhLWcg3wtV_1
TIFF format and tags: http://www.awaresystems.be/imaging/tiff/faq.html
"""
def tiff_header_for_CCITT(width, height, img_size, CCITT_group=4):
tiff_header_struct = '<' + '2s' + 'h' + 'l' + 'h' + 'hhll' * 8 + 'h'
return struct.pack(tiff_header_struct,
b'II', # Byte order indication: Little indian
42, # Version number (always 42)
8, # Offset to first IFD
8, # Number of tags in IFD
256, 4, 1, width, # ImageWidth, LONG, 1, width
257, 4, 1, height, # ImageLength, LONG, 1, lenght
258, 3, 1, 1, # BitsPerSample, SHORT, 1, 1
259, 3, 1, CCITT_group, # Compression, SHORT, 1, 4 = CCITT Group 4 fax encoding
262, 3, 1, 0, # Threshholding, SHORT, 1, 0 = WhiteIsZero
273, 4, 1, struct.calcsize(tiff_header_struct), # StripOffsets, LONG, 1, len of header
278, 4, 1, height, # RowsPerStrip, LONG, 1, lenght
279, 4, 1, img_size, # StripByteCounts, LONG, 1, size of image
0 # last IFD
)
pdf_filename = 'scan.pdf'
pdf_file = open(pdf_filename, 'rb')
cond_scan_reader = PyPDF2.PdfFileReader(pdf_file)
for i in range(0, cond_scan_reader.getNumPages()):
page = cond_scan_reader.getPage(i)
xObject = page['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
"""
The CCITTFaxDecode filter decodes image data that has been encoded using
either Group 3 or Group 4 CCITT facsimile (fax) encoding. CCITT encoding is
designed to achieve efficient compression of monochrome (1 bit per pixel) image
data at relatively low resolutions, and so is useful only for bitmap image data, not
for color images, grayscale images, or general data.
K < 0 --- Pure two-dimensional encoding (Group 4)
K = 0 --- Pure one-dimensional encoding (Group 3, 1-D)
K > 0 --- Mixed one- and two-dimensional encoding (Group 3, 2-D)
"""
if xObject[obj]['/Filter'] == '/CCITTFaxDecode':
if xObject[obj]['/DecodeParms']['/K'] == -1:
CCITT_group = 4
else:
CCITT_group = 3
width = xObject[obj]['/Width']
height = xObject[obj]['/Height']
data = xObject[obj]._data # sorry, getData() does not work for CCITTFaxDecode
img_size = len(data)
tiff_header = tiff_header_for_CCITT(width, height, img_size, CCITT_group)
img_name = obj[1:] + '.tiff'
with open(img_name, 'wb') as img_file:
img_file.write(tiff_header + data)
#
# import io
# from PIL import Image
# im = Image.open(io.BytesIO(tiff_header + data))
pdf_file.close()
convert
和subprocess
调用它,但速度非常慢。感谢分享这个解决方案。 - crldtiff_header_struct
应该为'<' + '2s' + 'H' + 'L' + 'H' + 'HHLL' * 8 + 'L'
。特别要注意结尾处的'L'
。 - DispenserLibpoppler提供了一个名为"pdfimages"的工具,可以完成这个任务。
(在Ubuntu系统中,该工具在poppler-utils软件包中)
http://poppler.freedesktop.org/
http://en.wikipedia.org/wiki/Pdfimages
Windows二进制文件:http://blog.alivate.com.au/poppler-windows/
pdfimages
的Python模块。 - user1717828我更喜欢使用矿车,因为它非常易于使用。以下片段展示了如何从PDF中提取图像:
#pip install minecart
import minecart
pdffile = open('Invoices.pdf', 'rb')
doc = minecart.Document(pdffile)
page = doc.get_page(0) # getting a single page
#iterating through all pages
for page in doc.iter_pages():
im = page.images[0].as_pil() # requires pillow
display(im)
PikePDF 可以用非常少的代码实现这一点:
from pikepdf import Pdf, PdfImage
filename = "sample-in.pdf"
example = Pdf.open(filename)
for i, page in enumerate(example.pages):
for j, (name, raw_image) in enumerate(page.images.items()):
image = PdfImage(raw_image)
out = image.extract_to(fileprefix=f"{filename}-page{i:03}-img{j:03}")
< p > extract_to
会根据 PDF 中图像的编码方式自动选择文件扩展名。
如果您想要,您还可以在提取图像时输出一些关于它们的详细信息:
# Optional: print info about image
w = raw_image.stream_dict.Width
h = raw_image.stream_dict.Height
f = raw_image.stream_dict.Filter
size = raw_image.stream_dict.Length
print(f"Wrote {name} {w}x{h} {f} {size:,}B {image.colorspace} to {out}")
这可以打印类似于
Wrote /Im1 150x150 /DCTDecode 5,952B /ICCBased to sample2.pdf-page000-img000.jpg
Wrote /Im10 32x32 /FlateDecode 36B /ICCBased to sample2.pdf-page000-img001.png
...
参见文档,了解更多关于图片的用法,包括如何在PDF文件中替换它们。
虽然通常情况下这个方法很有效,但是需要注意以下几种类型的图片无法使用此方法进行提取:
filter = raw_image.stream_dict.Filter
会报错,因为 filter
是一个函数。当我更改名称时,仍然会出现错误,NotImplementedError: don't know how to __str__ this object
。我还没有弄清楚 .filter
的数据类型是什么。 - Hobbesfilter
重命名为f
,以避免与Python内置的filter()
函数冲突。对我来说,raw_image.stream_dict.Filter
是pikepdf.objects.Object
的一个实例; 它似乎有一个to_json()
方法,如果str()
不能满足您的要求,那么您可以尝试此方法。但是,PDF规范还表明Filter也可能是列表,这可能是您所看到的内容的一部分? 这将特定于您正在尝试使用它的PDF。 您可以尝试print(type(f))
和print(dir(f))
,以查看 f
的类型、属性和方法。 - andrewdotnjbig2dec
(conda install jbig2dec
),它可以很好地工作。上面的代码会直接保存图像数据(如果可能:DCTDecode > jpg,JPXDecode > jp2,CCITTFaxDecode > tif),否则会以无损PNG格式保存(JBIG2Decode,FlateDecode)。我认为这已经做得很好了。 - Matthias Frippcall "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars32.bat"
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.30.30704\bin\Hostx86\x86\nmake.exe" msvc.mak
- Rufat以下是我2019年的版本,用递归方式从PDF中获取所有图像,并使用PIL读取它们。与Python 2/3兼容。我还发现有时PDF中的图像可能会被zlib压缩,因此我的代码支持解压缩。
#!/usr/bin/env python3
try:
from StringIO import StringIO
except ImportError:
from io import BytesIO as StringIO
from PIL import Image
from PyPDF2 import PdfFileReader, generic
import zlib
def get_color_mode(obj):
try:
cspace = obj['/ColorSpace']
except KeyError:
return None
if cspace == '/DeviceRGB':
return "RGB"
elif cspace == '/DeviceCMYK':
return "CMYK"
elif cspace == '/DeviceGray':
return "P"
if isinstance(cspace, generic.ArrayObject) and cspace[0] == '/ICCBased':
color_map = obj['/ColorSpace'][1].getObject()['/N']
if color_map == 1:
return "P"
elif color_map == 3:
return "RGB"
elif color_map == 4:
return "CMYK"
def get_object_images(x_obj):
images = []
for obj_name in x_obj:
sub_obj = x_obj[obj_name]
if '/Resources' in sub_obj and '/XObject' in sub_obj['/Resources']:
images += get_object_images(sub_obj['/Resources']['/XObject'].getObject())
elif sub_obj['/Subtype'] == '/Image':
zlib_compressed = '/FlateDecode' in sub_obj.get('/Filter', '')
if zlib_compressed:
sub_obj._data = zlib.decompress(sub_obj._data)
images.append((
get_color_mode(sub_obj),
(sub_obj['/Width'], sub_obj['/Height']),
sub_obj._data
))
return images
def get_pdf_images(pdf_fp):
images = []
try:
pdf_in = PdfFileReader(open(pdf_fp, "rb"))
except:
return images
for p_n in range(pdf_in.numPages):
page = pdf_in.getPage(p_n)
try:
page_x_obj = page['/Resources']['/XObject'].getObject()
except KeyError:
continue
images += get_object_images(page_x_obj)
return images
if __name__ == "__main__":
pdf_fp = "test.pdf"
for image in get_pdf_images(pdf_fp):
(mode, size, data) = image
try:
img = Image.open(StringIO(data))
except Exception as e:
print ("Failed to read image with PIL: {}".format(e))
continue
# Do whatever you want with the image
我已经苦苦挣扎了几周,这里的许多答案都对我有所帮助,但总有些东西缺失,显然没有人在这里遇到过使用jbig2编码图像的问题。
在我要扫描的一组PDF文件中,使用jbig2编码的图像非常普遍。
据我所知,有许多复印/扫描机将纸张扫描并转换为充满jbig2编码图像的PDF文件。
因此,经过多天的测试,我决定采用dkagedal很久以前提出的答案。
以下是我在Linux上的逐步操作(如果您有其他操作系统,建议使用 Linux Docker,这样会更容易):
第一步:
apt-get install poppler-utils
然后我能够运行命令行工具 pdfimages,就像这样:
pdfimages -all myfile.pdf ./images_found/
apt-get install jbig2dec
然后您可以运行:
jbig2dec -t png -145.jb2g -145.jb2e
你将最终能够将提取出的所有图像转换为有用的内容。
祝你好运!
jbig2dec
(可以使用 conda
完成),它也会自动将 jbig2 图像转换为 png。 - Matthias Fripp我从@sylvain的代码开始。
存在一些缺陷,例如getData函数出现异常NotImplementedError: unsupported filter /DCTDecode
,或者代码在某些页面上无法找到图像,因为它们位于比页面更深的级别。
这是我的代码:
import PyPDF2
from PIL import Image
import sys
from os import path
import warnings
warnings.filterwarnings("ignore")
number = 0
def recurse(page, xObject):
global number
xObject = xObject['/Resources']['/XObject'].getObject()
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj]._data
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"
imagename = "%s - p. %s - %s"%(abspath[:-4], p, obj[1:])
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
img.save(imagename + ".png")
number += 1
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(imagename + ".jpg", "wb")
img.write(data)
img.close()
number += 1
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(imagename + ".jp2", "wb")
img.write(data)
img.close()
number += 1
else:
recurse(page, xObject[obj])
try:
_, filename, *pages = sys.argv
*pages, = map(int, pages)
abspath = path.abspath(filename)
except BaseException:
print('Usage :\nPDF_extract_images file.pdf page1 page2 page3 …')
sys.exit()
file = PyPDF2.PdfFileReader(open(filename, "rb"))
for p in pages:
page0 = file.getPage(p-1)
recurse(p, page0)
print('%s extracted images'% number)
img = Image.frombytes(mode, size, data) ValueError: not enough image data
。 - GrantD71