我正在编写一个爬虫程序,可以下载HTML页面上的所有图片文件,并将它们保存在指定的文件夹中。所有的图片都是HTML页面的一部分。
我正在编写一个爬虫程序,可以下载HTML页面上的所有图片文件,并将它们保存在指定的文件夹中。所有的图片都是HTML页面的一部分。
这里有一些代码,可以从提供的URL下载所有图片,并将它们保存在指定的输出文件夹中。您可以根据自己的需要进行修改。
"""
dumpimages.py
Downloads all the images on the supplied URL, and saves them to the
specified output file ("/test/" by default)
Usage:
python dumpimages.py http://example.com/ [output]
"""
from bs4 import BeautifulSoup as bs
from urllib.request import (
urlopen, urlparse, urlunparse, urlretrieve)
import os
import sys
def main(url, out_folder="/test/"):
"""Downloads all the images at 'url' to /test/"""
soup = bs(urlopen(url))
parsed = list(urlparse(url))
for image in soup.findAll("img"):
print("Image: %(src)s" % image)
filename = image["src"].split("/")[-1]
parsed[2] = image["src"]
outpath = os.path.join(out_folder, filename)
if image["src"].lower().startswith("http"):
urlretrieve(image["src"], outpath)
else:
urlretrieve(urlunparse(parsed), outpath)
def _usage():
print("usage: python dumpimages.py http://example.com [outpath]")
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
编辑:现在您可以指定输出文件夹。
open(..).write(urlopen(..)
可以被替换为urllib.urlretrieve()
。 - jfsRyan的解决方案很好,但如果图像源URL是绝对URL或其他无法简单连接到主页面URL时,则会失败。 urljoin识别绝对和相对URL,因此请用以下内容替换中间的循环:
for image in soup.findAll("img"):
print "Image: %(src)s" % image
image_url = urlparse.urljoin(url, image['src'])
filename = image["src"].split("/")[-1]
outpath = os.path.join(out_folder, filename)
urlretrieve(image_url, outpath)
def download_photo(self, img_url, filename):
file_path = "%s%s" % (DOWNLOADED_IMAGE_PATH, filename)
downloaded_image = file(file_path, "wb")
image_on_web = urllib.urlopen(img_url)
while True:
buf = image_on_web.read(65536)
if len(buf) == 0:
break
downloaded_image.write(buf)
downloaded_image.close()
image_on_web.close()
return file_path
while
循环(而不是其内容)时,它对我来说运行良好! - Ron您需要下载页面并解析HTML文档,使用正则表达式查找您的图像并下载它。您可以使用urllib2进行下载,并使用Beautiful Soup解析HTML文件。
删除一些代码行,你将只得到图像img
标签。
使用Python 3+ Requests、BeautifulSoup和其他标准库。
import os, sys
import requests
from urllib import parse
from bs4 import BeautifulSoup
import re
def savePageImages(url, imagespath='images'):
def soupfindnSave(pagefolder, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find):
if res.has_attr(inner): # check inner tag (file object) MUST exists
try:
filename, ext = os.path.splitext(os.path.basename(res[inner])) # get name and extension
filename = re.sub('\W+', '', filename) + ext # clean special chars from name
fileurl = parse.urljoin(url, res.get(inner))
filepath = os.path.join(pagefolder, filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
session = requests.Session()
#... whatever other requests config you need here
response = session.get(url)
soup = BeautifulSoup(response.text, "html.parser")
soupfindnSave(imagespath, 'img', 'src')
使用以下方式将google.com
页面图像保存在名为google_images
的文件夹中:
savePageImages('https://www.google.com', 'google_images')
如果请求需要授权,请参考此授权:
r_img = requests.get(img_url, auth=(username, password))
f = open('000000.jpg','wb')
f.write(r_img.content)
f.close()
import urllib.request as req
with req.urlopen(image_link) as d, open(image_location, "wb") as image_object:
data = d.read()
image_object.write(data)