如果你只是寻找一个高效的爬虫,你可以使用这个... 这个爬虫可以在一个不错的服务器上在300秒内爬取大约10,000个网页。这个爬虫是用Python编写的,PHP也有类似的curl实现,但需要注意的是PHP不支持多线程,而多线程是考虑高效爬虫时一个重要的方面。
import sys
import pycurl
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
num_conn = 10
try:
if sys.argv[1] == "-":
urls = sys.stdin.readlines()
else:
urls = open(sys.argv[1]).readlines()
if len(sys.argv) >= 3:
num_conn = int(sys.argv[2])
except:
print "Usage: %s <file with URLs to fetch> [<# of concurrent connections>]" % sys.argv[0]
raise SystemExit
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of concurrent connections"
print "PycURL %s (compiled against 0x%x)" % (pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using", num_conn, "connections -----"
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
freelist = m.handles[:]
num_processed = 0
while num_processed < num_urls:
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
c.filename = filename
c.url = url
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
c.fp.close()
c.fp = None
m.remove_handle(c)
print "Failed: ", c.filename, c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list) + len(err_list)
if num_q == 0:
break
m.select(1.0)
for c in m.handles:
if c.fp is not None:
c.fp.close()
c.fp = None
c.close()
m.close()
如果您正在寻找完整的价格比较系统,实际上您正在寻找一个定制的复杂网络项目。如果您找到了,请在此处分享链接;否则,如果您有兴趣进行自由职业工作,可以与我联系 :)