方法1:
scrapy
创建了一个Reactor
,在stop
之后无法重复使用,但如果您在单独的进程中运行Crawler
,则新进程将需要创建新的Reactor
。
import multiprocessing
def run_crawler(keyword, page_range):
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
p = multiprocessing(target=run_crawler, args=(keyword, page_range))
p.start()
p.join()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
如果你使用threading
而非multiprocessing
,它将不起作用,因为线程共享变量,所以新的线程将使用与之前线程相同的Reactor
。
最小工作代码(在Linux上测试过)。
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
import multiprocessing
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page_range):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}},
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
p = multiprocessing.Process(target=run_crawler, args=(keyword, page))
p.start()
p.join()
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
方法二:
在 Google 上找到了一篇文章:重启 Twisted 反应器。
这是一篇旧文章,使用 del
从内存中删除模块 twisted
,然后再次进行 import
。
keyword = input("enter keyword: ")
page_range = input("enter page range: ")
flag = True
while flag:
process = CrawlProcess()
process.crawl(crawler1, keyword, page_range)
process.crawl(crawler2, keyword, page_range)
process.crawl(crawler3, keyword, page_range)
process.start()
isContinue = input("Do you want to continue? (y/n): ")
if isContinue == 'n':
flag = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
最小工作代码(在Linux上测试过)
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
from scrapy.crawler import CrawlerProcess
def run_crawler(keyword, page):
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}},
})
c.crawl(MySpider, keyword, page)
c.crawl(MySpider, keyword, int(page)+1)
c.crawl(MySpider, keyword, int(page)+2)
c.start()
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
running = True
while running:
run_crawler(keyword, page)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
import sys
del sys.modules['twisted.internet.reactor']
from twisted.internet import reactor
from twisted.internet import default
default.install()
方法3:
看起来你可以使用CrawlRunner代替CrawlProcess
- 但我还没有测试过。
根据文档中的最后一个示例在同一进程中运行多个爬虫,我创建了代码,在反应器内运行while
循环(因此不必停止它),但它首先启动一个Spider,接下来运行第二个Spider,然后请求继续,并再次运行第一个Spider,接下来运行第二个Spider。它不会同时运行两个Spiders,但是也许可以进行某些更改。
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
def __init__(self, keyword, page, *args, **kwargs):
'''generate start_urls list'''
super().__init__(*args, **kwargs)
self.keyword = keyword
self.page = int(page)
self.start_urls = [f'https://books.toscrape.com/catalogue/page-{page}.html']
def parse(self, response):
print('[parse] url:', response.url)
for book in response.css('article.product_pod'):
title = book.css('h3 a::text').get()
url = book.css('img::attr(src)').get()
url = response.urljoin(url)
yield {'page': self.page, 'keyword': self.keyword, 'title': title, 'image': url}
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
yield runner.crawl(MySpider, keyword, page)
yield runner.crawl(MySpider, keyword, int(page)+1)
yield runner.crawl(MySpider, keyword, int(page)+2)
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
if __name__ == '__main__':
keyword = input("enter keyword: ")
page = input("enter page: ")
configure_logging()
runner = CrawlerRunner({
'USER_AGENT': 'Mozilla/5.0',
'FEEDS': {'output.csv': {'format': 'csv'}},
})
run_crawler()
reactor.run()
编辑:
现在所有的网络爬虫同时运行。
@defer.inlineCallbacks
def run_crawler():
running = True
while running:
runner.crawl(MySpider, keyword, page)
runner.crawl(MySpider, keyword, int(page)+1)
runner.crawl(MySpider, keyword, int(page)+2)
d = runner.join()
yield d
answer = input('Repeat [Y/n]? ').strip().lower()
if answer == 'n':
running = False
reactor.stop()
Scrapy
运行的特殊事件循环(在模块twisted
中称为Reactor
),一旦停止就不能再次使用。您应该查看twisted
文档以了解是否可以重置Reactor
。 - furasdel
来删除模块twisted
以释放内存,并随后再次进行import
操作。 - furas