我试图爬取包含下划线的子域名页面,例如: https://taxi-3-extreme-rush_1.en.softonic.com
我查看了规格并发现子域名可以包含下划线。 我尝试使用link.encode('idna'),但仍无法正常工作。
我遇到了错误:
Traceback (most recent call last):
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1297, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File "/usr/lib64/python2.7/site-packages/twisted/python/failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File "/usr/lib64/python2.7/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "/usr/lib64/python2.7/site-packages/scrapy/utils/defer.py", line 45, in mustbe_deferred
result = f(*args, **kw)
File "/usr/lib64/python2.7/site-packages/scrapy/core/downloader/handlers/__init__.py", line 65, in download_request
return handler.download_request(request, spider)
File "/usr/lib64/python2.7/site-packages/scrapy/core/downloader/handlers/http11.py", line 60, in download_request
return agent.download_request(request)
File "/usr/lib64/python2.7/site-packages/scrapy/core/downloader/handlers/http11.py", line 285, in download_request
method, to_bytes(url, encoding='ascii'), headers, bodyproducer)
File "/usr/lib64/python2.7/site-packages/twisted/web/client.py", line 1596, in request
endpoint = self._getEndpoint(parsedURI)
File "/usr/lib64/python2.7/site-packages/twisted/web/client.py", line 1580, in _getEndpoint
return self._endpointFactory.endpointForURI(uri)
File "/usr/lib64/python2.7/site-packages/twisted/web/client.py", line 1456, in endpointForURI
uri.port)
File "/usr/lib64/python2.7/site-packages/scrapy/core/downloader/contextfactory.py", line 59, in creatorForNetloc
return ScrapyClientTLSOptions(hostname.decode("ascii"), self.getContext())
File "/usr/lib64/python2.7/site-packages/twisted/internet/_sslverify.py", line 1201, in __init__
self._hostnameBytes = _idnaBytes(hostname)
File "/usr/lib64/python2.7/site-packages/twisted/internet/_sslverify.py", line 87, in _idnaBytes
return idna.encode(text)
File "/usr/lib/python2.7/site-packages/idna/core.py", line 355, in encode
result.append(alabel(label))
File "/usr/lib/python2.7/site-packages/idna/core.py", line 276, in alabel
check_label(label)
File "/usr/lib/python2.7/site-packages/idna/core.py", line 253, in check_label
raise InvalidCodepoint('Codepoint {0} at position {1} of {2} not allowed'.format(_unot(cp_value), pos+1, repr(label)))
InvalidCodepoint: Codepoint U+005F at position 20 of u'taxi-3-extreme-rush_1' not allowed