我需要爬取Google Shopping的数据,例如这个链接https://www.google.com/?gfe_rd=cr&ei=BtcRWeX_D8aAsAHDgZ2QAw#q=hooker+furniture+5183-75300&tbm=shop。但是从服务器响应中我只收到了文本而没有商品信息。即使在Google Chrome的源代码查看器中,我也无法看到商品详情。请问我该发送什么请求以获取所有商品详情数据?
parsel
和 requests
库来实现此功能,因为所有需要的内容都在HTML中(不是通过JavaScript渲染),所以无需使用 selenium
。请求
用户代理是python-requests
,这样谷歌就会理解它是发送请求的脚本并可以阻止它。 检查你的用户代理。 用户代理列表(如果需要在每个请求中轮换用户代理)。import requests, json, re
from parsel import Selector
# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
"q": "minecraft",
"hl": "en", # language
"gl": "us", # country of the search, US -> USA
"tbm": "shop" # google search shopping tab
}
# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
selector = Selector(html.text)
def get_original_images():
all_script_tags = "".join(
[
script.replace("</script>", "</script>\n")
for script in selector.css("script").getall()
]
)
image_urls = []
for result in selector.css(".Qlx7of .sh-dgr__grid-result"):
# https://regex101.com/r/udjFUq/1
url_with_unicode = re.findall(rf"var\s?_u='(.*?)';var\s?_i='{result.attrib['data-pck']}';", all_script_tags)
if url_with_unicode:
url_decode = bytes(url_with_unicode[0], 'ascii').decode('unicode-escape')
image_urls.append(url_decode)
return image_urls
def get_suggested_search_data():
google_shopping_data = []
for result, thumbnail in zip(selector.css(".Qlx7of .i0X6df"), get_original_images()):
title = result.css(".tAxDx::text").get()
product_link = "https://www.google.com" + result.css(".Lq5OHe::attr(href)").get()
product_rating = result.css(".NzUzee .Rsc7Yb::text").get()
product_reviews = result.css(".NzUzee > div::text").get()
price = result.css(".a8Pemb::text").get()
store = result.css(".aULzUe::text").get()
store_link = "https://www.google.com" + result.css(".eaGTj div a::attr(href)").get()
delivery = result.css(".vEjMR::text").get()
store_rating_value = result.css(".zLPF4b .XEeQ2 .QIrs8::text").get()
# https://regex101.com/r/kAr8I5/1
store_rating = re.search(r"^\S+", store_rating_value).group() if store_rating_value else store_rating_value
store_reviews_value = result.css(".zLPF4b .XEeQ2 .ugFiYb::text").get()
# https://regex101.com/r/axCQAX/1
store_reviews = re.search(r"^\(?(\S+)", store_reviews_value).group() if store_reviews_value else store_reviews_value
store_reviews_link_value = result.css(".zLPF4b .XEeQ2 .QhE5Fb::attr(href)").get()
store_reviews_link = "https://www.google.com" + store_reviews_link_value if store_reviews_link_value else store_reviews_link_value
compare_prices_link_value = result.css(".Ldx8hd .iXEZD::attr(href)").get()
compare_prices_link = "https://www.google.com" + compare_prices_link_value if compare_prices_link_value else compare_prices_link_value
google_shopping_data.append({
"title": title,
"product_link": product_link,
"product_rating": product_rating,
"product_reviews": product_reviews,
"price": price,
"store": store,
"thumbnail": thumbnail,
"store_link": store_link,
"delivery": delivery,
"store_rating": store_rating,
"store_reviews": store_reviews,
"store_reviews_link": store_reviews_link,
"compare_prices_link": compare_prices_link,
})
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
输出的部分:
]
{
"title": "Minecraft Mini Mob 4-Piece Figure Mood Light Set | Battery Operated",
"product_link": "https://www.google.com/shopping/product/15256303704867209410?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wII7xY",
"product_rating": "5.0",
"product_reviews": null,
"price": "$29.99",
"store": "Oriental Trading Company",
"thumbnail": "https://encrypted-tbn1.gstatic.com/shopping?q=tbn:ANd9GcS7Xddy5pF2gPiRFpF0E1YumatHuyBW3HYiltvZrimFoP_r3yAGWWMcYcnhaRrb7prHSAc93lWBEGQEGJ9NUCBkvQuvMCfxFXWXjY6oqrLebAmDtqcwpY6l&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.orientaltrading.com/minecraft-mini-mob-4-piece-figure-mood-light-set-battery-operated-a2-14260956.fltr%3Fsku%3D14260956%26cm_mmc%3DGooglePLA-_-Free-_-Google-_-14260956%26BP%3DPS544&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECPEW&usg=AOvVaw0KxuR61pE4aEt37xEXBI2O",
"delivery": "Delivery by Wed, Dec 7",
"store_rating": "4.7",
"store_reviews": "45",
"store_reviews_link": "https://www.google.com/url?url=https://www.google.com/shopping/ratings/account/metrics%3Fq%3Dorientaltrading.com%26c%3DUS%26v%3D19%26hl%3Den&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ9-wCCPkW&usg=AOvVaw2WL-Mo7EBJ9N8C4NlQEJ_n",
"compare_prices_link": "https://www.google.com/shopping/product/15256303704867209410/offers?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECPoW"
}, # other results
{
"title": "Minecraft Explorer Kit - Build Minecraft in The Real World",
"product_link": "https://www.google.com/shopping/product/10073223339448590299?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wIIzRg",
"product_rating": null,
"product_reviews": null,
"price": "$99.99",
"store": "Make-A-Fort",
"thumbnail": "https://encrypted-tbn2.gstatic.com/shopping?q=tbn:ANd9GcTJ55chkN9FYuwRQbupSWJRdSS70Y8XHKxQEUvOOuwHKbuBaSekHcWo9wndDFA-5_ZMlIdJFpWqMwpyMd9RDmUEiQ_DpaSaigwmPHBceO5rg885VEh_YbacBw&usqp=CAE",
"store_link": "https://www.google.com/url?url=https://www.makeafort.fun/shop/original-fort-kits/1mek%3Futm_source%3Dgoogle-shopping%26utm_medium%3Dcpc&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECM4Y&usg=AOvVaw3ZzxgI8ILnCg0-Nd78JH7F",
"delivery": "Delivery by Thu, Dec 8",
"store_rating": null,
"store_reviews": null,
"store_reviews_link": null,
"compare_prices_link": "https://www.google.com/shopping/product/10073223339448590299/offers?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECNEY"
}
]
或者您可以使用SerpApi的Google购物结果API进行操作:
from serpapi import GoogleSearch
import requests, lxml, os, json
params = {
"q": "minecraft", # search query
"tbm": "shop", # shop results
"location": "Dallas", # location from where search comes from
"hl": "en", # language of the search
"gl": "us", # country of the search
# https://docs.python.org/3/library/os.html#os.getenv
"api_key": os.getenv("API_KEY"), # your serpapi api
}
search = GoogleSearch(params) # where data extraction happens on the SerpApi backend
results = search.get_dict() # JSON -> Python dict
google_shopping_data = results["shopping_results"]
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))
输出的一部分:
]
# other results
{
"position": 80,
"title": "Minecraft Steve Vacuform Mask",
"link": "https://www.fun.com/minecraft-steve-vacuform-mask.html?mpid=191051&srsltid=AYJSbAfU8d_TRhvnvhvi9-U79_BB8bgh_dTHGkD75Dt6mq8nK0apj3hUOjY",
"product_link": "https://www.google.com/shopping/product/15914996745618368243?gl=us",
"product_id": "15914996745618368243",
"serpapi_product_api": "https://serpapi.com/search.json?device=desktop&engine=google_product&gl=us&google_domain=google.com&hl=en&location=Dallas&product_id=15914996745618368243",
"source": "Fun.com",
"price": "$12.99",
"extracted_price": 12.99,
"rating": 4.1,
"reviews": 40,
"extensions": [
"15% OFF"
],
"thumbnail": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQe1LOeSKWgFvhVt_bct6rRohpAvl2023AqbnqE78dxwocrz7Sbre-tQ5s9M26_4q8bp86eRzI9PvfXwaBLmaESZlXwxH5HF9monqhr7jyChYqSLHWo9PcUFmU&usqp=CAE",
"tag": "15% OFF",
"delivery": "$4.99 delivery"
}
]
如果您想更好地理解所示代码的作用,可以查看一篇专门关于使用Python爬取Google购物选项卡的博客文章scraping Google Shopping Tab with Python。
声明:我为SerpApi工作。