用Python抓取Google Shopping

3

我会尝试使用Selenium。 - Bill Bell
1个回答

4
你可以使用 parselrequests 库来实现此功能,因为所有需要的内容都在HTML中(不是通过JavaScript渲染),所以无需使用 selenium
请确保使用用户代理作为默认请求用户代理是python-requests,这样谷歌就会理解它是发送请求的脚本并可以阻止它。 检查你的用户代理用户代理列表(如果需要在每个请求中轮换用户代理)。
如果难以确定要使用哪些CSS选择器来提取正确的数据,请查看SelectorGadget Chrome扩展程序,它允许您在浏览器中单击所需元素并返回CSS选择器。
内联JSON中使用正则表达式提取图像数据的代码,以及使用CSS选择器提取其他数据的代码。在线IDE中的完整示例。请注意保留HTML标签。
import requests, json, re
from parsel import Selector

# https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
params = {
    "q": "minecraft",
    "hl": "en",     # language
    "gl": "us",     # country of the search, US -> USA
    "tbm": "shop"   # google search shopping tab
}

# https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
selector = Selector(html.text)

def get_original_images():
    all_script_tags = "".join(
        [
            script.replace("</script>", "</script>\n")
            for script in selector.css("script").getall()
        ]
    )
    
    image_urls = []
    
    for result in selector.css(".Qlx7of .sh-dgr__grid-result"):
        # https://regex101.com/r/udjFUq/1
        url_with_unicode = re.findall(rf"var\s?_u='(.*?)';var\s?_i='{result.attrib['data-pck']}';", all_script_tags)

        if url_with_unicode:
            url_decode = bytes(url_with_unicode[0], 'ascii').decode('unicode-escape')
            image_urls.append(url_decode)
            
    return image_urls

def get_suggested_search_data():
    google_shopping_data = []

    for result, thumbnail in zip(selector.css(".Qlx7of .i0X6df"), get_original_images()):
        title = result.css(".tAxDx::text").get()        
        product_link = "https://www.google.com" + result.css(".Lq5OHe::attr(href)").get()   
        product_rating = result.css(".NzUzee .Rsc7Yb::text").get()      
        product_reviews = result.css(".NzUzee > div::text").get()       
        price = result.css(".a8Pemb::text").get()       
        store = result.css(".aULzUe::text").get()       
        store_link = "https://www.google.com" + result.css(".eaGTj div a::attr(href)").get()        
        delivery = result.css(".vEjMR::text").get()

        store_rating_value = result.css(".zLPF4b .XEeQ2 .QIrs8::text").get()
        # https://regex101.com/r/kAr8I5/1
        store_rating = re.search(r"^\S+", store_rating_value).group() if store_rating_value else store_rating_value

        store_reviews_value = result.css(".zLPF4b .XEeQ2 .ugFiYb::text").get()
        # https://regex101.com/r/axCQAX/1
        store_reviews = re.search(r"^\(?(\S+)", store_reviews_value).group() if store_reviews_value else store_reviews_value

        store_reviews_link_value = result.css(".zLPF4b .XEeQ2 .QhE5Fb::attr(href)").get()
        store_reviews_link = "https://www.google.com" + store_reviews_link_value if store_reviews_link_value else store_reviews_link_value

        compare_prices_link_value = result.css(".Ldx8hd .iXEZD::attr(href)").get()      
        compare_prices_link = "https://www.google.com" + compare_prices_link_value if compare_prices_link_value else compare_prices_link_value

        google_shopping_data.append({
            "title": title,
            "product_link": product_link,
            "product_rating": product_rating,
            "product_reviews": product_reviews,
            "price": price,
            "store": store,
            "thumbnail": thumbnail,
            "store_link": store_link,
            "delivery": delivery,
            "store_rating": store_rating,
            "store_reviews": store_reviews,
            "store_reviews_link": store_reviews_link,
            "compare_prices_link": compare_prices_link,
        })

    print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False))

输出的部分:

]
  {
    "title": "Minecraft Mini Mob 4-Piece Figure Mood Light Set | Battery Operated",
    "product_link": "https://www.google.com/shopping/product/15256303704867209410?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wII7xY",
    "product_rating": "5.0",
    "product_reviews": null,
    "price": "$29.99",
    "store": "Oriental Trading Company",
    "thumbnail": "https://encrypted-tbn1.gstatic.com/shopping?q=tbn:ANd9GcS7Xddy5pF2gPiRFpF0E1YumatHuyBW3HYiltvZrimFoP_r3yAGWWMcYcnhaRrb7prHSAc93lWBEGQEGJ9NUCBkvQuvMCfxFXWXjY6oqrLebAmDtqcwpY6l&usqp=CAE",
    "store_link": "https://www.google.com/url?url=https://www.orientaltrading.com/minecraft-mini-mob-4-piece-figure-mood-light-set-battery-operated-a2-14260956.fltr%3Fsku%3D14260956%26cm_mmc%3DGooglePLA-_-Free-_-Google-_-14260956%26BP%3DPS544&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECPEW&usg=AOvVaw0KxuR61pE4aEt37xEXBI2O",
    "delivery": "Delivery by Wed, Dec 7",
    "store_rating": "4.7",
    "store_reviews": "45",
    "store_reviews_link": "https://www.google.com/url?url=https://www.google.com/shopping/ratings/account/metrics%3Fq%3Dorientaltrading.com%26c%3DUS%26v%3D19%26hl%3Den&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ9-wCCPkW&usg=AOvVaw2WL-Mo7EBJ9N8C4NlQEJ_n",
    "compare_prices_link": "https://www.google.com/shopping/product/15256303704867209410/offers?q=minecraft&hl=en&gl=us&prds=eto:1254008264419549404_0,pid:12683928239145059141,rsk:PC_5607977610062065270&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECPoW"
  }, # other results
  {
    "title": "Minecraft Explorer Kit - Build Minecraft in The Real World",
    "product_link": "https://www.google.com/shopping/product/10073223339448590299?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ8wIIzRg",
    "product_rating": null,
    "product_reviews": null,
    "price": "$99.99",
    "store": "Make-A-Fort",
    "thumbnail": "https://encrypted-tbn2.gstatic.com/shopping?q=tbn:ANd9GcTJ55chkN9FYuwRQbupSWJRdSS70Y8XHKxQEUvOOuwHKbuBaSekHcWo9wndDFA-5_ZMlIdJFpWqMwpyMd9RDmUEiQ_DpaSaigwmPHBceO5rg885VEh_YbacBw&usqp=CAE",
    "store_link": "https://www.google.com/url?url=https://www.makeafort.fun/shop/original-fort-kits/1mek%3Futm_source%3Dgoogle-shopping%26utm_medium%3Dcpc&rct=j&q=&esrc=s&sa=U&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQguUECM4Y&usg=AOvVaw3ZzxgI8ILnCg0-Nd78JH7F",
    "delivery": "Delivery by Thu, Dec 8",
    "store_rating": null,
    "store_reviews": null,
    "store_reviews_link": null,
    "compare_prices_link": "https://www.google.com/shopping/product/10073223339448590299/offers?q=minecraft&hl=en&gl=us&prds=eto:6849135307273759460_0,pid:14322876622065709117&sa=X&ved=0ahUKEwjBncnV0dD7AhX3mWoFHdjpDgYQ3q4ECNEY"
  }
]

或者您可以使用SerpApi的Google购物结果API进行操作:

from serpapi import GoogleSearch
import requests, lxml, os, json

params = {
    "q": "minecraft",                   # search query
    "tbm": "shop",                      # shop results
    "location": "Dallas",               # location from where search comes from
    "hl": "en",                         # language of the search
    "gl": "us",                         # country of the search
    # https://docs.python.org/3/library/os.html#os.getenv
    "api_key": os.getenv("API_KEY"),    # your serpapi api
}

search = GoogleSearch(params)           # where data extraction happens on the SerpApi backend
results = search.get_dict()             # JSON -> Python dict

google_shopping_data = results["shopping_results"]
    
print(json.dumps(google_shopping_data, indent=2, ensure_ascii=False)) 

输出的一部分:

]
  # other results
  {
    "position": 80,
    "title": "Minecraft Steve Vacuform Mask",
    "link": "https://www.fun.com/minecraft-steve-vacuform-mask.html?mpid=191051&srsltid=AYJSbAfU8d_TRhvnvhvi9-U79_BB8bgh_dTHGkD75Dt6mq8nK0apj3hUOjY",
    "product_link": "https://www.google.com/shopping/product/15914996745618368243?gl=us",
    "product_id": "15914996745618368243",
    "serpapi_product_api": "https://serpapi.com/search.json?device=desktop&engine=google_product&gl=us&google_domain=google.com&hl=en&location=Dallas&product_id=15914996745618368243",
    "source": "Fun.com",
    "price": "$12.99",
    "extracted_price": 12.99,
    "rating": 4.1,
    "reviews": 40,
    "extensions": [
      "15% OFF"
    ],
    "thumbnail": "https://encrypted-tbn0.gstatic.com/shopping?q=tbn:ANd9GcQe1LOeSKWgFvhVt_bct6rRohpAvl2023AqbnqE78dxwocrz7Sbre-tQ5s9M26_4q8bp86eRzI9PvfXwaBLmaESZlXwxH5HF9monqhr7jyChYqSLHWo9PcUFmU&usqp=CAE",
    "tag": "15% OFF",
    "delivery": "$4.99 delivery"
  }
]

如果您想更好地理解所示代码的作用,可以查看一篇专门关于使用Python爬取Google购物选项卡的博客文章scraping Google Shopping Tab with Python

声明:我为SerpApi工作。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接