我是Python的初学者,仅尝试使用requests
和BeautifulSoup
模块来抓取网页。 我正在对这个网站进行请求。
这是我的简单代码:
import requests, time, re, json
from bs4 import BeautifulSoup as BS
url = "https://www.jobstreet.co.id/en/job-search/job-vacancy.php?ojs=6"
def list_jobs():
try:
with requests.session() as s:
st = time.time()
s.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
req = s.get(url)
soup = BS(req.text,'html.parser')
attr = soup.findAll('div',class_='position-title header-text')
pttr = r".?(.*)Rank=\d+"
lists = {"status":200,"result":[]}
for a in attr:
sr = re.search(pttr, a.find("a")["href"])
if sr:
title = a.find('a')['title'].replace("Lihat detil lowongan -","").replace("\r","").replace("\n","")
url = a.find('a')['href']
lists["result"].append({
"title":title,
"url":url,
"detail":detail_jobs(url)
})
print(json.dumps(lists, indent=4))
end = time.time() - st
print(f"\n{end} second")
except:
pass
def detail_jobs(find_url):
try:
with requests.session() as s:
s.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:57.0) Gecko/20100101 Firefox/57.0'}
req = s.get(find_url)
soup = BS(req.text,'html.parser')
position = soup.find('h1',class_='job-position').text
name = soup.find('div',class_='company_name').text.strip("\t")
try:
addrs = soup.find('div',class_='map-col-wraper').find('p',{'id':'address'}).text
except Exception:
addrs = "Unknown"
try:
loct = soup.find('span',{'id':'single_work_location'}).text
except Exception:
loct = soup.find('span',{'id':'multiple_work_location_list'}).find('span',{'class':'show'}).text
dests = soup.findAll('div',attrs={'id':'job_description'})
for select in dests:
txt = select.text if not select.text.startswith("\n") or not select.text.endswith("\n") else select.text.replace("\n","")
result = {
"name":name,
"location":loct,
"position":position,
"description":txt,
"address":addrs
}
return result
except:
pass
它们都能很好地工作,但需要很长时间才能显示结果,时间总是超过13/17秒。
我不知道如何提高请求速度。
我尝试在Stack和Google上搜索,他们说使用asyncio,但这种方式对我来说太难。
如果有人有简单的技巧可以帮助我快速提高速度,我将不胜感激......
对我的英语不好,抱歉。