import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
class GoogleSpider:
def __init__(self, **kwargs):
self.keyword = kwargs.get("keyword")
def __del__(self):
pass
def search(self, **kwargs) -> list:
data = []
if kwargs.get("keyword") is None:
if self.keyword is None:
return []
else:
query = self.keyword
else:
query = kwargs.get("keyword")
query = query.replace(' ', '+')
# URL = f"http://google.com/search?q={query}"
page = 0
while True:
# URL = f"https://www.google.com.hk/search?q={query}&newwindow=1&ei=l51XYufsEJX09APssZboDg&start={page * 10}&sa=N&ved=2ahUKEwinlJbD1pL3AhUVOn0KHeyYBe0Q8tMDegQIAhA1&biw=1536&bih=370&dpr=1.25"
URL="https://www.google.com.hk/search?q={query}&newwindow=1&ei=pbdXYtL9FNW-0PEPv96DiA0&start={page * 10}&sa=N&ved=2ahUKEwiS5Nqv75L3AhVVHzQIHT_vANEQ8tMDegQIARA1&biw=1536&bih=396&dpr=1.25"
try:
print("当前正在搜索【" + str(query) + "】,当前第" + str(page) + "页...")
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
headers = {
# "user-agent": USER_AGENT,
'User-Agent': str(UserAgent(path="ua.json").random),
"cookie": "CONSENT=YES+srp.gws-20211208-0-RC2.zh-CN+FX+870; "
"AEC=AVQQ_LBBv2AdMIJg5Mo-mhbpPvz7Yy6TXL2YDpPEIWPZ2V12AZNvVRj01w; 1P_JAR=2022-04-14-04; "
"NID=511"
"=WG_TSuY8P75PO_IIAjeBJh4D9Z1peKXWPh22PDAN62GWAajB5gIj6tvOQRCjHX5g9PEJPyM2RDB_ZlT5qS3lSXhUpOA1U9KkBkt3UbLM6uoHIZubQoHzZMzstsr_e_8eMDo9LPs18nMvIJf-4C6F_XC6TvZCYmgER4Dt2YzXRu6DhCoDljBI46qarDZiCTFDKvy2PNp_hzrGTfOUqg; DV=I3h3GDVGyQsvcNiZldgA7vxYqO5jAlg4dyRxmh2zaAEAAAA ",
}
resp = requests.get(URL, headers=headers, verify=True)
# print(resp.content)
f = open("1.html", "wb+")
f.write(resp.content)
f.close()
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
# print(soup.prettify())
li_arr = soup.select("div[class='yuRUbf']")
if len(li_arr):
print(len(li_arr))
# arr = []
for key in li_arr:
li_a = key.select("a")
a_href = li_a[0].attrs["href"]
li_h3 = li_a[0].select("h3")
_title = li_h3[0].text.strip().strip("\n").strip().replace("\n", "").replace(" ", "")
print(_title)
print(a_href)
obj = {"company": query, "title": _title, "url": a_href}
print(obj)
# arr.append(obj)
page += 1
else:
break
except Exception as e:
print(e)
break
return data
if __name__ == "__main__":
gs = GoogleSpider()
keyword = "python"
data = gs.search(keyword=keyword)