import requests
import csv
from lxml import html
from bs4 import BeautifulSoup
Header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3928.4 Safari/537.36'
}
def getdata(url):
resp=requests.get(url)
ht=resp.text
soup=BeautifulSoup(ht,"lxml")
title=soup.find('h1',class_='ph')
print(title.text)
p = soup.find('div', class_='blockquote')
print(p.text)
time=soup.find('span',class_='time')
print(time.text)
#保存在csv文件当中
with open("微信小程序.csv", "a", newline="") as cf:
w = csv.writer(cf)
w.writerow([title.text, p.text, time.text])
cf.close()
def parse_page(url):
resp=requests.get(url,headers=Header)
resp.encoding = resp.apparent_encoding
temp = resp.text
ht = html.fromstring(temp)
informations = ht.xpath('//*[@id="itemContainer"]/div/div/h3/a')
#抓取在该网页下的超链接
for inf in informations:
url2 = "http://www.wxapp-union.com/" + inf.get('href')
getdata(url2)
print('微信小程序全部爬取完成')
def began():
#小程序页面共计有107页
url = "http://www.wxapp-union.com/portal.php?mod=list&catid=1&page={}"
for i in range(1,108):
new_url=url.format(i)
parse_page(new_url)
if __name__ == '__main__':
began()