爬取站长素材-bs4、xpath
1 import requests 2 from lxml import etree 3 import os 4 if __name__ == "__main__": 5 url = "https://aspx.sc.chinaz.com/query.aspx" 6 headers = { 7 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3868.400 QQBrowser/10.8.4394.400" 8 } 9 if not os.path.exists('./zhanzhangsucai1'): 10 os.mkdir('./zhanzhangsucai1') 11 for page in range(11, 26): 12 page = str(page) 13 param = { 14 "keyword": "免费", 15 "issale": "", 16 "classID": "864", 17 "page": page 18 } 19 page_text = requests.get(url=url, params=param, headers=headers).text 20 tree = etree.HTML(page_text) 21 div_list = tree.xpath('//div[@class="box col3 ws_block"]') 22 print('第' + page + '页下载中') 23 for li in div_list: 24 detail_url = 'https:' + li.xpath('./a/@href')[0] 25 detail_page_text = requests.get(url=detail_url, headers=headers).text 26 detail_page_text = detail_page_text.encode('iso-8859-1').decode('utf-8') 27 detail_tree = etree.HTML(detail_page_text) 28 href_li = detail_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li')[0] 29 ppt_url = href_li.xpath('./a/@href')[0] 30 ppt_Name = li.xpath('./a/img/@alt')[0] + '.rar' 31 file_data = requests.get(url=ppt_url, headers=headers).content 32 ppt_path = 'zhanzhangsucai1/' + ppt_Name 33 with open(ppt_path, 'wb') as fp: 34 fp.write(file_data) 35 print(ppt_Name, '下载成功!!!') 36 print('第' + page + '页下载完成')
上: xpath
下:bs4
1 import requests 2 3 import os 4 from bs4 import BeautifulSoup 5 if __name__ == "__main__": 6 url = "https://aspx.sc.chinaz.com/query.aspx" 7 headers = { 8 "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3868.400 QQBrowser/10.8.4394.400" 9 } 10 if not os.path.exists('./zhanzhangsucai'): 11 os.mkdir('./zhanzhangsucai') 12 for page in range(11, 26): 13 page = str(page) 14 param = { 15 "keyword": "免费", 16 "issale": "", 17 "classID": "864", 18 "page": page 19 } 20 page_text = requests.get(url=url, params=param, headers=headers).text 21 22 soup = BeautifulSoup(page_text, 'lxml') 23 div_list = soup.find_all('div', class_='box col3 ws_block') 24 print('第' + page + '页下载中') 25 for li in div_list: 26 detail_url = 'https:' + li.a['href'] 27 detail_page_text = requests.get(url=detail_url, headers=headers).text.encode("ISO-8859-1") 28 detail_soup = BeautifulSoup(detail_page_text, 'lxml') 29 href_li = detail_soup.find('div', class_="clearfix mt20 downlist").ul.li 30 ppt_url = href_li.a['href'] 31 ppt_Name = li.a.img['alt'] + '.rar' 32 file_data = requests.get(url=ppt_url, headers=headers).content 33 ppt_path = 'zhanzhangsucai/' + ppt_Name 34 with open(ppt_path, 'wb') as fp: 35 fp.write(file_data) 36 print(ppt_Name, '下载成功!!!') 37 print('第' + page + '页下载完成')
                    
                
                
            
        
浙公网安备 33010602011771号