【python爬虫】游侠网部分新闻爬取
最后效果展示
【用网页表格的形式展示爬取的数据】
用pyinstaller打包成exe了
【python3.6 可运行的环境多一些】
aaaaa出错了!why什么?放到win7虚拟机运行 中文解码错误了~~~
代码
#游侠网 新闻news-link-ul https://www.ali213.net/ li a标签 import os from re import I import time import requests from lxml import etree import pandas as pd from requests.packages.urllib3.exceptions import InsecureRequestWarning requests.packages.urllib3.disable_warnings(InsecureRequestWarning) import traceback url_list = [] text_list = [] def get_url(url): header = headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"} response = requests.get(url,verify=False) rightcode = response.encoding#得到正确编码 s = response.content s.decode(rightcode)#对内容进行正确解码 return s for page in range(2,3): s = get_url("https://www.ali213.net/") selector = etree.HTML(s)#与下面的方法是同一样的效果 imgEle = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]') imgEleText = selector.xpath('//ul[@class="news-link-ul"]/li[1]/a[1]/text()') label = 'tu%s'%page for index,i in enumerate(imgEle): imgUrl = i.xpath('@href')[0]#连接已得到 url_list.append(imgUrl) text_list.append(imgEleText[index]) #print(imgUrl) #print(imgEleText[index]) def main1(): j = [] for k in url_list: j.append('http://'+k.split("/",2)[2])#给网页链接中//去除掉 然后添加 http dic1 = { "text":[i for i in j], "url":[f for f in text_list] # j replace url_list #"text":["https://www.ali213.net/"], #"url":["游侠网"] } #构建字典 利用字典创建表格数据 df1 = pd.DataFrame(dic1) df1['url'] = '<a href=' + df1['text'] + '><div>' + df1['url'] + '</div></a>' nowtime = time.strftime("%d-%m-%Y") df1 = df1.to_html(nowtime+'游侠网新闻.html',escape=False) #escape = False 与上面一条语句 ,可以将df1[text]中变成url链接 #html_table = df1.to_html('游侠网新闻.html') #print(df1.to_html()) #可以打印出html 字符串 if __name__ == '__main__': try: main1() except Exception as e: t=traceback.format_exc() with open(os.getcwd()+"/error-pa6.txt",'w') as f: f.write(t)
程序
链接:https://pan.baidu.com/s/1qO7z7UuR1Rlx9o-rmPNpAA
提取码:8888
--来自百度网盘超级会员V3的分享