python 网络爬虫 第一天
1 安装:
(1)安装 resquests
(2)安装 BeautifulSoup4
(3)安装jupyter
打开jupyter选择python3
以下代码是在jupyter下执行的
import requests res = requests.get('http://news.sina.com.cn/china/') res.encoding = "utf-8" #print(res.text)
获取页面的内容
res3 = requests.get('http://news.sina.com.cn/c/nd/2017-03-16/doc-ifycnpit2047114.shtml') res3.encoding = "utf-8" html3 = res3.text soup3 = BeautifulSoup(html3,"html.parser") timesource = soup3.select(".time-source")[0].contents[0].strip() print(timesource)
处理时间
from datetime import datetime #字串转时间 dt = datetime.strptime(timesource,"%Y年%m月%d日%H:%M") #时间转字串 #dt = dt.strftime("%Y-%m-%d") print(dt)
链接
alink = soup3.select(".time-source span span a")[0] print(alink) print(alink["href"]) print(alink.text)
文章内容处理
pss = soup3.select("#artibody p")[:-1] article=[] for p in pss: article.append(p.text.strip()) ' '.join(article) print(article)
简化处理
' '.join([p.text.strip() for p in soup3.select("#artibody p")[:-1]])
编辑名称
editor = soup3.select(".article-editor")[0].text.lstrip("责任编辑:") print(editor)