python 网络爬虫 第一天
1 安装:
(1)安装 resquests
(2)安装 BeautifulSoup4
(3)安装jupyter
打开jupyter选择python3
以下代码是在jupyter下执行的
import requests
res = requests.get('http://news.sina.com.cn/china/')
res.encoding = "utf-8"
#print(res.text)
获取页面的内容
res3 = requests.get('http://news.sina.com.cn/c/nd/2017-03-16/doc-ifycnpit2047114.shtml')
res3.encoding = "utf-8"
html3 = res3.text
soup3 = BeautifulSoup(html3,"html.parser")
timesource = soup3.select(".time-source")[0].contents[0].strip()
print(timesource)
处理时间
from datetime import datetime
#字串转时间
dt = datetime.strptime(timesource,"%Y年%m月%d日%H:%M")
#时间转字串
#dt = dt.strftime("%Y-%m-%d")
print(dt)
链接
alink = soup3.select(".time-source span span a")[0]
print(alink)
print(alink["href"])
print(alink.text)
文章内容处理
pss = soup3.select("#artibody p")[:-1]
article=[]
for p in pss:
article.append(p.text.strip())
' '.join(article)
print(article)
简化处理
' '.join([p.text.strip() for p in soup3.select("#artibody p")[:-1]])
编辑名称
editor = soup3.select(".article-editor")[0].text.lstrip("责任编辑:")
print(editor)

浙公网安备 33010602011771号