python简单爬虫
爬虫文字(1)
# requests库
## requests.get(url) 模拟浏览器打开网页
# re库
import requests
import re
response = requests.get('http://ishuo.cn/') # 模拟浏览器打开网页
# print(response.status_code) # 200成功,301,404网页丢失
# print(response.encoding) # utf-8
data = response.text #
# print(data)
# .匹配所有字符,*表示前面的字符0到无穷个
content_res = re.findall('<div class="content">(.*?)</div>', data)
title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
# print(title_res.index('活得糊涂的人,容易幸福'))
# print(title_res.index('购买银行理财产品亏损后如何起诉'))
title_res= title_res[10:60]
# print(title_res)
title_content_dic = {}
for i in range(len(title_res)):
title_content_dic[title_res[i]] = content_res[i]
# print(title_content_dic)
# print(title_content_dic)
for i in title_content_dic.items():
# print(str(i)+'\n')
print(f'{i[0]:<40} | {i[1]}')
爬虫文字(2)
import requests
import re
response = requests.get('http://ishuo.cn/') # 模拟浏览器打开网页
data = response.text
res = re.findall('<li class="list_li">(.*?)</li>',data)
title_content_desc_dic = {}
for i in res:
content = re.findall('<div class="content">(.*?)</div>',i)[0]
title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]
desc = re.findall('</a>(04月.*?)</div>',i)[0]
title_content_desc_dic[title] = (content,desc)
for i in title_content_desc_dic.items():
print(f'{i[0]:<40} | {i[1]}')
爬虫图片
import requests
import re
response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
data = response.text
# print(data)
img_url_res = re.findall('data-src="(.*?)"',data)
for i in img_url_res:
img_response = requests.get(i)
img_data = img_response.content
img_name = i.split('/')[-1]
f=open(img_name,'wb')
f.write(img_data)
# f.flush() # 快速刷新
爬虫视频
import requests
import re
response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)
# mp4_res1 = re.findall('<a href="(.*?)" class="img">',data)
# for i in mp4_res1:
# print(i)
mp4_res2 = re.findall('<a href="(.*?)">', data)
for i in mp4_res2: # type:str
res = re.findall('(.*?htm)', i)[0]
res = 'http://www.mod.gov.cn/v/' + res
response = requests.get(res)
data = response.text
# http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
url_res = re.findall('//Video (.*?.mp4)',data)[0]
mp4_response = requests.get(url_res)
mp4_data = mp4_response.content
f = open('test.mp4','wb')
f.write(mp4_data)
# break
'''
<a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a>
'''