Python爬虫 #011 爬虫案例
介绍了一些基本的爬虫用法,下面根据所学做一些案例。tip:由于网站会不断更新迭代,所以网站中的标签,内容可能有所改变,程序可能发生报错,仅为学习提供思路
!!! 注意:文档的保存路径需要改变,否则报错 !!!
1. BeautifulSoup爬取读书网
import requests
from bs4 import BeautifulSoup
urln = 'https://www.dushu.com/book/1163_%d.html'
f = open(r'C:\Users\Administrator\Desktop\my.txt', mode = 'a',encoding = 'utf-8')
for i in range(1, 101):
url = urln % (i)
response = requests.get(url)
if response.status_code == 200:
data = response.text
soup = BeautifulSoup(data, 'lxml')
# print(soup)
book_list = soup.find('div', attrs={'class': 'bookslist'})
# print(book_list)
books = book_list.find('ul').find_all('li')
# print(books)
for book in books:
# 如果try内出现异常,结束当前循环,继续下一次循环
try:
title = book.find('h3').find('a').get_text()
author = book.find('p').get_text()
# 直接得到标签
# brief = book.find('p', class_='disc eps')#.get_text()
# 得到标签列表,需要取出元素
brief = book.select('p[class="disc eps"]')[0].get_text()
# print(title)
# print(author)
# print(brief)
f.write('《' + title + '》' + '\n' + author + '\n' + brief + '\n' + '\n')
except:
continue
print('第%d个页面写入成功' % i)
else:
print('访问服务器异常')
f.close()
2. xpath 爬取读书网
import requests
from lxml import etree
class DuShuSpider(object):
# 初始化
def __init__(self, base_url):
self.base_url = base_url
self.books_fp = open(r"C:\Users\Administrator\Desktop\books.txt", mode="w", encoding="utf-8")
# 访问网页(设置随机请求头或ip代理池)
def Ask_Url(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3756.400 QQBrowser/10.5.4039.400'
}
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
html = etree.HTML(response.text)
print(html)
print("=" * 33)
else:
print("访问服务器异常")
return html
# 解析网页
def Get_Content(self):
for i in range(1,89):
print("第%d页"%i)
url = self.base_url % i
html = self.Ask_Url(url)
books = html.xpath('//div[@class="bookslist"]/ul/li')
for book in books:
# 设置空列表,用来储存每一本书的数据(标题,作者,简介)
## 每循环一次就被清空
data = []
try:
title = book.xpath('./div/h3/a/text()')[0]
author = book.xpath('./div/p[1]/a/text()')[0]
content = book.xpath('./div/p[2]/text()')[0]
data.append(title)
data.append(author)
data.append(content)
except Exception as result:
print(result)
continue
# 一本书就是一个列表,把数据传给函数保存
DuShuSpider.Save_TexData(self, data)
# 保存数据为文本
def Save_TexData(self, data):
# 一个data就是一本书的所有数据
for i in data:
self.books_fp.write(i + "\n")
if __name__ == "__main__":
base_url = "https://www.dushu.com/book/1617_%d.html"
dushu = DuShuSpider(base_url)
dushu.Get_Content()
3. jsonpath 爬取资讯
需要找接口网址,否则只能爬取到静态数据
import requests
import json
import jsonpath
# 找到接口网址,对其规律进行分析
# url = 'https://www.xfz.cn/api/website/articles/?p=1&n=20&type='
urln = 'https://www.xfz.cn/api/website/articles/?p=%d&n=20&type='
# 实测网页数据只有37页,往后为空
for i in range(1, 38):
url = urln % i
response = requests.get(url)
data = response.text
# print(data)
f = open(r'C:\Users\Administrator\Desktop\news.txt', mode='a', encoding='utf-8')
if response.status_code == 200:
json_obj = json.loads(data, encoding='utf-8')
# print(json_obj)
titles = jsonpath.jsonpath(json_obj, '$..[title]')
author_ids = jsonpath.jsonpath(json_obj, '$..[author_id]')
names = jsonpath.jsonpath(json_obj, '$..[name]')
# print(titles)
# print(author_ids)
# print(names)
try:
# 数据 = titles + author_ids + names 【三个列表的长度一致】
for j in range(len(titles)):
f.write(str(author_ids[j]) + names[j] + '\n' + titles[j] + '\n' + '\n')
except Exception as result:
print('page%d获取错误' % i, 'because: ', result)
continue
else:
print('访问服务器失败')
print('page%d' % i)
f.close()
'''
tip: try中for循环模型
a,b 列表长度一致
a = [1,2,3,4,5,6]
b = [1,2,3,4,5,6]
for i in range(len(a)):
print(a[i],b[i])
1 1
2 2
3 3
4 4
5 5
6 6
'''
本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/16628520.html

浙公网安备 33010602011771号