几个简单的例子 - 巩固Xpath语法 - Python

Xpath在w3cschool有教程,传送门。Xpath是使用路径表达式来选去xml或者html的节点。常用的路径表达式如下:

下面列出了最有用的路径表达式:

表达式描述
nodename 选取此节点的所有子节点。
/ 从根节点选取。
// 从匹配选择的当前节点选择文档中的节点,而不考虑它们的位置。
. 选取当前节点。
.. 选取当前节点的父节点。
@ 选取属性。

保存着个表格,以便忘记的时候有这个地方可以查阅。在浏览器也可以装个插件叫做xpath-helper用来辅助xpath语法。

本文中所有的案例都会用到etree和parsel库,用来对比xpath语法,当然解析xpath的还有bs4,不过据说这个库6年没更新了,暂时先抛弃它。

第一个案例,爬取58同城二手房信息。

  1 """
  2     几个简单的爬虫,巩固xpath
  3 """
  4 import os.path
  5 from lxml import etree
  6 import requests
  7 import parsel
  8 import csv
  9 
 10 class xpathSpider():
 11     headers = {
 12         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
 13     }
 14     url = 'https://xm.58.com/ershoufang/'
 15     def __init__(self, url, headers=headers):
 16         self.url = url
 17         self.headers = headers
 18 
 19     def get_html(self):
 20         response = requests.get(url=self.url, headers=self.headers)
 21         # 返回原始的html文本
 22         return response.text
 23 
 24     def get_html_by_xpath_through_etree(self):
 25         """
 26         利用lxml的etree解析html
 27         """
 28         html_data = etree.HTML(self.get_html())
 29         # 接下来用xpath解析
 30         results = html_data.xpath('//section[@class="list"]/div') # 用xpath解析找到所有的结果
 31         for item in results:
 32             # 标题
 33             title = item.xpath('.//div[@class="property-content-title"]/h3/text()')[0].strip() # 其中括号中的点表示从当前节点开始选取,当前节点就是item节点,它是results节点的子节点
 34             # 商圈或者说开发商
 35             kaifashang = item.xpath('.//div[@class="property-content-info property-content-info-comm"]/p[1]/text()')[0].strip()
 36             # 几室几厅
 37             room = ''.join(item.xpath('.//p[@class="property-content-info-text property-content-info-attribute"]/*/text()')) # p标签下一级包括很多span标签,用*号取回所有文本
 38             # 面积
 39             area = item.xpath('.//section/div/p[2]/text()')[0].strip() # 选中面积,里面的p[2]是room里的p[1]是同级节点
 40             # 总价格
 41             totalPrice = ''.join(item.xpath('.//div[@class="property-price"]/p[1]/*/text()'))
 42             # 单价
 43             perPrice = item.xpath('.//div[@class="property-price"]/p[2]/text()')[0].strip()
 44             # 地址
 45             address = '-'.join(item.xpath('.//section/div[@class="property-content-info property-content-info-comm"]/p/*/text()'))
 46             self.saveToCsv(title, kaifashang, room, area, totalPrice, perPrice, address)
 47 
 48     def get_html_by_xpath_through_pasel(self):
 49         """
 50         方法不止一种,再用parsel库来解析一下,有对比才有伤害,与etree不同的是,parsel库支持直接提取文本
 51         """
 52         html_data = self.get_html()
 53         # 开始解析
 54         selector = parsel.Selector(html_data)
 55         # 获取到结果
 56         results = selector.xpath('//section[@class="list"]/div')
 57         for item in results:
 58             # 标题
 59             title = item.xpath('.//div[@class="property-content-title"]/h3/text()').get().strip() # 其中括号中的点表示从当前节点开始选取,当前节点就是item节点,它是results节点的子节点
 60             # 商圈或者说开发商
 61             kaifashang = item.xpath('.//div[@class="property-content-info property-content-info-comm"]/p[1]/text()').get()
 62             # 几室几厅
 63             room = ''.join(item.xpath('.//p[@class="property-content-info-text property-content-info-attribute"]/*/text()').getall()) # p标签下一级包括很多span标签,用*号取回所有文本
 64             # 面积
 65             area = item.xpath('.//section/div/p[2]/text()').get().strip()  # 选中面积,里面的p[2]是room里的p[1]是同级节点
 66             # 总价格
 67             totalPrice = ''.join(item.xpath('.//div[@class="property-price"]/p[1]/*/text()').getall())
 68             # 单价
 69             perPrice = item.xpath('.//div[@class="property-price"]/p[2]/text()').get().strip()
 70             # 地址
 71             address = '-'.join(
 72                 item.xpath('.//section/div[@class="property-content-info property-content-info-comm"]/p/*/text()').get())
 73             print(title, kaifashang, room, area, totalPrice, perPrice, address, sep=" | ")
 74 
 75     def create_csv(self):
 76         """
 77         创建一个csv文档
 78         """
 79         csvFile = open('58同城二手房etree-xpath.csv', mode='w+', encoding='utf-8-sig', newline='')
 80         csvWriter = csv.writer(csvFile)
 81         csvWriter.writerow(['标题', '开发商', '厅室', '面积', '总价', '单价', '地址']) # 表头只要写一次
 82         csvFile.close()
 83 
 84     def saveToCsv(self, title, kaifashang, room, area, totalPrice, perPrice, address):
 85         # 如果要实现翻页爬取,必须用a+模式,
 86         with open('58同城二手房etree-xpath.csv', mode='a+', encoding='utf-8-sig', newline='') as f:
 87             csvWriter = csv.writer(f)
 88             csvWriter.writerow([title, kaifashang, room, area, totalPrice, perPrice, address])
 89 
 90     def run(self):
 91         self.get_html_by_xpath_through_etree() # 因为把保存的程序写到这个下面了,直接调用运行
 92 
 93 if __name__ == "__main__":
 94     for page in range(1, 10):
 95         url = f'https://xm.58.com/ershoufang/p{page}/'
 96         app = xpathSpider(url=url)
 97         # 这里判定文件存在不存在,如果不存在,就调用方法运行一次,如果多页爬取的时候,每次都调用的话,每次都会创建文件,这不是需要的
 98         if not os.path.exists('58同城二手房etree-xpath.csv'):
 99             app.create_csv()
100         app.run()

 第二个案例,爬取图片进行保存。

 1 """
 2     第二个图片案例
 3 """
 4 import os.path
 5 import time
 6 import requests
 7 from lxml import etree
 8 import parsel
 9 
10 class picXpathSpider():
11     headers = {
12         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
13     }
14     def __init__(self, url, headers=headers):
15         self.url = url
16         self.headers = headers
17 
18     def get_html(self):
19         response = requests.get(url=self.url, headers=self.headers)
20         response.encoding = response.apparent_encoding
21         response.encoding = 'gbk'
22         return response.text
23 
24     def get_content_by_xpath(self):
25         picnames = []
26         picurls = []
27         html_data = self.get_html()
28         selector = etree.HTML(html_data)
29         pictures = selector.xpath('//div[@class="slist"]/ul/li')
30         for item in pictures:
31             picname = item.xpath('.//a/b/text()')[0] # 取第一项
32             picnames.append(picname)
33             imgurl = item.xpath('.//a/img/@src')[0] # 取第一项
34             imgurl = 'https://pic.netbian.com/' + str(imgurl)
35             picurls.append(imgurl)
36             # print(picname, imgurl, sep=' | ')
37         return zip(picnames, picurls)
38 
39     def get_content_by_parsel(self):
40         picnames = []
41         picurls = []
42         html_data = self.get_html()
43         selector = parsel.Selector(html_data)
44         results = selector.xpath('//div[@class="slist"]/ul/li') # 选中需要的结果
45         for item in results:
46             picname = item.xpath('.//a/b/text()').get()  # 取第一项
47             picnames.append(picname)
48             imgurl = item.xpath('.//a/img/@src').get()  # 取第一项
49             imgurl = 'https://pic.netbian.com/' + str(imgurl)
50             picurls.append(imgurl)
51             print(picname, imgurl, sep=' | ')
52         return zip(picnames, picurls)
53 
54     def saveImg(self):
55         imginfo = self.get_content_by_xpath()
56         for picname, picurl in imginfo:
57             with open('images\\' + picname + '.' + picurl.split('.')[-1], mode='wb') as f:
58                 f.write(requests.get(url=picurl, headers=self.headers).content)
59                 print(picname + f'在-----{time.strftime("%H:%M:%S", time.localtime())}---------保存完毕!')
60 
61 if __name__ == "__main__":
62     if not os.path.exists('images\\'):
63         os.mkdir('images\\')
64     for page in range(1, 10+1):
65         if page == 1:
66             url = 'https://pic.netbian.com/4kmeinv/'
67         else:
68             url = f'https://pic.netbian.com/4kmeinv/index_{page}.html'
69         print(f'******************正在爬取----第{page}页数据----!******************')
70         app = picXpathSpider(url=url)
71         app.saveImg()

程序运行截图:

程序运行完毕保存的图片截图:

 

第三个案例:爬取某网站的所有城市名。

要爬取的网页截图如下:

也就是要爬取热门城市和全部城市下的所有城市名,他们分别属于不同的div标签,可以分开爬取,也可以一次性爬取。用到xpath的运算符"|"。如果//book | //cd, 表示返回所有用book元素和cd元素的集合。

  1 """
  2     第三个案例,爬取某网站的热门城市
  3 """
  4 import parsel
  5 import requests
  6 from lxml import etree
  7 
  8 class cityNameSpider():
  9     headers = {
 10         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
 11     }
 12 
 13     def __init__(self, url, headers=headers):
 14         self.url = url
 15         self.headers = headers
 16 
 17     def get_html(self):
 18         response = requests.get(url=self.url, headers=self.headers)
 19         if response.status_code == 200:
 20             return response.text
 21         else:
 22             print('请求失败!')
 23 
 24     # 第一种方法使用lxml的etree进行解析
 25     def parse_html_by_etree1(self):
 26         # 一股脑儿全部爬取
 27         cityNamesAll = []
 28         html_text = self.get_html()
 29         selector = etree.HTML(html_text)
 30         # 热门城市   //div[@class="bottom"]/ul/li
 31         # 所有城市   //div[@class="bottom"]/ul/div[2]/li
 32         lis = selector.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul/div[2]/li')
 33         for li in lis:
 34             cityName = li.xpath('.//a/text()')[0]
 35             cityNamesAll.append(cityName)
 36             # print(cityName)
 37         # print(len(cityNamesAll)) # 399
 38         return cityNamesAll
 39 
 40     def parse_html_by_etree2(self):
 41         # 分开爬取热门城市名和所有城市名
 42         cityNamesSep = []
 43         html_text = self.get_html()
 44         selector = etree.HTML(html_text)
 45         hotcitylis = selector.xpath('//div[@class="bottom"]/ul/li')
 46         for hotcity in hotcitylis:
 47             cityName = hotcity.xpath('.//a/text()')[0]
 48             cityNamesSep.append(cityName)
 49         allcities = selector.xpath('//div[@class="bottom"]/ul/div[2]/li')
 50         for othercity in allcities:
 51             otherCityName = othercity.xpath('.//a/text()')[0]
 52             cityNamesSep.append(otherCityName)
 53         # print(len(cityNamesSep)) # 399
 54         return cityNamesSep
 55 
 56     # 第二种方式使用parsel库进行解析
 57     def parse_html_by_parsel1(self):
 58         cityNamesAll = []
 59         html_text = self.get_html()
 60         selector = parsel.Selector(html_text)
 61         lis = selector.xpath('//div[@class="bottom"]/ul/li | //div[@class="bottom"]/ul/div[2]/li')
 62         for li in lis:
 63             cityName = li.xpath('.//a/text()').get()
 64             cityNamesAll.append(cityName)
 65         # print(cityNamesAll)
 66         return cityNamesAll
 67 
 68     def parse_html_by_parsel2(self):
 69         cityNamesSep = []
 70         html_text = self.get_html()
 71         selector = parsel.Selector(html_text)
 72         hotCitylis = selector.xpath('//div[@class="bottom"]/ul/li')
 73         for hotcity in hotCitylis:
 74             hotCity = hotcity.xpath('.//a/text()').get() # 获取文本
 75             cityNamesSep.append(hotCity)
 76         otherCityLis = selector.xpath('//div[@class="bottom"]/ul/div[2]/li')
 77         for othercity in otherCityLis:
 78             otherCity = othercity.xpath('.//a/text()').get()
 79             cityNamesSep.append(otherCity)
 80         # print(cityNamesSep)
 81         return cityNamesSep
 82 
 83     def saveTotxt(self):
 84         f = open('城市名.txt', mode='w+', encoding='utf-8')
 85         # 保存到txt文档,随便上面哪一种方法
 86         cityNames = self.parse_html_by_etree1()
 87         count = 0
 88         for cityname in cityNames:
 89             f.write(cityname + ', ')
 90             count += 1
 91             if count == 10:
 92                 count = 0 # 循环内从头开始计数
 93                 f.write('\n') # 每10个写一个换行符
 94         print('保存完毕!')
 95 
 96     def run(self):
 97         self.saveTotxt()
 98 
 99 if __name__ == "__main__":
100     url = 'https://www.aqistudy.cn/historydata/'
101     app = cityNameSpider(url=url)
102     # print(app.parse_html_by_etree1() == app.parse_html_by_etree2()) # True
103     # print(app.parse_html_by_parsel1() == app.parse_html_by_parsel2()) # True
104     app.saveTotxt()

程序运行完截图:

 

 

第四个案例,来个虎牙小姐姐的视频爬取。

  1 """
  2     虎牙小姐姐视频爬取
  3 """
  4 import os.path
  5 import pprint
  6 import random
  7 import re
  8 import time
  9 import requests
 10 from lxml import etree
 11 import parsel
 12 
 13 def change_tilte(title):
 14     # 一个函数,可以替换特殊字符
 15     pattern = re.compile(r'[\/\\\:\*\?\"\<\>\|]')
 16     new_title = re.sub(pattern, '', title)
 17     return new_title
 18 
 19 class huyaVideoSpider():
 20     headers = {
 21         'cookie': '',
 22         'referer': 'https://v.huya.com/g/Dance?set_id=31',
 23         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
 24     }
 25     def __init__(self, url, headers=headers):
 26         self.url = url
 27         self.headers = headers
 28 
 29     def get_html(self):
 30         response = requests.get(url=self.url, headers=self.headers) # 请求
 31         # print(response.text)
 32         return response.text # 返回值
 33 
 34     def parse_html_by_etree(self):
 35         videoIds = []
 36         html_text = self.get_html()
 37         selector = etree.HTML(html_text)
 38         results = selector.xpath('//section/ul[2]/li')
 39         # print(results)
 40         for item in results:
 41             videoPlayUrl = item.xpath('./a/@href')[0] # 取到第一项
 42             videoId = re.findall('\d+', videoPlayUrl)[0] # 取到第一项
 43             videoIds.append(videoId)
 44         return videoIds
 45 
 46     def parse_html_by_parsel(self):
 47         videoIds = []
 48         html_text = self.get_html()
 49         selector = parsel.Selector(html_text)
 50         results = selector.xpath('//section/ul[2]/li')
 51         for item in results:
 52             # 这里只需要提取到id就可以了
 53             videoPlayUrl = item.xpath('.//a/@href').get()
 54             videoId = re.findall('\d+', videoPlayUrl) # 从播放地址把id提取出来
 55             videoIds.append(videoId)
 56         return videoIds
 57 
 58     def request_content_url(self):
 59         videoTitles = []
 60         videoQualities = []
 61         downLoadUrls = []
 62         videoIds = self.parse_html_by_etree()
 63         for videoId in videoIds:
 64             json_url = f'https://liveapi.huya.com/moment/getMomentContent?videoId={videoId}&_=1641628611850'
 65             # 请求json_url
 66             res = requests.get(url=json_url, headers=self.headers)
 67             needResults = res.json()['data']['moment']['videoInfo']['definitions'] # list
 68             title = res.json()['data']['moment']['title'] # 视频标题
 69             videoTitles.append(title) # 添加到列表
 70             downLoadUrl = needResults[0]['url']
 71             downLoadUrls.append(downLoadUrl) # 将url添加到列表
 72             videoQuality = needResults[0]['defName'] # 画质
 73             videoQualities.append(videoQuality)
 74         return zip(videoTitles, videoQualities, downLoadUrls)
 75 
 76     # def make_dir(self):
 77     #     dirForSave = 'huyaVideo\\'
 78     #     if not os.path.exists(dirForSave):
 79     #         os.mkdir(dirForSave)
 80 
 81     def download_video(self):
 82         # 开始下载视频
 83         videoInfos = self.request_content_url()
 84         for videoTitle, videoQuality, videoUrl in videoInfos:
 85             videoTitle = change_tilte(videoTitle) # 以防万一,替换一些特殊字符,windows有些特殊字符是不能出现在标题里的
 86             with open('huyaVideo\\' + videoTitle + '--' + videoQuality + '.mp4', mode='wb') as f:
 87                 f.write(requests.get(url=videoUrl, headers=self.headers).content)
 88             print(videoTitle + f'-----于----->{time.strftime("%H:%M:%S", time.localtime())}---保存完毕!')
 89 
 90     def run(self):
 91         self.download_video()
 92 
 93 if __name__ == "__main__":
 94     filePath = 'huyaVideo\\'
 95     if not os.path.exists(filePath):
 96         os.mkdir(filePath)
 97     for page in range(1, 10 + 1):
 98         # 爬10页吧,没多线程,视频爬不动
 99         if page == 1:
100             url = 'https://v.huya.com/g/Dance_most_31'
101         else:
102             url = f'https://v.huya.com/g/Dance?set_id=31&order=mostplay&page={page}'
103         print(f'----------------------正在爬取{page}页内容,请稍后。。。。。。。。。。。。。。。。。。')
104         time.sleep(random.uniform(2, 5))
105         app = huyaVideoSpider(url=url)
106         app.run()

程序运行截图:

保存的视频截图:

爬完后发现哎呀妈呀,这么多卖肉的女主播,建议小孩子不要去看女的直播,又他瞄的是美颜啊,又他瞄的露这露那,去做J的不是更好?真的是恶心。毁三观。。

第五个案例, 爬取酷狗的榜单和歌曲单。

 1 """
 2     酷狗音乐
 3 """
 4 import requests
 5 from lxml import etree
 6 import parsel
 7 
 8 
 9 class kugouMusicSpider():
10     """
11         一个类,抓两个页面的xpath,一个是榜单,一个是榜单歌曲
12     """
13     headers = {
14         'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
15     }
16 
17     def __init__(self, url, headers=headers):
18         self.url = url
19         self.headers = headers
20 
21     def get_html(self):
22         response = requests.get(url=self.url, headers=self.headers)
23         response.raise_for_status()
24         response.encoding = response.apparent_encoding
25         response.encoding = 'utf-8'
26         return response.text
27 
28     # 解析榜单页面的xpath
29     def parse_rank_list_by_etree(self):
30         # 通过etree获取榜单名和榜单地址
31         rankListTitles = []
32         rankListUrls = []
33         html_data = self.get_html()
34         selector = etree.HTML(html_data)
35         # 这里的div标签的class属性是class="pc_rank_sidebar pc_rank_sidebar_first",选取一部分
36         rankList = selector.xpath('//div[contains(@class, "pc_rank_sidebar")]/ul/li')
37         # 如果是不包含的话就是//div[not(contains(@class, "pc_rank_sidebar"))]
38         for item in rankList:
39             rankListTitle = item.xpath('.//a/@title')[0] # 取第一项
40             rankListTitles.append(rankListTitle)
41             rankListUrl = item.xpath('.//a/@href')[0] # 这个url用来抓取歌曲的xpath
42             rankListUrls.append(rankListUrl)
43             # print(rankListTitle)
44         return zip(rankListTitles, rankListUrls)
45     def parse_rank_list_by_parsel(self):
46         # 通过parsel获取榜单名和榜单地址
47         rankListTitles = []
48         rankListUrls = []
49         html_data = self.get_html()
50         selector = parsel.Selector(html_data)
51         rankList = selector.xpath('//div[contains(@class, "pc_rank_sidebar")]/ul/li')
52         for item in rankList:
53             rankListTitle = item.xpath('./a/@title').get() # 榜单标题
54             rankListUrl = item.xpath('./a/@href').get() # 榜单url
55             rankListTitles.append(rankListTitle)
56             rankListUrls.append(rankListUrl)
57         return zip(rankListTitles, rankListUrls)
58 
59     # 开始解析歌曲页面的xpath,获取歌曲名和播放页面地址
60     def parse_music_page_by_etree(self):
61         # 通过etree获取歌曲页面的歌曲标题和歌曲播放地址
62         info = self.parse_rank_list_by_etree()
63         for rankListTitle, rankListUrl in info:
64             response = requests.get(url=rankListUrl, headers=self.headers)
65             selector = etree.HTML(response.text)
66             musciLis = selector.xpath('//div[contains(@class, "pc_temp_songlist")]/ul/li')
67             for item in musciLis:
68                 musicName = item.xpath('.//a/@title')[0]
69                 musicUrl = item.xpath('.//a/@href')[0]
70                 print(musicName, musicUrl, sep=" | ")
71             break
72     def parse_musci_page_by_parsel(self):
73         # 通过parsel获取歌曲页面的歌曲标题和歌曲播放地址
74         info = self.parse_rank_list_by_parsel()
75         for rankListTitle, rankListUrl in info:
76             response = requests.get(url=rankListUrl, headers=self.headers)
77             selector = parsel.Selector(response.text)
78             musicLis = selector.xpath('//div[contains(@class, "pc_temp_songlist")]/ul/li')
79             for item in musicLis:
80                 musicName = item.xpath('.//a/@title').get()
81                 musicUrl = item.xpath('.//a/@href').get()
82                 print(musicName, musicUrl, sep=' | ')
83     
84 if __name__ == "__main__":
85     url = 'https://www.kugou.com/yy/html/rank.html'
86     app = kugouMusicSpider(url=url)
87     app.parse_musci_page_by_parsel()

 

posted @ 2022-01-07 16:33  、一叶孤城  阅读(124)  评论(0编辑  收藏  举报