1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 # Author:woshinidaye
4
5 #xpath是在XML文档中搜索内蒙的一门语言,html是xml的一个子集
6 #安装lxml模块 pip install lxml
7
8
9 from lxml import html
10 etree = html.etree
11 a = '''
12 <note>
13 <to>George</to>
14 <from>John</from>
15 <author>
16 <nick id='1'>abc</nick>
17 <nick id='2'>def</nick>
18 <nick id='3'>ghi</nick>
19 <div>
20 <nick id='4'>qwe</nick>
21 </div>
22 <span>
23 <nick id='5'>qwe12313</nick>
24 <pan>
25 <nick id='6'>6666</nick>
26 </pan>
27 </span>
28 </author>
29 <heading>Reminder</heading>
30 <body>Don't forget the meeting!</body>
31 </note>
32 '''
33
34 # tree = etree.parse('b.html')
35 tree = etree.XML(a)
36 # res = tree.xpath('/note') #/表示层级关系,表示从根节点查找
37 # res = tree.xpath('/note/from') #/表示层级关系
38 # res = tree.xpath('/note/from/text()') #text()是那from下面的内容
39 # res = tree.xpath('/note/author/nick/text()') #这样没法拿到div下的nick获取到,
40 # res = tree.xpath('/note/author/*/nick/text()') #*可以代替author下的任何一个节点,通配符
41 # res = tree.xpath('/note/author//nick/text()') #//就表示所有author下的所有nick
42 # res = tree.xpath('/note/author/nick[1]/text()') #编号从1开始!!!![]索引
43 # res = tree.xpath(r'/note/author/nick[@id="2"]/text()') #类似于BeautifulSoup中确定标签,这里用@id=sss来表示标签
44 res = tree.xpath('/note/author/nick/@id') #@XXXX 获取属性值
45 for i in res:
46 print(i)
47
48
49 #提取页面源代码
50 #提取数据
51 '''
52 from lxml import html
53 import requests,csv
54 etree = html.etree
55 url = 'https://chengdu.zbj.com/search/f/?kw=SAAS'
56 header = {
57 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36"
58 }
59 rep = requests.get(url=url,headers=header)
60 # print(rep.text)
61 html = etree.HTML(rep.text) #解析
62 divs = html.xpath('/html/body/div[6]/div/div/div[2]/div[5]/div[1]/div')
63 with open('1128.txt','a+',encoding='utf-8',newline='') as f :
64 for div in divs: #拿到每一个服务商的信息
65 service_price = div.xpath('./div/div/a[2]/div[2]/div[1]/span/text()')[0]
66 service_title = 'SAAS'.join(div.xpath('./div/div/a[2]/div[2]/div[2]/p/text()')) #拼接
67 company = div.xpath('./div/div/a[1]/div[1]/p/text()')[1].strip()
68 a = [company,service_title,service_price]
69 csvwriter = csv.writer(f)
70 csvwriter.writerow(a)
71 print('done!!')
72 '''