from lxml import etree
'''
需求:
1、获取所有的tr标签
2、获取第二个tr标签
3、获取所有class等于even的标签
4、获取所有a标签的href属性
5、获取所有的职位信息(纯文本)
'''
def parse_tengxun():
parse = etree.HTMLParser(encoding='utf-8')
html = etree.parse("tengxun.html", parser=parse)
# trs = html.xpath(r'//h4[@class="recruit-title"]')
# # #返回的是一个列表
# for tr in trs:
# print(etree.tostring(tr,encoding='utf-8').decode('utf-8'))
#
# trs1 = html.xpath(r'//div/a/@href')
# #取出的是字符串类型
# for a in trs1:
# print(a)
trs2 = html.xpath(r'//*/div[@class="recruit-list"]/a')
#取出的是字符串类型
positons = []
for a in trs2:
zhiwei_list = a.xpath("./h4/text()")
zhiwei = zhiwei_list[0].split("-")[1]
# print(zhiwei)
qita = a.xpath("./p//span//text()")
didian = qita[1]
leixing = qita[2]
date = qita[3]
miaoshu = a.xpath("normalize-space(./p[@class='recruit-text']/text())")
positon = {
'职位':zhiwei,
'地点':didian,
'职位类型':leixing,
'发布日期':date,
'职位描述':miaoshu
}
positons.append(positon)
return positons
# print(print(etree.tostring(trs[0],encoding='utf-8').decode('utf-8')))
if __name__ == '__main__':
a=parse_tengxun()
for x in a:
print(x)