from lxml import etree
parser=etree.HTMLParser(encoding="utf-8")
html=etree.parse("test.html",parser=parser)
html2=etree.parse("lagou.html",parser=parser)
#html.xpath返回的是列表,大概率都是一个元素的标签
# 1.提取所有tr标签
# trs=html.xpath("//tr")
# for tr in trs:
# print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
#2.提取第二个tr标签
# tr=html.xpath("//tr[1]")[0]
# print(etree.tostring(tr,encoding="utf-8").decode("utf-8"))
#3.提取table下border为1px的标签
# border=html.xpath("//table[@border='2px']")[0]
# print(etree.tostring(border,encoding="utf-8").decode("utf-8"))
#4.获取a标签下href的值
# aList=html2.xpath("//a/@href")
# for a in aList:
# print(a)
#5.获取所有的标签内信息
trs=html.xpath("//tr[position()>1]")
positions=[]
for tr in trs:
herf=tr.xpath(".//a/@href")#.//是在当前位置下找元素,//是全文件
text=tr.xpath("./td[1]//text()")#tr下第一个td里面的text
lan=tr.xpath("./td[2]//text()")
price=tr.xpath("./td[3]//text()")
num=tr.xpath("./td[4]//text()")
name=tr.xpath("./td[5]//text()")
# alltext=tr.xpath("./td//text()")#第一个tr下所有td下的text
position={
"herf":herf,
"text":text,
"language":lan,
"price":price,
"num":num,
"name":name
}
false={'herf': [], 'text': [], 'language': [], 'price': [], 'num': [], 'name': []}
if position != false:
positions.append(position)
print(positions)