使用lxml通过文字获取class值--python实现
- 直奔主题
# author: liuada6666@163.com
# version:1.0
# time: 2022-4-12
# 转发请注明原作者,禁止通过该程序直接进行宣传或获利
import html as HTML # HTMLParser库
from selenium import webdriver # selenium库
import lxml # lxml库
from lxml import etree # lxml库
def Elements_Text_CLASS(browser, text):
if text == "":
return [0]
html = etree.HTML(browser.execute_script("return document.documentElement.outerHTML"))
result, results, text_index, t, value_list, num = [], [], 0, False, [], 0
element_list = html.xpath("""//*[contains(text(), "{}")]""".format(text)) # 获取所有文字
[result.append(etree.tostring(i, encoding="utf-8").decode('utf-8')) for i in element_list]
if not result: # 如果result为空,再次查找是否将文字写进placeholder属性
element_list = html.xpath("""//input[contains(@placeholder,"{}")]""".format(text))
[result.append(etree.tostring(i, encoding="utf-8").decode('utf-8')) for i in element_list]
for x in result:
end = ""
if '<input' not in x and '<textarea' not in x:
hierarchy = ""
while end[0:4] != "<div":
hierarchy += '/..'
extract_content = html.xpath("""//*[contains(text(), "{}")]{}""".format(text, hierarchy))[0]
end = etree.tostring(extract_content, encoding='utf-8').decode('utf-8')
if '<input' in end:
result = re.findall(re.compile('<input(.*?)>', re.S), end)
elif '<textarea' in end:
result = re.findall(re.compile('<textarea(.*?)>', re.S), end)
for i in result: # 如果没有获取到class属性则向父节点查找
if "class" not in i:
element_list = html.xpath("""//*[contains(text(), "{}")]/..""".format(text))
[results.append(etree.tostring(j, encoding="utf-8").decode('utf-8')) for j in element_list]
else:
results.append(i)
if 'placeholder' in i:
i = HTML.unescape(re.findall(re.compile('placeholder="(.*?)"', re.S), i)[0])
if text in i or text in end: # 如果输入的内容在result中
num += 1
if num == 0: # 如果输入的内容不在result中
return ['{}不存在'.format(text)]
for index, value in enumerate(html.xpath("""//@placeholder""")): # 如果placeholder属性值与text文字相同,直接对class的索引进行赋值
if value == text:
text_index, t = index, True
for index, value in enumerate(results):
class_name = re.findall(re.compile('class="(.*?)"', re.S), value)[0]
if " " in class_name: # 获取所有的class值并取第一个
class_name = class_name.split(" ", class_name.count(" "))[0]
liuada6666.append(class_name)
if not t: # 如果class的索引未赋值
ends, frequency_list, endn = [], [], ""
extract_content = html.xpath("""//*[contains(@class, "{}")]""".format(class_name))
[ends.append(etree.tostring(_, encoding='utf-8').decode('utf-8')) for _ in extract_content]
for vv, endn in enumerate(ends):
hierarchy, frequency = "", 0
try:
while text not in endn:
hierarchy += '/..'
frequency += 1
extract_content = html.xpath("""//*[contains(@class, "{}")]{}""".format(class_name, hierarchy))[vv]
endn = etree.tostring(extract_content, encoding='utf-8').decode('utf-8')
except IndexError:
pass
frequency_list.append(frequency)
text_index = frequency_list.index(min(frequency_list))
break
liuada6666.append(text_index)
return liuada6666
只需要赋值浏览器对象以及文字内容,就可以返回指定的class值以及索引
该函数可以利用到UI自动化测试中去,免去了很多繁琐的找定位的步骤
目前个人测试的准确率在**90%**以上,但是要注意一点的是:
如果输入的文字在网页中存在多个,比如说确定按钮,仅返回第一个匹配的值,如果想要多个值,可以将倒数第三行的break去掉
该函数使用的库有:selenium、lxml、HTMLParser、re
由于本人对html一知半解,所以还有很多标签类型是我不知道的,对一些比较复杂的web界面处理起来相对时间会增加、准确率也会降低,望理解~~~
浙公网安备 33010602011771号