使用lxml通过文字获取class值--python实现

  • 直奔主题
# author: liuada6666@163.com
# version:1.0
# time: 2022-4-12
# 转发请注明原作者,禁止通过该程序直接进行宣传或获利
import html as HTML  # HTMLParser库
from selenium import webdriver  # selenium库
import lxml  # lxml库
from lxml import etree  # lxml库

def Elements_Text_CLASS(browser, text):
    if text == "":
        return [0]
    html = etree.HTML(browser.execute_script("return document.documentElement.outerHTML"))
    result, results, text_index, t, value_list, num = [], [], 0, False, [], 0
    element_list = html.xpath("""//*[contains(text(), "{}")]""".format(text))  # 获取所有文字
    [result.append(etree.tostring(i, encoding="utf-8").decode('utf-8')) for i in element_list]
    if not result:  # 如果result为空,再次查找是否将文字写进placeholder属性
        element_list = html.xpath("""//input[contains(@placeholder,"{}")]""".format(text))
        [result.append(etree.tostring(i, encoding="utf-8").decode('utf-8')) for i in element_list]
    for x in result:
        end = ""
        if '<input' not in x and '<textarea' not in x:
            hierarchy = ""
            while end[0:4] != "<div":
                hierarchy += '/..'
                extract_content = html.xpath("""//*[contains(text(), "{}")]{}""".format(text, hierarchy))[0]
                end = etree.tostring(extract_content, encoding='utf-8').decode('utf-8')
            if '<input' in end:
                result = re.findall(re.compile('<input(.*?)>', re.S), end)
            elif '<textarea' in end:
                result = re.findall(re.compile('<textarea(.*?)>', re.S), end)
    for i in result:  # 如果没有获取到class属性则向父节点查找
        if "class" not in i:
            element_list = html.xpath("""//*[contains(text(), "{}")]/..""".format(text))
            [results.append(etree.tostring(j, encoding="utf-8").decode('utf-8')) for j in element_list]
        else:
            results.append(i)
        if 'placeholder' in i:
            i = HTML.unescape(re.findall(re.compile('placeholder="(.*?)"', re.S), i)[0])
        if text in i or text in end:  # 如果输入的内容在result中
            num += 1
    if num == 0:  # 如果输入的内容不在result中
        return ['{}不存在'.format(text)]
    for index, value in enumerate(html.xpath("""//@placeholder""")):  # 如果placeholder属性值与text文字相同,直接对class的索引进行赋值
        if value == text:
            text_index, t = index, True
    for index, value in enumerate(results):
        class_name = re.findall(re.compile('class="(.*?)"', re.S), value)[0]
        if " " in class_name:  # 获取所有的class值并取第一个
            class_name = class_name.split(" ", class_name.count(" "))[0]
        liuada6666.append(class_name)
        if not t:  # 如果class的索引未赋值
            ends, frequency_list, endn = [], [], ""
            extract_content = html.xpath("""//*[contains(@class, "{}")]""".format(class_name))
            [ends.append(etree.tostring(_, encoding='utf-8').decode('utf-8')) for _ in extract_content]
            for vv, endn in enumerate(ends):
                hierarchy, frequency = "", 0
                try:
                    while text not in endn:
                        hierarchy += '/..'
                        frequency += 1
                        extract_content = html.xpath("""//*[contains(@class, "{}")]{}""".format(class_name, hierarchy))[vv]
                        endn = etree.tostring(extract_content, encoding='utf-8').decode('utf-8')
                except IndexError:
                    pass
                frequency_list.append(frequency)
            text_index = frequency_list.index(min(frequency_list))
        break
    liuada6666.append(text_index)
    return liuada6666

只需要赋值浏览器对象以及文字内容,就可以返回指定的class值以及索引
该函数可以利用到UI自动化测试中去,免去了很多繁琐的找定位的步骤
目前个人测试的准确率在**90%**以上,但是要注意一点的是:

如果输入的文字在网页中存在多个,比如说确定按钮,仅返回第一个匹配的值,如果想要多个值,可以将倒数第三行的break去掉

该函数使用的库有:selenium、lxml、HTMLParser、re

由于本人对html一知半解,所以还有很多标签类型是我不知道的,对一些比较复杂的web界面处理起来相对时间会增加、准确率也会降低,望理解~~~

posted @ 2022-04-12 10:18  liuada  阅读(48)  评论(0)    收藏  举报  来源