html_parser

import json
from lxml import etree


class HtmlParser(object):
    """这是HtmlParser"""

    # 提取urls
    def _get_new_urls(self):
        pass

    # 提取内容
    def _get_new_data(self):
        pass

    def parser(self, page_url, html_cont_str):
        if page_url is None or html_cont_str is None:
            return
        # dict_data=json.loads(html_cont)
        html_etree = etree.HTML(html_cont_str)  # 获取element 类型的html
        # node_list = html_etree.xpath("//div[@id='u1']/a")  # 获得节点
        node_list = html_etree.xpath("//a[starts-with(@href,'http')]|//a[starts-with(@href,'//')]")  # 获得节点
        print(len(node_list))
        # 遍历节点
        i = 1
        for node in node_list:
            a_href = node.xpath("./@href")[0]
            # a_href=node.xpath("./text()")
            print('No.%3s: %s' % (i, a_href))
            i += 1
        new_urls = self._get_new_urls()
        new_data = self._get_new_data()

        return new_urls, new_data

        pass

  

posted @ 2017-12-18 23:43  安迪9468  阅读(200)  评论(0编辑  收藏  举报