提取网页正文,过滤掉script style meta
from lxml import etree
text = ''
html = etree.HTML(text, parser=etree.HTMLParser(encoding="utf-8"))
text = '\n'.join(html.xpath('//*[not(script or style or meta)]/text()'))
from lxml import etree
text = ''
html = etree.HTML(text, parser=etree.HTMLParser(encoding="utf-8"))
text = '\n'.join(html.xpath('//*[not(script or style or meta)]/text()'))