爬虫之scrapy xpath如何处理 本地html文件和 如何获取 head标签中的style样式中的内容 和处理[<Element style at 0x1012a6f48>]类的 方法

 

 

 

from lxml import etree
html ="""<!DOCTYPE html>

<!--[if IE 8]><html class="ie8"><![endif]-->
<!--[if IE 9]><html class="ie9"><![endif]-->
<!--[if gt IE 9]><!--><html><!--<![endif]-->
<head>
  <title>青春光线电影院 - 猫眼电影 - 一网打尽好电影</title>

  <link rel="dns-prefetch" href="//p0.meituan.net"  />
  <link rel="dns-prefetch" href="//p1.meituan.net"  />
  <link rel="dns-prefetch" href="//ms0.meituan.net" />
  <link rel="dns-prefetch" href="//ms1.meituan.net" />
  <link rel="dns-prefetch" href="//analytics.meituan.com" />
  <link rel="dns-prefetch" href="//report.meituan.com" />
  <link rel="dns-prefetch" href="//frep.meituan.com" />


  <meta charset="utf-8">
  <meta name="keywords" content="">
  <meta name="description" content="">
  <meta http-equiv="cleartype" content="yes" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <meta name="renderer" content="webkit" />

  <meta name="HandheldFriendly" content="true" />
  <meta name="format-detection" content="email=no" />
  <meta name="format-detection" content="telephone=no" />
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <script>
  cid = "c_93ziierd";
  ci = 1;
    window.system = {"cinemaImgs":[{"imgid":288774842,"imgDesc":"首图","url":"http://p0.meituan.net/poi/ec9127ac73d519016c1f55a5cbfe4014566042.png"},{"imgid":288774842,"imgDesc":"","url":"http://p0.meituan.net/poi/ec9127ac73d519016c1f55a5cbfe4014566042.png"}]};

  window.openPlatform = '';
  window.openPlatformSub = '';

  </script>
  <link rel="stylesheet" href="//ms0.meituan.net/mywww/common.4b838ec3.css"/>
<link rel="stylesheet" href="//ms0.meituan.net/mywww/cinemas-cinema.c339c8d8.css"/>
  <script src="//ms0.meituan.net/mywww/stat.74891044.js"></script>
  <script>if(window.devicePixelRatio >= 2) { document.write('<link rel="stylesheet" href="//ms0.meituan.net/mywww/image-2x.8ba7074d.css"/>') }</script>
  <style>
    @font-face {
      font-family: stonefont;
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot');
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot?#iefix') format('embedded-opentype'),
           url('//vfile.meituan.net/colorstone/e4004b2572743d76c471e9b1e2e8fac82084.woff') format('woff');
    }

    .stonefont {
      font-family: stonefont;
    }
  </style>
</head>
<body>
<script src="//ms0.meituan.net/mywww/cinemas-cinema.e0024071.js"></script>
</body>
</html>
"""

source = etree.HTML(html.encode('utf-8'))
links = source.xpath("//style")
print(links)  #[<Element style at 0x1012a6f48>]
for index in range(len(links)):
    print(links[index])    #<Element style at 0x1012a6f48>
  
    print(type(links[index]))   #<class 'lxml.etree._Element'>
    print(links[index].tag)  # 获取<sltyle>标签名    style 
    print(links[index].attrib)  # 获取<style>标签的属性href和class
    print(links[index].text)  # 获取<a>标签的文字部分
  
#这是最后一个方法 print(links[index].text) 获取的内容 ''' @font-face { font-family: stonefont; src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot'); src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot?#iefix') format('embedded-opentype'), url('//vfile.meituan.net/colorstone/e4004b2572743d76c471e9b1e2e8fac82084.woff') format('woff'); } .stonefont { font-family: stonefont; }'''

 

 

from lxml import etree
html ="""<!DOCTYPE html>

<!--[if IE 8]><html class="ie8"><![endif]-->
<!--[if IE 9]><html class="ie9"><![endif]-->
<!--[if gt IE 9]><!--><html><!--<![endif]-->
<head>
  <title>青春光线电影院 - 猫眼电影 - 一网打尽好电影</title>

  <link rel="dns-prefetch" href="//p0.meituan.net"  />
  <link rel="dns-prefetch" href="//p1.meituan.net"  />
  <link rel="dns-prefetch" href="//ms0.meituan.net" />
  <link rel="dns-prefetch" href="//ms1.meituan.net" />
  <link rel="dns-prefetch" href="//analytics.meituan.com" />
  <link rel="dns-prefetch" href="//report.meituan.com" />
  <link rel="dns-prefetch" href="//frep.meituan.com" />


  <meta charset="utf-8">
  <meta name="keywords" content="">
  <meta name="description" content="">
  <meta http-equiv="cleartype" content="yes" />
  <meta http-equiv="X-UA-Compatible" content="IE=edge" />
  <meta name="renderer" content="webkit" />

  <meta name="HandheldFriendly" content="true" />
  <meta name="format-detection" content="email=no" />
  <meta name="format-detection" content="telephone=no" />
  <meta name="viewport" content="width=device-width, initial-scale=1">


  <script>
  cid = "c_93ziierd";
  ci = 1;
    window.system = {"cinemaImgs":[{"imgid":288774842,"imgDesc":"首图","url":"http://p0.meituan.net/poi/ec9127ac73d519016c1f55a5cbfe4014566042.png"},{"imgid":288774842,"imgDesc":"","url":"http://p0.meituan.net/poi/ec9127ac73d519016c1f55a5cbfe4014566042.png"}]};

  window.openPlatform = '';
  window.openPlatformSub = '';

  </script>
  <link rel="stylesheet" href="//ms0.meituan.net/mywww/common.4b838ec3.css"/>
<link rel="stylesheet" href="//ms0.meituan.net/mywww/cinemas-cinema.c339c8d8.css"/>
  <script src="//ms0.meituan.net/mywww/stat.74891044.js"></script>
  <script>if(window.devicePixelRatio >= 2) { document.write('<link rel="stylesheet" href="//ms0.meituan.net/mywww/image-2x.8ba7074d.css"/>') }</script>
  <style>
    @font-face {
      font-family: stonefont;
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot');
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot?#iefix') format('embedded-opentype'),
           url('//vfile.meituan.net/colorstone/e4004b2572743d76c471e9b1e2e8fac82084.woff') format('woff');
    }

    .stonefont {
      font-family: stonefont;
    }
  </style>
</head>
<body>
<script src="//ms0.meituan.net/mywww/cinemas-cinema.e0024071.js"></script>
</body>
</html>
"""
import re

source = etree.HTML(html.encode('utf-8'))
links = source.xpath("//style")
print(links)  #[<Element style at 0x1012a6f48>]
for index in range(len(links)):
    print(links[index])    #<Element style at 0x1012a6f48>

    print(type(links[index]))   #<class 'lxml.etree._Element'>
    print(links[index].tag)  # 获取<sltyle>标签名    style
    print(links[index].attrib)  # 获取<style>标签的属性href和class
    print(links[index].text,type(links[index].text),'https'+ re.findall('//vfile.+\.woff',links[index].text)[0])  # 获取<a>标签的文字部分

#这是最后一个方法   print(links[index].text)   获取的内容
    ''' @font-face {
      font-family: stonefont;
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot');
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot?#iefix') format('embedded-opentype'),
           url('//vfile.meituan.net/colorstone/e4004b2572743d76c471e9b1e2e8fac82084.woff') format('woff');
    }

    .stonefont {
      font-family: stonefont;
     }'''



a=''' @font-face {
      font-family: stonefont;
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot');
      src: url('//vfile.meituan.net/colorstone/447a7d378623b41e92cdd55cd56adc3c3168.eot?#iefix') format('embedded-opentype'),
           url('//vfile.meituan.net/colorstone/e4004b2572743d76c471e9b1e2e8fac82084.woff') format('woff');
    }

    .stonefont {
      font-family: stonefont;
     }'''
b=re.findall('//vfile.+woff',a)
print(len(b))
View Code

 

posted on 2018-11-09 09:51  王大拿  阅读(927)  评论(0)    收藏  举报

导航