爬虫

爬虫:向网站发起请求,获取资源后分析并提取有用数据的程序

流程

  1、发起请求:使用http库向目标站点发起请求,即发送一个Request,Request包含:请求头、请求体等

  2、获取响应内容:如果服务器能正常响应,则会得到一个Response,Response包含:html,json,图片,视频等

  3、解析内容:解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等。解析json数据:json模块。解析二进制数据:以b的方式写入文件

  4、保存数据:数据库。文件

工具

  请求库:requests,selenium

  解析库:正则,beautifulsoup,pyquery

  存储库:文件,MySQL,Mongodb,Redis

框架:scrapy

安装:pip3 install requests

基本请求

import requests
res = requests.get('https://www.baidu.com')
res.encoding = 'utf-8'
print(res.text)
with open('a.html', 'w') as f:
    f.write(res.text)

带参数的GET请求->params

import requests
res = requests.get('https://www.baidu.com/s',
                   params={'wd':'图片'},
                   headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
                            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
                            'Accept-Encoding': 'gzip, deflate, br',
                            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
                            'Cache-Control': 'no-cache',
                            'Connection': 'keep-alive',
                            'Cookie': 'BD_UPN=12314753; PSTM=1572350125; BAIDUID=79D0925D8720B930D1F1E5BFF612720F:FG=1; BIDUPSID=AA6E74403EED680B571512C161DCBEA9; BDUSS=EyeXBkQXJNZ1Q0QXk0dzhoTlh1ODFzUzNwa0lySWJwMFBrOVJHMS1SNn5ILTFkRVFBQUFBJCQAAAAAAAAAAAEAAACxNoeFsM3A6GZlbGzIyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL-SxV2~ksVdRE; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ispeed_lsm=2; BD_HOME=1; H_PS_PSSID=1449_21086_18560_20698_29567_29220_26350; delPer=0; BD_CK_SAM=1; PSINO=3; H_PS_645EC=2d24IwpbvK2eVobcmeLgWHGcv8LmvTpWTYgrzRwRetwbEpdCPi08ahOlrNs; COOKIE_SESSION=15438_1_7_5_14_10_0_1_3_5_39_3_72210_0_0_0_1574650244_1574491787_1574665633%7C9%233409_3_1574491763%7C2',
                            'Host': 'www.baidu.com',
                            'Pragma': 'no-cache',
                            'Sec-Fetch-Mode': 'navigate',
                            'Sec-Fetch-Site': 'none',
                            'Sec-Fetch-User': '?1',
                            'Upgrade-Insecure-Requests': '1'
                            })
res.encoding = 'gbk'
print(res.text)

with open('a.html', 'w') as f:
    f.write(res.text)

带参数的GET请求->headers      华华手机商城

import requests
headers = {'Referer': 'http://www.aa7a.cn/user.php?&ref=http%3A%2F%2Fwww.aa7a.cn%2Fuser.php%3Fact%3Dlogout',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
# 登录时发送的请求
res = requests.post('http://www.aa7a.cn/user.php',
                    headers=headers,
                    data={
                        'username': '2960113637@qq.com',
                        'password':'zrh960906*',
                        'captcha': 'GC3T',
                        'remember': 1,
                        'ref': 'http://www.aa7a.cn/',
                        'act': 'act_login'
                    })
cookie=res.cookies.get_dict()  # 登录成功,获取cookie
res=requests.get('http://www.aa7a.cn/',headers=headers,
                 cookies=cookie,
                 )
if '2960113637@qq.com' in res.text:
    print("登录成功")
else:
    print("没有登录")

梨视频

import requests
import re
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')   # 刷新页面得到的数据
reg_text = '<a href="(.*?)" class="vervideo-lilink actplay">'   # html页面
obj = re.findall(reg_text,res.text)
# print(obj)
for url in obj:
    url = 'https://www.pearvideo.com/'+ url  # 拼接路径
    res1 = requests.get(url)
    obj1 = re.findall('srcUrl="(.*?)"',res1.text)
    print(obj1[0])
    name = obj1[0].rsplit('/',1)[1]
    res2 = requests.get(obj1[0])
    with open (name,'wb') as f:
        for line in res2.iter_content():
            f.write(line)

 ssl        https=http+ssl  

import requests
respone=requests.get('https://www.12306.cn',
                     cert=('/path/server.crt',
                           '/path/key'))
print(respone.status_code)

ip 代理收费(通过代理访问自己的服务,在服务端取出客户端ip查看一下)

import requests
proxies={
    'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
    'http':'http://localhost:9743',
    'https':'https://localhost:9743',
    'http':'http://124.205.155.148:9090'
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)

超时设置

import requests
respone=requests.get('https://www.baidu.com',
                     timeout=0.0001)

上传文件

import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)

 

安装:easy_install lxml

爬取建材   import requests

for i in range(5):
    response = requests.get('http://shop.jc001.cn/1373528/goods/?p=%s'%(i))
    response.encoding = 'gbk'
    # # print(response.text)
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(response.text,'lxml')
    # # res = soup.prettify()
    # # print(res)
  
  
  # copy selector # body
> div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li:nth-child(1) > a res = soup.select('body > div:nth-child(3) > div > div.col-md-9.col-md-push-3 > div > div.cnt.clearfix.line > ul > li > a') # print(res) for i,v in enumerate(res): print(v.attrs['href'])

获取手机

import requests, json
url = 'http://10.23.255.15/v1/api/provider'
# 请求头
headers = {"Cookie":"csrftoken=pAJN4t4EcLs9UH0nCzoevqn7dd2HzYIxLKA873Hm1p6EZd7PPAgukvM9UKM9N7qu; sessionid=7kabh663t34qt4he03ittndf48ikjdni", "User-Agent" : "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
data = requests.request('get', url, headers=headers)
data = json.loads(data.text)   #得到网页的所有数据
print(data)

list = []
list2 = []
for i in range(len(data)):
    for j in range(len(data[i]['device_details'])):
        # 将所有手机添加到空列表中
        list.append(data[i]["device_details"][j]["manufacturer"])
# 去重
list1 = set(list)
for i in list1:
    list2.append(i)
# 将手机设备以字典方式存储
dict = {}
for i in range(len(list2)):
    dict[list2[i]] = []
for i in range(len(data)):
    for j in range(len(data[i]['device_details'])):
        dict[data[i]["device_details"][j]["manufacturer"]].append('Android://' + data[i]["device_details"][j]["provider_ip"] + ':5039/' +data[i]["device_details"][j]["serialno"])
print(dict)

# 将手机存入excel表中
l = 0
import xlwt
# 创建一个workbook 设置编码
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
# 写入excel
worksheet.write(0, 0, label='品牌')
worksheet.write(0, 1, label='数量')
worksheet.write(0, 2, label='设备信息')
for i in range(len(list2)):
    for j in range(len(dict[list2[i]])):
        worksheet.write(2+j+l, 2, label=dict[list2[i]][j])
    worksheet.write(1+l, 0, label=list2[i])
    worksheet.write(1+l, 1, label=len(dict[list2[i]]))
    worksheet.write(1+l, 2, label=','.join(dict[list2[i]]))
    l = l+len(dict[list2[i]])+1
workbook.save('test.xls')

 

find:  -name="标签名"

   -id,class_,=""  把这个标签拿出来  

  -标签.text 取标签的内容
  -标签.get(属性名) 取标签属性的内容

find_all

import requests
from bs4 import BeautifulSoup
url='https://www.autohome.com.cn/news/1/#liststart'
res=requests.get(url)
#生成一个bs4对象
soup=BeautifulSoup(res.text,'lxml')
div=soup.find(id='auto-channel-lazyload-article')
#div 是个对象
# print(type(div))

ul=div.find(name='ul')   #只找第一个ul标签
# ul_list=div.find_all(class_="article")   #找出下面所有类名为article的标签
# print(len(ul_list))
li_list=ul.find_all(name='li')
# print(len(li_list))
for li in li_list:
    h3=li.find(name='h3')
    if h3:
        title=h3.text  #把h3标签的text取出来
        print(title)
    a=li.find(name='a')
    if a:
        article_url=a.get('href')  #取出a标签的href属性
        print(article_url)

    img=li.find(name='img')
    if img:
        img_url=img.get('src')
        print(img_url)
    p=li.find(name='p')
    if p:
        content=p.text
        print(content)

查找文档      五种过滤器 :字符串,正则,布尔,方法,列表

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="title" id="bbaa"><b name="xx" age="18">The Dormouse's story</b><b>xxxx</b></p>
<p class="xxx" a="xxx">asdfasdf</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# soup=BeautifulSoup(html_doc,'lxml')
# ress=soup.prettify()   #美化一下
# soup=BeautifulSoup(ress,'lxml')
# print(ress)

#遍历文档树
# print(soup.p.name)
# print(soup.p.attrs)
# print(soup.p.string)
# print(list(soup.p.strings))
# print(soup.p.text)

import re
print(soup.find_all(name='b'))  # 字符串

print(soup.find_all(name=re.compile('^b')))  # 正则
print(soup.find_all(id=re.compile('^b')))

print(soup.find_all(name=['a','b']))  # 列表

print(soup.find_all(name=True))  # 布尔

def has_class_but_no_id(tag):  # 方法
    return tag.has_attr('class') and not tag.has_attr('id')
print(soup.find_all(name=has_class_but_no_id))

css 选择

sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>",'lxml')
print(sibling_soup.b.next_sibling)
print(sibling_soup.c.previous_sibling )

自动打开百度

from selenium import webdriver
from selenium.webdriver.common.keys import Keys #键盘按键操作
import time
bro=webdriver.Chrome()
bro.get('https://www.baidu.com')
#取到输入框
inp=bro.find_element_by_id('kw')
#往框里写字
inp.send_keys("图片")
inp.send_keys(Keys.ENTER) #输入回车
time.sleep(3)
bro.close()

 百度自动登录

from selenium import webdriver
import time
bro = webdriver.Chrome()
bro.get("https://www.baidu.com")
bro.implicitly_wait(10)
dl_button=bro.find_element_by_link_text("登录")
dl_button.click()
user_login=bro.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn')
user_login.click()
time.sleep(1)
input_name=bro.find_element_by_name('userName')
input_name.send_keys("2960113637@qq.com")
input_password=bro.find_element_by_id("TANGRAM__PSP_10__password")
input_password.send_keys("zrh960906")
submit_button=bro.find_element_by_id('TANGRAM__PSP_10__submit')
time.sleep(1)
submit_button.click()
time.sleep(10)
print(bro.get_cookies())
bro.close()

京东购物

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
bro = webdriver.Chrome()
bro.get("https://www.jd.com")
bro.implicitly_wait(5)

def get_goods(bro):
    print("------------------------------------")
    goods_li = bro.find_elements_by_class_name('gl-item')
    for good in goods_li:
        img_url = good.find_element_by_css_selector('.p-img a img').get_attribute('src')
        if not img_url:
            img_url = 'https:' + good.find_element_by_css_selector('.p-img a img').get_attribute('data-lazy-img')
        url = good.find_element_by_css_selector('.p-img a').get_attribute('href')
        price = good.find_element_by_css_selector('.p-price i').text
        name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '')
        commit = good.find_element_by_css_selector('.p-commit a').text
        print('''
                商品链接:%s
                商品图片:%s
                商品名字:%s
                商品价格:%s
                商品评论数:%s

                ''' % (url, img_url, name, price, commit))
    next_page = bro.find_element_by_partial_link_text("下一页")
    time.sleep(1)
    next_page.click()
    time.sleep(1)
    get_goods(bro)

input_search=bro.find_element_by_id('key')
input_search.send_keys("大裤衩")
input_search.send_keys(Keys.ENTER)

try:
    get_goods(bro)
except Exception as e:
    print('结束')
finally:
    bro.close()

selenium拿cookie

#获取属性:
# tag.get_attribute('src')
#获取文本内容
# tag.text
#获取标签ID,位置,名称,大小(了解)
# print(tag.id)
# print(tag.location)
# print(tag.tag_name)
# print(tag.size)

#模拟浏览器前进后退
# browser.back()
# time.sleep(10)
# browser.forward()

#cookies管理
# print(browser.get_cookies())  获取cookie
# browser.add_cookie({'k1':'xxx','k2':'yyy'})  设置cookie
# print(browser.get_cookies())

#运行js
# from selenium import webdriver
# import time
#
# bro=webdriver.Chrome()
# bro.get("http://www.baidu.com")
# bro.execute_script('alert("hello world")') #打印警告
# time.sleep(5)
#选项卡管理
# import time
# from selenium import webdriver
#
# browser=webdriver.Chrome()
# browser.get('https://www.baidu.com')
# browser.execute_script('window.open()')
#
# print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.get('https://www.taobao.com')
# time.sleep(3)
# browser.switch_to_window(browser.window_handles[0])
# browser.get('https://www.sina.com.cn')
# browser.close()

#动作链
# from selenium import webdriver
# from selenium.webdriver import ActionChains
#
# from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
# import time
#
# driver = webdriver.Chrome()
# driver.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
# wait=WebDriverWait(driver,3)
# # driver.implicitly_wait(3)  # 使用隐式等待
#
# try:
#     driver.switch_to.frame('iframeResult') ##切换到iframeResult
#     sourse=driver.find_element_by_id('draggable')
#     target=driver.find_element_by_id('droppable')
#
#
# #方式一:基于同一个动作链串行执行
# # actions=ActionChains(driver) #拿到动作链对象
# # actions.drag_and_drop(sourse,target) #把动作放到动作链中,准备串行执行
# # actions.perform()
#
# #方式二:不同的动作链,每次移动的位移都不同
#
#
#     ActionChains(driver).click_and_hold(sourse).perform()
#     distance=target.location['x']-sourse.location['x']
#
#
#     track=0
#     while track < distance:
#         ActionChains(driver).move_by_offset(xoffset=2,yoffset=0).perform()
#         track+=2
#
#     ActionChains(driver).release().perform()
#
#     time.sleep(10)
#
#
# finally:
#     driver.close()

获取cookie

# import time
# from selenium import webdriver
# import json
# browser = webdriver.Chrome()
# browser.get('https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F')
#
# time.sleep(50)
# cookie=browser.get_cookies()
# print(cookie)
# with open('cookie.json','w')as f:
#     json.dump(cookie,f)



import requests
import json
with open('cookie.json','r')as f:
    di=json.load(f)

cookies = {}
# 获取cookie中的name和value,转化成requests可以使用的形式
for cookie in di:
    print(cookie)
    for key in cookie.keys():
        cookies[key] = cookie[key]

print(cookies)
res=requests.get('https://i-beta.cnblogs.com/api/user',
             cookies=cookies)
print(res.text)

破解验证码

import requests
from selenium import webdriver
import time
import json
url = 'https://account.cnblogs.com/signin?returnUrl=https%3A%2F%2Fwww.cnblogs.com%2F'
driver = webdriver.Chrome()
driver.get(url=url)
time.sleep(50)
driver.refresh()
c = driver.get_cookies()
print(c)
with open('xxx.txt','w') as f:
    json.dump(c,f)

time.sleep(3)
with open('xxx.txt', 'r') as f:
    di = json.load(f)
cookies = {}
for cookie in di:
    cookies[cookie['name']] = cookie['value']
print(cookies)

headers = {
    # 'authority': 'www.jd.com',
    # 'method': 'GET',
    # 'path': '/',
    # 'scheme': 'https',
    # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    # 'accept-encoding': 'gzip, deflate, br',
    # 'accept-language': 'zh-CN,zh;q=0.9',
    # 'cache-control': 'max-age=0',
    # 'upgrade-insecure-requests': '1',
    'authority': 'i-beta.cnblogs.com',
    'method': 'GET',
    'path': '/',
    'scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT',
    # 'if-modified-since': 'Sun, 24 Nov 2019 06:14:53 GMT,
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'none',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

}
# 使用该cookie完成请求
response = requests.get(url='https://i-beta.cnblogs.com/api/user', headers=headers, cookies=cookies)
print('xxx')
response.encoding = response.apparent_encoding
print(response.text)
from urllib.parse import unquote_plus  #字符转中文
from urllib.parse import urlencode  #中文转字符
msg = '''
"client_id=c3cef7c66a1843f8b3a9e6a1e3160e20&grant_type=password&timestamp=1574838172749&source=com.zhihu.web&signature=d9ca5ecd24ebcfd42360eabd392d860e837005d8&username=%2B8618953675221&password=lqz12345&captcha=&lang=cn&utm_source=&ref_source=other_https%3A%2F%2Fwww.zhihu.com%2Fsignin%3Fnext%3D%252F"
'''
print(unquote_plus(msg))

登录知乎

from requests_html import HTMLSession     #请求解析库     pip install requests-html
import base64                             #base64解密加密库
from PIL import Image                     #图片处理库
import hmac                               #加密库
from hashlib import sha1                  #加密库
import time
from urllib.parse import urlencode        #url编码库
import execjs                             #python调用node.js    pip install PyExecJS
from http import cookiejar

class Spider():
    def __init__(self):
        self.session = HTMLSession()
        self.session.cookies = cookiejar.LWPCookieJar()    # 使cookie可以调用save和load方法
        self.login_page_url = 'https://www.zhihu.com/signin?next=%2F'
        self.login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in'
        self.captcha_api = 'https://www.zhihu.com/api/v3/oauth/captcha?lang=en'
        self.headers = {
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
        }

        self.captcha =''         #存验证码
        self.signature = ''    #存签名

    # 首次请求获取cookie
    def get_base_cookie(self):
        self.session.get(url=self.login_page_url, headers=self.headers)

    # 处理验证码
    def deal_captcha(self):
        r = self.session.get(url=self.captcha_api, headers=self.headers)
        r = r.json()
        if r.get('show_captcha'):
            while True:
                r = self.session.put(url=self.captcha_api, headers=self.headers)
                img_base64 = r.json().get('img_base64')
                with open('captcha.png', 'wb') as f:
                    f.write(base64.b64decode(img_base64))
                captcha_img = Image.open('captcha.png')
                captcha_img.show()
                self.captcha = input('输入验证码:')
                r = self.session.post(url=self.captcha_api, data={'input_text': self.captcha},
                                      headers=self.headers)
                if r.json().get('success'):
                    break

    def get_signature(self):
        # 生成加密签名
        a = hmac.new(b'd1b964811afb40118a12068ff74a12f4', digestmod=sha1)
        a.update(b'password')
        a.update(b'c3cef7c66a1843f8b3a9e6a1e3160e20')
        a.update(b'com.zhihu.web')
        a.update(str(int(time.time() * 1000)).encode('utf-8'))
        self.signature = a.hexdigest()

    def post_login_data(self):
        data = {
            'client_id': 'c3cef7c66a1843f8b3a9e6a1e3160e20',
            'grant_type': 'password',
            'timestamp': str(int(time.time() * 1000)),
            'source': 'com.zhihu.web',
            'signature': self.signature,
            'username': '+8618217210664',
            'password': 'zrh960906*',
            'captcha': self.captcha,
            'lang': 'en',
            'utm_source': '',
            'ref_source': 'other_https://www.zhihu.com/signin?next=%2F',
        }

        headers = {
            'x-zse-83': '3_2.0',
            'content-type': 'application/x-www-form-urlencoded',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
        }

        data = urlencode(data)
        with open('ttt.js', 'rt', encoding='utf-8') as f:
            js = execjs.compile(f.read())
        data = js.call('b', data)
        print(data)

        r = self.session.post(url=self.login_api, headers=headers, data=data)
        if r.status_code == 201:
            self.session.cookies.save('mycookie')
            print('登录成功')
        else:
            print('登录失败')
    def login(self):
        self.get_base_cookie()
        self.deal_captcha()
        self.get_signature()
        self.post_login_data()

if __name__ == '__main__':
    zhihu_spider = Spider()
    zhihu_spider.login()

xpath选择

doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' a="xxx">Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

html=etree.HTML(doc)
# html=etree.parse('search.html',etree.HTMLParser())
# 1 所有节点
a=html.xpath('//*')    #匹配所有标签
# 2 指定节点(结果为列表)
# a=html.xpath('//head')
# 3 子节点,子孙节点
a=html.xpath('//div/a')
a=html.xpath('//body/a') #无数据
a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
a=html.xpath('//body//a[1]/..')  #从1开始
# 也可以这样
a=html.xpath('//body//a[1]/parent::*')
# 5 属性匹配
a=html.xpath('//body//a[@href="image1.html"]')

# 6 文本获取
a=html.xpath('//body//a[@href="image1.html"]/text()')
a=html.xpath('//body//a/text()')

# 7 属性获取
# a=html.xpath('//body//a/@href')
# # 注意从1 开始取(不是从0)
a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
#  a 标签有多个class类,直接匹配就不可以了,需要用contains
# a=html.xpath('//body//a[@class="li"]')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
a=html.xpath('//a[2]/text()')
a=html.xpath('//a[2]/@href')
# 取最后一个
a=html.xpath('//a[last()]/@href')
# 位置小于3的
a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor:祖先节点
# 使用了* 获取所有祖先节点
a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
a=html.xpath('//a/ancestor::div')
# attribute:属性值
a=html.xpath('//a[1]/attribute::*')
# child:直接子节点
a=html.xpath('//a[1]/child::*')
# descendant:所有子孙节点
a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
a=html.xpath('//a[1]/following::*')
a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
a=html.xpath('//a[1]/following-sibling::*')
a=html.xpath('//a[1]/following-sibling::a')
a=html.xpath('//a[1]/following-sibling::*[2]/text()')
a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)
posted @ 2019-11-25 17:23  大爷灰  阅读(378)  评论(0编辑  收藏  举报