02-正则和xpath

一、正则基本回顾

1.常用指令

import re
#提取出python
# key="javapythonc++php"
# re.findall('python',key)[0]

# #提取出hello world
# key="<html><h1>hello world<h1></html>"
# re.findall('<h1>(.*?)<h1>',key)[0]

# #提取170
# string = '我喜欢身高为170的女孩'
# re.findall('\d+',string)[0]

# #提取出http://和https://
# key='http://www.baidu.com and https://boob.com'
# re.findall('https?://',key)

# #提取出hello
# key='lalala<hTml>hello</HtMl>hahah' #输出<hTml>hello</HtMl>

# #提取出hit. 
key='bobo@hit.edu.com'#想要匹配到hit.
re.findall('h.*?\.',key)

# #匹配sas和saas
# key='saas and sas and saaas'

# #匹配出i开头的行
# string = '''fall in love with you
# i love you very much
# i love she
# i love her'''
# re.findall('^i.*',string,re.M)

# #匹配全部行
string1 = """<div>细思极恐
你的队友在看书
你的闺蜜在减肥
你的敌人在磨刀
隔壁老王在炼药
</div>"""
re.findall('.*',string1,re.S)

2.示例

#解析糗事百科糗图下所有的图片数据
import re
import requests
from urllib import request
import os



#1.检查页面数据是否为动态加载出来的
#2.获取页面源码数据
if not os.path.exists('qiutu'):
    os.mkdir('qiutu')
    
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
url = 'https://www.qiushibaike.com/pic/'
page_text = requests.get(url=url,headers=headers).text
#3.解析img标签的src属性值
ex = '<div class="thumb">.*?<img src="(.*?)" alt=.*?</div>'
img_url_list = re.findall(ex,page_text,re.S)
for img_url in img_url_list:
    img_url = 'https:'+img_url
    imgPath = 'qiutu/'+img_url.split('/')[-1]
    #4.对图片url发请求
    #5.持久化存储
    request.urlretrieve(url=img_url,filename=imgPath)
    print(imgPath+'下载成功!!!')

  

二、xpath

1.环境安装:

pip install lxml

2.xpath解析原理:

  • 标签定位
  • xpath表达式进行标签的定位
  • xpath表达式必须作用在xpath函数中
  • xpath函数是被封装在etree对象

3.示例

1.简历模板爬取
import requests
import os
from lxml import etree
import random
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
url = 'http://sc.chinaz.com/jianli/free.html'
response = requests.get(url=url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text

if not os.path.exists('jianli'):
    os.mkdir('jianli')
tree = etree.HTML(page_text)
div_list = tree.xpath('//div[@id="container"]/div')
for div in div_list:
    detail_url = div.xpath('./a/@href')[0]
    name = div.xpath('./a/img/@alt')[0]
    
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    tree = etree.HTML(detail_page_text)
    download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
    download_url = random.choice(download_url_list)
    
    jianli_data = requests.get(url=download_url,headers=headers).content
    
    file_path = 'jianli/'+name+'.rar'
    with open(file_path,'wb') as fp:
        fp.write(jianli_data)
    print(file_path+'下载成功')




######处理多页
import requests
import os
from lxml import etree
import random
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
    'Connection':'close'
}
start_page = 1
end_page = 5

if not os.path.exists('jianli'):
        os.mkdir('jianli')
        
url = 'http://sc.chinaz.com/jianli/free_%d.html'

for page in range(start_page,end_page+1):
    if page == 1:
        new_url = 'http://sc.chinaz.com/jianli/free.html'
    else:
        new_url = format(url%page)

    response = requests.get(url=new_url,headers=headers)
    response.encoding = 'utf-8'
    page_text = response.text

    tree = etree.HTML(page_text)
    div_list = tree.xpath('//div[@id="container"]/div')
    for div in div_list:
        detail_url = div.xpath('./a/@href')[0]
        name = div.xpath('./a/img/@alt')[0]

        detail_page_text = requests.get(url=detail_url,headers=headers).text
        tree = etree.HTML(detail_page_text)
        download_url_list = tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li/a/@href')
        download_url = random.choice(download_url_list)

        jianli_data = requests.get(url=download_url,headers=headers).content

        file_path = 'jianli/'+name+'.rar'
        with open(file_path,'wb') as fp:
            fp.write(jianli_data)
        print(file_path+'下载成功')

  

posted @ 2020-10-04 17:25  断浪狂刀忆年少  阅读(219)  评论(0编辑  收藏  举报