 
                    
                
         
    
    
    
	
	
		
    
    
        
            
            
            
                
常用正则表达式回顾
![]()
正则练习
- import re
- #提取出python
- key="javapythonc++php"
- re.findall('python',key)[0]
- #####################################################################
- #提取出hello world
- key="
- hello world- "
- 
re.findall('(.*)',key)[0]
- 
#####################################################################
- 
#提取170
- 
string = '我喜欢身高为170的女孩'
- 
re.findall('\d+',string)
- 
#####################################################################
- 
#提取出http://和https://
- 
key='http://www.baidu.com and https://boob.com'
- 
re.findall('https?://',key)
- 
#####################################################################
- 
#提取出hello
- 
key='lalalahellohahah' #输出hello
- 
re.findall('<[Hh][Tt][mM][lL]>(.*)',key)
- 
#####################################################################
- 
#提取出hit. 
- 
key='bobo@hit.edu.com'#想要匹配到hit.
- 
re.findall('h.*?\.',key)
- 
#####################################################################
- 
#匹配sas和saas
- 
key='saas and sas and saaas'
- 
re.findall('sa{1,2}s',key)
项目需求:爬取糗事百科指定页面的糗图,并将其保存到指定文件夹中
![]()
- #!/usr/bin/env python
- # -*- coding:utf-8 -*-
- import requests
- import re
- import os
- if __name__ == "__main__":
-  url = 'https://www.qiushibaike.com/pic/%s/'
-  headers={
-  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
-  }
-  #指定起始也结束页码
-  page_start = int(input('enter start page:'))
-  page_end = int(input('enter end page:'))
-  #创建文件夹
-  if not os.path.exists('images'):
-  os.mkdir('images')
-  #循环解析且下载指定页码中的图片数据
-  for page in range(page_start,page_end+1):
-  print('正在下载第%d页图片'%page)
-  new_url = format(url % page)
-  response = requests.get(url=new_url,headers=headers)
-  #解析response中的图片链接
-  e = '
- .*?![]() .*? .*?
 - '
-  pa = re.compile(e,re.S)
-  image_urls = pa.findall(response.text)
-  #循环下载该页码下所有的图片数据
-  for image_url in image_urls:
-  image_url = 'https:' + image_url
-  image_name = image_url.split('/')[-1]
-  image_path = 'images/'+image_name
-  image_data = requests.get(url=image_url,headers=headers).content
-  with open(image_path,'wb') as fp:
-  fp.write(image_data)
 
             
            posted @ 
2021-03-22 13:44 
好吗,好 
阅读(
81) 
评论() 
 
收藏 
举报