获取网站所有的文件

import  re
import urllib
import uuid
from urllib.parse import urljoin
import requests
import os
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
# 发送无参数的get请求 设置超时时间 timeout 单位秒
def getHTMLText(url):
try:
r = requests.get(url, timeout=10)
r.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
r.encoding=r.apparent_encoding
return r.text
except requests.RequestException as e:
return "发生异常"


def getHTMLBuffer(url):
try:
r = requests.get(url, timeout=160)
r.raise_for_status() # 如果响应状态码不是 200,就主动抛出异常
return r.content
except requests.RequestException as e:
return None
#urllib.parse.urljoin();
def link_crawler(seed_url,link_regex):
"""Crawl from the given seed URL following links matched by link_regex"""
crawl_queue=[seed_url]
while crawl_queue:
url=crawl_queue.pop()
html=download(url);
#filter for links matching our regular expression
def get_links(html):
"""Return a list of links from html"""
# urlparse.urljoin(seed_url, link)
#[^>]* 0个以上除了>以外的任意字符
'''
. 匹配除“\r\n”之外的任何单个字符。要匹配包括“\r\n”在内的任何字符,请使用像“[\s\S]”的模式。
? 匹配前面的子表达式0到1次
* 匹配前面的子表达式任意次
+ 匹配前面的子表达式一次或多次(大于等于1次)
[]是定义匹配的字符范围。
^ 只要是”^”这个字符是在中括号”[]”中被使用的话就是表示字符类的否定,如果不是的话就是表示限定开头。我这里说的是直接在”[]”中使用,不包括嵌套使用。
其实也就是说”[]”代表的是一个字符集,”^”只有在字符集中才是反向字符集的意思。
[^>]不是>的
'''
pattern1 =r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
pattern2 =r"(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
pattern3="(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')|(?<=src=\").+?(?=\")|(?<=src=\').+?(?=\')"
webpage_regex=re.compile(pattern3, re.IGNORECASE)
#list of all links from the webpage
return webpage_regex.findall(html)
def download():
pass



if __name__=="__main__":
dominurl="http://www.baidu.com"
html=getHTMLText(dominurl)
urllist=get_links(html)



for eachurl in urllist:
newurl= urljoin(dominurl,eachurl)
html = getHTMLBuffer(newurl)
filename=newurl.split('/')[-1]
newfilename=re.sub('[\/:*?"<>|]', '-', filename)
if(len(newfilename)>3):
newfilename2= os.path.join("D:\Review\image", newfilename)
# print(eachurl)

with open(newfilename2, "wb") as fs:
print(type(html))
if(html is not None):
fs.write(html)

posted @ 2018-01-16 23:14  公众号python学习开发  阅读(381)  评论(0编辑  收藏  举报