requests获取所有状态码

requests默认是不会获取301/302的状态码的。可以设置allow_redirects=False，这样就可以获取所有的状态码了

import requests

# url
# url = 'http://www.freebuf.com/news/157100.html'  # 请求200，返回200
url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False
# url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403，返回403
# url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面，请求404，返回404
# url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃
# url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名，公司限制访问，返回抛出异常，程序崩溃。效果和网络中断一样。
# url = 'http://10.1.75.241'  # 请求ip，（一定要加协议HTTP，否则崩溃）
# headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
try:
    # 发请求，得响应
    response = requests.get(url, headers=headers, allow_redirects=False)
    # 解析
    print('    give url:', url)
    print(' request.url:', response.request.url)
    print('response.url:', response.url)
    print(response.content)
    print(response.status_code)
except Exception as e:
    print(e)

封装一个获取所有状态码的函数，同时实现验证返回值的方法

import requests


def get_statecode_or_errinfo(url=''):
    '''
    获取响应状态码，或者未响应的错误信息
    :param url: 请求的url
    :return: 状态码，或者未响应的错误信息
    '''
    if url == '':
        return '请输入一个url作为get_statecode_or_errinfo的参数'
    # headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    try:
        # 发请求，得响应
        response = requests.get(url, headers=headers, allow_redirects=False)
        # 返回状态码
        return response.status_code
    except Exception as e:
        # 返回异常信息
        return e


if __name__ == '__main__':
    # url
    # url = 'http://www.freebuf.com/news/157100.html'  # 请求200，返回200
    url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False
    # url = 'http://www.freebuf.com/articles/database/151839.html'  # 请求403，返回403
    # url = 'http://www.freebuf.com/articles/database/1518391.html'  # 请求存在的域名中不存在的页面，请求404，返回404
    # url = 'http://www.freebudfsf.com/articles/database/1518391.html'  # 请求不存在的域名。程序崩溃。如果有Nginx，返回200
    # url = 'http://dsfs'  # 请求不存在的域名，设置了参数：allow_redirects=False，在有Nginx处理的情况下，有304，返回200。
    # url = 'https://www.douban.com/group/topic/49606658/'  # 请求存在的域名，公司限制访问，返回抛出异常，程序崩溃。效果和网络中断一样。
    # url = 'http://10.1.75.241'  # 请求ip，请求200，返回200（一定要加协议HTTP，否则崩溃）



    # url = 'http://www.freebuf.com/fevents/133225.html'  # 请求302，返回200。要想不跳转，获取302，用参数：allow_redirects=False
    url = 'http://www.freebuf.com/news/171238.html'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    # response=requests.get(url,headers=headers,allow_redirects=False)
    response=requests.get(url,headers=headers)
    # 检查状态码
    print(response.status_code)

    # # 检查url
    print(url)
    print(response.url)
    #
    # # 检查请求头
    print(response.request.headers)
    #
    # # 检查响应头
    print(response.headers)
    #
    # # 检查源码
    # print(response.content)
    # print(response.content.decode())
    # print(response.text)
    #
    # response.encoding = 'utf-8'
    # print(response.text)
    print(response.encoding)
    #
    # # 检查源码字符串长度
    print(len(response.content))

说明：

反扒：
总结多种验证返回值的方式。requests
比如：检查状态码、检查url（有可能发送了跳转）、检查请求头、检查响应头、检查源码、检查源码字符串长度。
检查状态码
print (response.status_code)
检查url
print (response.url)
检查请求头
print (response.request.headers)
检查响应头
print (response.headers)
检查源码字符串长度
print (len(response.content))
检查源码
print (response.content)
print (response.content.decode())
response.encoding='utf-8'
print (response.text)
print (response.encoding)

scrapy爬虫的响应规则：

# 1、被过滤掉，不发出请求：不在允许的域名范围内
# temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前，url直接被过滤掉。
# temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前，url直接被过滤掉。
# temp['title_url'] = "http://10.1.75.241"  # 请求ip地址，请求发出前，url直接过来掉。如果设置为允许ip网站，没有被过滤，就返回200

# 2、禁止访问
# temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403，资源存在，不让访问。Ignoring non-200 response
# temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404，资源本身不存在。Ignoring non-200 response

# 3、重定向后的作为新请求
# temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码

# 4、断网
# temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]

# 5、没有的网站
# temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉，如果没有被过滤，就返回域名解析错误：DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
pass

scrapy爬虫举例

freebuf2.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy_FB.items import ScrapyFb2Item


# from util.logger import Logger

# logger_freebuf2 = Logger(logname=__name__, logpath='collection_log', logformat=1, loglevel=10).getlog()
# logger_freebuf2.debug('i am debug3')
# logger_freebuf2.info('i am info3')
# logger_freebuf2.warning('i am warning3')


class Freebuf2Spider(scrapy.Spider):
    # freebuf2爬虫
    name = 'freebuf2'
    allowed_domains = ['freebuf.com','douban.com']
    start_urls = ['http://www.freebuf.com/page/708']

    def parse(self, response):
        cur_url = response.url  # 当前列表页url
        cur_page_num = int(cur_url.rpartition('/')[-1])  # 当前page num

        print('cur_url：%s' % cur_url)
        print('cur_page_num：%s' % cur_page_num)

        # 获取列表节点
        node_list = response.xpath('//*[@id="timeline"]/div/div[2]/dl/dt/a[1]')
        print('len(node_list)：%s' % len(node_list))

        page_num = int(cur_url.rpartition('/')[-1])  # 当前页码
        count_node = len(node_list)  # 当前列表页，一共有的详细页条数

        # 遍历节点
        for i, node in enumerate(node_list):
            # temp = {}
            temp = ScrapyFb2Item()
            temp['title'] = node.xpath('./text()').extract()[0].strip()
            if i == 0:
                # 1、被过滤掉，不发出请求：不在允许的域名范围内
                # temp['title_url'] = "https://www.baidu.com/"  # 跨域。请求发出前，url直接被过滤掉。
                # temp['title_url'] = "http://open.freebuf.com/live?id=1021"  # 跨域。请求发出前，url直接被过滤掉。
                # temp['title_url'] = "http://10.1.75.241"  # 请求ip地址，请求发出前，url直接过来掉。如果设置为允许ip网站，没有被过滤，就返回200

                # 2、禁止访问
                # temp['title_url'] = "http://www.freebuf.com/articles/database/151839.html"#禁止访问403，资源存在，不让访问。Ignoring non-200 response
                # temp['title_url'] = "http://www.freebuf.com/articles/database/1518391.html"#禁止访问404，资源本身不存在。Ignoring non-200 response

                # 3、重定向后的作为新请求
                # temp['title_url'] = "http://www.freebuf.com/news/156654.html"  # 重定向301、302。会返回重定向后200的状态码

                # 4、断网
                # temp['title_url'] = "https://www.douban.com/group/topic/49606658/"  # 公司限制访问。[<twisted.python.failure.Failure twisted.internet.error.ConnectionLost: Connection to the other side was lost in a non-clean fashion: Connection lost.>]

                # 5、没有的网站
                # temp['title_url'] = "https://www.badfsdsdfsdfsdfsdddd.com/"  # 直接被过滤掉，如果没有被过滤，就返回域名解析错误：DNS lookup failed: no results for hostname lookup: www.badfsdsdfsdfsdfsdddd.com.
                pass



            else:
                temp['title_url'] = node.xpath('./@href').extract()[0]
            temp['page_num'] = str(page_num)
            temp['line_num'] = i + 1
            temp['line_total'] = str(count_node)
            # print(temp['line_num'])
            yield scrapy.Request(temp['title_url'], callback=self.parse_detail, meta={"meta_1": temp}, errback=self.err)

        if len(node_list) != 0:  # 爬虫不终止的条件
            # 下一页
            next_url = 'http://www.freebuf.com/page/{}'.format(cur_page_num + 1)
            # print('next_url：%s' % next_url)
            yield scrapy.Request(next_url, callback=self.parse)  # 访问下一页

    def parse_detail(self, response):
        item = response.meta['meta_1']

        print(item['line_num'], item['title_url'])
        # print(response.status)
        print(item['line_num'], response.request.url)


    def err(self, response):
        print('err:',response.request.url)
        # print('err:',response.status)
        # print(dir(response))
        print('err:',response.getErrorMessage())
        print(dir(response))
        # print(type(response.getErrorMessage()))

posted @ 2018-02-01 20:10 安迪9468 阅读(2063) 评论(0) 收藏举报

刷新页面返回顶部

requests获取所有状态码

公告