python爬虫遇到的问题小记

一、重定向获取真实url

import requests
rs = requests.get("http://www.baidu.com/link?url=49Ps-KJfS3X63hh9xC1lPZ-GqmGKb2SNFRE-I2mCr79JQ1gmZfDbTq9wv-lVp5eL",verify=False,timeout=10)
print rs.url
http_headers = { 'Accept': '*/*','Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'}

重试机制:

def get_real_url(url,try_count = 1):
if try_count > 3:
    return url
try:
    rs = requests.get(url,headers=http_headers,timeout=10)
    if rs.status_code > 400:
        return get_real_url(url,try_count+1)
    return rs.url
except:
    return get_real_url(url, try_count + 1)

 

posted @ 2017-12-06 14:05  枫海坡  阅读(434)  评论(0)    收藏  举报