def request(url, data=None, get_or_post=None):
try:
if get_or_post:
response = requests.post(url=url, data=data, headers=headers)
else:
if data:
url = url + urlencode(data)
response = requests.get(url=url, headers=headers)
# print(response.headers)
# {'Server': 'jfe', 'Date': 'Wed, 06 Mar 2019 05:01:58 GMT', 'Content-Type': 'text/html', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Vary': 'Accept-Encoding', 'Set-Cookie': 'xtest=3695.cf6b6759; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; domain=search.jd.com, ipLoc-djd=1-72-2799-0; expires=Fri, 05-Apr-2019 05:01:58 GMT; Max-Age=2592000; path=/; domain=jd.com', 'Content-Encoding': 'gzip', 'Strict-Transport-Security': 'max-age=86400'}
# print(type(response)) # <class 'requests.models.Response'>
# print(type(response.text)) # <class 'str'>
# print(response.headers['content-type']) text/html
# print(response.encoding) # ISO-8859-1#response内容的编码
# print(response.apparent_encoding) utf-8#response headers里设置的编码(即服务端返回的数据是用utf8格式编码的)
# print(requests.utils.get_encodings_from_content(response.text)) ['utf-8']#response返回的html header标签里设置的编码
'''
class HTTPAdapter(BaseAdapter):
# 接收到服务端的响应之后对服务端的响应进行处理,构造Response对象
def build_response(self, req, resp):
response = Response()
response.status_code = getattr(resp, 'status', None)
response.encoding = get_encoding_from_headers(response.headers)
response.encoding由下面的函数返回值赋值得到的,下面函数判断响应头中的content-type中有没有charset,如果有charset就将charset的值返回,如果没有则判断有没有text,如果有返回ISO-8859-1,而我们请求搜索页的时候content-type是没有charset的,只有text
def get_encoding_from_headers(headers):
"""Returns encodings from given HTTP Header Dict.
:param headers: dictionary to extract encoding from.
:rtype: str
"""
content_type = headers.get('content-type')
if not content_type:
return None
content_type, params = cgi.parse_header(content_type)
if 'charset' in params:
return params['charset'].strip("'\'")
if 'text' in content_type:
return 'ISO-8859-1'
response.text是如何被编码的:
class Response(object):
@property
def text(self):
encoding = self.encoding # (response.encoding已被上面的函数赋值为ISO-8859-1)
try:
# 将服务端返回的响应体的内容(bytes类型)使用encoding(ISO-8859-1)的编码格式进行解码,解码成str类型
# 但是服务端返回的响应体的内容(bytes类型)是用utf-8编码生成的,用ISO-8859-1编码格式去进行解码成str类型,肯定会乱码
content = str(self.content, encoding, errors='replace')
总结:requests模块会根据响应头的content-type里的charset去设置响应体的编码格式,如果没有会给一个默认的编码格式ISO-8859-1, 但是服务端对响应体是用utf-8进行编码,编码成bytes类型返回的,然后你用ISO-8859-1去解码成str类型,肯定乱码(response.txt是ISO-8859-1编码格式的str类型)
解决方案:将上述过程逆向,将response.txt str类型使用ISO-8859-1编码格式编码成服务端原始返回的utf-8编码格式的bytes类型,然后再使用utf-8编码格式解码成str类型,即response.text.encode(response.encoding).decode(response.apparent_encoding),response.apparent_encoding就是服务端返回的响应头中设置编码格式,即服务端对返回的响应体(bytes类型)的编码格式,在本例中就是utf-8
'''
if response.status_code == 200:
return response.text.encode(response.encoding).decode(response.apparent_encoding)
return None
except RequestException:
print('请求' + url + '出错')
return None
def search(keyword, page):
url = "https://search.jd.com/Search?"
data = {
"keyword": keyword,
"enc": "utf-8",
"page": page,
}
html = request(url, data)
return html
html = search('显卡', 2)