|
|
|
|
|
|
一,requests发请求
s = requests.Session()
payload = {'key1': 'value1', 'key2': 'value2'}
proxies = {'http': 'http://47.98.163.18:8080', 'https': 'http://47.98.163.18:8080'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
requests.get(url, headers=headers, verify = False, params=payload, allow_redirects=False, proxies=proxies).content.decode('utf-8')
# headers 请求头
# data post请求数据
# verify ssl安全认证
# allow_redirects 重定向
# proxies 设置代理
requests.post(url, headers=headers, data=data,verify = False, allow_redirects=False).content.decode('utf-8')
re=requests.post(url, headers=headers, data=data,verify = False)
# 获取cookie
requests.utils.dict_from_cookiejar(re.cookies)
requests.packages.urllib3.disable_warnings() # 忽然警告
import urllib3
urllib3.disable_warnings()
requests.get('https://github.com', timeout=2) # 设置超时时间 timeout 设置单一的值,将会用作 connect 和 read 二者的 timeout。
requests.get('https://github.com', timeout=(3.05, 27)) # 如果要分别制定,就需要传入一个元组。(connect, read)的超时时间
requests.get('https://github.com', timeout=None) # 如果需要让 request 永远等待,则传入一个 None 作为 timeout 的值。
二,requests
import requests
kw = {'wd':'长城'}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
# params 接收一个字典或者字符串的查询参数,字典类型自动转换为url编码,不需要urlencode()
response = requests.get("http://www.baidu.com/s?", params = kw, headers = headers)
# 查看响应内容,response.text 返回的是Unicode格式的数据
print(response.text)
# 查看响应内容,response.content返回的字节流数据
print(respones.content)
# 查看完整url地址
print(response.url)
# 查看响应头部字符编码
print(response.encoding)
# 查看响应码
print(response.status_code)
三,python爬虫POST request payload形式的请求
import requests
import json
payloadHeader = {
'Host': 'sellercentral.amazon.com',
'Content-Type': 'application/json',
}
requests.post(postUrl, data=json.dumps(payloadData), headers=payloadHeader)
四,字典格式存数据库,要求数据库字段和字典格式字段一样
class MogujiePipeline(object):
def __init__(self):
# 创建数据库连接
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
# self.db = pymysql.connect(host='rm-bp195i4u0w1066u709o.mysql.rds.aliyuncs.com', port=3306, database='spider58',
# user='spider58',
# password='58spider@123',
# charset='utf8')
self.cursor = self.db.cursor()
def process_item(self, item, spider):
# 判断爬取的字段数据库中是否已经存在
print(f'select id from mogujie where clientUrl={item["clientUrl"]}')
num = self.cursor.execute('select id from mogujie where clientUrl="{}"'.format(item["clientUrl"]))
if not num:
list_key = []
list_lalues = []
for key, lalues in item.items():
list_key.append(key)
list_lalues.append("'" + str(lalues).replace("'", "‘") + "'")
# 拼接sql语句
insert_sql = 'insert into mogujie({}) values({})'.format(', '.join(list_key),
', '.join(list_lalues))
print('insert_sql:', insert_sql)
self.cursor.execute(insert_sql)
self.db.commit()
return item
def close_spider(self, spider):
# 关闭数据库的连接
self.cursor.close()
self.db.close()
五.爬起json数据
import requests
import json
import pymysql
import logging
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出
format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %H:%M:%S', # 时间
filename='yibao.log', # log文件名
filemode='a') # 写入模式“w”或“a”
class yibao(object):
def __init__(self):
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
self.cursor = self.db.cursor()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
self.parse_page()
def parse_page(self):
data = {
'operationId': 'icdIds',
}
html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
data_json = json.loads(html)
for data in data_json:
num = self.cursor.execute('select id from catalogue where id={}'.format())
if not num:
# 插入数据
self.cursor.execute(
'insert into catalogue() values()'.format())
self.db.commit()
# 查询数据
self.cursor.execute("select * from catalogue")
data = self.cursor.fetchone()
data = self.cursor.fetchall()
# 更新数据
self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
self.db.commit()
# 删除数据
self.cursor.execute("delete from catalogue where id={}".format())
self.db.commit()
if __name__ == '__main__':
yibao()
六.HTML数据
import requests
import json
import time
import pymysql
import logging
import random
from lxml import etree
logging.basicConfig(
level=logging.INFO, # 定义输出到文件的log级别,大于此级别的都被输出
format='%(asctime)s %(filename)s %(levelname)s : %(message)s', # 定义输出log的格式
datefmt='%Y-%m-%d %H:%M:%S', # 时间
filename='yibao.log', # log文件名
filemode='a') # 写入模式“w”或“a”
class yibao(object):
def __init__(self):
self.db = pymysql.connect(host='localhost', port=3306, database='cfda', user='root', password='root',
charset='utf8')
self.cursor = self.db.cursor()
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36", }
self.url = 'http://code.nhsa.gov.cn:8000/jbzd/public/toStdOperationTreeList.html'
self.parse_page()
def parse_page(self):
data = {
'operationId': 'icdIds',
}
html = requests.post(url=self.url, headers=self.headers, data=data).content.decode('utf-8')
etree_html = etree.HTML(html)
data = etree_html.xpath(
'//*[@id="classicont"]/div[@class="els-doc-h4"]/a//text() | //div[@class="els-doc-con-left"]/a//text()')
datas = etree_html.xpath(
'//*[@id="classicont"]/div[@class="els-doc-h4"]/span//text() | //div[@class="els-doc-con-left"]/span//text()')
for i in range(len(data)):
num = self.cursor.execute('select id from catalogue where id={}'.format())
if not num:
# 插入数据
self.cursor.execute(
'insert into catalogue() values()'.format())
self.db.commit()
# 查询数据
self.cursor.execute("select * from catalogue")
data = self.cursor.fetchone()
data = self.cursor.fetchall()
# 更新数据
self.cursor.execute("update catalogue set ''='{}', ''='{}' where id={}".format())
self.db.commit()
# 删除数据
self.cursor.execute("delete from catalogue where id={}".format())
self.db.commit()
if __name__ == '__main__':
yibao()
七.使用代理
proxies = {
"http": "http://ip:端口号",
"https": "https://ip:端口号",
}
request.get(url, proxies=proxies)
proxies = {
"http": "http://username:password@ip:端口号",
"https": "https://username:password@ip:端口号",
}
request.get(url, proxies=proxies)
八.requests简单使用 类
import json
import os
import time
import traceback
from urllib.parse import unquote
import requests
class yoyo_requests(object):
def __init__(self):
Logger.config(level="debug", processname=os.path.splitext(os.path.basename(__file__))[0])
self.logger = Logger
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
def reque_url(self, url: str, method="get", data={}, params={}, headers={}):
'''
:param url: url
:param method: 请求方式
:param data: post请求数据
:param cookie: cookie
:param domain: 域名
:return: response
'''
class yo_response(object):
status_code = 500
apparent_encoding = "utf-8"
encoding = "utf-8"
content = b""
text = ""
headers = {}
url = ''
response = yo_response()
if not url:
return response
if not hasattr(self, "Referer"):
self.Referer = url
self.headers["Referer"] = self.Referer
headers = {**headers, **self.headers}
for i in range(2):
self.logger.info("现在请求的网站:{}".format(url))
try:
time.sleep(0.01)
if isinstance(data, str) and '=' in data and method != "get":
data = data.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, str):
if "%7B%7D" in value:
value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
data[key] = unquote(value)
if isinstance(value, list) or isinstance(value, dict):
if "%7B%7D" in json.dumps(value, ensure_ascii=False):
data[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}"))
if isinstance(params, str) and '=' in params and method != "get":
params = dict([j.split('=') for j in params.split('&')])
if isinstance(params, dict):
for key, value in params.items():
if isinstance(value, str):
if "%7B%7D" in value:
value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
params[key] = unquote(value)
if isinstance(value, list) or isinstance(value, dict):
if "%7B%7D" in json.dumps(value, ensure_ascii=False):
params[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}"))
if data and method != "get":
self.logger.info("现在请求的网站data参数是:{}".format(data))
if params:
self.logger.info("现在请求的网站params参数是:{}".format(params))
proxies = get_proxies()
proxies = {}
url = unquote(url)
if method == "get":
response = requests.get(url=url, verify=False, timeout=60, params=params, headers=headers, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'}
elif method == "json":
headers['Content-Type'] = "application/json;charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, json=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} , ensure_ascii=False
else:
headers['Content-Type'] = "application/x-www-form-urlencoded; charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, data=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'}
status_code = response.status_code
self.logger.info("status_code:{}".format(status_code))
self.Referer = url.encode("utf-8").decode("latin1")
if status_code == 200:
if not response.encoding or response.encoding and "utf" not in response.encoding.lower():
response.encoding = response.apparent_encoding
if not response.encoding or "iso" in response.encoding.lower() or "indows" in response.encoding.lower():
response.encoding = "utf-8"
break
except Exception as e:
traceback.print_exc()
self.logger.error("请求网站报错:{}-{}".format(url, e)[:1023])
return response
if __name__ == "__main__":
yoyo = yoyo_requests()
response = yoyo.reque_url("http://jypt.bzggzyjy.cn/bzweb/jyxx/012001/list1.html")
print(response)
九.requests简单使用 函数
import json
import os
import time
import traceback
from urllib.parse import unquote
import requests
Logger.config(level="debug", processname=os.path.splitext(os.path.basename(__file__))[0])
def reque_url(url: str, method="get", data={}, params={}, headers={}):
'''
:param url: url
:param method: 请求方式
:param data: post请求数据
:param cookie: cookie
:param domain: 域名
:return: response
'''
class yo_response(object):
status_code = 500
apparent_encoding = "utf-8"
encoding = "utf-8"
content = b""
text = ""
headers = {}
url = ''
response = yo_response()
if not url:
return response
if 'Referer' not in locals() and 'Referer' not in globals():
Referer = url
headers = {**headers, **{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', "Referer": Referer}}
for i in range(2):
Logger.info("现在请求的网站:{}".format(url))
try:
time.sleep(0.01)
if isinstance(data, str) and '=' in data and method != "get":
data = data.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, str):
if "%7B%7D" in value:
value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
data[key] = unquote(value)
if isinstance(value, list) or isinstance(value, dict):
if "%7B%7D" in json.dumps(value, ensure_ascii=False):
data[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}"))
if isinstance(params, str) and '=' in params and method != "get":
params = dict([j.split('=') for j in params.split('&')])
if isinstance(params, dict):
for key, value in params.items():
if isinstance(value, str):
if "%7B%7D" in value:
value = value.replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}")
params[key] = unquote(value)
if isinstance(value, list) or isinstance(value, dict):
if "%7B%7D" in json.dumps(value, ensure_ascii=False):
params[key] = json.loads(json.dumps(value, ensure_ascii=False).replace('"%7B%7D"', "{}").replace("'%7B%7D'", "{}"))
if data and method != "get":
Logger.info("现在请求的网站data参数是:{}".format(data))
if params:
Logger.info("现在请求的网站params参数是:{}".format(params))
proxies = get_proxies()
proxies = {}
url = unquote(url)
if method == "get":
response = requests.get(url=url, verify=False, timeout=60, params=params, headers=headers, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'}
elif method == "json":
headers['Content-Type'] = "application/json;charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, json=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'} , ensure_ascii=False
else:
headers['Content-Type'] = "application/x-www-form-urlencoded; charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=60, params=params, headers=headers, data=data, proxies=proxies) # , proxies={'https': '10.1.1.159:4568'}
status_code = response.status_code
Logger.info("status_code:{}".format(status_code))
Referer = url.encode("utf-8").decode("latin1")
if status_code == 200:
if not response.encoding or response.encoding and "utf" not in response.encoding.lower():
response.encoding = response.apparent_encoding
if not response.encoding or "iso" in response.encoding.lower() or "indows" in response.encoding.lower():
response.encoding = "utf-8"
break
except Exception as e:
traceback.print_exc()
Logger.error("请求网站报错:{}-{}".format(url, e)[:1023])
return response
if __name__ == "__main__":
response = reque_url("http://jypt.bzggzyjy.cn/bzweb/jyxx/012001/list1.html")
print(response)
十 requests获取cookie
response = self.reque_url(url=image_url)
headers = {'Cookie': '; '.join([f'{key}={value}' for key, value in response.cookies.items()])}
十一 curl_cffi简单使用
import concurrent.futures
import random
import time
import traceback
import requests
from yscredit_tools.utils import get_proxies
from city import city
requests.packages.urllib3.disable_warnings()
from curl_cffi import requests
class ZCQuery(object):
def zc_query(self, ww):
for i in range(1, 10000):
index = 100
data_list = []
for city1 in city:
if not city1.get("children", ""):
data_list.append("{}-{}-{}".format(city1["name"], "", "", random.randint(1, index)))
else:
for city2 in city1["children"]:
if not city2.get("children", ""):
data_list.append("{}-{}-{}-{}".format(city1["name"], city2["name"], "", random.randint(1, index)))
else:
for city3 in city2["children"]:
data_list.append("{}-{}-{}-{}".format(city1["name"], city2["name"], city3["name"], random.randint(1, index)))
# self.get_info("{}-{}-{}-{}".format(city1["name"], city2["name"], city3["name"], random.randint(1, index)))
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
future_to_url = executor.map(self.get_info, data_list)
try:
print(future_to_url.__next__()) # 获取异步返回的结果,这里看到这个异步是否报错
except Exception as e:
print("异步执行错误:{}".format(e))
def get_info(self, data_):
timestamp = int(time.time() * 1000)
data = {
"areaList": [
{
"province": data_.split("-")[0],
"city": data_.split("-")[1],
"county": data_.split("-")[2]
}
],
"policyLevelList": [],
"policyClassifyNewList": [],
"policyPublishDeptNesList": [],
"industryLabelList": [],
"moneyType": None,
"fuchiType": None,
"publishDateFrom": "",
"publishDateTo": "",
"size": 20,
"current": int(data_.split("-")[3]),
"keywords": "",
"policyType": None,
"timestamp": timestamp
}
url = "https://aiqice.cn/data-api/mobile/query/policy?timestamp={}".format(timestamp)
response = self.reque_url(url=url, method='json', data=data)
def reque_url(self, url: str, method="get", data={}, params={}, headers={}):
response = ""
headers = {
"sign": "5e043efdff7c5f7f06c1299dfe808c78b7efac73c3951cba65d20395cbfd3bfd",
"token": "eyJhbGciOiJIUzI1NiJ9.eyJMT0dJTl9VU0VSX0tFWSI6IjQwNmE5ZmU0LWI1NWQtNDUxOC1hYmUzLTAwMWQ2ZWZiZTA3MiJ9.P1fvtXY4HHtAVzzU-8alFFliX_oiyTeqakn2DC6C9-A",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
cookies = {
"acw_tc": "1a0acb4617423625536051182ea8cb04a7a93aecddedd9861a488e6c228f13",
"Hm_lvt_5431ed3fb5a5e79364ba568e45ca8e86": "1742362555",
"Hm_lpvt_5431ed3fb5a5e79364ba568e45ca8e86": "1742362555",
"HMACCOUNT": "525C3484F6D91E35"
}
for i in range(2):
try:
time.sleep(0.01)
proxies = get_proxies()
if method == "get":
response = requests.get(url=url, verify=False, timeout=1, params=params, headers=headers, cookies=cookies, proxies=proxies, impersonate="chrome110") # , proxies={'https': '10.1.1.159:4568'}
elif method == "json":
headers['Content-Type'] = "application/json;charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=1, params=params, headers=headers, cookies=cookies, json=data, proxies=proxies, impersonate="chrome110") # , proxies={'https': '10.1.1.159:4568'} , ensure_ascii=False
else:
headers['Content-Type'] = "application/x-www-form-urlencoded; charset=UTF-8"
response = requests.post(url=url, verify=False, timeout=1, params=params, headers=headers, cookies=cookies, data=data, proxies=proxies, impersonate="chrome110") # , proxies={'https': '10.1.1.159:4568'}
status_code = response.status_code
# if status_code == 401:
# self.get_token
print("status_code:{}".format(status_code))
if status_code == 200:
break
except Exception as e:
traceback.print_exc()
print("请求网站报错:{}-{}".format(url, e)[:1023])
return response
if __name__ == '__main__':
ys = ZCQuery()
# ys.zc_query("ww")
while True:
with concurrent.futures.ProcessPoolExecutor(max_workers=30) as executor:
future_to_url = executor.map(ys.zc_query, [i for i in range(10000)])
try:
print(future_to_url.__next__()) # 获取异步返回的结果,这里看到这个异步是否报错
except Exception as e:
print("异步执行错误:{}".format(e))
发表于
2018-12-17 16:25
守护式等待
阅读( 1116)
评论()
收藏
举报
|
|