一、关于数据存储的常用方法封装
1.存入csv
class SaveCsv:
def __init__(self, title_list=None, save_path=None):
"""初始化表头"""
import csv
self.path = save_path
self.title_list = title_list
if self.title_list:
self.write_title(self.title_list)
def write_title(self, title_list):
""" 创建表头"""
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
2.存入Mysql
class SaveMysql:
def __init__(self, h='localhost', u='root', p=None, db=None):
"""1.连接数据库和定义游标"""
self.db = pymysql.connect(host=h, user=u, password=p, database=db,
cursorclass=pymysql.cursors.DictCursor)
self.cursor = self.db.cursor()
def select(self, sql):
"""2.查询操作"""
self.cursor.execute(sql)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
3.存入sqlite
class SaveSqlite:
def __init__(self, db_name=None):
"""链接库,创建游标"""
db_name = db_name if db_name else "DEMO.db"
self.con = sqlite3.connect(db_name)
self.cur = self.con.cursor()
def create_table(self, sql):
""" 创建数据表"""
try:
sql = "CREATE TABLE IF NOT EXISTS test(id INTEGER PRIMARY KEY,name TEXT,age INTEGER)"
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
二、读取时候的一些方法封装
1.读取Excel
class ReadExcel:
def __init__(self, path=''):
"""打开Excel文件读取数据"""
import xlrd
self.xl = xlrd.open_workbook(path)
def get_sheetinfo_by_name(self, name):
"""通过sheet_name获取一个工作表"""
self.sheet = self.xl.sheet_by_name(name)
return self.get_sheet_info()
def get_sheet_info(self):
"""读取工作表中数据"""
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
三、爬虫过程中的一些方法封装
1.随机获取user-agent的方法封装
import random
def get_random_ua():
""" 随机获取useragent"""
USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)'
]
return random.choice(USER_AGENT_LIST)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
2.代理云免费代理IP的方法封装
import requests
from lxml import etree
class Proxy:
def __init__(self):
self.headers = {
'User-Agent': get_user_agent(),
}
print('======稍等几秒!正在获取一批代理IP=====')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
3.对网页上随机复制过来的请求头加双引号
def change_headers(s):
"""
对浏览器中直接复制过来的headers加引号
启用^和$的多行模式re.M, 替换的格式中不可出现\s等元字符, 字典最后一个item有无逗号都不影响
"""
import re
s2 = re.sub(r'^([^:\n]+):?\s?(.*)$', r'"\1": "\2",', s, flags=re.M)
return s2
if __name__ == '__main__':
s = '''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Connection: keep-alive
Cookie: SINAGLOBAL=7711055592191.014.1598584986158; _s_tentry=www.baidu.com; Apache=8858699216407.67.1605851743484; ULV=1605851743593:5:3:2:8858699216407.67.1605851743484:1605260335550; login_sid_t=a940c0ecce7edd058d9b6925adaa8ac7; cross_origin_proto=SSL; WBStorage=8daec78e6a891122|undefined; WBtopGlobal_register_version=2020120116; crossidccode=CODE-yf-1KK0U9-TFqHe-VvEt7P4wOQxtqQ512159c; UOR=,,graph.qq.com; appkey=; ALF=1638346907; SSOLoginState=1606810907; SUB=_2A25ywY1ODeRhGeNP7VsW9CzNyDiIHXVRtvmGrDV8PUNbmtAKLWWlkW9NTnvmvAEcJljlBInv5ztVocS0FPV0i1Ms; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF2dYvgeviQkaTo-eFXhMHM5JpX5KzhUgL.Fo-pSo.NShzpe0B2dJLoIceLxKBLBonL12BLxKqL1hBLB.qLxK.L1h5L12qLxK.L1-eLBozLxK-LBKML1KeLxK-L1K5L12BLxKML1-2L1hBLxK-L12qLB-qLxKML1hnLBo2LxKMLB.BL1K2LxK-L12qL12zt; wvr=6
Host: s.weibo.com
Referer: https://s.weibo.com/weibo/%25E9%25B2%258D%25E6%25AF%2593%25E6%2598%258E%25E6%25B6%2589%25E5%25AB%258C%25E6%2580%25A7%25E4%25BE%25B5%25E5%2585%25BB%25E5%25A5%25B3?topnav=1&wvr=6&b=1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'''
print(change_headers(s))
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
![在这里插入图片描述]()
四、其他
1.打包命令
pyinstaller -F C:\Users\Admin\Desktop\其他代码\demo.py
https://www.researchgate.net/project/yashangzhengxingyiyuanzenmeyang
https://www.researchgate.net/project/naRhei7549
https://www.researchgate.net/project/shoudawoershoufgdhfgh
https://www.researchgate.net/project/zhengxingyiyuanasdasd
https://www.researchgate.net/project/shoudashoudawoer