Python中关于爬虫爬取过程中一些方法的封装...数据存储,代理,读取表格等

 

一、关于数据存储的常用方法封装

1.存入csv

class SaveCsv:
    def __init__(self, title_list=None, save_path=None):
        """初始化表头"""
        import csv
        self.path = save_path
        self.title_list = title_list
        if self.title_list:
            self.write_title(self.title_list)

    def write_title(self, title_list):
        """ 创建表头"""
        # title_list = ['邮箱', '站点', '名字', '亚马逊', 'Facebook', 'url']
        try:
            with open(self.path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file, delimiter=',')
                writer.writerow(title_list)
            return '表头创建成功!'
        except:
            return '表头创建失败!'

    def wirte_row(self, content_list):
        """写入行列表到csv中"""
        try:
            with open(self.path, 'a+', newline='', encoding='utf-8') as file:
                writer = csv.writer(file, delimiter=',')
                writer.writerow(content_list)
            print('>>>写入【{}】成功!'.format('数据'))
        except:
            pass
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29

2.存入Mysql

class SaveMysql:
    def __init__(self, h='localhost', u='root', p=None, db=None):
        """1.连接数据库和定义游标"""
        self.db = pymysql.connect(host=h, user=u, password=p, database=db,
                                  cursorclass=pymysql.cursors.DictCursor)
        self.cursor = self.db.cursor()

    def select(self, sql):
        """2.查询操作"""
        self.cursor.execute(sql)  # 执行sql语句
        return self.cursor.fetchall()  # 会获取所有数据

    def change(self, sql):
        """3.增删改操作"""
        self.cursor.execute(sql)
        self.db.commit()  # 提交数据
        print('操作成功!')
        return self.cursor.rowcount  # 获取操作的行数

    def __del__(self):
        """4.断开连接"""
        self.cursor.close()
        self.db.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23

3.存入sqlite

class SaveSqlite:
    def __init__(self, db_name=None):
        """链接库,创建游标"""
        db_name = db_name if db_name else "DEMO.db"
        self.con = sqlite3.connect(db_name)
        self.cur = self.con.cursor()

    def create_table(self, sql):
        """ 创建数据表"""
        try:
            sql = "CREATE TABLE IF NOT EXISTS test(id INTEGER PRIMARY KEY,name TEXT,age INTEGER)"  # 建表语句
            self.cur.execute(sql)
            return "创建表成功!"
        except:
            return '创建表失败!'

    def change_data(self, sql):
        """增删改数据"""
        try:
            # data = "1,'Desire',5"
            # sql = """INSERT INTO test VALUES (%s)""" % data
            self.cur.execute(sql)
            self.con.commit()
            return "修改数据成功!"
        except:
            return '修改数据失败!'

    def select_data(self, sql, num):
        """ 查询多条数据"""
        self.cur.execute(sql)
        return self.cur.fetchmany(num)

    def __del__(self):
        """关闭游标, 断开数据库连接"""
        self.cur.close()
        self.con.close()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36

二、读取时候的一些方法封装

1.读取Excel

class ReadExcel:
    def __init__(self, path=''):
        """打开Excel文件读取数据"""
        import xlrd
        self.xl = xlrd.open_workbook(path)

    def get_sheetinfo_by_name(self, name):
        """通过sheet_name获取一个工作表"""
        self.sheet = self.xl.sheet_by_name(name)
        return self.get_sheet_info()

    def get_sheet_info(self):
        """读取工作表中数据"""
        # 定义一个列表用来装读取的数据
        infolist = []
        for row in range(0, self.sheet.nrows):
            # self.sheet.nrows 返回的是Excel的行数
            info = self.sheet.row_values(row)
            infolist.append(info)
        return infolist
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20

三、爬虫过程中的一些方法封装

1.随机获取user-agent的方法封装

import random


def get_random_ua():
    """ 随机获取useragent"""
    USER_AGENT_LIST = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)'
    ]
    return random.choice(USER_AGENT_LIST)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14

2.代理云免费代理IP的方法封装

import requests
from lxml import etree

class Proxy:
    def __init__(self):
        self.headers = {
            'User-Agent': get_user_agent(),
        }
        print('======稍等几秒!正在获取一批代理IP=====')
        # 1.爬取代理ip信息
        self.proxies_list = self.get_ip_list()
        #  2.校验代理ip
        self.use_can_ip_list = self.check_ip(self.proxies_list)

    def get_ip_list(self):
        """获取代理IP"""
        base_url = 'http://www.ip3366.net/?stype=1&page={}'
        # 存放代理ip的列表
        proxies_list = []
        for i in range(1, 3):
            # 获取到代理ip网站的每一页信息
            response = requests.get(base_url.format(i), headers=self.headers)
            response.encoding = 'utf-8'
            html_ele = etree.HTML(response.text)
            # 开始提取信息
            IP_num = html_ele.xpath('//div[@id="list"]//tr/td[1]/text()')  # 获取ip地址
            IP_port = html_ele.xpath('//div[@id="list"]//tr/td[2]/text()')  # 获取端口号
            IP_type = html_ele.xpath('//div[@id="list"]//tr/td[4]/text()')  # 请求类型
            # 将获取的ip和端口号拼接,并加入代理ip列表
            for i in range(len(IP_num)):
                proxy_dict = {}
                proxy_dict[IP_type[i]] = IP_num[i] + ':' + IP_port[i]
                proxies_list.append(proxy_dict)
        return proxies_list

    def check_ip(self, proxies_list):
        """ip进行检验"""
        # 认定3秒没有请求到信息,或者报错的为不可使用的ip
        can_use_ip_list = []
        # 筛选可以使用的代理
        for proxy in proxies_list:
            try:
                response = requests.get('https://www.baidu.com/', proxies=proxy, headers=self.headers, timeout=3)
                if response.status_code != 200:
                    continue
            except Exception as e:
                # print('IP>>>{},不可使用,错误信息:{}'.format(proxy, e))
                pass
            else:
                pass
                can_use_ip_list.append(proxy)

        return can_use_ip_list

    def get_random_proxy(self):
        return random.choice(self.use_can_ip_list)
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56

3.对网页上随机复制过来的请求头加双引号

def change_headers(s):
    """
      对浏览器中直接复制过来的headers加引号
      启用^和$的多行模式re.M, 替换的格式中不可出现\s等元字符, 字典最后一个item有无逗号都不影响
    """
    import re
    s2 = re.sub(r'^([^:\n]+):?\s?(.*)$', r'"\1": "\2",', s, flags=re.M)
    return s2


if __name__ == '__main__':
    s = '''
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding: gzip, deflate, br
Accept-Language: zh-CN,zh;q=0.9
Connection: keep-alive
Cookie: SINAGLOBAL=7711055592191.014.1598584986158; _s_tentry=www.baidu.com; Apache=8858699216407.67.1605851743484; ULV=1605851743593:5:3:2:8858699216407.67.1605851743484:1605260335550; login_sid_t=a940c0ecce7edd058d9b6925adaa8ac7; cross_origin_proto=SSL; WBStorage=8daec78e6a891122|undefined; WBtopGlobal_register_version=2020120116; crossidccode=CODE-yf-1KK0U9-TFqHe-VvEt7P4wOQxtqQ512159c; UOR=,,graph.qq.com; appkey=; ALF=1638346907; SSOLoginState=1606810907; SUB=_2A25ywY1ODeRhGeNP7VsW9CzNyDiIHXVRtvmGrDV8PUNbmtAKLWWlkW9NTnvmvAEcJljlBInv5ztVocS0FPV0i1Ms; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF2dYvgeviQkaTo-eFXhMHM5JpX5KzhUgL.Fo-pSo.NShzpe0B2dJLoIceLxKBLBonL12BLxKqL1hBLB.qLxK.L1h5L12qLxK.L1-eLBozLxK-LBKML1KeLxK-L1K5L12BLxKML1-2L1hBLxK-L12qLB-qLxKML1hnLBo2LxKMLB.BL1K2LxK-L12qL12zt; wvr=6
Host: s.weibo.com
Referer: https://s.weibo.com/weibo/%25E9%25B2%258D%25E6%25AF%2593%25E6%2598%258E%25E6%25B6%2589%25E5%25AB%258C%25E6%2580%25A7%25E4%25BE%25B5%25E5%2585%25BB%25E5%25A5%25B3?topnav=1&wvr=6&b=1
Upgrade-Insecure-Requests: 1
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4208.400'''
    print(change_headers(s))
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22

在这里插入图片描述

四、其他

1.打包命令

pyinstaller -F C:\Users\Admin\Desktop\其他代码\demo.py

































































https://www.researchgate.net/project/yashangzhengxingyiyuanzenmeyang
https://www.researchgate.net/project/naRhei7549
https://www.researchgate.net/project/shoudawoershoufgdhfgh
https://www.researchgate.net/project/zhengxingyiyuanasdasd
https://www.researchgate.net/project/shoudashoudawoer

posted on 2020-12-19 23:43  天天熬夜写电脑剧本  阅读(196)  评论(0)    收藏  举报

导航