scrapy使用案例
Python爬取腾讯漫画信息
1,技术栈
- python
- scrapy
- Crawl Spider
- pycharm
2,腾讯动漫
# 网址
https://ac.qq.com/
# 检索页面
https://ac.qq.com/Comic/all/search/hot/page/1
# 全部5410结果
- 先获取所有分页器的url
- 获取每一页的漫画url
- 进入漫画详情页面获取需要数据
- 持久化存储
3,创建一个scrapy项目
3.1,通过pycharm创建一个虚拟环境
3.2,安装模块
pip install pywin32
pip install scrapy
pip install scrapy-redis
3.3,创建工程
scrapy startproject 工程名称
scrapy genspider –t crawl 爬虫名称  起始url
scrapy startproject comic
cd comic
scrapy genspider –t crawl  cinfo https://ac.qq.com/Comic/all/page/1
# 若创建失败
scrapy genspider  cinfo https://ac.qq.com/Comic/all/page/1
# 使用crawl创建成功跳过
# cingo.py
import scrapy
class CinfoSpider(scrapy.Spider):
    name = 'cinfo'
    allowed_domains = ['ac.qq.com']
    start_urls = ['http://ac.qq.com/']
    def parse(self, response):
        pass
# 修改为
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_redis.spiders import RedisCrawlSpider
class CinfoSpider(CrawlSpider):
    name = 'cinfo'
    # allowed_domains = ['ac.qq.com']
    start_urls = ['https://ac.qq.com/Comic/all/page/1']
    link = LinkExtractor(allow=r"/Comic/comicInfo/id/\d+")
    # 链接提取器
    rules = (
        # 规则解析器
        Rule(link, callback="parse_item", follow=True),
    )
    def parse_item(self, response):
        pass
3.4,link和rules
CinfoSpider类不再继承scrapy.Spider类,而是继承CrawlSpider类,
- 
link 链接提取器 link = LinkExtractor(allow=r"/Comic/comicInfo/id/\d+") allow写正则, 点击一个漫画详情页url为:https://ac.qq.com/Comic/comicInfo/id/630166 点击检查a标签的href="/Comic/comicInfo/id/630166" 每一个漫画只有最后一个数字有变化,所以正则为"/Comic/comicInfo/id/\d+"
- 
rules 规则解析器 rules = ( # 规则解析器 Rule(link, callback="parse_item", follow=True), ) rules一定要是一个可迭代对象,当只有一个链接提取器时,后面一定要加一个`,`
- 
parse_item 回调函数 给提取到的url发起请求后返回请求数据在这里 示例: def parse_item(self, response): print(response)
执行cinfo
scrapy crawl cinfo
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://ac.qq.com/robots.txt> (referer: None)
[scrapy.core.engine] DEBUG: Crawled (200) <GET https://ac.qq.com/Comic/all/page/1> (referer: None)
4,settings配置
4.1,ROBOTSTXT协议
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False
4.2,UA伪装
# 方式一
# USER_AGENT = 'comic (+http://www.yourdomain.com)'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36'
#方式二
DEFAULT_REQUEST_HEADERS = {
  # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  # 'Accept-Language': 'en',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.5,en;q=0.3",
    "Accept-Encoding": "gzip, deflate",
    'Content-Length': '0',
    "Connection": "keep-alive",
    "referer": "https://ac.qq.com/Comic/all/page/1"
}
# 方式三,中间键,
 settings = get_project_settings()
 USER_AGENT_LIST = settings.get("USER_AGENT_LIST")
  def process_request(self, request, spider):
        # UA伪装
        request.headers["User-Agent"] = random.choice(self.USER_AGENT_LIST)
        return None
5,重写start_urls
在start_urls页面中,分页器是通过js实现,无法直接获取到每一页的url
def get_start_urls(start_urls):
    response = requests.get(url=start_urls[0], )
    pages = etree.HTML(response.content)
    all_comic = pages.xpath("""/html/body/div[3]/div[2]/div/div[1]/span/em/text()""")
    all_comic = all_comic[0].encode('utf-8').decode("utf-8")
    print(f"开始爬取腾讯漫画,预计有{all_comic}本")
    pages_bs4 = BeautifulSoup(response.text, 'html.parser')
    pages_comic = pages_bs4.find_all("li", class_="ret-search-item clearfix")
    print(f"一页有{len(pages_comic)}本漫画")
    max_num = math.ceil(int(all_comic) / len(pages_comic))
    print(f"总共有{max_num}个页面")
    start_urls = [start_urls[0] + f"{i}" for i in range(1, max_num + 1)]
    return start_urls
    start_urls = ['https://ac.qq.com/Comic/all/page/']
    print("---------------开始执行爬虫--------------")
    start_urls = get_start_urls(start_urls)
6,提出漫画章节的url
    link = LinkExtractor(allow=r"/Comic/comicInfo/id/\d+")
    # 链接提取器
    link_comic = LinkExtractor(allow=r"ComicView/index/id/\d+/cid/\d+")
    # TODO:付费章节
    rules = (
        # 规则解析器
        Rule(link, callback="parse_item", follow=True),
        Rule(link_comic, callback="parse_comic_item", follow=True),
    )
因为有付费章节存在,提取的章节url无法获得章节内容,先跳过
7,获取需要的数据(xpath、bs4)
class ComicItem(scrapy.Item):
    # define the fields for your item here like:
    comic_id = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    popularity = scrapy.Field()
    collect = scrapy.Field()
    introductory = scrapy.Field()
    score = scrapy.Field()
    score_people_nums = scrapy.Field()
    status = scrapy.Field()
    cover = scrapy.Field()
    author_image = scrapy.Field()
 def parse_item(self, response):
        # response = requests.get(url=self.url)
        print(response)
        comic_id = response.url.split("/")[-1]
        # print(comic_id)
        pages = etree.HTML(response.text)
        title = pages.xpath("""//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[1]/h2/strong/text()""")
        print("漫画名称>>>", title)
        authors = pages.xpath("""//*[@id="special_bg"]/div[3]/div/div/div[2]/div/p/span/em//text()""")
        author = unicodedata.normalize('NFKC', authors[0]).strip()
        print(f"漫画作者为{author},人气{authors[1]},收藏{authors[-1]}")
        pages_bs4 = BeautifulSoup(response.text, 'html.parser')
        introductory = pages_bs4.find('p', class_="works-intro-short ui-text-gray9").text
        # print(introductory.strip())
        tags = pages_bs4.find_all("span", class_="tags-show")
        # print(tags)
        # TODO: 涉及js逆向,后面再做
        score = pages.xpath("""//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[2]/p/strong/text()""")
        score_people_nums = pages.xpath("""//*[@id="special_bg"]/div[3]/div/div/div[2]/div/div[2]/p/span/text()""")
        # print(score)
        print(score or "评分人数过少,暂未计入评分")
        # print(score_people_nums)
        status = pages.xpath("""//*[@id="special_bg"]/div[3]/div/div/div[1]/label/text()""")
        # print(status)
        image_banner = pages.xpath(
            """//*[@id="special_bg"]/div[3]/div/div/div[1]/a/img/@src """
        )
        # | //*[@id="special_bg"]/div[3]/div/div/div[1]/a/img/@alt
        cover, author_image = image_banner
        # print(image_banner)
        # print(len(score))
        # score = [i.strip() for i in score]
        # print(score)
        icon = pages_bs4.find("div", class_="works-intro-head clearfix").children
        # print(pages_bs4)
        icon = [i.text for i in icon][1::2]
        icon.remove(title[0])
        # print('icon',icon)
        # items = {}
        items = ComicItem()
        items["comic_id"] = comic_id
        items["title"] = title[0]
        items["author"] = author
        items["popularity"] = authors[1]
        items["collect"] = authors[-1]
        items["introductory"] = introductory.strip()
        items["score"] = score[0] if score else "评分人数太少,不予统计"
        items["score_people_nums"] = score_people_nums[0]
        items["status"] = status
        items["cover"] = cover
        items["author_image"] = author_image
        # print(items)
        yield items
8,持久化存储mysql(scrapy+pymysql+dbutils
"""
数据库连接工具类
# """
import pymysql
import traceback
import logging
from dbutils.pooled_db import PooledDB
from scrapy.utils.project import get_project_settings
class MysqlUtil(object):
    # 获取setting文件中的配置
    settings = get_project_settings()
    config = {
        'host': settings.get('MYSQL_HOST'),
        'port': settings.get('MYSQL_PORT'),
        'database': settings.get('MYSQL_DATABASE'),
        'user': settings.get('MYSQL_USER'),
        'password': settings.get('MYSQL_PASSWORD'),
        'charset': settings.get('MYSQL_CHARSET')
    }
    """
    MYSQL数据库对象,负责产生数据库连接 , 此类中的连接采用连接池实现获取连接对象:conn = Mysql.getConn()
            释放连接对象;conn.close()或del conn
    """
    # 连接池对象
    __pool = None
    def __init__(self):
        # 数据库构造函数,从连接池中取出连接,并生成操作游标
        self._conn = MysqlUtil.get_conn()
        self._cursor = self._conn.cursor()
    # 获取链接
    @staticmethod
    def get_conn():
        """
        @summary: 静态方法,从连接池中取出连接
        @return MySQLdb.connection
        """
        if MysqlUtil.__pool is None:
            __pool = PooledDB(creator=pymysql, mincached=1, maxcached=20, host=MysqlUtil.config['host'], port=MysqlUtil.config['port'], user=MysqlUtil.config['user'], passwd=MysqlUtil.config['password'], db=MysqlUtil.config['database'], charset=MysqlUtil.config['charset'])
        return __pool.connection()
    # 查询所有数据
    def get_all(self, sql, param=None):
        """
        @summary: 执行查询,并取出所有结果集
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list(字典对象)/boolean 查询到的结果集
        """
        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            print("count>>>", count,type(count))
            if count > 0:
                result = self._cursor.fetchall()
            else:
                result = False
            return result
        except Exception as e:
            logging.error(f"报错提示:{e}")
    # 查询某一个数据
    def get_one(self, sql, param=None):
        """
        @summary: 执行查询,并取出第一条
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list/boolean 查询到的结果集
        """
        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            if count > 0:
                result = self._cursor.fetchone()
            else:
                result = False
            return result
        except Exception as e:
            logging.error(f"报错提示:{e}")
    # 查询数量
    def get_count(self, sql, param=None):
        """
        @summary: 执行查询,返回结果数
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list/boolean 查询到的结果集
        """
        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            return count
        except Exception as e:
            logging.error(f"报错提示:{e}")
    # 查询部分
    def get_many(self, sql, num, param=None):
        """
        @summary: 执行查询,并取出num条结果
        @param sql:查询SQL,如果有查询条件,请只指定条件列表,并将条件值使用参数[param]传递进来
        @param num:取得的结果条数
        @param param: 可选参数,条件列表值(元组/列表)
        @return: result list/boolean 查询到的结果集
        """
        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            if count > 0:
                result = self._cursor.fetchmany(num)
            else:
                result = False
            return result
        except Exception as e:
            logging.error(f"报错提示:{e}")
    # 插入一条数据
    def insert_one(self, sql, value):
        """
        @summary: 向数据表插入一条记录
        @param sql:要插入的SQL格式
        @param value:要插入的记录数据tuple/list
        @return: insertId 受影响的行数
        """
        try:
            row_count = self._cursor.execute(sql, value)
            return row_count
        except Exception as e:
            logging.error(f"报错提示:{e}")
            self.end("rollback")
    # 插入多条数据
    def insert_many(self, sql, values):
        """
        @summary: 向数据表插入多条记录
        @param sql:要插入的SQL格式
        @param values:要插入的记录数据tuple(tuple)/list[list]
        @return: count 受影响的行数
        """
        try:
            row_count = self._cursor.executemany(sql, values)
            return row_count
        except Exception as e:
            logging.error(f"报错提示:{e}")
            self.end("rollback")
    # def __get_insert_id(self):
    #     """
    #     获取当前连接最后一次插入操作生成的id,如果没有则为0
    #     """
    #     self._cursor.execute("SELECT @@IDENTITY AS id")
    #     result = self._cursor.fetchall()
    #     return result[0]['id']
    # 执行sql
    def __query(self, sql, param=None):
        try:
            if param is None:
                count = self._cursor.execute(sql)
            else:
                count = self._cursor.execute(sql, param)
            return count
        except Exception as e:
            logging.error(f"报错提示:{e}")
    # 更新
    def update(self, sql, param=None):
        """
        @summary: 更新数据表记录
        @param sql: SQL格式及条件,使用(%s,%s)
        @param param: 要更新的  值 tuple/list
        @return: count 受影响的行数
        """
        return self.__query(sql, param)
    # 删除
    def delete(self, sql, param=None):
        """
        @summary: 删除数据表记录
        @param sql: SQL格式及条件,使用(%s,%s)
        @param param: 要删除的条件 值 tuple/list
        @return: count 受影响的行数
        """
        return self.__query(sql, param)
    def begin(self):
        """
        @summary: 开启事务
        """
        self._conn.autocommit(0)
    def end(self, option='commit'):
        """
        @summary: 结束事务
        """
        if option == 'commit':
            self._conn.commit()
        else:
            self._conn.rollback()
    def dispose(self, is_end=1):
        """
        @summary: 释放连接池资源
        """
        if is_end == 1:
            self.end('commit')
        else:
            self.end('rollback')
        self._cursor.close()
        self._conn.close()
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
import scrapy
from comic.db_context import MysqlUtil
import logging
import traceback
class ComicPipeline:
    pool = None
    def open_spider(self, spider):
        try:
            self.pool = MysqlUtil()
        except (BaseException) as e:
            logging.error(f"数据库链接失败,报错提示:{e}")
            print(f"数据库链接失败,报错提示:{e}")
    def process_item(self, item, spider):
        try:
            sql_select = f"""select id,title from comic where id = {item["comic_id"]}"""
            flag = self.pool.get_count(sql_select)
            if flag:
                print("flag>>>>", flag)
                logging.info(f"漫画{flag[1]}已存在")
            else:
                sql_insert = """insert into comic(id,title,author,introductory,status,platform_id)  values
                             (%(id)s,%(title)s,%(author)s,%(introductory)s,%(status)s,1)"""
                params = {'id': item['comic_id'],
                          "title": item["title"],
                          "author": item["author"],
                          "introductory": item["introductory"],
                          "status": item["status"], }
                print(f"漫画{item['title']}基本数据插入成功")
                self.pool.insert_one(sql_insert, params)
                self.pool.end("commit")
        except Exception as e:
            logging.error(f"发生异常:{e}")
            self.pool.end("rollback")
    def close_spider(self, spider):
        pass
9,日志
# start logger configure setting
current_day = datetime.datetime.now()
LOG_ENABLED = True  # 启用日志,默认不启用
LOG_ENCODING = 'utf-8'
LOG_LEVEL = "ERROR"
# LOG_STDOUT = True  # 输出重定向至log日志,比如print
#
LOG_FILE = os.path.join(os.path.dirname(os.path.dirname(__file__)), "logs")
# end logger configure setting
10,分布式爬虫
10.1,下载模块
pip install scrapy_redis
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号