scrapy爬虫框架

scrapy爬虫框架爬取百度贴吧帖子并存库

参考:http://cuiqingcai.com/3472.html/3

scrapySpiders.mysqlpipelines.pipelines.ScrapyPipelines
# -*- coding: utf-8 -*-
__author__ = 'mmc'

from sql import Sql
from scrapySpiders.items import ScrapyspidersItem


class ScrapyPipelines(object):
    @classmethod
    def process_item(self, item, spider):
        if isinstance(item, ScrapyspidersItem):
            print "start save data in database"
            tb_name = item['name']
            tb_content = item['content']
            Sql.insert_content(tb_name, tb_content)

在settings.py文件中修改配置即可自动进行数据存储

 

spiders

# -*- coding: utf-8 -*-
__author__ = 'mmc'

import re
import scrapy
from bs4 import BeautifulSoup
from scrapy.http import Request
from scrapySpiders.items import ScrapyspidersItem
from scrapySpiders.mysqlpipelines.pipelines import ScrapyPipelines


class Myspiders(scrapy.Spider):

    name = 'TieBa'
    base_url = 'https://tieba.baidu.com/p/'

    def start_requests(self):
        url = self.base_url+'4420237089'+'?see_lz=1'
        yield Request(url, callback=self.parse)

    def parse(self, response):
        max_num = BeautifulSoup(response.text, 'lxml').find('li', class_='l_reply_num').find_all('span')[-1].get_text()
        print max_num
        # print response.text
        baseurl = str(response.url)
        for num in range(1, int(max_num)+1): #int(max_num)+1
            url = baseurl+'&pn='+str(num)
            # print url
            yield Request(url, callback=self.get_name)

    def get_name(self, response):
        response = BeautifulSoup(response.text, 'lxml')
        item = ScrapyspidersItem()
        tmp = response.findAll("div", {"class": "d_post_content j_d_post_content "})
        vmp = response.findAll("span", {"class": "tail-info"})
        for val, f in zip(tmp, vmp[1:-1:3]):
            # print val.get_text()
            # print f.get_text()
            item['name'] = val.get_text().strip()
            item['content'] = f.get_text().strip()
            print item
            ScrapyPipelines().process_item(item,'')
        return item
        # item['name'] = tmp
        # item['content'] = vmp
        # # print tmp
        # # print vmp
        # return item

sql.py

# -*- coding: utf-8 -*-
__author__ = 'mmc'

import MySQLdb


db_para = {'host': '127.0.0.1',
           'port': 3306,
           'user': 'mmc',
           'passwd': 'mmc123456',
           'db': 'test'}
dbcon = MySQLdb.connect(**db_para)
cur = dbcon.cursor()
#出现UnicodeEncodeError: 'latin-1' codec can't encode character执行以下代码解决 dbcon.set_character_set(
'utf8') cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;')

class Sql: @classmethod def insert_content(cls, tb_name, tb_content): sql = """INSERT INTO `test`.`tb_name` (`tb_name`, `tb_content`) VALUES (%(tb_content)s,%(tb_name)s) """ value = {'tb_name': tb_name, 'tb_content': tb_content} cur.execute(sql, value) dbcon.commit()

 

scrapy查询语法:

当我们爬取大量的网页,如果自己写正则匹配,会很麻烦,也很浪费时间,令人欣慰的是,scrapy内部支持更简单的查询语法,帮助我们去html中查询我们需要的标签和标签内容以及标签属性。下面逐一进行介绍:

  • 查询子子孙孙中的某个标签(以div标签为例)://div
  • 查询儿子中的某个标签(以div标签为例):/div
  • 查询标签中带有某个class属性的标签://div[@class=’c1′]即子子孙孙中标签是div且class=‘c1’的标签
  • 查询标签中带有某个class=‘c1’并且自定义属性name=‘alex’的标签://div[@class=’c1′][@name=’alex’]
  • 查询某个标签的文本内容://div/span/text() 即查询子子孙孙中div下面的span标签中的文本内容
  • 查询某个属性的值(例如查询a标签的href属性)://a/@href

示例代码:

 


from scrapy.selector import Selector
from scrapy.http import HtmlResponse

def parse(self, response):
       # 分析页面
       # 找到页面中符合规则的内容(校花图片),保存
       # 找到所有的a标签,再访问其他a标签,一层一层的搞下去
 
       hxs = HtmlXPathSelector(response)#创建查询对象
 
       # 如果url是 http://www.xiaohuar.com/list-1-\d+.html
       if re.match('http://www.xiaohuar.com/list-1-\d+.html', response.url): #如果url能够匹配到需要爬取的url,即本站url
           items = hxs.select('//div[@class="item_list infinite_scroll"]/div') #select中填写查询目标,按scrapy查询语法书写
           for i in range(len(items)):
               src = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src' % i).extract()#查询所有img标签的src属性,即获取校花图片地址
               name = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()' % i).extract() #获取span的文本内容,即校花姓名
               school = hxs.select('//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()' % i).extract() #校花学校
               if src:
                   ab_src = "http://www.xiaohuar.com" + src[0]#相对路径拼接
                   file_name = "%s_%s.jpg" % (school[0].encode('utf-8'), name[0].encode('utf-8')) #文件名,因为python27默认编码格式是unicode编码,因此我们需要编码成utf-8
                   file_path = os.path.join("/Users/wupeiqi/PycharmProjects/beauty/pic", file_name)
                   urllib.urlretrieve(ab_src, file_path)

 

posted @ 2017-06-08 20:04  好奇的小明  阅读(276)  评论(0编辑  收藏  举报