数据采集技术第四次作业

作业①

1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

代码部分
booksspider.py

import scrapy
from ..items import BooksproItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class BooksspiderSpider(scrapy.Spider):
    name = 'booksspider'
    #allowed_domains = ['www.xxx.com']
    key = 'python'
    source_url = "http://search.dangdang.com/"
    def start_requests(self):
        url = BooksspiderSpider.source_url + "?key=" + BooksspiderSpider.key  #拼接得到带有搜索关键词的url
        yield scrapy.Request(url = url , callback=self.parse)   #回调请求,用的都是同一个parse方法


    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body,["utf-8","gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            #li_list = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            li_list = selector.xpath("//*[@id='component_59']/li") #先以ul的id值定位到相应的ul,再定位下面的li标签即可
            for li in li_list:
                title = li.xpath("./a[position()=1]/@title").extract_first()
                price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date = li.xpath("./p[@class='search_book_author']/span[position()= last()-1]/text()").extract_first()
                publisher = li.xpath(
                    "./p[@class='search_book_author']/span [position()=last()]/a/@title").extract_first()
                #有的detail值为空
                detail = li.xpath("./p[@class='detail']/text()").extract_first()

                item = BooksproItem()
                item['title'] = title.strip() if title else ""
                item["author"] = author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item

                # link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
                # if link:
                #     url = response.urljoin(link)
                #     yield scrapy.Request(url=url, callback=self.parse)
        except Exception as err:
            print(err)

items.py

import scrapy

class BooksproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    author = scrapy.Field()
    date = scrapy.Field()
    publisher = scrapy.Field()
    detail = scrapy.Field()
    price = scrapy.Field()
    pass

settings.py

BOT_NAME = 'booksPro'

SPIDER_MODULES = ['booksPro.spiders']
NEWSPIDER_MODULE = 'booksPro.spiders'
LOG_LEVEL = 'ERROR'  #这样设置就可以只看到提示错误的日志信息了


USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre' #UA伪装
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'booksPro (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'booksPro.pipelines.BooksproPipeline': 300,
}

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class BooksproPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="chu836083241",db="test",charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("detele from books")
            self.opened = True
            self.count = 1       
        except Exception as err:
            print(err)
            self.opened = False
    
    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取",self.count,"本书籍")

    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                self.cursor.execute(
                    "insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values(%s,%s,%s,%s,%s,%s)",
                    (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"]))
                self.count+=1
        except Exception as err:
            print('err')
        return item

结果如下

2)心得体会

本次实验主要是对书上代码的复现,本次实验后我能够更加熟练地使用Scrapy框架,并且明白了通过python中的语句来操作mysql中的表格,要注意mysql中设置表格的列名要和python语句中的一致,还有就是通过使用xpath来解析数据,确实非常灵活。

作业②

1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

代码部分
stockspider.py

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from ..items import StockItem

class StockspiderSpider(scrapy.Spider):
    name = 'stockspider'
    #allowed_domains = ['www.xxx.com']
    start_urls = ['http://quote.eastmoney.com/center/gridlist.html#hs_a_board']

    def __init__(self):    #只需要执行一次
        self.chrome_options = Options()
        self.chrome_options.add_argument('--headless') 
        self.chrome_options.add_argument('--disable-gpu')  #设置启动chrome时不可见
        self.driver = webdriver.Chrome(chrome_options=self.chrome_options)


    def parse(self, response):  
        tr_list = response.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
        for tr in tr_list:
            Id = tr.xpath('./td[1]/text()').extract_first().strip()
            code = tr.xpath('./td[2]/a/text()').extract_first().strip()
            name = tr.xpath('./td[3]/a/text()').extract_first().strip()
            new_price = tr.xpath('./td[5]/span/text()').extract_first().strip()
            zhangdiefu = tr.xpath('./td[6]/span/text()').extract_first().strip()
            zhangdiee = tr.xpath('./td[7]/span/text()').extract_first().strip()
            # print(Id,code,name,new_price,zhangdiefu,zhangdiee)
            item = StockItem()
            item['Id'] = Id
            item['code'] = code
            item['name'] = name
            item['new_price'] = new_price
            item['zhangdiefu'] = zhangdiefu
            item['zhangdiee'] = zhangdiee
            yield item

    def closed(self,spider):  #只要执行一次就行,关闭浏览器驱动程序
        self.driver.quit()

settings.py

# Obey robots.txt rules
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre' #UA伪装
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'

DOWNLOADER_MIDDLEWARES = {   #开启下载中间件
   'stock.middlewares.StockDownloaderMiddleware': 543,
}

ITEM_PIPELINES = {      #开启管道
   'stock.pipelines.StockPipeline': 300,
}

pipelines.py

from itemadapter import ItemAdapter
import pymysql

class StockPipeline:
    
    def open_spider(self,spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="chu836083241",db="test",charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("detele from books")
            self.opened = True
            self.count = 1       
        except Exception as err:
            print(err)
            self.opened = False



    def process_item(self, item, spider):
        try:
            print(item['Id'])
            print(item['code'])
            print(item['name'])
            print(item['new_price'])
            print(item['zhangdiefu'])
            print(item['zhangdiee'])
            if self.opened:
                self.cursor.execute(
                    "insert into stock (Id,code,name,new_price,zhangdiefu,zhangdiee) values(%s,%s,%s,%s,%s,%s)",
                    (item['Id'], item['code'], item['name'], item['new_price'], item['zhangdiefu'], item['zhangdiee']))
                self.count+=1
        except Exception as err:
            print('err')
        return item

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取",self.count-1,"条记录")

这次的不同就是要多写一个py文件,主要改动的是里面的process_response方法,要对我们的响应对象进行拦截与修改,得到页面的源码数据,方便我们在stockspider.py中得到新的响应对象,直接用xpath进行解析。
middlewares.py

def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        #挑选指定的响应对象进行篡改
        #通过url指定request
        #通过request指定response
        #spider爬虫对象
        bro = spider.driver
        if request.url in spider.start_urls:
            #response   # 进行篡改 实例化新的响应对象(包含动态加载的新闻数据)替代原来的旧响应对象
            # 基于seleium便捷获取动态数据
            bro.get(request.url)
            sleep(3)
            page_text = bro.page_source
            new_response = HtmlResponse(url =request.url,body = page_text,encoding = 'utf-8',request=request)
            return new_response
        else:
            return response

结果展示

2)心得体会

本次实验将selenium和scrapy结合起来,我了解到scrapy中下载中间件的作用。本次实验主要是要写process_response方法,通过篡改响应对象,直接解析页面的源代码,这样就可以使用xpath来进行数据解析了,不过由于时间比较紧张,还没实现翻页操作,之后自己会尝试实现一下(通过xpath来定位到下一页按钮,再结合click()方法实现翻页)。另外也了解了下载中间件中也能篡改请求对象,例如设置UA池、IP代理等。本次实验后,我对scrapy框架有了更深入的了解,收获颇丰。

作业③

1)熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

代码部分
myspider.py

import scrapy
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
from ..items import CurrencyItem


class MyspiderSpider(scrapy.Spider):
    name = 'myspider'
    #allowed_domains = ['www.xxx.com']
    start_url = 'http://fx.cmbchina.com/hq/'

    def start_requests(self):
        url = MyspiderSpider.start_url
        yield scrapy.Request(url=url,callback=self.parse)

    def parse(self, response):
        dammit = UnicodeDammit(response.body,["utf-8","gbk"])
        data = dammit.unicode_markup
        selector = scrapy.Selector(text=data)
        #原先用table/tbody/tr来定位,但是加了tbody就定位不到了
        tr_list = selector.xpath('//*[@id="realRateInfo"]/table//tr')
        self.index = 0
        for tr in tr_list:
            if(self.index == 0):
                self.index += 1
                continue
            else:
                item = CurrencyItem()
                Currency = tr.xpath('./td[1]/text()').extract_first().strip()
                TSP = tr.xpath('./td[4]/text()').extract_first().strip()
                CSP = tr.xpath('./td[5]/text()').extract_first().strip()
                TBP = tr.xpath('./td[6]/text()').extract_first().strip()
                CBP = tr.xpath('./td[7]/text()').extract_first().strip()
                Time = tr.xpath('./td[8]/text()').extract_first().strip()
                item['Currency'] = Currency
                item['TSP'] = TSP
                item['CSP'] = CSP
                item['TBP'] = TBP
                item['CBP'] = CBP
                item['Time'] = Time
                yield item           

items.py

import scrapy


class CurrencyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    Currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    Time = scrapy.Field()
    pass

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql


class CurrencyPipeline(object):

    def open_spider(self,spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="chu836083241",db="test",charset='utf8')
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            #self.cursor.execute("detele from books")
            self.opened = True
            self.count = 1       
        except Exception as err:
            print(err)
            self.opened = False
    
    def process_item(self, item, spider):
        try:
            print(self.count)
            print(item['Currency'])
            print(item['TSP'])
            print(item['CSP'])
            print(item['TBP'])
            print(item['CBP'])
            print(item['Time'])
            if self.opened:
                self.cursor.execute(
                    "insert into currency (Id,Currency,TSP,CSP,TBP,CBP,Time) values(%s,%s,%s,%s,%s,%s,%s)",
                    (self.count,item["Currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["Time"]))
                self.count+=1
        except Exception as err:
            print('err')
        return item


    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取",self.count-1,"条记录")

结果展示

2)心得体会

本次实验的页面源码数据并非动态加载的,所以难度不是很大,不过有个坑点就是在xpath中用了tbody标签来定位,结果返回值是空的,但是在第二题中却没出现这种情况。这个疑惑暂时还没解决。

posted @ 2020-11-03 21:57  ╮兲兲快泺╭  阅读(110)  评论(0编辑  收藏  举报