数据采集第四次作业

作业①:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

候选网站:http://www.dangdang.com/

实践代码:

bookSpiders.py:(本代码爬了10页)

import scrapy
from ..items import BooksItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

class BookspiderSpider(scrapy.Spider):
    name = 'bookSpider'
    #key='python'
    start_urls = ['http://search.dangdang.com/?key=python']

    # def start_requests(self):
    #     url=BookspiderSpider.start_urls+"?key="+BookspiderSpider.key
    #     print(url)
    #     yield scrapy.Request(url=url,callback=self.parse)

    def parse(self, response):
        try:
            dammit=UnicodeDammit(response.body,["utf-8","gbk"])
            data=dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            lis=selector.xpath("//ul[@class='bigimg']/li")
            print(lis)
            for li in lis:
                title=li.xpath("./a[position()=1]/@title").extract_first()
                price=li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author=li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date=li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
                publisher=li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
                detail=li.xpath("./p[@class='detail']/text()").extract_first()

                item=BooksItem()
                item["title"]=title.strip() if title else ""
                item["author"]=author.strip() if title else ""
                item["date"]=date.strip()[1:] if date else ""
                item["publisher"]=publisher.strip() if publisher else ""
                item["price"]=price.strip() if price else ""
                item["detail"]=detail.strip() if detail else ""
                yield item
            pagenum=10
            for page in range(2,pagenum):
                page='http://search.dangdang.com/?key=python&page_index{}'.format(page)
                yield scrapy.Request(url=page,callback=self.parse)
            # link=selector.xpath("//div[@class='paging']/ul[@name='Fy']/li/a[@class='null']/@href").extract_first()
            # if link:
            #     url=response.urljoin(link)
            #     yield scrapy.Request(url=url,callback=self.parse)
        except Exception as err:
            print(err) 
        pass

pipelines.py:

from itemadapter import ItemAdapter
import pymysql

class BooksPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="**********",db="mydb",charset="utf8")#链接数据库#密码不是真实密码,只是不想改密码了
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from books")
            self.opened=True
            self.count=0
            self.num=1
        except Exception as err:
            print(err)
            self.opened=False


    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                n=str(self.num)
                self.cursor.execute("insert into books(bNum,bTitle,bAuthor,bPublisher,bDate, bPrice,bDetail)values(%s,%s,%s,%s,%s,%s,%s)",(n,item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
                self.count+=1
                self.num+=1
        except Exception as err:
            print(err)
        return item

    
    def close_spider(self,spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")
        print("总共爬取",self.count,"本书籍")

items.py:

import scrapy


class BooksItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    author=scrapy.Field()
    date=scrapy.Field()
    publisher=scrapy.Field()
    detail=scrapy.Field()
    price=scrapy.Field()
    pass

settings.py:

ITEM_PIPELINES = {
    'books.pipelines.BooksPipeline': 300,
}

run.py:

from scrapy import cmdline
cmdline.execute("scrapy crawl bookSpider -s LOG_ENABLED=False".split())

实践结果:

实践心得:

本次爬取当当网图书作业是书上实例的复现,刚开始源代码运行不出来还是爬不到数据来着,就改了部分代码。后面再来看源代码,爬出来了,貌似是url的问题,至今没搞清楚去了中括号为啥就行了。。

作业②

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息

候选网站:东方财富网:https://www.eastmoney.com/

​##新浪股票:http://finance.sina.com.cn/stock/

实践代码:

Scrapy+MySQL,在之前代码的基础上把pipelines.py中改一改就行了,采用的是js抓包,然后提取数据
pipelines.py:

from itemadapter import ItemAdapter
import pymysql

class StocksPipeline:
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="**********",db="mydb",charset="utf8")#链接数据库#密码不是真实密码,只是不想改密码了
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks1")
            self.opened=True
            self.count=0
            self.num=1
        except Exception as err:
            print(err)
            self.opened=False


    def process_item(self, item, spider):
        try:

            if self.opened:
                n=str(self.num)
                self.cursor.execute("insert into stocks1(序号,股票代码,股票名称,最新报价,涨跌幅,涨跌额,成交量,成交额,振幅,最高,最低,今开,昨收)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(n,item["Code"] , item["name"] , item["Latest_price"],item["UD_range"] , item["UD_price"] ,item["Deal_num"] ,item["Deal_price"] ,item["Amplitude"], item["Up_est"] ,item["Down_est"], item["Today"] ,item["Yesterday"]))
                self.count+=1
                self.num+=1
        except Exception as err:
            print(err)
        return item

    
    def close_spider(self,spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")

由于我们这是爬取动态网页,为了要使用到xpath,采取scrapy+selenium+xpath+Mysql。
注意得清空表格stocks1:

stocksdemo.py:

import scrapy
from ..items import SeleniumstockItem
import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
class StocksdemoSpider(scrapy.Spider):
    name = 'stocksdemo'
    
    def start_requests(self):
        url = 'http://quote.eastmoney.com/center/gridlist.html'
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        browser = webdriver.Chrome()
        print("正在打开网页...")
        browser.get("http://quote.eastmoney.com/center/gridlist.html")
        print("等待网页响应...")
        # 需要等一下,直到页面加载完成
        #wait = WebDriverWait(browser, 10)
        #wait.until(EC.presence_of_element_located((By.CLASS_NAME, "grid")))
        browser.implicitly_wait(10) 
        print("正在获取网页数据...")
        trs=browser.find_elements_by_xpath("//table[@class='table_wrapper-table']/tbody/tr")
        for tr in trs:
            td=tr.find_elements_by_xpath("./td")
            item=SeleniumstockItem()
            item["Snumber"]=td[0].text
            item["Code"]=td[1].text
            item["Name"]=td[2].text
            item["Latest_price"]=td[4].text
            item["UD_range"]=td[5].text
            item["UD_price"]=td[6].text
            item["Deal_num"]=td[7].text
            item["Deal_price"]=td[8].text
            item["Amplitude"]=td[9].text
            item["Up_est"]=td[10].text
            item["Down_est"]=td[11].text
            item["Today"]=td[12].text
            item["Yesterday"]=td[13].text
            #print(item["Snumber"]+"  "+item["Code"] + '  ' + item["Name"] + '  ' + item["Latest_price"] + '  ' + item["UD_range"] + '  ' + item["UD_price"] + '  ' + item["Deal_num"] + '  ' + item["Deal_price"] + '  ' +item["Amplitude"] + '  ' + item["Up_est"] + '  ' + item["Down_est"] + '  ' + item["Today"] + '  ' + item["Yesterday"])
            yield item 
            
        browser.close()
        pass

pipelines.py:

from itemadapter import ItemAdapter
import pymysql

class SeleniumstockPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="********",db="mydb",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks1")
            self.opened=True
            self.count=0
            self.num=1
        except Exception as err:
            print(err)
            self.opened=False


    def process_item(self, item, spider):
        try:

            if self.opened:
                n=str(self.num)
                self.cursor.execute("insert into stocks1(序号,股票代码,股票名称,最新报价,涨跌幅,涨跌额,成交量,成交额,振幅,最高,最低,今开,昨收)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(item["Snumber"],item["Code"] , item["Name"] , item["Latest_price"],item["UD_range"] , item["UD_price"] ,item["Deal_num"] ,item["Deal_price"] ,item["Amplitude"], item["Up_est"] ,item["Down_est"], item["Today"] ,item["Yesterday"]))
                self.count+=1
                self.num+=1
        except Exception as err:
            print(err)
        return item

    
    def close_spider(self,spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")

run.py:

from scrapy import cmdline
cmdline.execute("scrapy crawl stocksdemo -s LOG_ENABLED=False".split())

setting.py、items.py同之前一样

实践结果:



实践心得:

通过实践,对selenium框架有了更加深入的理解。

作业③:

要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

候选网站:招商银行网:http://fx.cmbchina.com/hq/

输出信息:MYSQL数据库存储和输出格式

实践代码:

Exchange.py:

import scrapy
from ..items import CurrencyItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

class ExchangeSpider(scrapy.Spider):
    name = 'Exchange'
    start_urls = ['http://fx.cmbchina.com/hq/']

    def parse(self, response):
        try:
            dammit=UnicodeDammit(response.body,["utf-8","gbk"])
            data=dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            trs=selector.xpath("//div[@id='realRateInfo']/table[@class='data']//tr")
            num=0
            for tr in trs:
                if (num==0):
                    num=num+1
                    continue
                Currency=tr.xpath("./td[@class='fontbold']/text()").extract_first()
                TSP=tr.xpath("./td[position()=4]/text()").extract_first()
                CSP=tr.xpath("./td[position()=5]/text()").extract_first()
                TBP=tr.xpath("./td[position()=6]/text()").extract_first()
                CBP=tr.xpath("./td[position()=7]/text()").extract_first()
                Time=tr.xpath("./td[position()=8]/text()").extract_first()
                item=CurrencyItem()
                item["Currency"]=Currency.strip() if Currency else ""
                item["TSP"]=TSP.strip() if TSP else ""
                item["CSP"]=CSP.strip()[1:] if CSP else ""
                item["TBP"]=TBP.strip() if TBP else ""
                item["CBP"]=CBP.strip() if CBP else ""
                item["Time"]=Time.strip() if Time else ""
                yield item
            
           
        except Exception as err:
            print(err) 
        pass

pipelines.py:


from itemadapter import ItemAdapter
import pymysql

class CurrencyPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="**********",db="foreignExchange",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from currency")
            self.opened=True
            self.count=0
        except Exception as err:
            print(err)
            self.opened=False


    def process_item(self, item, spider):
        try:
            if self.opened:
                self.count+=1
                n=str(self.count)
                print(n+" "+item["Currency"]+" "+item["TSP"]+" "+item["CSP"]+" "+item["TBP"]+" "+item["CBP"]+" "+item["Time"])
                self.cursor.execute("insert into currency(bNum,bCurrency,bTSP,bCSP, bTBP,bCBP,bTime)values(%s,%s,%s,%s,%s,%s,%s)",(n,item["Currency"],item["TSP"],item["CSP"],item["TBP"],item["CBP"],item["Time"]))
                
        except Exception as err:
            print(err)
        return item

    
    def close_spider(self,spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened=False
        print("closed")

items.py:

import scrapy


class CurrencyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    Currency=scrapy.Field()
    TSP=scrapy.Field()
    CSP=scrapy.Field()
    TBP=scrapy.Field()
    CBP=scrapy.Field()
    Time=scrapy.Field()
    pass

settings.py:

ITEM_PIPELINES = {
    'currency.pipelines.CurrencyPipeline': 300,
}

run.py:

from scrapy import cmdline
cmdline.execute("scrapy crawl Exchange -s LOG_ENABLED=False".split())

实践结果:

实践心得:

此题为爬取静态网页,理解了第一个实例,本题就没什么可说的了。

posted @ 2020-11-01 19:58  筱du  阅读(183)  评论(0编辑  收藏  举报