第四次实验

作业①

要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法,用Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据。

候选网站

http://www.dangdang.com/)

输出信息:

MySQL数据库存储和输出格式如下:

创建MySQL数据库pydb,建表books

cursor.execute('CREATE DATABASE IF NOT EXISTS mydb DEFAULT CHARSET utf8 COLLATE utf8_general_ci;')

cursor.execute('drop table if exists books')
sql="""CREATE TABLE IF NOT EXISTS `books` (
	  `bTitle` varchar(512) NOT NULL,
	  `bAuthor` varchar(256),
	  `bPublisher` varchar(256),
	  `bDate` varchar(32),
	  `bPrice` varchar(16),
	  `bDetail` text,
	  PRIMARY KEY (`bTitle`)
	) ENGINE=InnoDB  DEFAULT CHARSET=utf8"""

cursor.execute(sql)

创建scrapy项目并进入项目目录

scrapy startproject dangdang
#成功执行上述命令后,进入weather这个项目目录
cd dangdang

创建爬虫文件

scrapy genspider -t basic dd dangdang.com

用pycharm打开项目文件夹,进行修改

items.py

title = scrapy.Field()
author=scrapy.Field()
date=scrapy.Field()
publisher=scrapy.Field()
detail=scrapy.Field()
price=scrapy.Field()

settings.py

#忽略网站爬虫协议
ROBOTSTXT_OBEY = False
#配置piplines.py能够使用
ITEM_PIPELINES = {
    'dangd.pipelines.DangdPipeline': 300,
}

新建run.py

from scrapy import cmdline
#不用切换命令提示符,直接运行run.py
cmdline.execute("scrapy crawl dd".split())

编写爬虫文件dd.py

#设置开始url
    source_url = 'http://search.dangdang.com/'
    def start_requests(self):
        url = DdSpider.source_url + "?key=" + DdSpider.key
        yield scrapy.Request(url=url, callback=self.parse)
#对网页进行分析,实现数据获取和翻页
    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector=scrapy.Selector(text=data)
            lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
            for li in lis:
                title=li.xpath("./a[position()=1]/@title").extract_first()
                price =li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
                author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
                date =li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
                publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
                detail = li.xpath("./p[@class='detail']/text()").extract_first()
                #detail有时没有,结果None
                item=BookItem()
                item["title"]=title.strip() if title else ""
                item["author"]=author.strip() if author else ""
                item["date"] = date.strip()[1:] if date else ""
                item["publisher"] = publisher.strip() if publisher else ""
                item["price"] = price.strip() if price else ""
                item["detail"] = detail.strip() if detail else ""
                yield item
            # 最后一页时link为None
            link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
            if link:
                url = response.urljoin(link)
                yield scrapy.Request(url=url, callback=self.parse)
        except Exception as err:
            print(err)

编写piplines.py

在scrapy的过程中一旦打开一个spider爬虫就会执行这个类的 open_spider(self,spider)函数,一旦这个spider爬虫关闭就执行这个类 的close_spider(self,spider)函数。因此程序在open_spider函数中连接 MySQL数据库,并创建操作游标self.cursor,在close_spider中提交数 据库并关闭数据库,程序中使用count变量统计爬取的书籍数量。 在数据处理函数中每次有数据到达,就显示数据内容,并使用insert 的SQL语句把数据插入到数据库中。

class DangdPipeline(object):
    def open_spider(self,spider):
        print("opened")
        try:
            self.con=pymysql.connect(host="127.0.0.1",port=3306,user="root",passwd="278676",db="mydb",charset="utf8")
            self.cursor=self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from books")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "本书籍")

    def process_item(self, item, spider):
        try:
            print(item["title"])
            print(item["author"])
            print(item["publisher"])
            print(item["date"])
            print(item["price"])
            print(item["detail"])
            print()
            if self.opened:
                self.cursor.execute("insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values( % s, % s, % s, % s, % s, % s)",(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
                self.count += 1
        except Exception as err:
            print(err)
        return item

运行结果:

Gitee地址:

作业一:
https://gitee.com/flyme10086/data-excavate/tree/master/作业4/任务一

心得体会:

此实验通过对书本代码的复现,使我们进一步了解scrapy和xpath的使用及通过MySQL数据库储存数据。

作业②

要求:

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。

候选网站:

招商银行网:http://fx.cmbchina.com/hq/

输出信息:

MySQL数据库存储和输出格式

创建scrapy项目并进入项目目录

scrapy startproject cmbchina
#成功执行上述命令后,进入weather这个项目目录
cd cmbchina

创建爬虫文件

scrapy genspider -t basic cmb fx.cmbchina.com

用pycharm打开项目文件夹,进行修改

items.py

Currency = scrapy.Field()
TSP = scrapy.Field()
CSP = scrapy.Field()
TBP = scrapy.Field()
CBP = scrapy.Field()
Time = scrapy.Field()

settings.py

#忽略网站爬虫协议
ROBOTSTXT_OBEY = False
#配置piplines.py能够使用
ITEM_PIPELINES = {
    'cmbchina.pipelines.CmbchinaPipeline': 300,
}

新建run.py

from scrapy import cmdline
#不用切换命令提示符,直接运行run.py
cmdline.execute("scrapy crawl cmb".split())

编写爬虫文件cmb.py

#获取网页数据
#去除数据中的空白字符
    def parse(self, response):
        item = CmbchinaItem()
        currency = response.xpath('//div[@id="realRateInfo"]/table/tr/td[@class="fontbold"][1]/text()').extract()
        item["Currency"] = [x.strip() for x in currency if x.strip() != '']
        tsp = response.xpath('//*[@id="realRateInfo"]/table/tr/td[@class="numberright"][1]/text()').extract()
        item["TSP"]= [x.strip() for x in tsp if x.strip() != '']
        csp = response.xpath('//*[@id="realRateInfo"]/table/tr/td[@class="numberright"][2]/text()').extract()
        item["CSP"]= [x.strip() for x in csp if x.strip() != '']
        tbp = response.xpath('//*[@id="realRateInfo"]/table/tr/td[@class="numberright"][3]/text()').extract()
        item["TBP"]= [x.strip() for x in tbp if x.strip() != '']
        cbp = response.xpath('//*[@id="realRateInfo"]/table/tr/td[@class="numberright"][4]/text()').extract()
        item["CBP"] = [x.strip() for x in cbp if x.strip() != '']
        tim = response.xpath('//*[@id="realRateInfo"]/table/tr/td[@align="center"][3]/text()').extract()
        item["Time"] = [x.strip() for x in tim if x.strip() != '']
        yield item

编写piplines.py

class CmbchinaPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="278676", db="pydb",charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from fcs")
            self.opened = True
            self.count = 0
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")
        print("总共爬取", self.count, "条数据")

    def process_item(self, item, spider):
        try:

            if self.opened:
                for i in range(len(item["Currency"])):
                    num = str(self.count+1)
                    print(num,end='\t')
                    print(item["Currency"][i],end='\t')
                    print(item["TSP"][i],end='\t')
                    print(item["CSP"][i],end='\t')
                    print(item["TBP"][i],end='\t')
                    print(item["CBP"][i],end='\t')
                    print(item["Time"][i])
                    print()
                    self.cursor.execute(
                        "insert into fcs (id,Currency,TSP,CSP,TBP,CBP,Time) values( % s,% s, % s, % s, % s, % s, % s)",
                        (num, item["Currency"][i], item["TSP"][i], item["CSP"][i], item["TBP"][i], item["CBP"][i], item["Time"][i]))
                    self.count += 1
        except Exception as err:
            print(err)
        return item

运行结果:

Gitee地址:

作业二:
https://gitee.com/flyme10086/data-excavate/tree/master/作业4/任务二

心得体会:

本次作业通过使用scrapy+xpath+MySQL数据库储存技术,使我进一步掌握了scrapy框架的使用,并对MYSQL数据库的增删改查有了更进一步的了解。

作业③

要求:

熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容;使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

候选网站

东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息:

表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

设置启动chrome时可见,便于发现错误原因

driver = webdriver.Chrome()

初始界面

driver.get("http://quote.eastmoney.com/center/gridlist.html#hs_a_board")

数据库操作

#打开数据库连接
conn = pymysql.connect(host="localhost",user="root",passwd="278676",port=3306,charset="utf8")
# #获取游标
cursor = conn.cursor()
#创建pydb数据库
cursor.execute('CREATE DATABASE IF NOT EXISTS mydb DEFAULT CHARSET utf8 COLLATE utf8_general_ci;')
#选择数据库
conn.select_db('mydb')
#获取游标
cursor = conn.cursor()
#创建表stocks
cursor.execute('drop table if exists stocks_hs')
sql="""CREATE TABLE IF NOT EXISTS `stocks_hs` (
	  `ID` varchar(16) NOT NULL,
	  `sID` varchar(16) NOT NULL,
	  `sName` varchar(64) NOT NULL,
	  `sPrice` varchar(16),
	  `sRFExtent` varchar(16),
	  `sRFQuota` varchar(16),
	  `sNum` varchar(16),
	  `sQuota` varchar(16),
	  `sExtent` varchar(16),
	  `sHigh` varchar(16),
	  `sLow` varchar(16),
	  `sToday` varchar(16),
	  `sYesterday` varchar(16),
	  PRIMARY KEY (`sName`)
	) ENGINE=InnoDB  DEFAULT CHARSET=utf8"""
cursor.execute(sql)

获取网页数据

#沪深A
time.sleep(random.random())
hsA=driver.find_element(By.XPATH,'//*[@id="nav_hs_a_board"]/a')
hsA.click()
ID = driver.find_elements(By.XPATH,'//tbody/tr/td[2]')
name = driver.find_elements(By.XPATH,'//tbody/tr/td[3]')
price = driver.find_elements(By.XPATH,'//tbody/tr/td[5]')
RFextent = driver.find_elements(By.XPATH,'//tbody/tr/td[6]')
RFquota = driver.find_elements(By.XPATH,'//tbody/tr/td[7]')
num = driver.find_elements(By.XPATH,'//tbody/tr/td[8]')
quota = driver.find_elements(By.XPATH,'//tbody/tr/td[9]')
extent = driver.find_elements(By.XPATH,'//tbody/tr/td[10]')
high = driver.find_elements(By.XPATH,'//tbody/tr/td[11]')
low = driver.find_elements(By.XPATH,'//tbody/tr/td[12]')
today = driver.find_elements(By.XPATH,'//tbody/tr/td[13]')
yesterday = driver.find_elements(By.XPATH,'//tbody/tr/td[14]')

将数据插入数据库

for i in range(len(ID)):
    cursor.execute("insert into stocks_hs (ID,sID,sName,sPrice,sRFExtent,sRFQuota,sNum,sQuota,sExtent,sHigh,sLow,sToday,sYesterday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                   (str(i+1),ID[i].text, name[i].text, price[i].text, RFextent[i].text, RFquota[i].text, num[i].text, quota[i].text, extent[i].text, high[i].text, low[i].text, today[i].text, yesterday[i].text))
    print(str(i+1),ID[i].text, name[i].text, price[i].text, RFextent[i].text, RFquota[i].text, num[i].text, quota[i].text, extent[i].text, high[i].text, low[i].text, today[i].text, yesterday[i].text)

上传并关闭数据库

#上传数据
conn.commit()
#先关闭游标
cursor.close()
#再关闭数据库连接
conn.close()

运行结果:

Gitee地址:

作业三:
https://gitee.com/flyme10086/data-excavate/tree/master/作业4/任务三

心得体会:

本次作业要求我们进一步掌握selenium的使用。通过此次实验,我对使用selenium模拟chrome浏览器进行数据获取有了更深的体会和认识,同时也对使用MySQL进行数据储存有了进一步的了解。

posted @ 2021-11-10 23:53  Flyme10086  阅读(14)  评论(0编辑  收藏  举报