# 「数据采集」实验四

## 作业①

• 要求：熟练掌握 scrapyItemPipeline 数据的序列化输出方法,使用Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
• 候选网站：http://www.dangdang.com/
• 关键词：学生自由选择
• 输出信息：
id title author publisher date price detail
1 Python算法图解 何韬 清华大学出版社 2021-04-01 ¥34.50 用到算法。数据结构是算法的基础，数组、字典、堆、栈、链表...
.. .. .. .. .. .. ..

Gitee链接 : 作业4-1

### 1.网页解析

• 找到信息所处位置
• xpath获取信息
lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
• 定位

• xpath获取信息
title=li.xpath("./a[position()=1]/@title").extract_first()
price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
detail = li.xpath("./p[@class='detail']/text()").extract_first()

### 2.编写items.py中的数据项目类

class BookItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author=scrapy.Field()
date=scrapy.Field()
publisher=scrapy.Field()
detail=scrapy.Field()
price=scrapy.Field()


### 3.编写pipelines.py中的数据处理类

class BookPipeline(object):
def open_spider(self,spider):
print("opend")
self.con = sqlite3.connect("books.db")
self.cursor = self.con.cursor()
try:
try:
self.cursor.execute("create table books(bTitle varchar(512),bAuthor varchar(256),bPublisher varchar(256),"
"bDate varchar(32),bPrice varchar(16),bDetail varchar(256),"
"constraint pk_books primary key (bTitle,bAuthor));")
except:
self.cursor.execute("delete from books")
self.opened = True
self.count = 1
except Exception as err:
print(err)
self.opened = False

def close_spider(self, spider):
if self.opened:
self.con.commit()
self.con.close()
self.opened=False
print("closed")
print("总共爬取",self.count,"本书籍")

def process_item(self, item, spider):
try:
print(item["title"])
print(item["author"])
print(item["publisher"])
print(item["date"])
print(item["price"])
print(item["detail"])
print()
if self.opened:
self.cursor.execute("insert into books (bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) values (?,?,?,?,?,?)",
(item["title"],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
self.count+=1
except Exception as err:
print(err)
return item


### 4.编写setting.py

BOT_NAME = 'bank'

SPIDER_MODULES = ['bank.spiders']
NEWSPIDER_MODULE = 'bank.spiders'
ITEM_PIPELINES = {'bank.pipelines.BankPipeline': 300,}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'bank (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


### 5.编写MySpider.py爬虫程序

class MySpider(scrapy.Spider):
name = "mySpider"
key = 'python'
source_url='http://search.dangdang.com/'

def start_requests(self):
url = MySpider.source_url+"?key="+MySpider.key
yield scrapy.Request(url=url,callback=self.parse)

def parse(self, response):
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
selector=scrapy.Selector(text=data)
lis=selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
for li in lis:
title=li.xpath("./a[position()=1]/@title").extract_first()
price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
date = li.xpath("./p[@class='search_book_author']/span[position()=last()-1]/text()").extract_first()
publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first()
detail = li.xpath("./p[@class='detail']/text()").extract_first()
#detail有时没有，结果None

item=BookItem()
item["title"]=title.strip() if title else ""
item["author"]=author.strip() if author else ""
item["date"] = date.strip()[1:] if date else ""
item["publisher"] = publisher.strip() if publisher else ""
item["price"] = price.strip() if price else ""
item["detail"] = detail.strip() if detail else ""
yield item

#最后一页时link为None

link=selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first()
if link:
url=response.urljoin(link)
yield scrapy.Request(url=url, callback=self.parse)

except Exception as err:
print(err)


• 控制台输出
• 数据库截图

### 7.心得体会

• 本题为复现课本中代码，未遇到困难。

## 作业②

• 要求：熟练掌握 scrapyItemPipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
• 候选网站：招商银行网：http://fx.cmbchina.com/hq/
• 输出信息：MYSQL数据库存储和输出格式
id currency tsp csp tbp cbp time
1 港币 82.23 82.23 81.91 81.33 10:22:05
.. .. .. .. .. .. ..

Gitee链接：作业4-2

### 1.网页解析

• xpath 获取信息
currency = selector.xpath('//div[@id="realRateInfo"]//td[1]/text()').extract()
tsp = selector.xpath('//div[@id="realRateInfo"]//td[4]/text()').extract()
csp = selector.xpath('//div[@id="realRateInfo"]//td[5]/text()').extract()
tbp = selector.xpath('//div[@id="realRateInfo"]//td[6]/text()').extract()
cbp = selector.xpath('//div[@id="realRateInfo"]//td[7]/text()').extract()
time = selector.xpath('//div[@id="realRateInfo"]//td[8]/text()').extract()

### 2.编写items.py中的数据项目类

class BankItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
id = scrapy.Field()
currency = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
time = scrapy.Field()


### 3.编写pipelines.py中的数据处理类

class BankPipeline:
def open_spider(self, spider):
print("opend")
self.con = sqlite3.connect("bank.db")
self.cursor = self.con.cursor()
try:
try:
self.cursor.execute("create table bank (bId varchar(4),bCurrency varchar(16),bTsp varchar(8),bCsp varchar(8),bTbp varchar(8),"
"bCbp varchar(8),bTime varchar(32),"constraint pk_bank primary key (bId,bCurrency));")
except:
self.cursor.execute("delete from bank")
self.opened = True
self.count = 1
except Exception as err:
print(err)
self.opened = False

def close_spider(self, spider):
try:
if self.opened:
self.con.commit()
self.con.close()
self.opened = False
except Exception as err:
print(err)
print("closed")
print("总共爬取", self.count - 1, "项信息")

def process_item(self, item, spider):
try:
print(self.count,item['currency'],item['tsp'],item['csp'],item['tbp'],item['cbp'], item['time'])
if self.opened:
self.cursor.execute("insert into bank (bId,bCurrency,bTsp,bCsp,bTbp,bCbp,bTime) values(?,?,?,?,?,?,?)",
(self.count,item['currency'],item['tsp'],item['csp'],item['tbp'],item['cbp'], item['time']))
self.count += 1
except Exception as err:
print(err)
return item


### 4.编写MySpider.py爬虫程序

class MySpider(scrapy.Spider):
# 继承Scrapy.Spider类
name = "bankSpider"
source_url = "http://fx.cmbchina.com/hq/"
page = 0
count = 1

def start_requests(self):
url = MySpider.source_url
yield scrapy.Request(url=url, callback=self.parse)

def parse(self, response):
try:
try:
dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
data = dammit.unicode_markup
except Exception as err:
print(err)
selector = scrapy.Selector(text=data)
currency = selector.xpath('//div[@id="realRateInfo"]//td[1]/text()').extract()
tsp = selector.xpath('//div[@id="realRateInfo"]//td[4]/text('
')').extract()
csp = selector.xpath('//div[@id="realRateInfo"]//td[5]/text('
')').extract()
tbp = selector.xpath('//div[@id="realRateInfo"]//td[6]/text('
')').extract()
cbp = selector.xpath('//div[@id="realRateInfo"]//td[7]/text('
')').extract()
time = selector.xpath('//div[@id="realRateInfo"]//td[8]/text('
')').extract()
for i in range(1,len(currency)):
item = BankItem()
item["currency"] = currency[i].strip() if currency[i] else ""
item["tsp"] = tsp[i].strip() if tsp[i] else ""
item["csp"] = csp[i].strip() if csp[i] else ""
item["tbp"] = tbp[i].strip() if tbp[i] else ""
item["cbp"] = cbp[i].strip() if cbp[i] else ""
item["time"] = time[i].strip() if time[i] else ""
yield item

MySpider.page += 1
print("MySpider.page:", MySpider.page)

except Exception as err:
print(err)


• 控制台输出
• 数据库截图

### 6.心得体会

• 对于Scrapy爬虫程序的框架使用逐渐熟练；
• 本题未进行翻页爬取，难度较低，未遇到困难。

## 作业③

• 要求：熟练掌握 Selenium查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容；使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
• 候选网站：东方财富网
• 输出信息：MySQL数据库存储和输出格式如下

1 688093 N世华 28.47 62.22% 10.92 26.13万 7.6亿 22.3% 32.0 28.08 30.2 17.55
2 ... ... ... ...

Gitee链接：作业4-3

### 1.网页解析

• 板块切换

lis = driver.find_elements(By.XPATH, '//div[@id="tab"]/ul/li')
tab = lis[i].find_element(By.XPATH,'.//a')
driver.execute_script("arguments[0].click();", tab)
• 翻页

time.sleep(3)
input = driver.find_element(By.XPATH,//div[@class="dataTables_wrapper"]//input')
input.clear()
input.send_keys(页码)
go = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//a[@class="paginte_go"]')
go.click()
• 信息定位

### 2.数据库操作

class StocksDB:
def openDB(self):

def closeDB(self):

def insert(self, tab,id, num, name, new, up, upprice, com, comprice, f, max,min,today,yes):



### 3.获取网页login()

def login():
# 请求网页
driver.get('http://quote.eastmoney.com/center/gridlist.html#hs_a_board')


### 4.拉动侧边滑动条

def drop_scroll():
for x in range(1, 11, 2):
time.sleep(0.5)
# 代表滑动条位置
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
# 运行上面的js代码
driver.execute_script(js)


### 5.获取信息get_info()

def get_info(tab):
# 分析页面后，获取tr里面的数据
trs = driver.find_elements(By.XPATH,'//tbody/tr')
# 遍历每个tr，获取详细信息
for tr in trs:
# 获取序号
id = tr.find_element(By.XPATH,'.//td[1]').text
# 获取股票代码
num = tr.find_element(By.XPATH,'.//td[2]/a').text
# 获取股票名称
name = tr.find_element(By.XPATH,'.//td[3]/a').text
# 获取最新价
new  = tr.find_element(By.XPATH,'.//td[5]/span').text
# 获取涨跌幅
up = tr.find_element(By.XPATH,'.//td[6]/span').text
# 获取涨跌额
upprice = tr.find_element(By.XPATH,'.//td[7]/span').text
# 获取成交量
com = tr.find_element(By.XPATH,'.//td[8]').text
# 获取成交额
comprice = tr.find_element(By.XPATH,'.//td[9]').text
# 获取振幅
f = tr.find_element(By.XPATH,'.//td[10]').text
# 获取最高
max  = tr.find_element(By.XPATH,'.//td[11]/span').text
# 获取最低
min = tr.find_element(By.XPATH,'.//td[12]/span').text
# 获取今开
today  = tr.find_element(By.XPATH,'.//td[13]/span').text
# 获取昨收
yes = tr.find_element(By.XPATH,'.//td[14]').text
# 向数据库添加一条数据
print(tab,id,num, name, new, up, upprice, com, comprice, f,max,min,
today,yes)
db.insert(tab,id, num, name, new, up, upprice, com, comprice, f, max,
min,today,yes)
print(str(id) + ' has been inserted')


### 6.切换页面

def to_searchs():
lis = driver.find_elements(By.XPATH,'//div[@id="tab"]/ul/li')
# 爬取三个板块
for i in range(3):
time.sleep(3)
lis = driver.find_elements(By.XPATH, '//div[@id="tab"]/ul/li')
tab = lis[i].find_element(By.XPATH,'.//a')
print(tab.text)
text = tab.text
driver.execute_script("arguments[0].click();", tab)
# 每个版块爬取三页
for i in range(3):
# element is not attached to the page document
time.sleep(3)
#找到输入框
input = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//input')
# 清空输入框
input.clear()
input.send_keys(i+1)
# 找到确定按钮
go = driver.find_element(By.XPATH,'//div[@class="dataTables_wrapper"]//a[@class="paginte_go"]')
go.click()
# 拉动侧边滑动条，使页面数据加载完全
drop_scroll()
# 获取信息
get_info(text)


• 控制台输出
• 数据库截图

### 8.心得体会

• 通过本题熟悉了selenium框架的使用；
• 在切换页面时会出现element is not attached to the page document的错误，原因是在刚进入新的页面时页面数据未加载完全，time.sleep(3)即可解决问题；
• 在切换板块时若使用tab.click()会报错element click intercepted，改为driver.execute_script("arguments[0].click();", tab)
posted @ 2021-11-10 12:55  Sevennnn  阅读(41)  评论(0编辑  收藏  举报