数据采集实践作业四

作业①

1）爬取当当网站图书数据

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据

候选网站：http://search.dangdang.com/?key=python&act=input

关键词：学生可自由选择

输出信息：MySQL的输出信息如下

avatar

主要代码：

BookSpider.py

1 import scrapy
2 from book.items import BookItem
3 from bs4 import BeautifulSoup
4 from bs4 import UnicodeDammit
5
6 class MySpider(scrapy.Spider):
7 name = "books"
8 key = 'python'
9 source_url='http://search.dangdang.com/'
10
11 def start_requests(self):
12 url = MySpider.source_url+"?key="+MySpider.key
13 yield scrapy.Request(url=url,callback=self.parse)
14
15 def parse(self, response):
16 try:
17 dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
18 data = dammit.unicode_markup
19 selector = scrapy.Selector(text=data)
20 print(selector)
21 lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]")
22 for li in lis:
23 title = li.xpath("./a[position()=1]/@title").extract_first()
24 price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first()
25 author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first()
26 date = li.xpath("./p[@class='search_book_author']/span[position()=last()- 1]/text()").extract_first()
27 publisher = li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title ").extract_first()
28 detail = li.xpath("./p[@class='detail']/text()").extract_first()
29 # detail有时没有，结果None
30
31 item = BookItem()
32 item["title"] = title.strip() if title else ""
33 item["author"] = author.strip() if author else ""
34 item["date"] = date.strip()[1:] if date else ""
35 item["publisher"] = publisher.strip() if publisher else ""
36 item["price"] = price.strip() if price else ""
37 item["detail"] = detail.strip() if detail else ""
38 yield item
39
40 # 最后一页时link为None
41 link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next'] / a / @ href").extract_first()
42 if link:
43 url = response.urljoin(link)
44 yield scrapy.Request(url=url, callback=self.parse)
45 except Exception as err:
46 print(err)

pipelines.oy

1 # Define your item pipelines here
2 #
3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
6
7 # useful for handling different item types with a single interface
8 from itemadapter import ItemAdapter
9 import pymysql
10
11 class BookPipeline(object):
12 def open_spider(self,spider):
13 print("opened")
14 try:
15 self.con=pymysql.connect(host="localhost", port=3306, user="root", password="123456",
16 db="mydb", charset="utf8")
17 self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
18 self.cursor.execute("delete from books")
19 self.opened=True
20 self.count = 0
21 except Exception as err:
22 print(err)
23 self.opened=False
24
25
26 def close_spider(self,spider):
27 if self.opened:
28 self.con.commit()
29 self.con.close()
30 self.opened=False
31 print("closed")
32 print("总共爬取",self.count,"本书籍")
33
34 def process_item(self, item, spider):
35 try:
36 # print(item['title'])
37 # print(item['author'])
38 # print(item['publisher'])
39 # print(item['date'])
40 # print(item['price'])
41 # print(item['detail'])
42 # print()
43 if self.opened:
44 self.cursor.execute("insert into books(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail) "
45 "values(%s,%s,%s,%s,%s,%s)",
46 (item['title'],item["author"],item["publisher"],item["date"],item["price"],item["detail"]))
47 self.count+=1
48 print("爬取第" + str(self.count) + "本成功")
49 except Exception as err:
50 print(err)
51 return item

实验截图：

2）心得体会

本题主要是复现书本中的例题，遇到的问题主要是数据插入和连接数据库。

作业②

1）爬取外汇网站数据

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站：招商银行网：http://fx.cmbchina.com/hq/
输出信息：MySQL数据库存储和输出格式
- Id Currency TSP CSP TBP CBP Time
  
  1 港币 86.60 86.60 86.26 85.65 15：36：30
  
  2......

Id	Currency	TSP	CSP	TBP	CBP	Time
1	港币	86.60	86.60	86.26	85.65	15：36：30
2......

主要代码：

ForexSpider.py

1 import scrapy
2 from forex.items import ForexItem
3 from bs4 import BeautifulSoup
4 from bs4 import UnicodeDammit
5
6 class MySpider(scrapy.Spider):
7 name = "forex"
8 source_url='http://fx.cmbchina.com/Hq/'
9
10 def start_requests(self):
11 url = MySpider.source_url
12 yield scrapy.Request(url=url,callback=self.parse)
13
14 def parse(self, response):
15 try:
16 dammit = UnicodeDammit(response.body, ["utf-8", "utf-16", "gbk"])
17 data = dammit.unicode_markup
18 selector = scrapy.Selector(text=data)
19 trs = selector.xpath('//*[@id="realRateInfo"]//tr[position()>1]')
20 for tr in trs:
21 currency = tr.xpath("./td[@class='fontbold'][1]/text()").extract_first()
22 tsp = tr.xpath('./td[4]/text()').extract_first()
23 csp = tr.xpath('./td[5]/text()').extract_first()
24 tbp = tr.xpath('./td[6]/text()').extract_first()
25 cbp = tr.xpath('./td[7]/text()').extract_first()
26 last_time = tr.xpath('./td[8]/text()').extract_first()
27 item = ForexItem()
28 item['currency'] = currency.strip() if currency else ""
29 item['tsp'] = tsp.strip() if currency else ""
30 item['csp'] = csp.strip() if currency else ""
31 item['tbp'] = tbp.strip() if currency else ""
32 item['cbp'] = cbp.strip() if currency else ""
33 item['last_time'] = last_time.strip() if currency else ""
34 yield item
35
36 except Exception as err:
37 print(err)

pipelines.py

1 # Define your item pipelines here
2 #
3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5
6
7 # useful for handling different item types with a single interface
8 from itemadapter import ItemAdapter
9 import pymysql
10
11 class ForexPipeline(object):
12 def open_spider(self,spider):
13 print("opened")
14 try:
15 self.con=pymysql.connect(host="localhost", port=3306, user="root", password="123456",
16 db="mydb", charset="utf8")
17 self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
18 self.cursor.execute("delete from forexs")
19 self.opened=True
20 self.count = 0
21 except Exception as err:
22 print(err)
23 self.opened=False
24
25
26 def close_spider(self,spider):
27 if self.opened:
28 self.con.commit()
29 self.con.close()
30 self.opened=False
31 print("closed")
32 print("总共爬取",self.count,"条数据")
33 def process_item(self, item, spider):
34 print()
35 try:
36 print(item['currency'])
37 print(item['tsp'])
38 print(item['csp'])
39 print(item['tbp'])
40 print(item['cbp'])
41 print(item['last_time'])
42 print()
43 if self.opened:
44 self.cursor.execute("insert into forexs(Currency,TSP,CSP,TBP,CBP,Time) "
45 "values(%s,%s,%s,%s,%s,%s)",
46 (item['currency'], item["tsp"], item["csp"], item["tbp"], item["cbp"],
47 item["last_time"]))
48 self.count += 1
49 print("爬取第" + str(self.count) + "条数据成功")
50 except Exception as err:
51 print(err)
52 return item

实验截图：

2）心得体会:

加深了对scrapy编写顺序及xpath的理解。

作业③

1）

要求：熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容；使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站：东方财富网：http://quote.eastmoney.com/center/gridlist.html#hs_a_board
输出信息：MySQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	成交额	振幅	最高	最低	今开
1	688093	N世华	28.47	62.22%	10.92	26.13万	7.6亿	22.34	32.0	28.08	30.2	17.55
2......

主要代码：

1 from selenium import webdriver
2 from selenium.webdriver.chrome.options import Options
3 from bs4 import BeautifulSoup
4 import pymysql
5 import json
6 import requests
7 import re
8 from bs4 import UnicodeDammit
9 import urllib.request
10
11
12
13 class StockDB:
14 def openDB(self):
15 print("opened")
16 try:
17 self.con=pymysql.connect(host="localhost", port=3306, user="root", password="123456",
18 db="mydb", charset="utf8")
19 self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
20 self.cursor.execute("delete from Stock_sz")
21 self.opened = True
22 self.count = 0
23 except Exception as err:
24 print(err)
25 self.opened = False
26
27 def insert(self, stockList):
28 try:
29 self.cursor.executemany(
30 "insert into Stock_sz(Id,StockCode,StockName,NewPrice,RiseFallPercent,RiseFallNum,Turnover,DealNum,Amplitude,Highest,Lowest,Today,Yesterday) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
31 stockList)
32 except Exception as err:
33 print(err)
34
35 def closeDB(self):
36 if self.opened:
37 self.con.commit()
38 self.con.close()
39 self.opened = False
40 print("closed")
41
42 def show(self):
43 self.cursor.execute("select * from Stock_hs")
44 rows = self.cursor.fetchall()
45 print("{:8}\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}"
46 "\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}" .format("股票代码","股票名称","最新价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今收","昨收",chr(12288)))
47 for row in rows:
48 print("{:8}\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}"
49 "\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}".format(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11],chr(12288)))
50
51 class stock:
52 def getStockData(self):
53 chrome_options = Options()
54 chrome_options.add_argument('--headless')
55 chrome_options.add_argument('--disable-gpu')
56 driver = webdriver.Chrome(options=chrome_options)
57 driver.get("http://quote.eastmoney.com/center/gridlist.html#sz_a_board")
58 trs = driver.find_elements_by_xpath('//tbody/tr')
59 stocks = []
60 for tr in trs:
61 tds = tr.find_elements_by_xpath('./td')
62 td = [x.text for x in tds]
63 stocks.append(td)
64 # print(stocks)
65 stockInfo = []
66 for stock in stocks:
67 stockInfo.append((stock[0], stock[1], stock[2], stock[4], stock[5], stock[6], stock[7], stock[8], stock[9],
68 stock[10], stock[11], stock[12], stock[13]))
69 return stockInfo
70
71 def process(self):
72 self.db = StockDB()
73 self.db.openDB()
74 stockInfo = self.getStockData()
75 print(stockInfo)
76 self.db.insert(stockInfo)
77 # self.db.show()
78 self.db.closeDB()
79
80 if __name__ =="__main__":
81 s = stock()
82 s.process()
83 print("completed")

实验截图：

沪深A股

上证A股

深证A股

2）心得体会：

理解了selinum的工作流程。

posted @ 2022-01-11 15:18 抱着欣欣看月亮阅读(94) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

抱着欣欣看月亮

数据采集实践作业四

作业①

1）爬取当当网站图书数据

主要代码：

2）心得体会

作业②

1）爬取外汇网站数据

主要代码：

2）心得体会:

作业③

1）

主要代码：

实验截图：

2）心得体会：

公告