第四次作业
一、作业内容
作业①:要求:
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;
Scrapy+Xpath+MySQL数据库存储技术路线爬取当当网站图书数据
候选网站:http://www.dangdang.com/关键词:学生自由选择输出信息:MYSQL的输出信息如
sql
create table books( btitle varchar(512) primary key, bauthor varchar(256), bpublisher varchar(256), bprice varchar(16), bdate varchar (32), bdetail text );
items.py
import scrapy class DangdangItem(scrapy.Item): # define the fields for your item here like: title = scrapy.Field() author = scrapy.Field() publisher = scrapy.Field() date = scrapy.Field() price = scrapy.Field() detail = scrapy.Field()
pipelines.py
import pymysql class DangdangPipeline(object): def open_spider(self, spider): print("opened") try: self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="", db="book", charset="utf8") # 账号密码连接数据库 self.cursor = self.con.cursor(pymysql.cursors.DictCursor) self.cursor.execute("delete from book") # 删除 self.opened = True self.count = 0 # count计数 except Exception as error: print(error) self.opened = False def close_spider(self, spider): #提交数据并关闭数据库 if self.opened: self.con.commit() self.con.close() self.opened = False print("closed") print("共爬取了", self.count, "本书籍") def process_item(self, item, spider): try: print(item["title"]) print(item["author"]) print(item["publisher"]) print(item["date"]) print(item["price"]) print(item["detail"]) if self.opened: self.cursor.execute( "insert into book(bTitle,bAuthor,bPublisher,bDate,bPrice,bDetail)values(%s,%s,%s,%s,%s,%s)", (item["title"], item["author"], item["publisher"], item["date"], item["price"], item["detail"])) self.count +=1 except Exception as error: print(error) return item
setting.py
BOT_NAME = 'dangdang' SPIDER_MODULES = ['dangdang.spiders'] NEWSPIDER_MODULE = 'dangdang.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'dangdang (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'dangdang.pipelines.DangdangPipeline': 300, }
mydangdand.py
import scrapy from ..items import DangdangItem from bs4 import BeautifulSoup from bs4 import UnicodeDammit class BookSpider(scrapy.Spider): name = 'mydangdang' key = "python" start_url = 'http://search.dangdang.com/' def start_requests(self): url = BookSpider.start_url + "?key=" + BookSpider.key+"act=input" yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): try: dammit = UnicodeDammit(response.body,["utf-8","gdk"]) data =dammit.unicode_markup selector = scrapy.Selector(text=data) lis = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]") for li in lis: title = li.xpath("./a[position()=1]/@title").extract_first() price = li.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first() author = li.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first() date = li.xpath("./p[@class='search_book_author']/span[position()=last-1]/text()").extract_first() publisher =li.xpath("./p[@class='search_book_author']/span[position()=last()]/a/@title").extract_first() detail = li.xpath("./p[@class='detail']/text()").extract_first() item = DangdangItem() item["title"]=title.strip()if title else"" item["author"]=author.strip()if author else"" item["publisher"]=publisher.strip()if publisher else"" item["date"]=date.strip()[1:]if date else"" item["price"]=price.strip()if price else"" item["detail"]=detail.strip()if detail else"" yield item #最后一页link为none,连续爬取不同页 link = selector.xpath("//div[@class='paging']/ul[@name='Fy']/li[@class='next']/a/@href").extract_first() if link: url=response.urljoin(link) except Exception as error: print(error)

总结:是对书上代码的复现,加强了对scrapy的理解,由于sql还没安装成功,所以只给出爬取结果
作业②
-
要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;Scrapy+Xpath+MySQL数据库存储技术路线爬取股票相关信息
-
候选网站:东方财富网:https://www.eastmoney.com/
-
输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
item.py
import scrapy class StockItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() code = scrapy.Field()#code f12 name = scrapy.Field()#name f14 newp = scrapy.Field()#newp f2 zdf = scrapy.Field()#zdf f3 zde = scrapy.Field()#zde f4 cjl = scrapy.Field()#cjl f5 cje = scrapy.Field()#cje f6 zf = scrapy.Field()#zf f7 max =scrapy.Field()#max f15 min =scrapy.Field()#min f16 today = scrapy.Field() #today f16 yesterday = scrapy.Field() #yesterday f16
stock.py
import json import scrapy from stock.items import StockItem class StocksSpider(scrapy.Spider): name = 'stocks' start_urls = [ 'http://34.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112406554985928274808_1604410757799&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1604410757861'] def parse(self, response): jsons = response.text[41:][:-2] text_json = json.loads(jsons) for data in text_json['data']['diff']: item = StockItem() item["code"] = data['f12'] item["name"] = data['f14'] item["nepw"] = data['f2'] item["zdf"] = data['f3'] item["zde"] = data['f4'] item["cjl"] = data['f5'] item["cje"] = data['f6'] item["zf"] = data['f7'] item["max"] = data['15'] item["min"] = data['f16'] item["today"] = data['f17'] item["yesterday"] = data['f18'] yield item print("完成") # 再爬取后10页的内容 for i in range(2, 11): new_url = 'http://34.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240918880626239239_1602070531441&pn=' + str( i) + '&pz=20&po=1&np=3&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,' \ 'm:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,' \ 'f22,f11,f62,f128,f136,f115,f152&_=1604410757861' if new_url: yield scrapy.Request(new_url, callback=self.parse)
settings
BOT_NAME = 'stock' SPIDER_MODULES = ['stock.spiders'] NEWSPIDER_MODULE = 'stock.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'stock (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'stock.pipelines.StockPipeline': 300, }
作业③
(1)要求:熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据
item
import scrapy class CurrencyItem(scrapy.Item): currency = scrapy.Field() tsp = scrapy.Field() csp = scrapy.Field() tbp = scrapy.Field() cbp = scrapy.Field() time = scrapy.Field()
setting
BOT_NAME = 'currency' SPIDER_MODULES = ['currency.spiders'] NEWSPIDER_MODULE = 'currency.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'currency (+http://www.yourdomain.com)' ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'currency.pipelines.CurrencyPipeline': 300, }
pipeline
import pymysql class CurrencyPipeline: def open_spider(self, spider): print("opened") try: self.con = pymysql.connect(host="127.0.0.1", port=3306, user="host", passwd="123456", db="mydata", charset="utf8") self.cursor = self.con.cursor(pymysql.cursors.DictCursor) self.cursor.execute("delete from currency") self.opened = True self.count = 0 except Exception as err: print(err) self.opened = False def process_item(self, item, spider): try: print(self.count) print(item["currency"]) print(item["tsp"]) print(item["csp"]) print(item["tsp"]) print(item["cbp"]) print(item["time"]) print() if self.opened: self.cursor.execute( "insert into currency (id,currency,tsp,csp,tbp,cbp,time) values(%d,%s,%s,%s,%s,%s,%s)", (self.count,item["currency"], item["tsp"], item["csp"], item["tbp"], item["cbp"], item["time"])) self.count += 1 except Exception as err: print(err) return item
spider
from bs4 import UnicodeDammit class CurrencySpider(scrapy.Spider): name = 'currency' #allowed_domains = ['fx.cmbchina.com'] start_urls = ["http://fx.cmbchina.com/hq/"] def parse(self,response): dammit = UnicodeDammit(response.body,["utf-8","gbk"]) data = dammit.unicode_markup selector=scrapy.Selector(text=data) lis=selector.xpath("//div[@id='realRateInfo']/table/tr") for li in lis[1:]: item = CurrencyItem() currency=li.xpath("./td[position()=1][@class='fontbold']/text()").extract_first() tsp = li.xpath("./td[position()=4][@class='numberright']/text()").extract_first() csp = li.xpath("./td[position()=5][@class='numberright']/text()").extract_first() tbp=li.xpath("./td[position()=6][@class='numberright']/text()").extract_first() cbp=li.xpath("./td[position()=7][@class='numberright']/text()").extract_first() time=li.xpath("./td[position()=8][@align='center']/text()").extract_first() item["currency"]=str(currency.strip()) item["tsp"]=str(tsp.strip()) item["csp"]=str(csp.strip()) item["tbp"]=str(tbp.strip()) item["cbp"]=str(cbp.strip()) item["time"]=str(time.strip()) yield item

浙公网安备 33010602011771号