爬虫实战: 爬取当当网top500书籍

1.这个好像是爬虫入门必备项目,练练手

  练习网站:http://bang.dangdang.com/books/fivestars 

 

2.requests + bs4模式,因为这个网站比较简单,不多说废话了。

   (此次下载的内容没有输出到文本文件中,而是存储到mongodb数据库中)

#!/usr/bin/env python
#-*- coding:utf-8 -*-


'''爬取当当网top500书籍
'''

import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'
}
class SpiderDangdang(object): def __init__(self,url): self.url = url def get_collection(self): '''将下载的数据存入mongodb数据库 ''' client = MongoClient('localhost', 27017) database = client.spider collection = database.dangdang return collection def get_response(self): try: response = requests.get(self.url,headers=headers) response.raise_for_status response.encoding = response.apparent_encoding return response.text except Exception as e: print('aa',e) return 'None' def get_soup(self,response): try: soup = BeautifulSoup(response,'html.parser') except: soup = BeautifulSoup(response,'html.parser') return soup def get_items(self,soup): items = soup.select('div.bang_list_box>ul>li') return items def get_item_content(self,item): num = item.select('div.list_num')[0].text.strip() name = item.select('div.name')[0].text.strip() star = item.select('div.star')[0].text.strip() author = item.select('div.publisher_info')[0].text.strip() try: if item.select('div.price>p>span.price_n'): price_n = item.select('div.price>p>span.price_n')[0].text.strip() else: price_n = 'None' except: price_n = 'None' try: if item.select('div.price>p>span.price_r'): price_r = item.select('div.price>p>span.price_r')[0].text.strip() else: price_r = 'None' except: price_r = 'None' try: if item.select('div.price>p>span.price_s'): price_s = item.select('div.price>p>span.price_s')[0].text.strip() else: price_s = 'None' except: price_s = 'None' content = { '排名': num, '书名': name, '评分': star, '作者': author, '当前价格': price_n, '原价': price_r, '折扣': price_s } return content def start(self): collection = self.get_collection() response = self.get_response() soup = self.get_soup(response) items = self.get_items(soup) for item in items: content = self.get_item_content(item) query = {'排名': content['排名']} if collection.find_one(query): print('\033[1;31m该item已经存在,不进行存储\033[0m') else: collection.insert_one(content) print('\033[1;32m该item是新的, 进行存储\033[0m') if __name__ == '__main__': urls = ['http://bang.dangdang.com/books/fivestars/1-{page}'.format(**locals()) for page in range(1,26)] for page,url in enumerate(urls): print('\033[1;33m开始爬取第{page}页\033[0m'.format(page=page+1)) ss = SpiderDangdang(url) ss.start()

 

3.注意数据存储时,进行相应的判断,去掉已经存入过的item;

简单的for循环语句,建议使用列表推导式/字典/元组推导式等,简单明了

最后,输出必要的提示字符,了解程序目前运行的阶段

 

posted @ 2020-08-24 22:52  遗世独立的愚公  阅读(566)  评论(0编辑  收藏  举报