python爬虫练习2--苏宁图书信息
仅供学习,禁止用于商用,后果自负。
import scrapy, requests, json, re from copy import deepcopy from bs4 import BeautifulSoup from urllib.parse import urlencode class BookSpider(scrapy.Spider): name = 'book' allowed_domains = ['suning.com'] start_urls = ['http://book.suning.com/'] def parse(self, response): """ 进入图书首页,分析图书类别和子类别 该方法仅解析了一个子类别图书 :param response: :return: """ one_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-item"]') two_types = response.xpath('//div[@class="menu-list"]/div[@class="menu-sub"]/div[@class="submenu-left"]') for one_index, menu_item in enumerate([one_types[0]]): one_type = menu_item.xpath('./dl/dt/h3/a/text()').extract_first() two_type_list = two_types[one_index].xpath('./p[@class="submenu-item"]/a/text()').extract() for two_index, two_type_item in enumerate([two_type_list[0]]): two_type = two_type_item three_types = two_types[one_index].xpath('./ul')[two_index].xpath('./li') for three_type_a in [three_types[0]]: three_type = three_type_a.xpath('./a/text()').extract_first() url = three_type_a.xpath('./a/@href').extract_first() item = {} item["one_type"] = one_type item["two_type"] = two_type item["three_type"] = three_type item["type_url"] = url yield scrapy.Request(item["type_url"], callback=self.get_book_page_num, meta={"item": deepcopy(item)}) def get_book_page_num(self, response): """进入图书列表页面获取每个图书详情地址 由于列表页面默认只会加载30条数据,因此通过接口的方式获取图书详情地址 该方法仅解析了第一页数据 """ item = response.meta.get("item", {}) page_num = int(response.xpath('//div[@id="bottom_pager"]/a[@role="menuitem"]')[-1].xpath('./@pagenum').extract_first()) item["page_num"] = page_num ci = item["type_url"].split("-")[1] for i in range(1): params = ( ('ci', str(ci)), ('pg', '03'), ('cp', str(i)), ('il', '0'), ('iy', '0'), ('adNumber', '0'), ('n', '1'), ('ch', '4'), ('prune', '0'), ('sesab', 'ACBAABC'), ('id', 'IDENTIFYING'), ('cc', '089'), ) book_lsit_api = "https://list.suning.com/emall/showProductList.do?" + urlencode(params) # 获取前30条数据 yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)}) # 获取后30条数据 params.append(('paging', '1')) params.append(('sub', '0')) yield scrapy.Request(book_lsit_api, callback=self.parse_book_list, meta={"item": deepcopy(item)}) def parse_book_list(self, response): """ 接口返回的数据为存在缺失的html代码,xpath解析有误,因此使用BeautifulSoup解析获取详情页地址 :param response: :return: """ item = response.meta.get("item", {}) soup = BeautifulSoup(response.text, "lxml") books = soup.find_all('a', attrs={'class': 'sellPoint'}) for book in books: detail_url = "https:" + book.get('href') yield scrapy.Request(detail_url, callback=self.parse_book_detail, meta={"item": deepcopy(item)}) def parse_book_detail(self, response): """ 解析详情页获取图书名称、价格、作者、出版社信息 由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup :param response: :return: """ price = self.get_price(response) item = response.meta.get("item", {}) soup = BeautifulSoup(response.text, "html.parser") li_list = soup.find_all('li', attrs={'class': 'pb-item'}) if len(li_list) > 0: item["author"] = self.replace(li_list[0].text) if len(li_list) > 1: item["press"] = self.replace(li_list[1].text) if len(li_list) > 2: item["time"] = self.replace(li_list[2].text) name = soup.find('h1', attrs={"id": "itemDisplayName"}).text.replace("\n", "").replace("\u3000", " ") image_url = response.xpath('//div[@class="imgzoom-main"]/a/img/@src').extract_first() item["name"] = name item["price"] = price item["image_url"] = "https:" + image_url print(item) def get_price(self, response): """ 获取价格 通过接口分析参数后发现仅passPartNumber、vendorCode控制价格信息因解析该参数即可 由于详情页有反爬措施,xpath无法解析因此使用BeautifulSoup :param response: :return: """ passPartNumber_str = re.findall(r'"passPartNumber":"[0-9]*?"', response.text)[0] passPartNumber = passPartNumber_str.split('"')[-2] vendorCode_str = re.findall(r'"vendorCode":"[0-9]*?"', response.text)[0] vendorCode = vendorCode_str.split('"')[-2] url = "https://pas.suning.com/nspcsale_0_{}_{}_{}_300_089_0890199_502282_1000347_8999_100138_Z001___R9011205_3.0____0001400PA____0___16.0_2__502320_502687_.html?callback=pcData&_=1637305043921".format( passPartNumber, passPartNumber, vendorCode ) r = requests.get(url=url) json_data = r.text.replace("pcData(", "")[:-2] price = json.loads(json_data)["data"]["price"]["saleInfo"][0]["netPrice"] return price def replace(self, str): """特殊符号清理 1:删除\n 2:删除\t 3:删除" " 4:替换\u3000、\xa0为空格 """ temp = str.replace("\n", "").replace("\t", "").replace(" ", "").replace("\u3000", " ").replace(u'\xa0', u' ') return temp