堆糖图集抓取

堆糖的数据抓取,堆糖返回的是json数据, 没有加密,没有反扒,重复对一个url进行请求就可以获取到不同的数据。
但是数据会有部分相同,所以需要进行去重操作

import os import random import re import time from concurrent.futures.thread import ThreadPoolExecutor import requests import json from pymongo import MongoClient class Save(object): def __init__(self, host): self.client = MongoClient(host=host, port=27017) # 使用的mongo的数据表 self.db = self.client.ImageSet def _save_data_mongodb(self, collect_name, data): # 定义一个去重的图标集名称 self.collect_name = self.db[collect_name] history_record = self.collect_name.find_one({"_id": data['title_id']}) if history_record: # 数据库中已经存在数据 return False else: # 数据库中不存在,插入数据 self.collect_name.update_one({'_id': data['title_id']}, {'$set': data}, upsert=True) return True class DT: def __init__(self, cookie): self.start_url = "https://www.duitang.com/napi/vienna/feed/list/by_recommend/?start=0&limit=18" self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "Cookie": cookie, "Host": "www.duitang.com", } self.temp_url = "https://www.duitang.com/p/atlas/?id={}" def get_image_list(self): """获取堆糖的数据图片的链接""" response = requests.get(url=self.start_url, headers=self.headers) content = json.loads(response.content.decode()) object_list = content['data']['object_list'] title_url_temp = "https://www.duitang.com/p/atlas/?id={}" content_list = [] for object_l in object_list: item = {} item['title_id'] = int(object_l['resource_id']) item['title_url'] = title_url_temp.format(object_l['resource_id']) try: item['title'] = object_l['atlas']['desc'] image_list_demo = object_l['atlas']['blogs'] except: continue image_url_list = [] for x in image_list_demo: image_url = x['photo']['path'][:-5:] image_url_list.append(image_url) if len(image_url_list) < 3: continue item['image_url'] = image_url_list content_list.append(item) print(content_list) return content_list def save_(self, image): """保存到本地""" upload_time = time.strftime("%Y-%m-%d", time.localtime()) print("开始写入") rule = re.compile(r'\s*', re.S) rule2 = re.compile(r'\W*', re.S) title = rule.sub('', image['title']) title = rule2.sub('', title) if title == "": title = "新建文件夹上传{}".format(random.randint(100)) path = 'D:/堆糖/' + str(upload_time) + '/' + title if not os.path.exists(path): os.makedirs(path) with open(path + '/content.txt', 'w', encoding='utf8')as fb: fb.write(str([image['title']])) for x in image['image_url']: x_index = image['image_url'].index(x) with open(path + '/{}.jpg'.format(str(x_index)), 'wb') as f: response = requests.get(url=x) f.write(response.content) print(title+'写入完成') def run(self): image_content = self.get_image_list() for image in image_content: collect_name = "堆糖" result = Save("localhost")._save_data_mongodb(collect_name, data=image) if result: self.save_(image) else: continue if __name__ == '__main__': """ 堆糖的数据: 堆糖的数据需要登陆保持才会返回不同的数据,如果没有登陆状态,那么返回的数据相同, """ cookie ="js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1562840503,1562840556,1562925082,1562981276; __dtac=""; sessionid=7a4a7191-7aff-413d-abaa-a7858cea4e62; dt_auth=eyJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1NjQxOTA4ODYsInN1YiI6IuS4jeWPr-aPj-i_sOeahOe7huiKgiIsImlkIjoyMDQwNDUxNSwicGxhdGZvcm0iOiJXRUIiLCJ2ZXJzaW9uIjoxfQ.bPhwCNPpIq0t63oFIM7VwNEtQnyErrz8njJKC7vqpPI; _auth_user_id=20404515; username=%E4%B8%8D%E5%8F%AF%E6%8F%8F%E8%BF%B0%E7%9A%84%E7%BB%86%E8%8A%82; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1562981293" with ThreadPoolExecutor(10) as executor: dt = DT(cookie) for num in range(1, 1000): executor.submit(dt.run) time.sleep(1)

  

posted @ 2019-07-16 14:14  不可描述的细节  阅读(159)  评论(0)    收藏  举报