堆糖的数据抓取,堆糖返回的是json数据, 没有加密,没有反扒,重复对一个url进行请求就可以获取到不同的数据。
但是数据会有部分相同,所以需要进行去重操作
import os
import random
import re
import time
from concurrent.futures.thread import ThreadPoolExecutor
import requests
import json
from pymongo import MongoClient
class Save(object):
def __init__(self, host):
self.client = MongoClient(host=host, port=27017)
# 使用的mongo的数据表
self.db = self.client.ImageSet
def _save_data_mongodb(self, collect_name, data):
# 定义一个去重的图标集名称
self.collect_name = self.db[collect_name]
history_record = self.collect_name.find_one({"_id": data['title_id']})
if history_record:
# 数据库中已经存在数据
return False
else:
# 数据库中不存在,插入数据
self.collect_name.update_one({'_id': data['title_id']}, {'$set': data}, upsert=True)
return True
class DT:
def __init__(self, cookie):
self.start_url = "https://www.duitang.com/napi/vienna/feed/list/by_recommend/?start=0&limit=18"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
"Cookie": cookie,
"Host": "www.duitang.com",
}
self.temp_url = "https://www.duitang.com/p/atlas/?id={}"
def get_image_list(self):
"""获取堆糖的数据图片的链接"""
response = requests.get(url=self.start_url, headers=self.headers)
content = json.loads(response.content.decode())
object_list = content['data']['object_list']
title_url_temp = "https://www.duitang.com/p/atlas/?id={}"
content_list = []
for object_l in object_list:
item = {}
item['title_id'] = int(object_l['resource_id'])
item['title_url'] = title_url_temp.format(object_l['resource_id'])
try:
item['title'] = object_l['atlas']['desc']
image_list_demo = object_l['atlas']['blogs']
except:
continue
image_url_list = []
for x in image_list_demo:
image_url = x['photo']['path'][:-5:]
image_url_list.append(image_url)
if len(image_url_list) < 3:
continue
item['image_url'] = image_url_list
content_list.append(item)
print(content_list)
return content_list
def save_(self, image):
"""保存到本地"""
upload_time = time.strftime("%Y-%m-%d", time.localtime())
print("开始写入")
rule = re.compile(r'\s*', re.S)
rule2 = re.compile(r'\W*', re.S)
title = rule.sub('', image['title'])
title = rule2.sub('', title)
if title == "":
title = "新建文件夹上传{}".format(random.randint(100))
path = 'D:/堆糖/' + str(upload_time) + '/' + title
if not os.path.exists(path):
os.makedirs(path)
with open(path + '/content.txt', 'w', encoding='utf8')as fb:
fb.write(str([image['title']]))
for x in image['image_url']:
x_index = image['image_url'].index(x)
with open(path + '/{}.jpg'.format(str(x_index)), 'wb') as f:
response = requests.get(url=x)
f.write(response.content)
print(title+'写入完成')
def run(self):
image_content = self.get_image_list()
for image in image_content:
collect_name = "堆糖"
result = Save("localhost")._save_data_mongodb(collect_name, data=image)
if result:
self.save_(image)
else:
continue
if __name__ == '__main__':
"""
堆糖的数据:
堆糖的数据需要登陆保持才会返回不同的数据,如果没有登陆状态,那么返回的数据相同,
"""
cookie ="js=1; Hm_lvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1562840503,1562840556,1562925082,1562981276; __dtac=""; sessionid=7a4a7191-7aff-413d-abaa-a7858cea4e62; dt_auth=eyJhbGciOiJIUzI1NiJ9.eyJleHAiOjE1NjQxOTA4ODYsInN1YiI6IuS4jeWPr-aPj-i_sOeahOe7huiKgiIsImlkIjoyMDQwNDUxNSwicGxhdGZvcm0iOiJXRUIiLCJ2ZXJzaW9uIjoxfQ.bPhwCNPpIq0t63oFIM7VwNEtQnyErrz8njJKC7vqpPI; _auth_user_id=20404515; username=%E4%B8%8D%E5%8F%AF%E6%8F%8F%E8%BF%B0%E7%9A%84%E7%BB%86%E8%8A%82; Hm_lpvt_d8276dcc8bdfef6bb9d5bc9e3bcfcaf4=1562981293"
with ThreadPoolExecutor(10) as executor:
dt = DT(cookie)
for num in range(1, 1000):
executor.submit(dt.run)
time.sleep(1)