import requests
import re
import time
import hashlib
import threading
from pymysql.converters import escape_string
from fake_useragent import UserAgent
from mylib.module import *
headers = {
'User-Agent': UserAgent().random
}
def set_hash(string):
md5 = hashlib.md5()
md5.update(string.encode('utf-8'))
return md5.hexdigest()
def get_pro(url):
ls = []
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
html = res.text
address = re.findall(r'<div class="right2a">([\s\S]*?)</div>', html)[0]
href = re.findall(r'href="(.*?)"', address)
cname = re.findall(r'">(.*?)</a>', address)
for add_url, name in zip(href, cname):
dt = {}
dt['pro_url'] = add_url
add_res = requests.get(url=add_url, headers=headers)
add_res.encoding = 'utf-8'
add_html = add_res.text
try:
a_href = re.findall(r'>下一页</a><a href="(.*?)">末页</a>', add_html)[0]
page_prefix = a_href.split('/')[-1].split('_')[0]
# print(name, '有下一页')
time.sleep(0.2)
except:
page_prefix = False
# print(name, '没有下一页')
dt['pro_name'] = name
dt['page_pre'] = page_prefix
ls.append(dt)
print(name)
return ls
def parse_page(ls, total_page=5):
for item in ls:
page_pre = item['page_pre']
pro_url = item['pro_url']
pro_name = item['pro_name']
title_flag = False
for page in range(1, total_page + 1):
if page == 1:
p = ''
else:
if not page_pre:
break
p = page_pre + '_{}.html'.format(page)
url = pro_url + p
headers = {
'User-Agent': UserAgent().random
}
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
html = res.text
a_ls = re.findall(r'<div class="xwt2_a">(.*?)</div>', html)
for a in a_ls:
href = re.findall(r'href="(.*?)"', a)[0]
con_res = requests.get(url=href, headers=headers)
con_res.encoding = 'utf-8'
con_html = con_res.text
title = re.findall(r'<h1>(.*?)</h1>', con_html)[0]
title_hash = set_hash(title)
# 去重
with lock:
user_id = db.findone(
f'select id from shui5 where title_hash="{title_hash}"')
if user_id:
title_flag = True # 重复
break
try:
source = re.findall(r"<span>来源:(.*?)</span>", con_html)[0]
source_name = re.findall(
r'target=_blank>(.*?)</a>', source)[0]
except:
source_name = '暂无'
author_name = re.findall(
r'<span class="fa">作者:(.*?)</span>', con_html)[0]
hot_url = re.findall(
r'<span>人气:<script src="(.*?)"></script></script>', con_html)[0]
hot_num = re.findall(r'\d+', requests.get(url=hot_url, headers=headers).text)[0]
timer = re.findall(
r'<span class="m_none">时间:(.*?)</span>', con_html)[0]
try:
des = re.findall(
r'<div class="articleDes">摘要:(.*?)</div>', con_html)[0].strip()
des = escape_string(des)
except:
des = '暂无'
content = re.findall(
r'<div class="arcContent" id="tupain">[\s\S]*?</div>', con_html)[0]
content = escape_string(content)
sql = f'''insert into shui5 values(NULL, "{pro_name}", "{title}", "{title_hash}", "{source_name}",
"{author_name}", "{hot_num}", "{timer}", "{des}", "{content}", now())'''
with lock:
db.insert(sql)
print('存储一条')
if title_flag:
print('标题重复,停止爬取')
break
print(f'第{page}页爬完, 等待1s')
time.sleep(1)
if __name__ == '__main__':
lock = threading.Lock()
db = DB('127.0.0.1', 'root', 'sqsyq402', 'hello')
url = 'https://www.shui5.cn/article/DiFangCaiShuiFaGui/'
# ls = get_pro(url)
# print(ls)
ls = [{'pro_url': 'https://www.shui5.cn/article/BeiJingShiCaiShuiFaGui/', 'pro_name': '北京市', 'page_pre': '152'}, {'pro_url': 'https://www.shui5.cn/article/ShangHaiShiCaiShuiFaGui/', 'pro_name': '上海市', 'page_pre': '153'}, {'pro_url': 'https://www.shui5.cn/article/ZhongQingShiCaiShuiFaGui/', 'pro_name': '重庆市', 'page_pre': '154'}, {'pro_url': 'https://www.shui5.cn/article/TianJinShiCaiShuiFaGui/', 'pro_name': '天津市', 'page_pre': '155'}, {'pro_url': 'https://www.shui5.cn/article/HeBeiShengCaiShuiFaGui/', 'pro_name': '河北省', 'page_pre': '158'}, {'pro_url': 'https://www.shui5.cn/article/GuangDongShengCaiShuiFaGu/', 'pro_name': '广东省', 'page_pre': '159'}, {'pro_url': 'https://www.shui5.cn/article/AnHuiShengCaiShuiFaGui/', 'pro_name': '安徽省', 'page_pre': '160'}, {'pro_url': 'https://www.shui5.cn/article/JiLinShengCaiShuiFaGui/', 'pro_name': '吉林省', 'page_pre': '161'}, {'pro_url': 'https://www.shui5.cn/article/HuBeiShengCaiShuiFaGui/', 'pro_name': '湖北省', 'page_pre': '162'}, {'pro_url': 'https://www.shui5.cn/article/XiCangZiZhiQuCaiShuiFaGui/', 'pro_name': '西藏区', 'page_pre': '163'}, {'pro_url': 'https://www.shui5.cn/article/ZheJiangShengCaiShuiFaGui/', 'pro_name': '浙江省', 'page_pre': '164'}, {'pro_url': 'https://www.shui5.cn/article/JiangSuShengCaiShuiFaGui/', 'pro_name': '江苏省', 'page_pre': '165'}, {'pro_url': 'https://www.shui5.cn/article/ShanDongShengCaiShuiFaGui/', 'pro_name': '山东省', 'page_pre': '166'}, {'pro_url': 'https://www.shui5.cn/article/SiChuanShengCaiShuiFaGui/', 'pro_name': '四川省', 'page_pre': '167'}, {'pro_url': 'https://www.shui5.cn/article/HeNanShengCaiShuiFaGui/', 'pro_name': '河南省', 'page_pre': '168'}, {'pro_url': 'https://www.shui5.cn/article/LiaoNingShengCaiShuiFaGui/', 'pro_name': '辽宁省', 'page_pre': '169'}, {'pro_url': 'https://www.shui5.cn/article/JiangXiShengCaiShuiFaGui/', 'pro_name': '江西省', 'page_pre': '170'}, {'pro_url': 'https://www.shui5.cn/article/HuNanShengCaiShuiFaGui/', 'pro_name': '湖南省', 'page_pre': '171'}, {'pro_url': 'https://www.shui5.cn/article/HeiLongJiangCaiShuiFaGui/', 'pro_name': '黑龙江', 'page_pre': '172'}, {'pro_url': 'https://www.shui5.cn/article/GuangXiZiZhiQuCaiShuiFaGu/', 'pro_name': '广西区',
'page_pre': '173'}, {'pro_url': 'https://www.shui5.cn/article/YunNanShengCaiShuiFaGui/', 'pro_name': '云南省', 'page_pre': '174'}, {'pro_url': 'https://www.shui5.cn/article/ShanXiShengCaiShuiFaGui/', 'pro_name': '陕西省', 'page_pre': '175'}, {'pro_url': 'https://www.shui5.cn/article/HaiNanShengCaiShuiFaGui/', 'pro_name': '海南省', 'page_pre': '176'}, {'pro_url': 'https://www.shui5.cn/article/GuiZhouShengCaiShuiFaGui/', 'pro_name': '贵州省', 'page_pre': '177'}, {'pro_url': 'https://www.shui5.cn/article/NingXiaZiZhiQuCaiShuiFaGu/', 'pro_name': '宁夏区', 'page_pre': '178'}, {'pro_url': 'https://www.shui5.cn/article/GanSuShengCaiShuiFaGui/', 'pro_name': '甘肃省', 'page_pre': '179'}, {'pro_url': 'https://www.shui5.cn/article/NeiMengGuzizhiquQuCaiShuiFa/', 'pro_name': '内蒙古', 'page_pre': '180'}, {'pro_url': 'https://www.shui5.cn/article/category_181/', 'pro_name': '山西省', 'page_pre': '181'}, {'pro_url': 'https://www.shui5.cn/article/QingHaiShengCaiShuiFaGui/', 'pro_name': '青海省', 'page_pre': '182'}, {'pro_url': 'https://www.shui5.cn/article/XinJiangZiZhiQuCaiShuiFaG/', 'pro_name': '新疆区', 'page_pre': '183'}, {'pro_url': 'https://www.shui5.cn/article/FuJianShengCaiShuiFaGui/', 'pro_name': '福建省', 'page_pre': '184'}, {'pro_url': 'https://www.shui5.cn/article/XiangGangDiQuCaiShuiFaGui/', 'pro_name': '香港区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/TaiWanDiQuCaiShuiFaGui/', 'pro_name': '台湾区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/AoMenDiQuCaiShuiFaGui/', 'pro_name': '澳门区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/GuoWaiCaiShuiFaGui/', 'pro_name': '国外', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/DaiLianShiFaGui/', 'pro_name': '大连市', 'page_pre': '12667'}, {'pro_url': 'https://www.shui5.cn/article/QingDaoShiFaGui/', 'pro_name': '青岛市', 'page_pre': '12669'}, {'pro_url': 'https://www.shui5.cn/article/NinBoShiFaGui/', 'pro_name': '宁波市', 'page_pre': '12670'}, {'pro_url': 'https://www.shui5.cn/article/XiaMenShiFaGui/', 'pro_name': '厦门市', 'page_pre': '12672'}, {'pro_url': 'https://www.shui5.cn/article/ShenZhengShiFaGui/', 'pro_name': '深圳市', 'page_pre': '12673'}]
thread_num, thread_work_num = 5, 8
threads = [threading.Thread(target=parse_page, args=(
ls[i * thread_work_num: (i + 1) * thread_work_num], )) for i in range(thread_num)]
for t in threads:
t.start()
for t in threads:
t.join()
print('线程结束')
from lxml import etree
import re, requests
from fake_useragent import UserAgent
headers = {
'User-Agent': UserAgent().random
}
url = 'https://stock.tuchong.com/activity?availableOnly=&page=1&platform=image&search_id=7147858628085547301&size=100&sortBy=0&topic_id='
res = requests.get(url=url, headers=headers)
html = res.text
a = re.findall(r'"image_id":"(.*?)",', html)
for i in a:
b = 'https://cdn9-banquan.ituchong.com/weili/image/ml/' + i + '.webp'
ret = requests.get(b, headers=headers)
code = ret.content
with open('./img/' + i + '.webp', 'wb') as f:
f.write(code)
# cdn9-banquan.ituchong.com/weili/image/ml/540018839618979425.webp