python爬虫

import requests
import re
import time
import hashlib
import threading
from pymysql.converters import escape_string
from fake_useragent import UserAgent

from mylib.module import *

headers = {
    'User-Agent': UserAgent().random
}

def set_hash(string):
    md5 = hashlib.md5()
    md5.update(string.encode('utf-8'))
    return md5.hexdigest()


def get_pro(url):
    ls = []
    res = requests.get(url=url, headers=headers)
    res.encoding = 'utf-8'
    html = res.text

    address = re.findall(r'<div class="right2a">([\s\S]*?)</div>', html)[0]
    href = re.findall(r'href="(.*?)"', address)
    cname = re.findall(r'">(.*?)</a>', address)

    for add_url, name in zip(href, cname):
        dt = {}
        dt['pro_url'] = add_url
        add_res = requests.get(url=add_url, headers=headers)
        add_res.encoding = 'utf-8'
        add_html = add_res.text

        try:
            a_href = re.findall(r'>下一页</a><a href="(.*?)">末页</a>', add_html)[0]
            page_prefix = a_href.split('/')[-1].split('_')[0]
            # print(name, '有下一页')
            time.sleep(0.2)
        except:
            page_prefix = False
            # print(name, '没有下一页')

        dt['pro_name'] = name
        dt['page_pre'] = page_prefix
        ls.append(dt)

        print(name)

    return ls


def parse_page(ls, total_page=5):
    for item in ls:
        page_pre = item['page_pre']
        pro_url = item['pro_url']
        pro_name = item['pro_name']

        title_flag = False
        for page in range(1, total_page + 1):
            if page == 1:
                p = ''
            else:
                if not page_pre:
                    break
                p = page_pre + '_{}.html'.format(page)

            url = pro_url + p
            headers = {
                'User-Agent': UserAgent().random
            }
            res = requests.get(url=url, headers=headers)
            res.encoding = 'utf-8'
            html = res.text
            a_ls = re.findall(r'<div class="xwt2_a">(.*?)</div>', html)
            for a in a_ls:
                href = re.findall(r'href="(.*?)"', a)[0]
                con_res = requests.get(url=href, headers=headers)
                con_res.encoding = 'utf-8'
                con_html = con_res.text

                title = re.findall(r'<h1>(.*?)</h1>', con_html)[0]
                title_hash = set_hash(title)

                # 去重
                with lock:
                    user_id = db.findone(
                        f'select id from shui5 where title_hash="{title_hash}"')
                if user_id:
                    title_flag = True  # 重复
                    break

                try:
                    source = re.findall(r"<span>来源:(.*?)</span>", con_html)[0]
                    source_name = re.findall(
                        r'target=_blank>(.*?)</a>', source)[0]
                except:
                    source_name = '暂无'
                author_name = re.findall(
                    r'<span class="fa">作者:(.*?)</span>', con_html)[0]
                hot_url = re.findall(
                    r'<span>人气:<script src="(.*?)"></script></script>', con_html)[0]
                hot_num = re.findall(r'\d+', requests.get(url=hot_url, headers=headers).text)[0]
                timer = re.findall(
                    r'<span class="m_none">时间:(.*?)</span>', con_html)[0]
                try:
                    des = re.findall(
                        r'<div class="articleDes">摘要:(.*?)</div>', con_html)[0].strip()
                    des = escape_string(des)
                except:
                    des = '暂无'
                content = re.findall(
                    r'<div class="arcContent" id="tupain">[\s\S]*?</div>', con_html)[0]
                content = escape_string(content)

                sql = f'''insert into shui5 values(NULL, "{pro_name}", "{title}", "{title_hash}", "{source_name}", 
                "{author_name}", "{hot_num}", "{timer}", "{des}", "{content}", now())'''
                with lock:
                    db.insert(sql)

                print('存储一条')

            if title_flag:
                print('标题重复,停止爬取')
                break

            print(f'第{page}页爬完, 等待1s')
            time.sleep(1)


if __name__ == '__main__':
    lock = threading.Lock()
    db = DB('127.0.0.1', 'root', 'sqsyq402', 'hello')
    url = 'https://www.shui5.cn/article/DiFangCaiShuiFaGui/'
    # ls = get_pro(url)
    # print(ls)
    ls = [{'pro_url': 'https://www.shui5.cn/article/BeiJingShiCaiShuiFaGui/', 'pro_name': '北京市', 'page_pre': '152'}, {'pro_url': 'https://www.shui5.cn/article/ShangHaiShiCaiShuiFaGui/', 'pro_name': '上海市', 'page_pre': '153'}, {'pro_url': 'https://www.shui5.cn/article/ZhongQingShiCaiShuiFaGui/', 'pro_name': '重庆市', 'page_pre': '154'}, {'pro_url': 'https://www.shui5.cn/article/TianJinShiCaiShuiFaGui/', 'pro_name': '天津市', 'page_pre': '155'}, {'pro_url': 'https://www.shui5.cn/article/HeBeiShengCaiShuiFaGui/', 'pro_name': '河北省', 'page_pre': '158'}, {'pro_url': 'https://www.shui5.cn/article/GuangDongShengCaiShuiFaGu/', 'pro_name': '广东省', 'page_pre': '159'}, {'pro_url': 'https://www.shui5.cn/article/AnHuiShengCaiShuiFaGui/', 'pro_name': '安徽省', 'page_pre': '160'}, {'pro_url': 'https://www.shui5.cn/article/JiLinShengCaiShuiFaGui/', 'pro_name': '吉林省', 'page_pre': '161'}, {'pro_url': 'https://www.shui5.cn/article/HuBeiShengCaiShuiFaGui/', 'pro_name': '湖北省', 'page_pre': '162'}, {'pro_url': 'https://www.shui5.cn/article/XiCangZiZhiQuCaiShuiFaGui/', 'pro_name': '西藏区', 'page_pre': '163'}, {'pro_url': 'https://www.shui5.cn/article/ZheJiangShengCaiShuiFaGui/', 'pro_name': '浙江省', 'page_pre': '164'}, {'pro_url': 'https://www.shui5.cn/article/JiangSuShengCaiShuiFaGui/', 'pro_name': '江苏省', 'page_pre': '165'}, {'pro_url': 'https://www.shui5.cn/article/ShanDongShengCaiShuiFaGui/', 'pro_name': '山东省', 'page_pre': '166'}, {'pro_url': 'https://www.shui5.cn/article/SiChuanShengCaiShuiFaGui/', 'pro_name': '四川省', 'page_pre': '167'}, {'pro_url': 'https://www.shui5.cn/article/HeNanShengCaiShuiFaGui/', 'pro_name': '河南省', 'page_pre': '168'}, {'pro_url': 'https://www.shui5.cn/article/LiaoNingShengCaiShuiFaGui/', 'pro_name': '辽宁省', 'page_pre': '169'}, {'pro_url': 'https://www.shui5.cn/article/JiangXiShengCaiShuiFaGui/', 'pro_name': '江西省', 'page_pre': '170'}, {'pro_url': 'https://www.shui5.cn/article/HuNanShengCaiShuiFaGui/', 'pro_name': '湖南省', 'page_pre': '171'}, {'pro_url': 'https://www.shui5.cn/article/HeiLongJiangCaiShuiFaGui/', 'pro_name': '黑龙江', 'page_pre': '172'}, {'pro_url': 'https://www.shui5.cn/article/GuangXiZiZhiQuCaiShuiFaGu/', 'pro_name': '广西区',
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      'page_pre': '173'}, {'pro_url': 'https://www.shui5.cn/article/YunNanShengCaiShuiFaGui/', 'pro_name': '云南省', 'page_pre': '174'}, {'pro_url': 'https://www.shui5.cn/article/ShanXiShengCaiShuiFaGui/', 'pro_name': '陕西省', 'page_pre': '175'}, {'pro_url': 'https://www.shui5.cn/article/HaiNanShengCaiShuiFaGui/', 'pro_name': '海南省', 'page_pre': '176'}, {'pro_url': 'https://www.shui5.cn/article/GuiZhouShengCaiShuiFaGui/', 'pro_name': '贵州省', 'page_pre': '177'}, {'pro_url': 'https://www.shui5.cn/article/NingXiaZiZhiQuCaiShuiFaGu/', 'pro_name': '宁夏区', 'page_pre': '178'}, {'pro_url': 'https://www.shui5.cn/article/GanSuShengCaiShuiFaGui/', 'pro_name': '甘肃省', 'page_pre': '179'}, {'pro_url': 'https://www.shui5.cn/article/NeiMengGuzizhiquQuCaiShuiFa/', 'pro_name': '内蒙古', 'page_pre': '180'}, {'pro_url': 'https://www.shui5.cn/article/category_181/', 'pro_name': '山西省', 'page_pre': '181'}, {'pro_url': 'https://www.shui5.cn/article/QingHaiShengCaiShuiFaGui/', 'pro_name': '青海省', 'page_pre': '182'}, {'pro_url': 'https://www.shui5.cn/article/XinJiangZiZhiQuCaiShuiFaG/', 'pro_name': '新疆区', 'page_pre': '183'}, {'pro_url': 'https://www.shui5.cn/article/FuJianShengCaiShuiFaGui/', 'pro_name': '福建省', 'page_pre': '184'}, {'pro_url': 'https://www.shui5.cn/article/XiangGangDiQuCaiShuiFaGui/', 'pro_name': '香港区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/TaiWanDiQuCaiShuiFaGui/', 'pro_name': '台湾区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/AoMenDiQuCaiShuiFaGui/', 'pro_name': '澳门区', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/GuoWaiCaiShuiFaGui/', 'pro_name': '国外', 'page_pre': False}, {'pro_url': 'https://www.shui5.cn/article/DaiLianShiFaGui/', 'pro_name': '大连市', 'page_pre': '12667'}, {'pro_url': 'https://www.shui5.cn/article/QingDaoShiFaGui/', 'pro_name': '青岛市', 'page_pre': '12669'}, {'pro_url': 'https://www.shui5.cn/article/NinBoShiFaGui/', 'pro_name': '宁波市', 'page_pre': '12670'}, {'pro_url': 'https://www.shui5.cn/article/XiaMenShiFaGui/', 'pro_name': '厦门市', 'page_pre': '12672'}, {'pro_url': 'https://www.shui5.cn/article/ShenZhengShiFaGui/', 'pro_name': '深圳市', 'page_pre': '12673'}]
    thread_num, thread_work_num = 5, 8
    threads = [threading.Thread(target=parse_page, args=(
        ls[i * thread_work_num: (i + 1) * thread_work_num], )) for i in range(thread_num)]

    for t in threads:
        t.start()
    for t in threads:
        t.join()

    print('线程结束')

from lxml import etree
import re, requests
from fake_useragent import UserAgent

headers = {
    'User-Agent': UserAgent().random
}

url = 'https://stock.tuchong.com/activity?availableOnly=&page=1&platform=image&search_id=7147858628085547301&size=100&sortBy=0&topic_id='
res = requests.get(url=url, headers=headers)
html = res.text

a = re.findall(r'"image_id":"(.*?)",', html)
for i in a:
    b = 'https://cdn9-banquan.ituchong.com/weili/image/ml/' + i + '.webp'
    ret = requests.get(b, headers=headers)
    code = ret.content
    with open('./img/' + i + '.webp', 'wb') as f:
        f.write(code)

# cdn9-banquan.ituchong.com/weili/image/ml/540018839618979425.webp
posted @ 2023-07-18 10:30  hacker_dvd  阅读(25)  评论(0)    收藏  举报