telethon代码运行

import asyncio
import datetime
import json

import pymongo
import requests
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from redis import Redis
from telethon import TelegramClient
# https://my.telegram.org/apps
from telethon.utils import get_display_name

# 数据库基本信息
db_configs = {
    'type': 'mongo',
    'host': '127.0.0.1',
    'port': '27017',
    "user": "",
    "password": "",
    'db_name': 'new_spider'
}


class Mongo():
    def __init__(self):
        self.db_name = db_configs.get("db_name")
        self.host = db_configs.get("host")
        self.port = db_configs.get("port")
        self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10)
        self.username = db_configs.get("user")
        self.password = db_configs.get("passwd")
        if self.username and self.password:
            self.db = self.client[self.db_name].authenticate(self.username, self.password)
        self.db = self.client[self.db_name]

    def update(self, item, col="tg_spider"):
        if not isinstance(item, list):
            item = [item]

        for each_item in item:
            coll = self.db[col]
            coll.update_one({"message_id": each_item["message_id"]}, {'$set': each_item}, upsert=True)


redis_cli = Redis.from_url("redis://@localhost:6379")
last_offset_date = datetime.datetime.strptime("2022-10-01", "%Y-%m-%d")  # 设置开始时间
api_id = xxxx

api_hash = xxxxxx

channel_dict = {}

client = TelegramClient('ztelegrm', api_id, api_hash, connection_retries=15, retry_delay=3)
robot_url = xxxxx

mongo_cli = Mongo()


def robot_warning(text, url):
    """
    企业微信机器人
    text: 文本
    url: 链接

    """
    if isinstance(text, dict) or isinstance(text, list):
        text = json.dumps(text, ensure_ascii=False)
    headers = {
        "Content-Type": "application/json"
    }
    data = {
        "msgtype": "text",
        "text": {
            "content": text,
        }
    }
    return requests.post(url, headers=headers, data=json.dumps(data))


async def format_message(message, chat_id):
    if not message.text or message.media:
        return None

    chat = await message.get_chat()

    chat_display_name = get_display_name(chat)
    sender_user = await message.get_sender()

    content = message.text
    talker = chat_id
    tg_id = getattr(sender_user, 'username', '')
    create_time = message.date.strftime("%s")
    group_name = chat_display_name
    first_name = getattr(sender_user, 'first_name', '')
    last_name = getattr(sender_user, 'last_name', '')
    if tg_id is None:
        tg_id = "未知"
    if last_name is None and first_name is None:
        user_name = "未知"
    else:
        user_name = f"{first_name}_{last_name}"
    send_data = f"{content}|$#$#|{talker}|$#$#|{tg_id}|$#$#|{create_time}|$#$#|{group_name}|$#$#|{user_name}"
    redis_cli.set(f"tg_cache_id_{chat_id}", message.id)
    # return send_data

    doc_data = {
        "timestamp": message.date.strftime("%s"),
        "sender": {
            "username": getattr(sender_user, 'username', ''),
            "firstName": getattr(sender_user, 'first_name', ''),
            "lastName": getattr(sender_user, 'last_name', '')
        },
        "channel": chat_display_name,
        "channel_id": chat_id,
        "text": message.text,
        "message_id": message.id
    }
    return doc_data


async def get_channel_dict():
    async for d in client.iter_dialogs():
        channelId = d.entity.id
        channelName = d.name
        # channel_dict[channelId] = channelName
        yield channelId, channelName


async def load_history_to_save():
    channel_dict = get_channel_dict()
    async for channel_info in channel_dict:
        channel_id = channel_info[0]
        min_id = redis_cli.get(f"tg_cache_id_{channel_id}")
        if min_id is None:
            min_id = 0
        print(channel_id, channel_info[1])
        entity = await client.get_entity(channel_id)
        await asyncio.sleep(10)
        message_list = []
        # 每次查50条,limit=50
        if min_id == 0:
            # 这个地方含义是获取2022-10-01这天的数据,从旧数据到新数据的顺序,这样message id逐渐增大。
            async for message in client.iter_messages(entity, reverse=True, offset_date=last_offset_date, limit=50):
                doc_data = await format_message(message, channel_id)
                if doc_data:
                    message_list.append(doc_data)
        else:
            # 根据message id获取数据,id逐渐增大。
            async for message in client.iter_messages(entity, reverse=True, min_id=int(min_id),
                                                      limit=50):
                doc_data = await format_message(message, channel_id)
                if doc_data:
                    message_list.append(doc_data)
        # robot_warning(message_list, robot_url)
        mongo_cli.update(message_list)

        print("发送完成")


def start():
    with client:
        scheduler = AsyncIOScheduler()
        scheduler.add_job(load_history_to_save, 'interval', minutes=5, max_instances=1,
                          next_run_time=datetime.datetime.now())
        scheduler.start()
        client.loop.run_forever()


if __name__ == '__main__':
    start()

posted @ 2022-12-20 16:30  公众号python学习开发  阅读(438)  评论(0编辑  收藏  举报