【2022.8.10】写了一个用于爬取exchange邮件的爬虫,使用exchangelib提供的线程接口,登录邮箱,下载附件,将hash保存到本地用于去重,使用了多线程提升爬取速度
    
#!/usr/bin/python3
# coding=utf8
from __future__ import print_function
import shutil
from exchangelib import Credentials, Account, Configuration, DELEGATE, FileAttachment, EWSDateTime
from multiprocessing.pool import Pool
from exchangelib.protocol import BaseProtocol
from exchangelib.protocol import NoVerifyHTTPAdapter
from urllib3.exceptions import InsecureRequestWarning
import urllib3
import time, os, sys, linecache
urllib3.disable_warnings(InsecureRequestWarning)
URL = ""
MAIL_SERVER = ""
suffix = "" #邮箱尾缀
# Tell exchangelib to use this adapter class instead of the default
BaseProtocol.HTTP_ADAPTER_CLS = NoVerifyHTTPAdapter
BaseProtocol.USERAGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
FILTER = ['admin_meical', 'e_news', 'LanguageCenter', 'rtaf_news', 'weather', "Welfare", "dict1"]
AttachFilter = 'ppt'
TSTART = EWSDateTime(2020, 7, 3)
TEND = EWSDateTime(2022, 7, 3)
if not os.path.isdir("attach"):
    os.mkdir("attach")
pwd_path = os.getcwd()
AttachDir = os.path.join(pwd_path, 'attach')
def R(message):
    return "\033[1;91m{}\033[0;m".format(message)
def G(message):
    return "\033[1;92m{}\033[0;m".format(message)
def Y(message):
    return "\033[1;93m{}\033[0;m".format(message)
def B(message):
    return "\033[1;94m{}\033[0;m".format(message)
def PrintException():
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    linecache.checkcache(filename)
    line = linecache.getline(filename, lineno, f.f_globals)
    print(R('EXCEPTION IN (LINE {} "{}"): {}'.format(lineno, line.strip(), exc_obj)))
def getAccount(username, password):
    if not username.endswith(suffix):
        username += "@" + suffix
    credentials = Credentials(username, password)
    config = Configuration(server=MAIL_SERVER, credentials=credentials)
    account = Account(primary_smtp_address=username, config=config,
                      autodiscover=False, access_type=DELEGATE)
    return account
def log(user, text):
    with open(os.path.join(user, "log.txt"), "a", encoding="utf-8") as f:
        f.write(text+"\n")
def getinfo(user, account):
    print(user, "[*]Found {} mails in inbox, {} unread".format(
        account.inbox.total_count, account.inbox.unread_count))
    print("trash", account.trash.total_count)
    print("outbox", account.outbox.total_count)
    print("sent", account.sent.total_count)
def mkuserdir(user):
    if not os.path.isdir(user):
        os.mkdir(user)
    else:
        print(B("[*]Dir %s already exists" % user))
def download_attachments(items, user):
    for item in items:
        try:
            #print("[*]Find message: %s" % (item.message_id))
            pathh = "%s_%s_%s" % (item.sender.email_address.split("@")[0], str(item.datetime_received).split()[0], item.importance)
            pathh = os.path.join(user, pathh)
            if item.has_attachments:
                for attachment in item.attachments:
                    if isinstance(attachment, FileAttachment):
                        if AttachFilter not in attachment.name:
                            continue
                        if not os.path.isdir(pathh):
                            os.mkdir(pathh)
                        if len(attachment.name) > 60:
                            name, ext = attachment.name.rsplit(".",1)
                            attach_name = "{}.{}".format(name[:55], ext)
                        else:
                            attach_name = attachment.name
                        attach_path = os.path.join(pathh, attach_name)
                        with open(attach_path, 'wb') as f, attachment.fp as fp:
                            buffer = fp.read(1024)
                            while buffer:
                                f.write(buffer)
                                buffer = fp.read(1024)
                        shutil.copy(attach_path, AttachDir)
                        log(user, '[+]Attachment saved: ' + attachment.name)
                        print(G("[+]Saved attachment: %s for user: %s" % (attachment.name, user)))
        except Exception as e:
            pass
            #PrintException()
def getinbox(account, user):
    #print(B("[*]Getting attachments in inbox: %s" % user))
    # starting = account.default_timezone.localize(TSTART)
    # end = account.default_timezone.localize(TEND)
    # items = account.inbox.filter(datetime_received__range=(starting, end))
    items = account.inbox.all()
    download_attachments(items, user)
    filtered_items = account.inbox.filter(subject__contains='foo').exclude(categories__icontains='bar')
def gettrash(account, user):
    #print(B("[*]Getting attachments in th trash: %s" % user))
    # starting = account.default_timezone.localize(TSTART)
    # end = account.default_timezone.localize(TEND)
    # items = account.inbox.filter(datetime_received__range=(starting, end))
    items = account.inbox.all()
    download_attachments(items, user)
    return 0
def getoutbox(account, user):
    #print(B("[*]Getting attachments in the outbox: %s" % user))
    # starting = account.default_timezone.localize(TSTART)
    # end = account.default_timezone.localize(TEND)
    # items = account.inbox.filter(datetime_received__range=(starting, end))
    items = account.inbox.all()
    download_attachments(items, user)
    return 0
def getsent(account, user):
    #print(B("[*]Getting attachments in the sent: %s" % user))
    # starting = account.default_timezone.localize(TSTART)
    # end = account.default_timezone.localize(TEND)
    # items = account.inbox.filter(datetime_received__range=(starting, end))
    items = account.inbox.all()
    download_attachments(items, user)
    return 0
def usermail(user, passwd):
    mkuserdir(user)
    tries1 = 0
    while tries1<2:
        try:
            account = getAccount(user, passwd)
            #getinfo(user, account)
            break
        except Exception as e:
            #PrintException()
            tries1 += 1
            time.sleep(20)
    if tries1 == 3:
        return False
    #return True
    tries = 0
    while tries<2:
        try:
            if [].count(user) == 0:
                getinbox(account, user)
            gettrash(account, user)
            getoutbox(account, user)
            getsent(account, user)
            return True
        except Exception as e:
            #PrintException()
            tries += 1
            #print(Y("[*]Sleep 20s and try again"))
            time.sleep(10)
    return False
def account_entry(fp):
    for i in fp:
        user, passwd = i.split()
        #print(Y("==================================Enter Next Account=================================="))
        # if usermail(user, passwd):
        #     print(G("[+]Complete download attachments for %s" % (user)))
        # else:
        #     print(R("[-]Fail download attachments for %s" % (user)))
        usermail(user, passwd)
    #print(B("[*]Getting all attachment"))
if __name__ == '__main__':
    # 分割用户名:哈希:密码,保留用户名和密码
    # with open("ori.txt", 'r', encoding='utf-8') as f:
    #     with open("new.txt", 'a', encoding='utf-8') as fp:
    #         lines = f.readlines()
    #         for line in lines:
    #             [u, h, p] = line.split(":")
    #             line2 = u + ' ' + p
    #             fp.write(line2)
    # 将1600个用户分给20个线程处理,每个线程处理80个用户
    fd = open("new.txt").read().split("\n")
    c = 0
    fd_list = []
    all_list = []
    thread_jobs = 80
    for f in fd:
        c = c + 1
        if c>thread_jobs:
            all_list.append(fd_list)
            fd_list = []
            c = 0
        fd_list.append(f)
    if not fd_list:
        all_list.append(fd_list)
    thread_num = len(all_list)
    print(G("[*]Thread num is: %s" % str(thread_num)))
    pool = Pool(processes=thread_num)
    pool.map(account_entry, all_list)