开启多线程-爬取科斯林词典

import re
import random
import requests

from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count

# ------------------------- 制作英文词典 --------------------------------------

rex = re.compile(r'[-&()/\.]+')


def bar(url):
    response = requests.get(url=url)
    soup = BeautifulSoup(response.text, 'html.parser')
    ul_obj = soup.find(name='ul', attrs={'class', 'columns2 browse-list'})
    return ul_obj.find_all(name='a')


def worker(url):
    """
        拿到具体的连接,https://www.collinsdictionary.com/browse/english/words-starting-with-a
        如上链接,是所有以a开头的单词集合
    """
    a_list = bar(url='https://www.collinsdictionary.com/browse/english/words-starting-with-{}'.format(url[0]))
    for item in a_list:
        for i in bar(item.get('href')):
            res = i.text
            if not re.findall(rex, res) and len(res) > 2:
                print(res)
                url[1].write('{}\n'.format(res))


def spider_collins():
    """
        爬取柯林斯网站所有的单词,链接深度共三层,
        第一层获取24个字母的连接,
        第二层获取以字母开头的所有短语或单词,
        第三层,就是具体的一个个单词了
    """
    f = open('w.txt', 'a', encoding='utf8')
    t = ThreadPoolExecutor(cpu_count() * 5)
    for i in range(ord('a'), ord('z') + 1):  # 97 ~ 122
        t.submit(worker, (chr(i), f))
        # break
    t.shutdown()
    f.close()

 

posted on 2019-08-29 08:50  江湖乄夜雨  阅读(287)  评论(0编辑  收藏  举报