# !/usr/bin/env python
# -*- coding:utf-8 -*-
"""
# File : english-001.py
# Time :2024/1/2 20:37
# Author :lrtao2010
# version :python 3.10.1
# Description:记录指定英文字母数量的英语单词
"""
#导入模块
import requests #下载网页
import re
import time
import random
def spider(url):
# 自定义请求头
my_headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
#'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Host': 'XXXX.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
resp = requests.get(url,headers=my_headers,timeout=(30,30))
#print(resp.text)
obj = re.compile(r'<tr data-key=".*?"><td>(?P<name>.*?)</td></tr>',re.S) # re.S可以让re匹配到换行符
result = obj.finditer(resp.text)
for item in result:
dic = item.groupdict()
#print("|".join(dic.values()))
with open("./shuju/yingyu-3.txt", 'a+', encoding='utf-8')as s_f:
s_f.write("|".join(dic.values()) + "\n")
#主程序,注意修改英文字母数量
if __name__ == '__main__':
for page in range(1,108):
url = f"https://XXXX.com/zh-cn/find-english-words-by-length/any/3/{page}"
print(url)
spider(url)
time.sleep(random.randint(2, 3))