wordsinasentence.com 单词英英翻译 17987个 含获取nodejs代码

wordsinasentence.com 单词英英翻译 17987个
这个网站的英英单词翻译非常不错,汇总成markdown,方便查询

官方在线查询地址
https://wordsinasentence.com/vocabulary-word-list/

由于直接贴上来,页面就崩了,所以改为附件了,本地查阅吧
https://files.cnblogs.com/files/pengchenggang/wordsinasentence-wordsArr.zip?t=1722836509&download=true

wordsinasentence.js

// wordsinasentence.com 单词的英语翻译
const { getPageListUrl, loadData, getTxt, saveData } = require('../getHtmlUtils.js')
const axios = require("axios")

const startNum = 1
const endNum = 37
const rootPath = 'wordsIn/' // 根目录 目录请手工创建
const folderPath = 'wordsinasentence/' // 目录请手工创建 下次叫list
const detailPath = 'details/' // 目录请手工创建

const step1 = () => {
  // 1. 获取单词列表的页面
  const pageListUrl = 'https://wordsinasentence.com/vocabulary-word-list/?_page=$num'
  // 获取全部列表页
  getPageListUrl(pageListUrl, startNum, endNum, rootPath + folderPath)
}

const step2 = () => {
  // 获取本地文件,解析其中的详情页连接,并将结果放到一个数组
  const allArr = []
  for (let i = startNum; i <= endNum; i++) {
    const data = loadData(rootPath + folderPath + i + '.html')
    const txt = dataFilter(data) // 对数据进行一次过滤
    const arr = txtReg(txt) // 获取页面中的链接和单词的数组
    allArr.push(...arr)
  }
  // console.log(allArr)
  saveData(rootPath + 'linkArr.json', JSON.stringify(allArr))

}

const txtReg = txt => {
  const regex = /<a href="([^"]+)">([^<]+)<\/a>/gs
  var matches = [...txt.matchAll(regex)]
  return matches.map(match => {
    return [match[1], match[2].trim()]
  })
}

const dataFilter = data => {
  const mainStartPosition = `<div class="pt-cv-wrapper"><div class="pt-cv-view`
  const mainEndPosition = `<div class="text-left pt-cv-pagination-wrapper"><ul`
  return getTxt(mainStartPosition, mainEndPosition, data)
}

// step2()

const step3 = () => { // 获取详情页
  const linkArr = JSON.parse(loadData(rootPath + 'linkArr.json'))
  getLinkManager(linkArr, 499)

}
let currIndex = 0
const getLinkManager = (linkArr, index) => {
  currIndex = index
  // 创建5个线程 加快速度
  for (let i = 0; i < 20; i++) {
    currIndex += 1
    getLink(linkArr, currIndex)

  }
}

const getLink = (linkArr, index) => {
  if (index >= linkArr.length) return
  const [link, word] = linkArr[index]
  axios.get(link).then(res => {
    console.info('获取 word:' + word)
    const html = res.data
    saveData(rootPath + detailPath + index + '.html', html)
    currIndex += 1
    getLink(linkArr, currIndex) // 递归调用
  })
}


// step3()

const step4 = () => { // 获取详情页面中的单词和解释
  let wordsArr = []
  let wordsTxt = ''
  for (let i = 0; i <= 17986; i++) {
    console.info('i', i)
    const html = loadData(rootPath + detailPath + i +'.html')
    const html2 = dataFilterByStep4(html)
    // console.log('html2', html2)
    const [title, content] = txtRegByStep4(html2)
    // wordsArr.push({ title, content})
    wordsTxt += `## ${title} \n${content} \n \n`
  }
  saveData(rootPath + 'wordsArr.txt', wordsTxt)

}
const txtRegByStep4 = html2 => {
  var regex = /<p[^>]*>(.*?)<\/p>/gs
  var matches = [...html2.matchAll(regex)]
  var firstTwo = matches.map(match => {
    return match[1]
  }).slice(0, 2)
  return [firstTwo[0].replace('Definition of ', ''), firstTwo[1]]
}
const dataFilterByStep4 = data => {
  const mainStartPosition = `<!--.headline_area-->`
  const mainEndPosition = `<!–– SENTENCE 1 */ ––>`
  return getTxt(mainStartPosition, mainEndPosition, data)
}

step4()

getHtmlUtils.js

const axios = require("axios")
const fs = require('fs')

exports.getPageListUrl = (url, startNum, endNum, folderPath) => {
  // let i = startNum
  for (let i = startNum; i <= endNum; i++) {
    axios.get(url.replace('$num', i)).then(res => {
      const html = res.data
      const filePath = '.\\' + folderPath + i + '.html'
      saveData(filePath, html)
    })
  }
}

exports.saveData = (filePath, data) => {
  fs.writeFile(filePath, data, function () {
    console.info(filePath + ' 写入完成!')
  })
}

exports.loadData = (filePath) => {
  return fs.readFileSync(filePath, 'utf8')
}

exports.getTxt = (mainStartPosition, mainEndPosition, content) => {
  // const mainStartPosition = '<div class="lacontent">'
  // const mainEndPosition = '<div class="contextdhall clearfix">'
  let pos1 = 0, pos2 = 0
  pos1 = content.indexOf(mainStartPosition) // + mainStartPosition.length
  pos2 = content.indexOf(mainEndPosition) // + mainEndPosition.length
  // console.info('pos1', pos1)
  // console.info('pos2', pos2)

  const listContent = content.substring(pos1, pos2)
  // console.info('listContent', listContent)
  return listContent

}
posted @ 2024-08-05 13:33  彭成刚  阅读(43)  评论(0编辑  收藏  举报