爬取300首古诗

import time

import requests
from lxml import etree
from multiprocessing import Pool


#先创建一个lp文件夹




def zxc(qwe,hercx):
  qwe_op=requests.get(qwe,headers=hercx).text
  html1 = etree.HTML(qwe_op)
  '标头的xpth的'
  roto=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/h1/text()')
  "作者的xpth"
  roto1=html1.xpath('//*[@id="sonsyuanwen"]/div[1]/p/a[1]/text()')
  '诗句'
  textuio=html1.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/div[2]/text()')
  with open('lp/'+str(roto[0])+'.txt','a+',encoding='utf-8')as ll:
     ll.write('{}\n'.format(str(roto[0])))
     ll.write('{}\n'.format(str(roto1[0])))
     for i in textuio:
       s=str(i).replace(' ','')
       ll.write('{}\n'.format(str(s)))
  print('完成{}下载'.format(roto[0]))


# 用进程池
if __name__ == '__main__':
   qwe1=time.time()
   pool= Pool()
   hercx = {
     'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36 Edg/105.0.1343.25'
   }
   cvb = requests.get('https://so.gushiwen.cn/gushi/tangshi.aspx/', headers=hercx).text
   html = etree.HTML(cvb)

   '爬取300首古诗 获取300首的地址xpath'
   qwe_300 = '//*[@id="html"]/body/div/div/div/div/span/a/@href'
   rahsfd_300 = html.xpath(qwe_300)
   vbn_to = rahsfd_300  # 300个地址返回成列表里面

   adp='https://so.gushiwen.cn'
   for i in vbn_to:
         qwe=adp+str(i)
         pool.apply_async(zxc, args=(qwe,hercx))
   cvb=time.time()
   pool.close()
   pool.join()
   print('{}秒'.format(str(cvb-qwe1)))

 



posted @ 2022-09-09 20:14  python,菜鸟  阅读(107)  评论(0编辑  收藏  举报