1031-诗人信息重新收集及诗人关联人物

诗人信息重新收集

收集所有诗人

诗人图片与朝代，之前收集的诗人只有2000多个，是因为之前收集的诗人有生平，而其他诗人没有，故没有收集到

这次收集所有诗人，都有个人介绍。从个人介绍中也可提取出有用的信息。

经过此次收集，共收集到20088个诗人

import requests
from bs4 import BeautifulSoup
from lxml import etree
import re

headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#创建头部信息
pom_list=[]
k=1
for i in range(1,2010):
    url='https://www.xungushici.com/authors/p-'+str(i)
    r=requests.get(url,headers=headers)
    content=r.content.decode('utf-8')
    soup = BeautifulSoup(content, 'html.parser')

    hed=soup.find('div',class_='col col-sm-12 col-lg-9')
    list=hed.find_all('div',class_="card mt-3")

    origin_url='https://www.xungushici.com'

    for it in list:
        content = {}
        # 1.1获取单页所有诗集
        title = it.find('h4', class_='card-title')
        poemauthor=title.find_all('a')[1].text
        desty=title.find_all('a')[0].text
        #print(poemauthor+"+"+desty)
        #获取诗人图像
        if it.find('a',class_='ml-2 d-none d-md-block')!=None:
            src=it.find('a',class_='ml-2 d-none d-md-block').img['src']
        else:
            src="http://www.huihua8.com/uploads/allimg/20190802kkk01/1531722472-EPucovIBNQ.jpg"
        content['src']=src
        href=title.find_all('a')[1]['href']
        #对应的诗人个人详情页面
        real_href = origin_url + href
        #诗人简介及诗集个数
        text = it.find('p', class_='card-text').text
        #诗人的诗集个数
        numtext = it.find('p', class_='card-text').a.text
        pattern = re.compile(r'\d+')
        num=re.findall(pattern, numtext)[0]

        #进入诗人详情页面
        r2=requests.get(real_href,headers=headers)
        content2=r2.content.decode('utf-8')
        soup2 = BeautifulSoup(content2, 'html.parser')
        ul=soup2.find('ul',class_='nav nav-tabs bg-primary')
        if ul!=None:
            list_li=ul.find_all('li',class_='nav-item')
            exp = ""
            for it in list_li:
                if it.a.text=="人物生平" or it.a.text=="人物" or it.a.text=="生平":
                    urlsp=origin_url+it.a['href']
                    r3 = requests.get(urlsp, headers=headers)
                    content3 = r3.content.decode('utf-8')
                    soup3 = BeautifulSoup(content3, 'html.parser')
                    list_p=soup3.select('body > div.container > div > div.col.col-sm-12.col-lg-9 > div:nth-child(3) > div.card > div')
                    for it in list_p:
                        exp=it.get_text().replace('\n','').replace('\t','').replace('\r','')
            content['author']=poemauthor
            content['produce']=text
            content['num']=num
            content['desty'] = desty
            content['experience'] = exp
            pom_list.append(content)
        else:
            content['author'] = poemauthor
            content['produce'] = text
            content['num'] = num
            content['desty'] = desty
            content['experience'] = "无"
            pom_list.append(content)
        print("第"+str(k)+"个")
        k=k+1

import xlwt

xl = xlwt.Workbook()
# 调用对象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

sheet1.write(0,0,"author")
sheet1.write(0,1,'produce')
sheet1.write(0,2,'num')
sheet1.write(0,3,'experience')
sheet1.write(0,4,'src')
sheet1.write(0,5,'desty')

for i in range(0,len(pom_list)):
    sheet1.write(i+1,0,pom_list[i]['author'])
    sheet1.write(i+1, 1, pom_list[i]['produce'])
    sheet1.write(i+1, 2, pom_list[i]['num'])
    sheet1.write(i+1, 3, pom_list[i]['experience'])
    sheet1.write(i + 1, 4, pom_list[i]['src'])
    sheet1.write(i + 1, 5, pom_list[i]['desty'])
xl.save("author3.xlsx")

存储效果

诗人关联人物

通过诗集来找到关联人物

先找唐代相互关联的诗人，步骤如下：

通过对诗名，诗文赏析，写作背景，来对作者（也就是诗人）进行分析，找出其中包含的其他诗人的名字。存储到一个列表

因为会涉及到多首诗，都是同一个作者，因此，涉及到字典，列表合并，去重

列表合并：list+=list2

列表去重：a=list(set(a))

在考虑是否是相关的作者时，有以下考察点：

其一：通过内容去检索所有诗人，找到诗人库中包含的关联诗人

其二：找到关联诗人后，判断是否和作者同一个朝代

其三：还可判断是否在作者的出生年月内（未实现，部分诗人出生年月不详）

import numpy
import xlwt
import pandas as pd

ans=[]

def read_author():
    file="author3.xlsx"
    data=pd.read_excel(file).fillna("无")
    author=list(data.author)
    all_desty=list(data.desty)
    experience=list(data.experience)

    dict_exp={}
    dict_des={}

    for i in range(len(author)):
        dict_exp[author[i]]=experience[i]
        dict_des[author[i]]=all_desty[i]


    #去重
    author=list(set(author))
    file2="tang.xlsx"
    tang=pd.read_excel(file2).fillna("无")
    author_name=list(tang.author)
    t_desty=list(tang.desty)
    new_author=list(set(author_name))

    dict={}
    for it in new_author:
        dict[it]=[]

    title=list(tang.title)
    appear=list(tang.appear)
    back=list(tang.background)
    for i in range(len(author_name)):
        name = author_name[i]
        if name=='佚名':
            continue
        content=title[i]+appear[i]+back[i]

        friend=[]
        for j in range(len(author)):
            if content.find(author[j])!=-1 and dict_des[author[j]]=="唐代":
                friend.append(author[j])
        #print(name+": "+str(friend))
        dict[name]+=friend
        dict[name]=list(set(dict[name]))
        dict[name]=dict[name]
    print("--------------------ans--------------------------")
    ans_author=[]
    ans_friend=[]
    for it in dict:
        if dict[it]!=[]:
            ans_author.append(it)
            ans_friend.append(','.join(dict[it]))
            print(it+":"+str(dict[it]))



    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

    sheet1.write(0, 0, "author")
    sheet1.write(0, 1, 'friend')


    for i in range(0, len(ans_author)):
        sheet1.write(i + 1, 0, ans_author[i])
        sheet1.write(i + 1, 1, ans_friend[i])

    xl.save("friend.xlsx")






if __name__ == '__main__':
    read_author()

效果

里面friend有的包括原作者，之后在对其清洗

posted @ 2021-10-31 17:28 清风紫雪阅读(248) 评论(0) 收藏举报

刷新页面返回顶部

清风紫雪