1117-古今地名映射爬取与经纬度检索&诗人,牌名,朝代,飞花令词实体导入

古今地名映射

爬取来源

从百度百科调用它的搜索接口:检索两个内容,一个是它的现地名,另一个是它的简介,从简介中在进行词性分析找出对应的地名

 

 

 代码

import urllib.request
import urllib.parse
from lxml import etree
from pyhanlp import *
import pandas as pd

def query(content):
    # 请求地址
    url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content)
    print(url)
    # 请求头部
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    # 利用请求地址和请求头部构造请求对象
    req = urllib.request.Request(url=url, headers=headers, method='GET')
    # 发送请求,获得响应
    response = urllib.request.urlopen(req)
    # 读取响应,获得文本
    text = response.read().decode('utf-8')
    # 构造 _Element 对象
    html = etree.HTML(text)
    # 使用 xpath 匹配数据,得到匹配字符串列表
    #'/html/body/div[3]/div[2]/div/div[1]/div[7]/dl[2]/dd[5]/a'
    #sen_list = html.xpath('//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
    f=False
    sen_list=html.xpath('/html/body/div[3]/div[2]/div/div[1]/dl[1]/dd/h2//text()')
    if sen_list==[]:
        sen_list = html.xpath(
            '//div[contains(@class,"lemma-summary") or contains(@class,"lemmaWgt-lemmaSummary")]//text()')
    if sen_list!=[]:
        # 过滤数据,去掉空白
        sen_list_after_filter = [item.strip('\n') for item in sen_list]
        # 将字符串列表连成字符串并返回
        text=''.join(sen_list_after_filter)
        CRFnewSegment = HanLP.newSegment("crf")
        term_list = CRFnewSegment.seg(text)
        ci=['ns']
        where_list=[]
        for it in term_list:
            if str(it.nature) in ci:
                where_list.append(str(it.word))
        if len(where_list)>0:
            print(where_list)
            return where_list[0]
        else:
            return ""
    else:
        return ""


from xlrd import open_workbook
from xlutils.copy import copy

#将分类结果重新写入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 1, "jin_where")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 1, data[i])

    excel.save(file)

if __name__ == '__main__':
    jin_list=[]
    data=pd.read_excel('gu_where.xlsx')
    gu_where=data.gu_where
    for i in range(len(gu_where)):
        content=gu_where[i]
        print(content)
        result = query(content)
        print("查询结果:%s" % result)
        jin_list.append(result)
    write_to(jin_list,'gu_where.xlsx')

结果

 

 

 现今地点经纬度

首先对古代地点进行经纬度获取,获取后保存获得的现金地名

若古代地名获取不到经纬度,用现今地名进行获取经纬度,同样保存获得的现金地名和经纬度

在进行高德地图经纬度调用的时候要注意一次不能太多:500个地名经纬度能容忍(别问我是如何知道的!!!惨痛的实践)

import pandas as pd
import requests
import json
def coords(city):
    # 输入API问号前固定不变的部分
    url = 'https://restapi.amap.com/v3/geocode/geo'

    # 将两个参数放入字典
    params = {'key': 'cd0c1ab60e3a22a87009a4196abd94e0',
              'address': city}
    res = requests.get(url, params)
    jd = json.loads(res.text)
    if len(jd['geocodes']) != 0:
        print(jd)
        coords = jd['geocodes'][0]['location']
        address=jd['geocodes'][0]['formatted_address']
        print(address)
        return coords,address
    else:
        return '',''

if __name__ == '__main__':
    data=pd.read_excel('gu_where.xlsx')
    gu_name=list(data.gu_where)
    jin_name=list(data.jin_where)
    ans_gu=[]
    ans_jin=[]
    #经度与纬度
    lng=[]
    lat=[]
    for i in  range(6500,len(gu_name)):
        gu=gu_name[i]
        jin=jin_name[i]
        loca, address = coords(gu)
        if loca != '':
            ans_gu.append(gu)
            ans_jin.append(address)
            loca_list=loca.split(',')
            lng.append(loca_list[0])
            lat.append(loca_list[1])
            print(gu+" "+address+" "+str(loca_list[0])+" "+str(loca_list[1]))
        else:
            loca,address=coords(jin)
            if loca!='':
                ans_gu.append(gu)
                ans_jin.append(address)
                loca_list = loca.split(',')
                lng.append(loca_list[0])
                lat.append(loca_list[1])
                print(gu+" "+address+" "+str(loca_list[0])+" "+str(loca_list[1]))
    import xlwt

    xl = xlwt.Workbook()
    # 调用对象的add_sheet方法
    sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True)

    sheet1.write(0, 0, "gu_name")
    sheet1.write(0,1,"jin_name")
    sheet1.write(0,2,"lng")
    sheet1.write(0,3,"lat")
    for i in range(0, len(ans_jin)):
        sheet1.write(i + 1, 0, ans_gu[i])
        sheet1.write(i + 1, 1, ans_jin[i])
        sheet1.write(i + 1, 2, lng[i])
        sheet1.write(i + 1, 3, lat[i])

    xl.save("gu_jin_lng_lat2.xlsx")

清洗后数据

在获得经纬度的地名进行相应的保存

 

 

 部分实体导入

诗人与朝代实体

import pandas as pd
import numpy as np
import re
from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher

# 创建节点
def CreateNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    #print(re_value)
    if re_value is None:
        m_mode = Node(m_label,**m_attrs)
        n = graph.create(m_mode)
        return n
    return None
# 查询节点
def MatchNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    return re_value
# 创建关系
def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
    reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1,m_r_name,reValue2)
    n = graph.create(m_r)
    return n

#查找关系
def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1, m_r_name['name'], reValue2)
    return m_r

def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    print(m_r_name)
    propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
    m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
    graph.merge(m_r)

#修改节点属性
def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    if reValue1 is None:
        return False
    reValue1.update(new_attrs)
    graph.push(reValue1)



graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')


def create_author():
    file='./data2/author.xlsx'
    data=pd.read_excel(file).fillna("")
    author=list(data.author)
    produce=list(data.produce)
    num=list(data.num)
    src=list(data.src)
    desty=list(data.desty)
    bg_time=list(data.begin_time)
    ed_time=list(data.end_time)
    zi_list=list(data.zi)
    hao_list=list(data.hao)
    author_label='author'
    desty_label='desty'
    for i in range(len(author)):
        print(""+str(i)+"")
        attr1 = {"name": author[i], "produce": produce[i], "num": num[i],
                 "src": src[i],"bg_time":bg_time[i],"ed_time":ed_time[i],"zi":zi_list[i],"hao":hao_list[i]}
        CreateNode(graph, author_label, attr1)
        print("创建诗人:" + author[i] + "成功!!")
        attr2={"name":desty[i]}
        if MatchNode(graph,desty_label,attr2)==None:
            CreateNode(graph,desty_label,attr2)
            print("创建朝代:"+desty[i]+"成功!!")
        #创建关系
        m_r_name1 = "朝代"
        reValue1 = CreateRelationship(graph, author_label, attr1, desty_label, attr2, m_r_name1)
        print("创建关系:"+author[i]+"-所属朝代-"+desty[i]+"成功")
        m_r_name2 = "包含"
        reValue2 = CreateRelationship(graph,desty_label, attr2, author_label, attr1,  m_r_name2)
        print("创建关系:" + desty[i] + "-包含-" + author[i] + "成功")



if __name__ == '__main__':
    create_author()

导入效果

 

 

 牌名

包含词牌名,曲牌名

import pandas as pd
import numpy as np
import re
from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher

# 创建节点
def CreateNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    #print(re_value)
    if re_value is None:
        m_mode = Node(m_label,**m_attrs)
        n = graph.create(m_mode)
        return n
    return None
# 查询节点
def MatchNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    return re_value
# 创建关系
def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
    reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1,m_r_name,reValue2)
    n = graph.create(m_r)
    return n

#查找关系
def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1, m_r_name['name'], reValue2)
    return m_r

def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    print(m_r_name)
    propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
    m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
    graph.merge(m_r)

#修改节点属性
def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    if reValue1 is None:
        return False
    reValue1.update(new_attrs)
    graph.push(reValue1)



graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')

def create_pai_name():
    file = './data2/cipai_name.xlsx'
    data = pd.read_excel(file).fillna("")
    title=list(data.title)
    cipai_label="ci_pai"
    for it in title:
        attr1={"name":it}
        CreateNode(graph, cipai_label, attr1)
        print("创建词牌名"+it+"成功!!")

    file2 = './data2/qupai_name.xlsx'
    data2 = pd.read_excel(file2).fillna("")
    title2 = list(data2.qu_name)
    qupai_label = "qu_pai"
    for it in title2:
        attr1 = {"name": it}
        CreateNode(graph, qupai_label, attr1)
        print("创建曲牌名" + it + "成功!!")



if __name__ == '__main__':
    create_pai_name()

导入效果

 

 

 曲牌名:

 

 

 

飞花令

import pandas as pd
import numpy as np
import re
from py2neo import Node,Relationship,Graph,NodeMatcher,RelationshipMatcher

# 创建节点
def CreateNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    #print(re_value)
    if re_value is None:
        m_mode = Node(m_label,**m_attrs)
        n = graph.create(m_mode)
        return n
    return None
# 查询节点
def MatchNode(m_graph,m_label,m_attrs):
    m_n="_.name="+"\'"+m_attrs['name']+"\'"
    matcher = NodeMatcher(m_graph)
    re_value = matcher.match(m_label).where(m_n).first()
    return re_value
# 创建关系
def CreateRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph,m_label1,m_attrs1)
    reValue2 = MatchNode(m_graph,m_label2,m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1,m_r_name,reValue2)
    n = graph.create(m_r)
    return n

#查找关系
def findRelationship(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    m_r = Relationship(reValue1, m_r_name['name'], reValue2)
    return m_r

def updateRelation(m_graph,m_label1,m_attrs1,m_label2,m_attrs2,m_r_name):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    reValue2 = MatchNode(m_graph, m_label2, m_attrs2)
    if reValue1 is None or reValue2 is None:
        return False
    print(m_r_name)
    propertyes={'value': m_r_name['value'], 'danwei': m_r_name['danwei']}
    m_r = Relationship(reValue1, m_r_name['name'], reValue2,**propertyes)
    graph.merge(m_r)

#修改节点属性
def updateNode(m_graph,m_label1,m_attrs1,new_attrs):
    reValue1 = MatchNode(m_graph, m_label1, m_attrs1)
    if reValue1 is None:
        return False
    reValue1.update(new_attrs)
    graph.push(reValue1)



graph = Graph('http://localhost:7474',username='neo4j',password='fengge666')

def create_word():
    file = './data2/word.xlsx'
    data = pd.read_excel(file).fillna("")
    word=list(data.word)
    word_label="word"
    for it in word:
        attr1={"name":it}
        CreateNode(graph, word_label, attr1)
        print("创建飞花令:"+it+"成功!!")



if __name__ == '__main__':
    create_word()

导入效果

 

 

 

posted @ 2021-11-17 22:39  清风紫雪  阅读(205)  评论(0编辑  收藏  举报