import json
import re
import pandas as pd
import nltk
import save_csv
# bltk命令实体案例 提取文本中的人名,地名,机构等等
def parse_document(document):
document = re.sub('\n', ' ', document)
if isinstance(document, str):
document = document
else:
raise ValueError('Document is not string!')
document = document.strip()
sentences = nltk.sent_tokenize(document)
sentences = [sentence.strip() for sentence in sentences]
return sentences
# sample document
text = open(r'test.json', "r").read()
text1 = ""
for item in json.loads(text):
text1 += " " + item["text"]
# print(text1)
# text = """
# FIFA was founded in 1904 to oversee international competition among the national associations of Belgium,
# Denmark, France, Germany, the Netherlands, Spain, Sweden, and Switzerland. Headquartered in Zürich, its
# membership now comprises 211 national associations. Member countries must each also be members of one of
# the six regional confederations into which the world is divided: Africa, Asia, Europe, North & Central America
# and the Caribbean, Oceania, and South America.
# """
# tokenize sentences
sentences = parse_document(text1)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]
# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
for tagged_tree in ne_tagged_sentence:
# extract only chunks having NE labels
if hasattr(tagged_tree, 'label'):
entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) # get NE name
entity_type = tagged_tree.label() # get NE category
named_entities.append((entity_name, entity_type))
# get unique named entities
named_entities = list(set(named_entities))
# 存入excel之前
print(named_entities)
# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities, columns=['Entity Name', 'Entity Type'])
# 存入csv中
entity_frame.to_csv('data_df.csv', encoding='utf_8_sig')
# display results
print(entity_frame)
# save_csv.save_csv_data(entity_frame)
![]()