#coding=utf-8
from pymongo import MongoClient
from lxml import etree
import requests
jigou = u"\r\n 【机构】\r\n "
zuozhe = u"\r\n 【作者】\r\n "
# 获取数据库
def get_db():
client = MongoClient('localhost', 27017)
db = client.cnki
db.authenticate("用户名","密码")
return db
# 获取第num条数据
def get_data(table, num):
i = 1
for item in table.find({}, {"html":1,"_id":0}):
if i==num:
if item.has_key('html') and item['html']:
return item['html']
else:
i+=1
continue
# 列表首元素转字符串
def list_str(list):
if len(list)!=0:
return list[0]
else:
return ""
# 作者英文名,机构英文名
def en_ls(list, length1, length2):
if len(list)!=0:
list = list[0].replace(u"【Author】","").replace("\r\n","").strip().split(";")
if len(list)==(length2+length1)+1:
return list2str(list[:length1]), list2str(list[length1:-1])
else:
return "", ""
else:
return "", ""
def hyxx(list):
if len(list)!=0:
hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],""
for item in list:
if u"【会议录名称】" in item:
hylmc = item.replace(u"【会议录名称】","").replace("\r\n","").strip()
continue
if u"【会议名称】" in item:
hymc = item.replace(u"【会议名称】","").replace("\r\n","").strip()
continue
if u"【会议时间】" in item:
hysj = item.replace(u"【会议时间】","").replace("\r\n","").strip()
continue
if u"【会议地点】" in item:
hydd = item.replace(u"【会议地点】","").replace("\r\n","").strip()
continue
if u"【分类号】" in item:
flh = item.replace(u"【分类号】","").replace("\r\n","").strip()
continue
if u"【主办单位】" in item:
zbdw = item.replace(u"【主办单位】","").replace(u"、",";").replace("\r\n","").strip()
continue
return hylmc,hymc,hysj,hydd,flh,zbdw
else:
return "","","","","",""
# 列表转字符串
def list2str(list):
if len(list)!=0:
return ";".join(list)
else:
return ""
# 构造论文入库字典
def standard_dict(html):
dc = {}
print 1
# print html
tree = etree.HTML(html)
# 论文名称
dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()"))
# 外文名称
dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()"))
# 作者
dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
# 作者数量
length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
# 机构名称
dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou))
# 机构数量
length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou))
# 作者英文名, 机构英文名
dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2)
# 摘要
dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()"))
# 英文摘要
dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()"))
# 关键词
dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()"))
# 英文关键词
dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()"))
# 会议信息
dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()"))
if dc["proceeding_title"]=="":
print 2
dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()"))
return dc
# 主函数
def main():
db = get_db()
collection=db.conference
collection2 = db.conference_cleaned
for item in collection.find({}, {"html":1,"_id":0}):
if item.has_key('html') and item['html']:
dc = standard_dict(item['html'])
collection2.insert(dc)
if __name__ == '__main__':
main()
# 以下代码用于测试清洗特定一条数据
# db = get_db()
# collection=db.conference
# data = get_data(collection, 1)
# dc = standard_dict(data)
# for k,v in dc.items():
# print k,v