simpleHandleData

#!usr/bin/python
#coding=utf-8

import os

##########################################################################
#文件编码类型判断
def obtainFileType(ff):
    import chardet 
    enc=chardet.detect(ff) 
    return enc['encoding']  #返回文件类型

#########################################################################
#判断文件是否存在
def fileExists(filepath):
    if os.path.exists(filepath):
        return 1        #存在
    else:
        return 0        #不存在

#########################################################################
#按照"/"分开关键词
#string 关键词的字符串
def ParticipleBySprit(string):
    rpos = len("【标引词】")
    keyString = string[rpos:].strip()         #截取关键词, 并去掉左右2边空格
    keywordsList = keyString.split("/")       #将字符串以"/"分割成,存储在列表中
    #print obtainFileType(keywordsList[0])      #utf8编码
    #print keyString
    #print len(keywordsList)
    return keywordsList

#########################################################################
#获取题目字符串
#string 题目的字符串
def havingArticleTitle(string):
    rpos = len("【来源文献】")
    ArticleTitle = string[rpos:].strip()         #截取关键词, 并去掉左右2边空格
    return ArticleTitle

#########################################################################
#统计算法   第一种, 简单统计-----在文章标题, 依次检索关键词, 然后统计次数
#即出现一次就计数加一, 然后存入frequence列表中
#解释: 同一个关键词只计算一次, 最多出现一次
def simpleFrequencyOne(keyList, articleTitleStr, frequence):
    count = 0
    for i in range(len(keyList)):
        while True:
            pos = articleTitleStr.find(keyList[i])
            if -1 == pos:       #查询失败, 字符串找不到关键词,就退出
                break
            else:
                count = count + 1
                break
            
    frequence.append(count)     #将统计出来的词频,填入列表


#########################################################################
#统计算法   第二种, 简单统计----在文章标题, 依次去检索关键词, 然后统计出现的次数, 
#即出现一次就计数加一, 然后存入frequence列表中
#解释: 同一个词, 可能出现几次, 这里不考虑语境, 只是单纯计数, 计算关键词出现的次数
def simpleFrequencyTwo(keyList, articleTitleStr, frequence):
    count = 0
    for i in range(len(keyList)):
        start = 0
        while True:
            pos = articleTitleStr.find(keyList[i], start)
            if -1 == pos:       #查询失败, 字符串找不到关键词,就退出
                break
            else:
                start = pos + len(keyList[i])
                count = count + 1

    frequence.append(count)     #将统计出来的词频,填入列表


#########################################################################
#统计算法   第三种, 简单统计----在文章标题, 依次去检索关键词, 然后统计出现的次数, 
#即出现一次就计数加一, 然后存入frequence列表中
#解释: 同一个词, 可以出现几次, 且计数, 但是, 同样一个位置, 不能被统计2次,
#也就是说, 一个位置只能被唯一一个关键词所占有
def simpleFrequencyThr(keyList, articleTitleStr, frequence):        
    count = 0       #统计词汇的出现次数
    singleTtile = [0] * len(articleTitleStr.decode('utf-8'))        #构造标记列表
    for i in range(len(keyList)):                                   
        start = 0                   #匹配的开始位置
        while True:
            pos = articleTitleStr.decode('utf-8').find(keyList[i].decode('utf-8'), start)       #转换为utf-8, 计算英文和汉字的真实长度
            if -1 == pos:       #查询失败, 字符串找不到关键词,就退出
                break
            else:
                startSin = pos      #检索到关键词的开始位置
                endSin = pos + len(keyList[i].decode('utf-8'))      #检索到关键词的结束位置
                singal = False        #标记字符串的位置,是否已经被计数过
                
                for ipos in range(startSin, endSin):
                    if singleTtile[ipos] == 1:          #已经标记了
                        singal = True               #标记了,就改变状态
                        #print "change id"
                        break

                if singal == False:     #如果该位置,没有使用过,
                    for spos in range(startSin, endSin):        #标记该位置的所有文字, 使其以后不能被使用
                        singleTtile[spos] = 1

                    count = count + 1   #计数器加1

            start = pos + len(keyList[i].decode('utf-8'))       #改变索引的起始位置
                
    frequence.append(count)     #将统计出来的词频,填入列表
    #print "count =============", count
    #print "singleTtile====", singleTtile

#########################################################################
#计算关键词在标题中使用的频率
#keyList 关键词列表
#articleTitleStr 题目的字符串
#switchCase
    #这3个列表都是顺序存储,一一对应.
    #titleLen           题目长度的列表
    #keyWordsNum        关键词个数的数组
    #frequence          关键词出现的次数    
def count_KeyWords_In_ArticleTitle_Frequency(keyList, articleTitleStr, switchCase, titleLen, keyWordsNum, frequence):
    #print 'ipos', ipos
    titleLen.append(len(articleTitleStr.decode('utf-8')))       #统计, 存储题目的长度, 先转换为utf8,然后计数
    keyWordsNum.append(len(keyList))            #存储关键词的个数
    if switchCase == "1":
        simpleFrequencyOne(keyList, articleTitleStr, frequence)
    if switchCase == "2":
        simpleFrequencyTwo(keyList, articleTitleStr, frequence)
    if switchCase == "3":
        simpleFrequencyThr(keyList, articleTitleStr, frequence)
#########################################################################
#加权平均值-------使用未知权重的情况
# k 关键词个数
# f 出现的次数

def countWeightedMeans(k, f):
    molecule = 0    #分子
    nominator = 0   #分母
    
    for i in range(len(k)):
        molecule += int(k[i]) * int(f[i])
        nominator += int(k[i])
        #print molecule

    print "无权重的加权平均值: ".decode('utf-8'), molecule * 1.0 /nominator
        

#########################################################################    
#打印获取的列表数据
def PrintList(articleTitleList, keyStrList, titleLen, keyWordsNum, frequence):
    #print " frequence", len(frequence)
    if len(titleLen) == len(keyWordsNum) and len(frequence) == len(keyWordsNum):
        for i in range(len(titleLen)):
            print i
            print "        题  目:".decode('utf-8'), articleTitleList[i].decode('utf-8')
            print "        关键词:", keyStrList[i][15:].strip().decode('utf-8')
            print "title length:".decode('utf-8'), titleLen[i], "  key words number:".decode('utf-8'), keyWordsNum[i], "  frequence:".decode('utf-8'), frequence[i]
            print "\n"

#########################################################################
#数据录入 ,读取存入文件
#参数:文件名(绝对路径)
def ReadFileAndSimpleHadnle(filePath):
    if 0 == fileExists(filePath):
        print "\n", filePath, "file is not exist\n"
        return 0

    articleTitleStr = ""    #标题字符串初始化
    articleTitleList = []   #标题列表初始化
    keyStrList = []         #关键词字符串初始化
        
    keyList = []            #存储关键词的列表初始化
    titleLen = []           #文章标题的长度的列表初始化
    keyWordsNum = []        #关键词的个数的列表初始化
    frequence = []          #出现的次数的列表初始化
    
    print "\n"
    print "第一种: 统计关键词, 一个关键词只计算一次(最多出现一次)\n"
    print "第一种: 统计关键词, 出现一次, 计数器加一, 同一个词, 可能出现多次.\n       这里不考虑语境, 只是单纯计数, 计算关键词出现的次数\n"
    print "第三种: 统计关键词, 同一个词, 可以出现多次, 但是, 同样一个位置, 不能被统计2次. \n       也就是说, 一个位置只能被唯一一个关键词所占有\n"
    switchCase = raw_input('choose difference Statistics model : ')     #选择不同的统计模型
    
    fileData = open(filePath)
    fileData.seek(0)
    lines = fileData.readlines()
    if len(lines) < 1:          #至少也要有一篇, 一个题目,一组关键词
        print "\n" + " " * 15 +"file is empty\n"
        return 0    #没有数据,就退出
    #print len(lines)

    ipos = 0
    for line in lines:
        #if ipos >= 20:
            #break
        
        if 0 == line.find("【来源篇名】") or 0 == line.find("【来源文献】"):
            articleTitleStr = havingArticleTitle(line)      #获取标题
            articleTitleList.append(articleTitleStr)
            #print line.decode('utf-8')      #在控制台真确输出
            

        elif 0 == line.find("【标引词】"):
            keyList = ParticipleBySprit(line)       #获取关键词
            keyStrList.append(line)
            #print line.decode('utf-8')      #在控制台真确输出

            #进行数据处理,计算关键字在文章题目中出现的频率
            #print 'ipos', ipos
            count_KeyWords_In_ArticleTitle_Frequency(keyList, articleTitleStr, switchCase, titleLen, keyWordsNum, frequence)
            ipos = ipos + 1
        
    fileData.close()
    #print len(articleTitleList)
    #PrintList(articleTitleList, keyStrList, titleLen, keyWordsNum, frequence)
    countWeightedMeans(keyWordsNum, frequence)      #计算频率,显示结果


#########################################################################
if __name__ == "__main__":
    filePath = 'S:/date/articleData/articleTitle.txt'       #数据保存的绝对地址
    ReadFileAndSimpleHadnle(filePath)
#stringtest = "【标引词】制度现代化/公共选择/民本主义/自由与民主"
#print len("【标引词】")
#print stringtest.index("【标引词】")
posted on 2015-09-27 20:16 HGonlyWJ 阅读(231) 评论(0) 收藏举报