说是“终结”……但任务越来越重也没个终结的实感……
那么今天来捡起顶会热词统计来做个词语拆分吧(问就是要求……)
import re import operator import argparse import codecs def isNumber(s): try: float(s) if '.' in s else int(s) return True except ValueError: return False class Rake: def __init__(self, inputFilePath, stopwordsFilePath, outputFilePath, minPhraseChar, maxPhraseLength): self.outputFilePath = outputFilePath self.minPhraseChar = minPhraseChar self.maxPhraseLength = maxPhraseLength # read documents self.docs = [] for document in codecs.open(inputFilePath, 'r', 'utf-8'): self.docs.append(document) # read stopwords stopwords = [] for word in codecs.open(stopwordsFilePath, 'r', 'utf-8'): stopwords.append(word.strip()) stopwordsRegex = [] for word in stopwords: regex = r'\b' + word + r'(?![\w-])' stopwordsRegex.append(regex) self.stopwordsPattern = re.compile('|'.join(stopwordsRegex), re.IGNORECASE) def separateWords(self, text): splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]') words = [] for word in splitter.split(text): word = word.strip().lower() # leave numbers in phrase, but don't count as words, since they tend to invalidate scores of their phrases if len(word) > 0 and word != '' and not isNumber(word): words.append(word) return words def calculatePhraseScore(self, phrases): # calculate wordFrequency and wordDegree wordFrequency = {} wordDegree = {} for phrase in phrases: wordList = self.separateWords(phrase) wordListLength = len(wordList) wordListDegree = wordListLength - 1 for word in wordList: wordFrequency.setdefault(word, 0) wordFrequency[word] += 1 wordDegree.setdefault(word, 0) wordDegree[word] += wordListDegree for item in wordFrequency: wordDegree[item] = wordDegree[item] + wordFrequency[item] # calculate wordScore = wordDegree(w)/wordFrequency(w) wordScore = {} for item in wordFrequency: wordScore.setdefault(item, 0) wordScore[item] = wordDegree[item] * 1.0 / wordFrequency[item] # calculate phraseScore phraseScore = {} for phrase in phrases: phraseScore.setdefault(phrase, 0) wordList = self.separateWords(phrase) candidateScore = 0 for word in wordList: candidateScore += wordScore[word] phraseScore[phrase] = candidateScore return phraseScore def execute(self): file = codecs.open(self.outputFilePath,'w','utf-8') for document in self.docs: # split a document into sentences sentenceDelimiters = re.compile(u'[.!?,;:\t\\\\"\\(\\)\\\'\u2019\u2013]|\\s\\-\\s') sentences = sentenceDelimiters.split(document) # generate all valid phrases phrases = [] for s in sentences: tmp = re.sub(self.stopwordsPattern, '|', s.strip()) phrasesOfSentence = tmp.split("|") for phrase in phrasesOfSentence: phrase = phrase.strip().lower() if phrase != "" and len(phrase) >= self.minPhraseChar and len(phrase.split()) <= self.maxPhraseLength: phrases.append(phrase) # calculate phrase score phraseScore = self.calculatePhraseScore(phrases) keywords = sorted(phraseScore.items(), key = operator.itemgetter(1), reverse=True) file.write(str(keywords[0:int(len(keywords)/3)]) + "\n") file.close() def readParamsFromCmd(): parser = argparse.ArgumentParser(description = "This is a python implementation of rake(rapid automatic keyword extraction).") parser.add_argument('inputFilePath', help = 'The file path of input document(s). One line represents a document.') parser.add_argument('stopwordsFilePath', help = 'The file path of stopwords, each line represents a word.') parser.add_argument('-o', '--outputFilePath', help = 'The file path of output. (default output.txt in current dir).', default = 'output.txt') parser.add_argument('-m', '--minPhraseChar', type = int, help = 'The minimum number of characters of a phrase.(default 1)', default = 1) parser.add_argument('-a', '--maxPhraseLength', type = int, help = 'The maximum length of a phrase.(default 3)', default = 3) return parser.parse_args() params = readParamsFromCmd().__dict__ rake = Rake(params['inputFilePath'], params['stopwordsFilePath'], params['outputFilePath'], params['minPhraseChar'], params['maxPhraseLength']) rake.execute()
之前的词语统计都是去掉停用词之后一个一个硬拆的,这次我们要求按照词组拆分
拆分之后是输出进txt的,效果大概是这样

之后我们要将其存进数据库……接下来的就下期再说吧!
浙公网安备 33010602011771号