Python,Most frequently used words in a text

# Most frequently used words in a text
# 找出一段文本中出现频率最高的三个单词
# 单词只包含英文字母和撇号('), 别的符号均被视为单词间分隔
# 单词不能只包含撇号, 例如"'"或"''"是无效的单词
# 匹配时大小写不敏感(A和a被视为相同的单词),返回的结果中的字母应被小写化
# 若有多个单词出现频率相同且只能选一个, 选什么都可以
# 如果单词不足三个,有几个返回几个,没有则返回空数组

#我的解法
# Completed in 0.96ms
# Completed in 108.68ms
from collections import defaultdict
#heapq模块主要提供对堆数据结构的支持,其底层实现为二叉堆,可以高效地进行最小/最大值的查找和管理
import heapq
def my_nlargest(n,iterable,key=None):
    '''
    返回可迭代对象中最大的n个元素 
    参数key用于提取比较依据,如果key为None,则直接比较元素
    '''
    end = 0
    heap = [[0,0]]#根节点下标为1
    # x,left:x*2,right:x*2+1
    if key is None: #is是身份运算符,判断两个标识符是不是引用自一个对象
        def key(x):
            return x
    for item in iterable:
        heap.append([item,key(item)])
        end+=1
        pos=end
        while heap[pos][1]>heap[pos//2][1] and pos>1:
            heap[pos],heap[pos//2]=heap[pos//2],heap[pos]
            pos//=2
    # 1 + 2 + 2^2 + 2^3 + ... + 2^(n-1) = 2^n - 1
    result = sorted(heap[1:min(2**(n+1),1+end)],key=lambda x:x[1],reverse=True)
    return [x[0] for x in result[:n]]

def top_3_words(text):
    text = text.lower()#字符串转换为小写
    word_count_map = defaultdict(int)
    allowed_char = 'abcdefghijklmnopqrstuvwxyz\''
    word = ''
    isOnlyApostrophe = True
    for ch in text:
        if ch not in allowed_char:
            if word=='' or isOnlyApostrophe:
                continue
            else:
                word_count_map[word]+=1
                word=''
                isOnlyApostrophe = True
        else:
            if isOnlyApostrophe and ch!='\'':
                isOnlyApostrophe = False
            word+=ch 
    if word!='' and (not isOnlyApostrophe):word_count_map[word]+=1
    # version 2
    #heaqp.nlargest(n,iterable,key=None)返会可迭代对象中最大的n个元素
    return heapq.nlargest(3,word_count_map.keys(),key = lambda x:word_count_map[x])

    # version 3
    # return my_nlargest(3,word_count_map.keys(),key = lambda x:word_count_map[x])

    # version 1
    # word_count_list = sorted(word_count_map.keys(), key=lambda x:word_count_map[x],reverse=True)
    # #reverse=True表示降序排列
    # return [word for word in word_count_list[0:3]]




# 别人的解法 1
#Approach: Wrote a nice little personalized parser that implements a generator to be able
# to iterate over the tokens in the body of text.  Use of a default dictionary (factory int)
# to eaisly count frequencies (key is word, value is frequency).  Use of operator.itemgetter
# to eaisly obtain a key with the largest value

from collections import defaultdict
import operator


class Parser(object):
    # Parser,解析器
    def __init__(self, textToParse:str) -> None:  self.body = textToParse + " "
    # textToParse:str,中:str表示textToParse是一个字符串类型的参数
    # -> None:表示函数没有返回值
    # 在文本后加一个空格是为了处理最后一个单词
    # (如果文本最后一个字符是有效单词的一部分,没有后续的空白字符,也能得到处理)
        
    def _isBlackspace(self,char:str)->bool:  return (char.isalpha() or char=="'")
    #以_开头的函数或变量表示这是一个私有函数或变量,只能在类内部使用,这是一种约定俗成的写法,并不是python的语法规定
    #char.isalpha()判断字符是否是字母,返回True或False
    #_isBlackspace函数返回True表示这个字符是有效的单词的一部分,返回False表示这个字符是无效的单词的一部分
    #python中没有字符类型,字符是长度为1的字符串,所以这里的char是一个字符串(:str)

    def _isWhitespace(self,char:str)->bool:  return (not self._isBlackspace(char))
    #_isWhitespace函数判断一个字符是否是空白字符
    # 返回True表示这个字符是空白字符,返回False表示这个字符不是空白字符
    # 这里组成单词的有效字符之外的字符均为空白字符

    #Generator that produces tokens of blackspace that are not all apostrophes
    def tokenizer(self) ->str:
        # ->str 表示这个函数返回字符串类型的值
        token, consumingToken  = [], False
        for char in self.body:
            if self._isBlackspace(char):
                token.append(char)
                consumingToken = True
            if self._isWhitespace(char) and consumingToken:
                #To be a valid token, the token must contain atleast one letter [a-zA-Z]
                if any(c.isalpha() for c in token): 
                    yield ''.join(token)
                    #yield是一个关键字,用于生成器函数
                    #生成器函数是一个返回迭代器的函数,
                    #在调用生成器函数时,返回一个迭代器对象
                    #生成器函数的函数体中包含yield关键字
                    #生成器函数的函数体中可以包含多个yield关键字
                    #返回的迭代器对象中的值是由yield关键字后的表达式决定的
                    #生成器函数的函数体中可以包含return关键字,但是return后不能有表达式
                token = []
                consumingToken=False
    #end function

    #Note on the parser's addition of the trailing whitespace (in constructor): Manages an
    # edge case when the last character in the body of text to parse is a token.
    #      e.g: 'a b c c c' here 'c' is a valid token, and is the last character in the file
    # Since the trailing whitespace character is concatenated to the body of the text,
    # garantees that 'if self._isWhitespace(char) and consumingToken' is true for this edge
    # case scenario.
#end class


#Given a body of text, get at most the three words with the largest frequency
def top_3_words(text:str)->list:
    p = Parser(text)
    counter =  defaultdict(int)
    topWords = []

    for token in p.tokenizer():     #get word frequency
        counter[token.lower()]+=1
    
    for _ in range(3):              #get at most three words with higest freq
        if counter:
            maxKey = max(counter.items(), key=operator.itemgetter(1))[0]
            #operator.itemgetter(1)返回一个函数
            #这个函数用于获取一个对象的第一个域的值
            #域是一个对象的属性,这里的域是一个字典的键
            topWords.append(maxKey)
            counter.pop(maxKey)
        else:
            break

    return topWords
#end function
posted @ 2025-03-25 15:17 Kazuma_124 阅读(14) 评论(0) 收藏举报
刷新页面返回顶部
Kazuma_124

Python,Most frequently used words in a text

公告