英文文章词频统计:

 

功能:统计一篇英文文章的单词总数及出现频数并输出,之后排序,输出频数前十的单词及其频数。

实现方法:使用C语言,用fopen函数读入txt文件,fscanf函数逐个读入单词,结构体wordNode存储单词及其频数,以链表的形式连接在一起,最后使用插入排序进行分析,输出频数最高的5个单词。

 

 头文件

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

 

 定义宏

#define ERROR 1
#define OK 0
#define WORD_LENGTH 250

 

 自定义数据类型

typedef int status;

typedef struct Node
{
    char word[WORD_LENGTH];
    int time;
    struct Node *next;
}wordNode;

 

 定义全局变量

wordNode *headNode = NULL;

 

 声明所有使用的函数

wordNode *wordSearch(char *word,int *num);
status wordCount(char *word,int *num);
void printCountList(int *num);
void PrintFirstFiveTimes();
void mergeSort(wordNode **head);
void FrontBackSplit(wordNode *head,wordNode **pre,wordNode **next);
void wordJob(char word[]);
wordNode *SortedMerge(wordNode *pre,wordNode *next);
void release();

 

 主函数

status main(int argc,char *argv[])
{
    char temp[WORD_LENGTH];//定义用以临时存放单词的数组
    FILE *file;
    int count;
    int articleWordNum = 0;//定义统计结点个数的变量
    int *num = &articleWordNum;
    if((file = fopen("F:\\zc\\c\\yjs\\file.txt", "r")) == NULL)
    {
        printf("文件读取失败!");
        exit(1);
    }
    while((fscanf(file,"%s",temp))!= EOF)
    {
        wordJob(temp);
        count = wordCount(temp,num);
    }
    fclose(file);
    printf("\n输出所有单词的频数\n");
    printCountList(num);
    printf("\n输出词频最高的5个词\n");
    mergeSort(&headNode);              //排序
    PrintFirstFiveTimes();
    release();
    return 0;
}

 

查找单词所在结点并返回其地址

wordNode *wordSearch(char *word,int *num)
{
    wordNode *node;
    wordNode *nextNode = headNode;
    wordNode *preNode = NULL;
    char a[WORD_LENGTH];
    if(headNode == NULL)
    {
        node = (wordNode*)malloc(sizeof(wordNode));
        strcpy(node->word, word);
        node->time = 0;
        *num+=1;
        headNode = node;
        return node;
    }
    while(nextNode != NULL)          //查找匹配单词
    {
        strcpy(a,nextNode->word);
        if(strcmp(a, word) == 0)
        {
            return nextNode;
        }
        preNode = nextNode;
        nextNode = nextNode->next;
    }

    if(nextNode == NULL)
    {
        node = (wordNode*)malloc(sizeof(wordNode));
        strcpy(node->word, word);
        node->time = 0;
        node->next = headNode->next;
        headNode->next = node;
        *num+=1;
        return node;
    }
    else
        return nextNode;
}

 

进行词频统计

status wordCount(char *word,int *num)
{
    wordNode *tmpNode = NULL;
    tmpNode = wordSearch(word,num);      //word所在的节点
    if(tmpNode == NULL)
    {
        return ERROR;
    }
    tmpNode->time++;
    return 0;
}

 

输出所有词频

void printCountList(int *num)
{
    if(headNode == NULL)
    {
        printf("该文件无内容!");
    }
    else
    {
        wordNode *preNode = headNode;
        printf("\n\t总计 %d \n",*num);
        while(preNode != NULL)
        {
            printf("\n\t%s:%d次\n",preNode->word,preNode->time);
            preNode = preNode->next;
        }
    }
}

 

输出词频最高的10个词

void PrintFirstFiveTimes()
{
    if(headNode == NULL)
    {
        printf("该文件无内容!");
    }
    else
    {
        wordNode *preNode = headNode;
        int i = 1;
        while (preNode != NULL && i<=5)
        {
            printf("\n\t%s:%d次\n",preNode->word,preNode->time);
            preNode = preNode->next;
            i++;
        }
    }
}

 

对词频统计结果进行归并排序

void mergeSort(wordNode **headnode)
{
    wordNode *pre,*next,*head;
    head = *headnode;
    if(head == NULL || head->next == NULL)
    {
        return;
    }
    FrontBackSplit(head,&pre,&next);
    mergeSort(&pre);
    mergeSort(&next);
    *headnode = SortedMerge(pre,next); 
}

 

取尾节点

void FrontBackSplit(wordNode *source,wordNode **pre,wordNode **next)
{
    wordNode *fast;
    wordNode *slow;
    if(source == NULL || source->next == NULL)
    {
        *pre = source;
        *next = NULL;
    }
    else
    {
        slow = source;
        fast = source->next;
        while(fast != NULL)
        {
            fast = fast->next;
            if(fast != NULL)
            {
                slow = slow->next;
                fast = fast->next;
            }
        }
        *pre = source;
        *next = slow->next;
        slow->next = NULL;
    }
}

 

取频数最大的节点作为头节点

wordNode *SortedMerge(wordNode *pre,wordNode *next)
{
    wordNode *result = NULL;
    if(pre == NULL)
        return next;
    else if(next == NULL)
        return pre;
    if(pre->time >= next->time)
    {
        result = pre;
        result->next = SortedMerge(pre->next,next);
    }
    else
    {
        result = next;
        result->next = SortedMerge(pre,next->next);
    }
    return result;
}

 

处理单词

void wordJob(char word[])
{
    int i,k;
    for(i = 0;i<strlen(word);i++)
    {
        if(word[i]>='A'&& word[i]<='Z')
        {
            word[i] += 32;
            continue;
        }
        if(word[i]<'a'||word[i]>'z')
        {
            if(i == (strlen(word)-1))
            {
                word[i] = '\0';
            }
            else
            {
                k = i;
                while(i < strlen(word))
                {
                    word[i] = word[i+1];
                    i++;
                }
                i = k;
            }
        }
    }
}

 

释放所有结点内存

void release()
{
    if(headNode == NULL)
        return;
    wordNode *pre = headNode;
    while(pre != NULL)
    {
        headNode = pre->next;
        free(pre);
        pre = headNode;
    }
}

 

 

git@git.coding.net:amberpass/Calculate_words.git

 

https://git.coding.net/amberpass/Calculate_words.git

 

程序运行结果