第一次个人项目【词频统计】——关键程序思路详述

　　考虑使用平衡二叉树实现单词和词组数目的统计。

　　二叉树节点定义：

1 struct tnode {
2     char *word;
3     int count;
4     int height; //record node's height
5     struct tnode *left;
6     struct tnode *right;
7 };

　　节点中的成员char *word既可以存储单词，也可以存储词组。比较好的体现了该数据结构的复用属性。

　　【判断字符数逻辑】

　　逐字符读取文件，如果该字符在32-126之间，则字符总数＋1。

　　【判断行数逻辑】

　　逐字符读取，如果该字符为‘\n’,则总行数＋1，单个文件读取结束后总函数再＋1（因为文件结束符为EOF，不是‘\n’）。

　　【判断单词数逻辑】

　　遍历单词树，采用递归的方式计算总单词数。

1 long count_tree(struct tnode *root) {
2     if (root == NULL) return 0;
3     else return count_tree((*root).left) + count_tree((*root).right) + root->count;
4 }

　　【判断单词逻辑】

　　设置单词开始标志符bool isWordStart，然后根据标志符和当前读入的字符综合判断后续操作。不是单词开始且读到了字符，则开始记录单词。单词开始且读到的是字符或者数字，则继续记录，如果单词开始且读到了非字母数字字符则停止记录并并标志符置为false.

// 读到单词起始字母
if (!isWordStart && isCharacter(charTemp)) {
    isWordStart = true;
    lengthofStr = 1;
    str[0] = charTemp;
}
// 读单词（字母或者数字）
else if (isWordStart && isNumorCharacter(charTemp)) {
    str[lengthofStr++] = charTemp;
}
// 单词读完后读到第一个分隔符
else if (isWordStart && !isNumorCharacter(charTemp)) {
    isWordStart = false;
    str[lengthofStr] = '\0';
}

　　【单词树的动态维护】

　　由于最后输出要输出同类型词组排序最小的单词，因此在插入单词树时需要动态维护节点存储的单词。

　　【词组树的判断逻辑】

　　设置一个临时变量存储上次读到的单词。

　　如果是当前读到的单词总数为1，则不插入词组树，否则将该次读到的单词和上次读到的单词用空格分隔符拼接在一起，插入词组树。

// 若单词符合要求则插入树
if (lengthofStr >= 4 && isCharacter(str[1]) && isCharacter(str[2]) && isCharacter(str[3])) {
    //将单词插入单词树
    numofAllWord++;
    root = insert_balance(root, str);

    //总共只扫描到一个单词，不插入词组树
    if (numofAllWord == 1) {
        str_copy(str_before, str);
    }
    //总共扫描到至少两个单词，插入词组树
    else {
        phrase = str_connectwithSpace(str_before, str);
        rootofPhrase = insert_balance_phrase(rootofPhrase, phrase);
        str_copy(str_before, str);
    }

}

　　核心计数函数：

void getInformationofOneFile(char *filename, long &numofChararcter, long &numofLine, tnode *&root, tnode *&rootofPhrase) {
    FILE *fp = fopen(filename, "r");
    if (fp == NULL) {
        return;
    }

    char str[STRING_MAX_LEN], str_before[STRING_MAX_LEN], *phrase = NULL;
    long numofAllCharacter = 0;    //单篇文章所有ascii字符数目
    long numofAllWord = 0;           //单篇文章所有词组数目      
    bool isWordStart = false;
    bool isEmptyFile = false;
    int lengthofStr = 0;
    char charTemp;
    do {
        charTemp = fgetc(fp);

        if (charTemp == EOF &&numofAllCharacter == 0) {
            isEmptyFile = true;
            break;
        }
        numofAllCharacter++;

        if (charTemp >= 32 && charTemp <= 126) numofChararcter++;
        if (charTemp == '\n') numofLine++;

        // 读到单词起始字母
        if (!isWordStart && isCharacter(charTemp)) {
            isWordStart = true;
            lengthofStr = 1;
            str[0] = charTemp;
        }
        // 读单词（字母或者数字）
        else if (isWordStart && isNumorCharacter(charTemp)) {
            str[lengthofStr++] = charTemp;
        }
        // 单词读完后读到第一个分隔符
        else if (isWordStart && !isNumorCharacter(charTemp)) {
            isWordStart = false;
            str[lengthofStr] = '\0';
            // 若单词符合要求则插入树
            if (lengthofStr >= 4 && isCharacter(str[1]) && isCharacter(str[2]) && isCharacter(str[3])) {
                //将单词插入单词树
                numofAllWord++;
                root = insert_balance(root, str);

                //总共只扫描到一个单词，不插入词组树
                if (numofAllWord == 1) {
                    str_copy(str_before, str);
                }
                //总共扫描到至少两个单词，插入词组树
                else {
                    phrase = str_connectwithSpace(str_before, str);
                    rootofPhrase = insert_balance_phrase(rootofPhrase, phrase);
                    str_copy(str_before, str);
                }

                //[DEBUG][DEBUG][DEBUG][DEBUG][DEBUG][DEBUG]
                //cout << numofAllWord << endl;
                //travel_tree(rootofPhrase);

            }
        }
    } while (charTemp != EOF);

    if (!isEmptyFile) numofLine++;
    //numofLine++;

    fclose(fp);
}

posted @ 2018-03-30 17:42 ^TP^ 阅读(347) 评论(1) 收藏举报

刷新页面返回顶部

^TP^

第一次个人项目【词频统计】——关键程序思路详述

公告