词频统计
要求:
(1). 实现一个控制台程序,给定一段英文字符串,统计其中各个英文单词(4字符以上含4字符)的出现频率。 附加要求:读入一段文本文件,统计该文本文件中单词的频率。
(2). 性能分析:
- 对C++代码运行VS的性能分析工具,找出性能问题并进行优化。
- 对Java程序运行性能分析工具 NetBeans IDE 6.0,找出性能问题并进行优化。
预计完成时间:4H 实际完成时间:5-6H
github:https://github.com/yuanchenhui/zuoye:
#include <iostream>
#include <ctype.h>
#include <algorithm>
#include <string>
using namespace std;
struct Word //单词结构体
{
string Str;
int Count=0;
void exchange(Word &word) //单词交换,用于排序
{
string tStr = word.Str;
int tCount = word.Count;
word.Str = Str;
word.Count = Count;
Str = tStr;
Count = tCount;
}
};
void CalcCount(Word *words, string &newWord, int size) //词频统计
{
int i = 0;
for (; i < size; i++)
{
if (words[i].Str == newWord)
{
words[i].Count++;
return;
}
else if (words[i].Str == "")
break;
}
words[i].Str = newWord;
words[i].Count = 1;
}
void SortWordDown(Word * words, int size) //根据词频降序排序
{
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size - 1; j++)
{
if (words[j].Count < words[j + 1].Count)
{
words[j].exchange(words[j + 1]);
}
}
}
}
int main()
{
Word * words;
string content;
cout << "Word is case insensitive, i.e. “file123”, “123FILE” and “File” are considered the same word.";
getline(cin, content);
int wCount = 1; //记录单词总数
for (unsigned int i = 0; i < content.length(); i++)
{
if (isalnum(content[i]) == 0) //非字母数字
{
wCount++;
}
}
words = new Word[wCount];
string::size_type offset = content.find(‘ ‘); //单词分隔;size_type用以保存string对象的长度
while (offset != string::npos)
{
string wStr = content.substr(0, offset); //string.substr()返回一个从指定位置返回指定长度的字符串
if (wStr.length() < 4) //除去长度小于4的单词
{
wCount--;
content.erase(0, offset + 1);
offset = content.find(‘ ‘);
continue;
}
content.erase(0, offset + 1); //string.erase()删除从指定位置开始的指定长度的字符
transform(wStr.begin(), wStr.end(), wStr.begin(), ::tolower);
CalcCount(words, wStr, wCount);
offset = content.find(‘ ‘);
}
if (content.length() >= 4)
{
transform(content.begin(), content.end(), content.begin(), ::tolower);
CalcCount(words, content, wCount); //计算最后一个单词
}
else wCount--;
for (int i = 0; i < wCount; i++)
{
if (words[i].Str == "")
{
wCount--;
}
}
SortWordDown(words, wCount);
cout << "词频统计:" << endl;
for (int i = 0; i < wCount - 1; i++)
{
cout << words[i].Str << "频率:" << words[i].Count << "次" << endl;
}
cin.get();
delete[] words;
return 0;
}
运行结果:

浙公网安备 33010602011771号