C#统计文本单词个数2
using System;
using System.Collections;
using System.Linq;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
class getFiles
{
public static void getWordList(string pathName, ref Hashtable wordList) //getWordList:从文本文件中统计词频保存在Hashtable中
{
StreamReader sr = new StreamReader(pathName);
string line;
int num = ;
line = sr.ReadLine(); //按行读取
while (line != null)
{
num++;
MatchCollection mc;
Regex rg = new Regex("[A-Za-z-]+"); //用正则表达式匹配单词
mc = rg.Matches(line);
for (int i = ; i < mc.Count; i++)
{
string mcTmp = mc[i].Value.ToLower(); //大小写不敏感
if (mcTmp.Length >= )
{
if (!wordList.ContainsKey(mcTmp)) //第一次出现则添加为Key
{
wordList.Add(mcTmp, );
}
else //不是第一次出现则Value加
{
int value = (int)wordList[mcTmp];
value++;
wordList[mcTmp] = value;
}
}
else
continue;
}
line = sr.ReadLine();
}
sr.Close();
}
public static void getWordListExt(string pathName, ref Hashtable wordList) //getWordList的扩展模式
{
StreamReader sr = new StreamReader(pathName);
string line;
int num = ;
line = sr.ReadLine();
while (line != null)
{
num++;
MatchCollection mc;
Regex rg = new Regex("[A-Za-z-]+");
mc = rg.Matches(line);
for (int i = ; i < mc.Count; i++)
{
string mcTmp = mc[i].Value.ToLower();
if (mcTmp.Length >= ) //单词的最小长度为,如a,ab,a,等……
{
Regex rgx = new Regex("[-]+$"); //去掉不以数字开头的单词的尾部的数字
//如:win、win替换为win,winwin、则不变
if (mcTmp[] < '' || mcTmp[] >'')
{
mcTmp = rgx.Replace(mcTmp, "");
}
if (!wordList.ContainsKey(mcTmp))
{
wordList.Add(mcTmp, );
}
else
{
int value = (int)wordList[mcTmp];
value++;
wordList[mcTmp] = value;
}
}
else
continue;
}
line = sr.ReadLine();
}
sr.Close();
}
public static void getFilesDir(string pathName, ref Hashtable wordList, int extFlag) //getFilesDir:遍历目录中所以子目录及文件的函数
{
string[] subFiles = Directory.GetFiles(pathName); //获取当前目录中文件的路径及名称
foreach (string subFile in subFiles)
{
string fileExt = Path.GetExtension(subFile);
if (fileExt == ".txt" || fileExt == ".cpp" || fileExt == ".cs" || fileExt == ".h") //判断扩展名,找出指定的文本文件
{
Console.WriteLine(subFile);
try
{
if (extFlag == ) //判断是否为扩展模式
{
getWordList(subFile, ref wordList);
}
else
{
getWordListExt(subFile, ref wordList);
}
}
catch (Exception e)
{
Console.WriteLine(e.Message.ToString());
}
}
else
//Console.WriteLine("不是指定类型的文本文件!");
continue;
}
string[] subDirs = Directory.GetDirectories(pathName); //获取当前目录的子目录
foreach (string subDir in subDirs)
{
getFilesDir(subDir, ref wordList, extFlag); //递归遍历子目录
}
}
public static void Main(string[] args)
{
string tmp;
string pathName = "";
int i, j,valueTmp,extFlag = ; //tmp,i,j,valueTmp为排序参数,extFlag为扩展模式标记
Hashtable wordList = new Hashtable();
if (args.Length == ) //判断参数长度,若为,则不是扩展模式
{
pathName = args[];
extFlag = ;
}
else if (args.Length == && args[] == "-e") //若参数长度为,且第一个参数为“-e”,则为扩展模式
{
pathName = args[];
extFlag = ;
}
else
Console.WriteLine("参数输入错误!");
try
{
if (Directory.Exists(pathName)) //判断输入的路径是否存在
{
getFilesDir(pathName, ref wordList, extFlag);
StreamWriter sw = new StreamWriter(pathName + @"\ypfei.txt");
ArrayList keysList = new ArrayList(wordList.Keys);
keysList.Sort(); //对Hashtable中的Keys按字母序排列
//以下对Keys(单词)按Values(次数)进行插入排序
//由于插入排序是【稳定排序】,所以相同次数的单词依旧是字母序
for (i = ; i < keysList.Count; i++)
{
tmp = keysList[i].ToString();
valueTmp = (int)wordList[keysList[i]];
for (j = i; j > && valueTmp > (int)wordList[keysList[j - ]]; j--)
{
keysList[j] = keysList[j - ];
}
keysList[j] = tmp;
}
//最后把结果循环输出到TXT文件中
for (i = ; i < keysList.Count; i++)
{
Console.WriteLine("{} {}",keysList[i],wordList[keysList[i]]);
sw.WriteLine("<{}>:{}", keysList[i], wordList[keysList[i]]);
}
sw.Close();
}
else
Console.WriteLine("目录不存在!");
}
catch (Exception e)
{
Console.WriteLine(e.Message.ToString());
}
}
栋栋

浙公网安备 33010602011771号