C#统计文本单词个数2

  using System;
  using System.Collections;
  using System.Linq;
  using System.Text;
  using System.IO;
  using System.Text.RegularExpressions;
 
  class getFiles
  {
      public static void getWordList(string pathName, ref Hashtable wordList)     //getWordList：从文本文件中统计词频保存在Hashtable中
      {
          StreamReader sr = new StreamReader(pathName);
          string line;
          int num = ;
          line = sr.ReadLine();             //按行读取
          while (line != null)
          {
              num++;
              MatchCollection mc;
              Regex rg = new Regex("[A-Za-z-]+");    //用正则表达式匹配单词
              mc = rg.Matches(line);
              for (int i = ; i < mc.Count; i++)
              {
                  string mcTmp = mc[i].Value.ToLower();    //大小写不敏感
                  if (mcTmp.Length >= )
                  {
                      if (!wordList.ContainsKey(mcTmp))     //第一次出现则添加为Key
                      {
                          wordList.Add(mcTmp, );
                      }
                      else                                            //不是第一次出现则Value加
                      {
                          int value = (int)wordList[mcTmp];
                          value++;
                          wordList[mcTmp] = value;
                      }
                  }
                  else
                      continue;
              }
              line = sr.ReadLine();
          }
          sr.Close();
      }
 
      public static void getWordListExt(string pathName, ref Hashtable wordList)  //getWordList的扩展模式
      {
          StreamReader sr = new StreamReader(pathName);
          string line;
          int num = ;
          line = sr.ReadLine();
          while (line != null)
          {
              num++;
              MatchCollection mc;
              Regex rg = new Regex("[A-Za-z-]+");
              mc = rg.Matches(line);
              for (int i = ; i < mc.Count; i++)
              {
                  string mcTmp = mc[i].Value.ToLower();
                  if (mcTmp.Length >= )                         //单词的最小长度为，如a，ab，a，等……
                  {
                      Regex rgx = new Regex("[-]+$");        //去掉不以数字开头的单词的尾部的数字
                      //如：win、win替换为win，winwin、则不变
                      if (mcTmp[] < '' || mcTmp[] >'')
                      {
                          mcTmp = rgx.Replace(mcTmp, "");
                      }
                      if (!wordList.ContainsKey(mcTmp))
                      {
                          wordList.Add(mcTmp, );
                      }
                      else
                      {
                          int value = (int)wordList[mcTmp];
                          value++;
                          wordList[mcTmp] = value;
                      }
                  }
                  else
                      continue;
              }
              line = sr.ReadLine();
          }
          sr.Close();
      }
 
 
      public static void getFilesDir(string pathName, ref Hashtable wordList, int extFlag)   //getFilesDir：遍历目录中所以子目录及文件的函数
      {
         string[] subFiles = Directory.GetFiles(pathName);    //获取当前目录中文件的路径及名称
         foreach (string subFile in subFiles)
         {
             string fileExt = Path.GetExtension(subFile);
             if (fileExt == ".txt" || fileExt == ".cpp" || fileExt == ".cs" || fileExt == ".h")  //判断扩展名，找出指定的文本文件
             {
                 Console.WriteLine(subFile);
                 try
                 {
                     if (extFlag == )               //判断是否为扩展模式
                     {
                         getWordList(subFile, ref wordList);
                     }
                     else
                     {
                         getWordListExt(subFile, ref wordList);
                     }
                 }
                 catch (Exception e)
                 {
                     Console.WriteLine(e.Message.ToString());
                 }
             }
             else
                 //Console.WriteLine("不是指定类型的文本文件！");
                 continue;
         }

         string[] subDirs = Directory.GetDirectories(pathName);   //获取当前目录的子目录
         foreach (string subDir in subDirs)
         {
             getFilesDir(subDir, ref wordList, extFlag);         //递归遍历子目录
         }
     }


     public static void Main(string[] args)
     {
         string tmp;
         string pathName = "";
         int i, j,valueTmp,extFlag = ;   //tmp,i,j,valueTmp为排序参数,extFlag为扩展模式标记
         Hashtable wordList = new Hashtable();
         if (args.Length == )       //判断参数长度，若为，则不是扩展模式
         {
             pathName = args[];
             extFlag = ;
         }
         else if (args.Length ==  && args[] == "-e")   //若参数长度为,且第一个参数为“-e”，则为扩展模式
         {
             pathName = args[];
             extFlag = ;
         }
         else
             Console.WriteLine("参数输入错误！");
         try
         {
             if (Directory.Exists(pathName))      //判断输入的路径是否存在
             {
                 getFilesDir(pathName, ref wordList, extFlag);
                 StreamWriter sw = new StreamWriter(pathName + @"\ypfei.txt");
                 ArrayList keysList = new ArrayList(wordList.Keys);
                 keysList.Sort();          //对Hashtable中的Keys按字母序排列
                 //以下对Keys（单词）按Values（次数）进行插入排序
                 //由于插入排序是【稳定排序】，所以相同次数的单词依旧是字母序
                 for (i = ; i < keysList.Count; i++)
                 {
                     tmp = keysList[i].ToString();
                     valueTmp = (int)wordList[keysList[i]];
                     for (j = i; j >  && valueTmp > (int)wordList[keysList[j - ]]; j--)
                     {
                         keysList[j] = keysList[j - ];
                     }
                     keysList[j] = tmp;
                 }
                 //最后把结果循环输出到TXT文件中
                 for (i = ; i < keysList.Count; i++)
                 {
                     Console.WriteLine("{} {}",keysList[i],wordList[keysList[i]]);
                     sw.WriteLine("<{}>:{}", keysList[i], wordList[keysList[i]]);
                 }
                 sw.Close();
             }
             else
                 Console.WriteLine("目录不存在！");
         }
         catch (Exception e)
         {
             Console.WriteLine(e.Message.ToString());
         }
     }
posted @ 2013-04-23 13:40 东嘉CEO 阅读(1204) 评论(0) 收藏举报
刷新页面返回顶部
东嘉—自然语言处理

C#统计文本单词个数2

公告