TF-IDF(C#)

翻代码时看到以前写的TF-IDF的C#实现,共享一下..

 

ps: codeproject.com有一个泰国仔实现的版本,代码写得非常乱..

 

代码
using System;
using System.Collections.Generic;
using System.Text;

namespace Cluster
{
    
/// <summary>
    
/// 词项
    
/// </summary>
    class Term
    {
        
/// <summary>
        
/// 词在词表中的索引(在线性词表中的序号)
        
/// </summary>
        public int index;

        
/// <summary>
        
/// 词出现过的文档数(在多少篇文章出现过)
        
/// </summary>
        public int docNum;


        
/// <summary>
        
/// 
        
/// </summary>
        public Term(int index)
        {
            
this.index = index;
        }
    }
}

 

代码
using System;
using System.Collections.Generic;
using System.Text;

namespace Cluster
{
    
/// <summary>
    
/// term frequency–inverse document frequency
    
/// </summary>
    static class TFIDF
    {
        
/// <summary>
        
/// 计算tf-idf
        
/// </summary>
        
/// <param name="docs">待处理文档(已分词)</param>
        
/// <returns></returns>
        public static List<Dictionary<intdouble>> Calculate(string[][] docs)
        {
            List
<Dictionary<intdouble>> tfidfs = new List<Dictionary<intdouble>>();

            Dictionary
<string, Term> terms = new Dictionary<string, Term>(); //词表
            List<Dictionary<intdouble>> tfs = new List<Dictionary<intdouble>>(); //词频
            Dictionary<intdouble> idfs = new Dictionary<intdouble>(); //逆文档频率

            CalcTF(docs, terms, tfs);
            CalcIDF(docs, terms, idfs);
            CalcTFIDF(tfs, idfs, tfidfs);

            
return tfidfs;
        }

        
#region TF
        
/// <summary>
        
/// 计算词频(term frequency)
        
/// </summary>
        
/// <param name="docs">文档</param>
        
/// <param name="terms">词表</param>
        
/// <param name="tfs">词数</param>
        private static void CalcTF(string[][] docs, Dictionary<string, Term> terms, List<Dictionary<intdouble>> tfs)
        {
            
foreach (string[] doc in docs)
            {
                Dictionary
<intint> termNums = new Dictionary<intint>();
                
foreach (string term in doc)
                {
                    
int index = -1//词表索引
                    if (!terms.ContainsKey(term))
                    {
                        index 
= terms.Count;
                        terms.Add(term, 
new Term(index));
                    }
                    
else
                    {
                        index 
= terms[term].index;
                    }
                    
if (!termNums.ContainsKey(index))
                    {
                        termNums.Add(index, 
1);
                        terms[term].docNum
++//词的文档数
                    }
                    
else
                    {
                        termNums[index]
++;
                    }
                }
                
double len = (double)doc.Length;
                Dictionary
<intdouble> tf = new Dictionary<intdouble>(); //词频
                foreach (KeyValuePair<intint> kvp in termNums)
                {
                    tf.Add(kvp.Key, (
double)kvp.Value / len); //当前词的词数/总词数
                }
                tfs.Add(tf);
            }
        }
        
#endregion

        
#region IDF
        
/// <summary>
        
/// 计算逆文档频率(inverse document frequency)
        
/// </summary>
        
/// <param name="docs"></param>
        
/// <param name="terms"></param>
        
/// <param name="idfs"></param>
        private static void CalcIDF(string[][] docs, Dictionary<string, Term> terms, Dictionary<intdouble> idfs)
        {
            
double len = (double)docs.Length;
            
foreach (KeyValuePair<string, Term> kvp in terms)
            {
                
double idf = Math.Log(len / (double)kvp.Value.docNum, Math.E); //ln(总文档数/当前词出现过的文档数)
                idfs.Add(kvp.Value.index, idf);
            }
        }
        
#endregion

        
#region TF-IDF
        
/// <summary>
        
/// 
        
/// </summary>
        
/// <param name="tfs"></param>
        
/// <param name="idfs"></param>
        
/// <param name="tfidfs"></param>
        private static void CalcTFIDF(List<Dictionary<intdouble>> tfs, Dictionary<intdouble> idfs, List<Dictionary<intdouble>> tfidfs)
        {
            
foreach (Dictionary<intdouble> tf in tfs)
            {
                Dictionary
<intdouble> tfidf = new Dictionary<intdouble>();
                
foreach (KeyValuePair<intdouble> kvp in tf)
                {
                    tfidf.Add(kvp.Key, kvp.Value 
* idfs[kvp.Key]);
                }
                tfidfs.Add(tfidf);
            }
        }
        
#endregion

    }
}

 

posted @ 2010-12-29 17:15  Clotho_Lee  阅读(1882)  评论(0编辑  收藏  举报