Term-Frequency word weighting scheme is one of most used in normalization of document-term matrices in text mining and information retrieval.
See wikipedia for details.
function Y = tfidf( X )
% FUNCTION computes TF-IDF weighted word histograms.
%
% Y = tfidf( X );
%
% INPUT :
% X - document-term matrix (documents in columns)%
% OUTPUT :
% Y - TF-IDF weighted document-term matrix
%
% get term frequencies
X = tf(X);
% get inverse document frequencies
I = idf(X);
% apply weights for each documentfor j=1:size(X, 2)X(:, j) = X(:, j)*I(j);
end
Y = X;
function X = tf(X)
% SUBFUNCTION computes word frequencies
% for every wordfor i=1:size(X, 1) % get word i counts for all documentsx = X(i, :);
% sum all word i occurences in the whole collectionsumX = sum( x );
% compute frequency of the word i in the whole collection if sumX ~= 0X(i, :) = x / sum(x);
else% avoiding NaNs : set zero to never appearing words
X(i, :) = 0;
end
end
function I = idf(X)
% SUBFUNCTION computes inverse document frequencies
% m - number of terms or words
% n - number of documents
[m, n]=size(X);
% allocate space for document idf'sI = zeros(n, 1);
% for every documentfor j=1:n% count non-zero frequency words
nz = nnz( X(:, j) );
% if not zero, assign a weight: if nzI(j) = log( m / nz );
end
end

浙公网安备 33010602011771号