为训练集和测试集生成向量空间模型

计算每个特征项的权重时使用公式:

上式是对于训练集,而对于测试集,我直接使用:

对于训练集,TF和DF都在已经生成的word-doc矩阵中;对于测试集,TF需要另外数一下。

对于文档中的一个词,我们首先要判断它是否是特征项,所以首先要把特征项放到一个HashSet中,这是可行的,因为所有特征项也就几千个。同时我们还要快速地从word-doc矩阵中找到该词对应的那一行。当特征项选定后,word-doc矩阵中那些非特征项对应的行就是没用的,所以我们可以对word-doc矩阵进行裁剪:

View Code
#/usr/bin/perl

$bt=time;
%hash_all_features=();
my $feature_file="/home/orisun/master/fudan_corpus/4000_features";
open FEATUREFILE,"$feature_file";
while(<FEATUREFILE>){
chomp;
my @cont=split/\s+/;
my $term=$cont[0];
my $ig=$cont[1];
$hash_all_features{$term}=$ig;
}
close FEATUREFILE;

my $matrix_file="/home/orisun/master/fudan_corpus/matrix/part-r-00000";
my $new_matrix_file="/home/orisun/master/fudan_corpus/matrix/4000_matrix";
open (OLD_MATRIX,"$matrix_file") or die "Can't open $matrix_file : $!";
open (NEW_MATRIX,">$new_matrix_file") or die "Can't create $new_matrix_file : $!";
while(<OLD_MATRIX>){
chomp;
$line=$_;
$oriline=$_;
if($line=~s/^(.*?)(\s+.*)/$1/){
#print $line."\t";
if(exists $hash_all_features{$line}){
print NEW_MATRIX "$oriline\n";
}
}
}
close OLD_FILE;
close NEW_FILE;
$et=time;
print "Time:",$et-$bt," Seconds.\n";

生成新的裁剪后的矩阵用了6秒。

数一下测试文档中特征词出现的次数,Perl代码:

#/usr/local/bin
use warnings;
use File::Find;
#use Unicode::UTF8simple;

$bt=time;
#$utf=new Unicode::UTF8simple;
%hash_all_features=();
my $feature_file="/home/orisun/master/fudan_corpus/4000_features";
open FEATUREFILE,"$feature_file";
while(<FEATUREFILE>){
	chomp;
	my @cont=split/\s+/;
    my $term=$cont[0];
    my $ig=$cont[1];
    $hash_all_features{$term}=$ig;
}
close FEATUREFILE;

$answer_file="/home/orisun/master/fudan_corpus/answer_seg/";
$answer_wc_file="/home/orisun/master/fudan_corpus/answer_wc/";
sub process{
	my $file=$File::Find::name;
	if(-d $file){
		#print $file,"\n";
		unless($_=~/^\./){
			my $crefile=$answer_wc_file.$_;
			unless(-e $crefile){			#创建同名文件夹(如果不存在的话)
				mkdir $crefile,0755 or die "Cannot create profile $crefile:$!";	
				#print "create profile succeed!\n";	
			}
		}
	}
	elsif(-f $file){
		unless($_=~/^\./){
			my $oldfile=$File::Find::name;
			my $newfile=$oldfile;
			$newfile=~s/answer_seg/answer_wc/;
			#print $oldfile."\n";
			#print $newfile."\n";
			unless(-e $newfile){			#创建同名文件(如果不存在的话)
				open (WCFILE,">:encoding(UTF-8)","$newfile") or die "Cannot create wcfile:$!";
				close WCFILE;
			}
		}
	}
}

find(\&process, $answer_file);
$et=time;
print "Time:",$et-$bt," Seconds.\n";

上面的代码只是单纯地在平行的目录下生成同名的空文件。

下面的代码才是向文件中写入特征词出现的次数,及其在文档中的权重。

#/usr/local/bin
use warnings;
use File::Find;
#use Unicode::UTF8simple;

$bt=time;
#$utf=new Unicode::UTF8simple;
%hash_all_features=();
my $feature_file="/home/orisun/master/fudan_corpus/4000_features";
open FEATUREFILE,"$feature_file";
while(<FEATUREFILE>){
	chomp;
	my @cont=split/\s+/;
    my $term=$cont[0];
    my $ig=$cont[1];
    $hash_all_features{$term}=$ig;
}
close FEATUREFILE;

$answer_file="/home/orisun/master/fudan_corpus/answer_seg/";
$answer_wc_file="/home/orisun/master/fudan_corpus/answer_wc/";
sub process{
	my $file=$File::Find::name;
	if(-d $file){
		#print $file,"\n";
		unless($_=~/^\./){
			my $crefile=$answer_wc_file.$_;
			unless(-e $crefile){			#创建同名文件夹(如果不存在的话)
				mkdir $crefile,0755 or die "Cannot create profile $crefile:$!";	
				#print "create profile succeed!\n";	
			}
		}
	}
	elsif(-f $file){
		unless($_=~/^\./){
			my $oldfile=$File::Find::name;
			my $newfile=$oldfile;
			$newfile=~s/answer_seg/answer_wc/;
			#print $oldfile."\n";
			#print $newfile."\n";
			open ORIFILE,"$oldfile" or die "Cannot read orifile:$!";
			#print "open oldfile succees!\n";
			my %hash_wc=();
			#print "create hash_wc succees!\n";
			while(<ORIFILE>){
				chomp;
				#print $_,"\n";
				my @words=split/\s+/;
				foreach my $word (@words){
					#print $word,"\n";
					if(exists $hash_all_features{$word}){		#如果单词在features中出现,就数一下该单词在本文档中出现多少次
						if(exists $hash_wc{$word}){				
							$hash_wc{$word}+=1;
						}else{
							$hash_wc{$word}=1;
						}
					}
				}
			}
			close ORIFILE;
			open (WCFILE,">$newfile") or die "Cannot write wcfile:$!";
			foreach my $word (keys %hash_wc){
				print WCFILE $word;
				my $weight=$hash_wc{$word}*$hash_all_features{$word};
				print WCFILE "\t$weight\t".$hash_wc{$word}."\t".$hash_all_features{$word}."\n";
			}
			close WCFILE;
		}
	}
}

find(\&process, $answer_file);
$et=time;
print "Time:",$et-$bt," Seconds.\n";

 理论上完全可以把这些代码

foreach my $word (keys %hash_wc){
print WCFILE $word;
my $weight=$hash_wc{$word}*$hash_all_features{$word};
print WCFILE "\t$weight\t".$hash_wc{$word}."\t".$hash_all_features{$word}."\n";
}

插入到每一个程序中去,没必要分成两段程序,可实际如果那么做生成的文件中中文全部是乱码,不知道为什么。速度还行,总共运行了30秒左右。

Java代码计算第篇训练文档的向量空间模型:

/**
 * Author: Orisun
 * Date: Sep 4, 2011
 * FileName: AbstractBDB.java
 * Function: 封装对Berkely DB的通用操作
 */

import java.io.File;
import java.io.FileNotFoundException;
import java.util.Vector;

import com.sleepycat.bind.EntryBinding;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.collections.StoredMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentConfig;

public abstract class AbstractBDB {
	private Environment env;
	protected Database database;
	protected Database catalogdatabase; // 用来创建StoredClassCatalog实例的数据库
	private static final String CLASS_CATALOG = "java_class_catalog"; // catalogdatabase的数据库名
	protected StoredClassCatalog javaCatalog; // StoredClassCatalog实例用来序列化对象
	StoredMap<String, Vector> VectorDB = null;
	EntryBinding<String> keyBinding;
	EntryBinding<Vector> valueBinding;

	public AbstractBDB(String homeDirectory) throws DatabaseException,
			FileNotFoundException {
		EnvironmentConfig envConfig = new EnvironmentConfig(); // 环境配置
		envConfig.setTransactional(true); // 允许事务
		envConfig.setAllowCreate(true); // 当环境配置不存在时就创建
		env = new Environment(new File(homeDirectory), envConfig); // 创建环境

		DatabaseConfig dbConfig = new DatabaseConfig(); // 数据库配置
		dbConfig.setTransactional(true); // 允许事务
		dbConfig.setAllowCreate(true); // 当数据库不存在时就创建
		catalogdatabase = env.openDatabase(null, CLASS_CATALOG, dbConfig);
		javaCatalog = new StoredClassCatalog(catalogdatabase);
		
		database = env.openDatabase(null, "TC", dbConfig); // 打开数据库
		
		keyBinding = new SerialBinding<String>(
				javaCatalog, String.class);
		valueBinding = new SerialBinding<Vector>(
				javaCatalog, Vector.class);
		VectorDB = new StoredMap<String, Vector>(database, keyBinding,
				valueBinding, true);
	}

	public void close() throws DatabaseException {
		database.close(); // 关闭存放url的数据库
		javaCatalog.close(); // 关闭用来序列化对象的javaCatalog类
		env.close(); // 关闭环境
	}
}
View Code
/**
* Author: Orisun
* Date: Sep 4, 2011
* FileName: Train_Vector.java
* Function: 使用TF/IDF为训练集的每个文档建立向量空间模型,并把向量写入BerkelyDB
*/

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Vector;

import com.sleepycat.collections.StoredMap;
import com.sleepycat.je.DatabaseException;

public class Train_Vector extends AbstractBDB {
final int doc_num = 7196;
final int vec_len = 1000;
StoredMap<String, Vector> MatrixDB = null;
public static HashMap<String, Double> features = new HashMap<String, Double>();// 存放最终选择的特征词

// 从文件中读入特征项。参数文件存储经过特征选择后剩下的特征项。
public void initFeatures(File file) {
features.clear();
try {
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
String line = null;
int i = 0;
while ((line = br.readLine()) != null && i++ < vec_len) {
String[] words = line.split("\\s+");
features.put(words[0], Double.valueOf(words[1]));
}
} catch (Exception e) {
e.printStackTrace();
}
}

public Train_Vector(String homeDirectory) throws DatabaseException,
FileNotFoundException {
super(homeDirectory);
MatrixDB = new StoredMap<String, Vector>(database, keyBinding,
valueBinding, true);
}

// 把word-doc矩阵从文件讲到内存数据库中
public void readMatrix(File matrixFile) {
try {
FileReader fr = new FileReader(matrixFile);
BufferedReader br = new BufferedReader(fr);
String line = null;
Vector<Integer> v = new Vector<Integer>(Global.docnum);
for (int i = 0; i < doc_num; i++)
v.add(0);
while ((line = br.readLine()) != null) {
String[] words = line.split("\\s+");
for (int i = 1; i < words.length; i++) {
v.set(i - 1, Integer.parseInt(words[i]));
}
MatrixDB.put(words[0], v);
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
}

// 为每一个文件建立向量
public void buildVSM(File srcFile) {
if (srcFile.isDirectory()) {
String newpath = srcFile.getAbsolutePath().replaceFirst(
"train_seg", "train_vec");
File np = new File(newpath); // 在平行目录下创建同名文件夹
if (!np.exists()) {
np.mkdir();
}
File[] children = srcFile.listFiles();
for (File child : children) {
buildVSM(child);
}
} else if (srcFile.isFile()) {
// 把特征项从HashSet转化为数组
Object[] feature_array = features.keySet().toArray();
// 存储一篇文档中出现过的特征项,及其对应的TF/IDF权值
HashMap<String, Double> fea_wei = new HashMap<String, Double>();

int filerank = -1; // 文件的全局编号
Vector<Double> vsm = new Vector<Double>(vec_len);
for (int i = 0; i < vec_len; i++)
vsm.add(0.0);
try {
FileReader fr = new FileReader(srcFile);
BufferedReader br = new BufferedReader(fr);
String line = null;
line = br.readLine(); // 读取文档的全部内容,文档就一行
String[] words = line.split("\\s+");
filerank = Integer.parseInt(words[0]);
for (int i = 1; i < words.length; i++) {
String word = words[i];
if (!features.keySet().contains(word))
continue;
int TF = 0; // 特征项在本文档中出现的频率
int DF = 0; // 出现特征项的文档数目
int N = doc_num; // 全部文档数目
Vector<Integer> vec = MatrixDB.get(word);
TF = vec.get(filerank);
for (int j = 0; j < N; j++) {
if (vec.get(j) > 0)
DF++;
}
double IDF = 1.0 * N / DF;
double weight = Math.log(TF + 1.0) / Math.log(2)
* Math.log(IDF) / Math.log(2);
fea_wei.put(word, weight);
}
for (int i = 0; i < feature_array.length; i++) {
String feat = feature_array[i].toString();
double w = 0.0;
if (fea_wei.containsKey(feat))
w = fea_wei.get(feat);
vsm.set(i, w);
}
unit(vsm);
// 把文档对应的向量存入数据库
VectorDB.put(String.valueOf(filerank), vsm);
// 把文档对应的向量存入文件
String newfil = srcFile.getAbsolutePath().replaceFirst(
"train_seg", "train_vec");
File np = new File(newfil); // 在平行目录下创建同名文件夹
if (!np.exists()) {
np.createNewFile();
}
FileWriter fw = new FileWriter(np);
BufferedWriter bw = new BufferedWriter(fw);
for (int i = 0; i < vsm.size(); i++) {
bw.write(String.valueOf(vsm.get(i)) + "\t");
}
bw.flush();
bw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}

// 向量单位化,即把向量长度化为1.由于我们通过向量夹角来度量向量的距离,所以此函数没有被调用。
public void unit(Vector<Double> vsm) {
double powsum = 0.0;
Iterator<Double> iter = vsm.iterator();
while (iter.hasNext()) {
powsum += Math.pow(iter.next(), 2.0);
}
for (int i = 0; i < vsm.size(); i++) {
vsm.set(i, vsm.get(i) / powsum);
}
}

public static void main(String[] args) throws Exception {
Train_Vector tvector = new Train_Vector(
"/home/orisun/master/fudan_corpus/BDB");
tvector.initFeatures(new File(
"/home/orisun/master/fudan_corpus/4000_features"));
tvector.readMatrix(new File("/home/orisun/master/fudan_corpus/matrix/4000_matrix"));
tvector.buildVSM(new File("/home/orisun/master/fudan_corpus/train_seg"));
// 记得关闭数据库
tvector.close();
}
}

打印输出微量空间模型中每个特征项出现的顺序:

View Code
/**
*
@author Orisun
* 打印输出微量空间模型中每个特征项出现的顺序
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;

public class OutFeatureHash {
final int vec_len = 1000;
public static HashMap<String, Double> features = new HashMap<String, Double>();// 存放最终选择的特征词

// 从文件中读入特征项。参数文件存储经过特征选择后剩下的特征项。
public void initFeatures(File file) {
features.clear();
try {
FileReader fr = new FileReader(file);
BufferedReader br = new BufferedReader(fr);
String line = null;
int i = 0;
while ((line = br.readLine()) != null && i++ < vec_len) {
String[] words = line.split("\\s+");
features.put(words[0], Double.valueOf(words[1]));
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args){
OutFeatureHash inst=new OutFeatureHash();
inst.initFeatures(new File("/home/orisun/master/fudan_corpus/4000_features"));
Object[] feature_array = features.keySet().toArray();
File seqfile=new File("/home/orisun/master/fudan_corpus/vsm_termsequence_1000");
try {
if(!seqfile.exists())
seqfile.createNewFile();
FileWriter fw=new FileWriter(seqfile);
BufferedWriter bw=new BufferedWriter(fw);
for (int i = 0; i < feature_array.length; i++) {
String feat = feature_array[i].toString();
bw.write(feat+" "); //用一个空格隔开
}
bw.flush();
bw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

为测试集生成向量空间模型:

跟上面一样,先在平行的目录下创建同名文件,用了一秒。

View Code
#/usr/bin/perl
use File::Find;

$bt=time;
$answer_wc_files="/home/orisun/master/fudan_corpus/answer_wc";

sub process{
my $file=$File::Find::name;
if(-d $file){ #如果是一个文件夹
my $newprofile=$file;
$newprofile=~s/answer_wc/answer_vec/;
unless(-e $newprofile){
mkdir $newprofile,0755 or die "Cannot create profile $newprofile:$!";
}
}
elsif(-f $file){
unless($_=~/^\./){
my $newprofile=$file;
$newprofile=~s/answer_wc/answer_vec/;
unless(-e $newprofile){
open NEWF,">:encoding(UTF-8)","$newprofile" or die "Can't create $newprofile:$!";
close NEWF;
}
}
}
}

find(\&process,$answer_wc_files);
$et=time;
print "Time:",$et-$bt,"seconds\n";

然后输出每个测试文档对应的向量。用了12秒。

View Code
#/usr/bin/perl
use File::Find;

$bt=time;
$term_seq_file="/home/orisun/master/fudan_corpus/vsm_termsequence_1000";
@vector=();
open SEQF,"$term_seq_file" or die "Can't open $term_seq_file:$!";
while(<SEQF>){
chomp;
@vector=split;
}
close SEQF;

$answer_wc_files="/home/orisun/master/fudan_corpus/answer_wc";
sub process{
my $file=$File::Find::name;
if(-f $file){
unless($_=~/^\./){
my $newprofile=$file;
$newprofile=~s/answer_wc/answer_vec/;
if(-e $newprofile){
open OLDF,"$file" or die "Can't create $file:$!";
my %hash_term_weight=();
my @vsm=();
while(<OLDF>){
chomp;
my @line=split;
$hash_term_weight{$line[0]}=$line[1];
}
foreach(0..$#vector){
if(exists $hash_term_weight{$vector[$_]}){
$weight=$hash_term_weight{$vector[$_]};
$vsm[$_]=$weight;
}
}
close OLDF;
open NEWF,">$newprofile" or die "Can't create $newprofile:$!";
foreach(0..$#vsm){
if($vsm[$_]){
print NEWF $vsm[$_]."";
}else{ #如果该项是0就写入0.0
print NEWF "0.0 ";
}
}
close NEWF;
}
}
}
}

find(\&process,$answer_wc_files);
$et=time;
print "Time:",$et-$bt,"seconds\n";
posted @ 2011-11-09 19:23  张朝阳  阅读(2299)  评论(0编辑  收藏  举报