文章关键词提取算法

文本预处理部分

1.对于原始文档，我们要求是中文（包括标点符号），并且文档的一第句（即第一个全角句号之前的内容）应该是文章的标题。

2.采ISCTCLAS分词，并标注词性。

wordseg.cpp

#include <string>
#include <iostream>
#define OS_LINUX
#include "ICTCLAS50.h"
using namespace std;

int main(int argc, char *argv[])
{
	if (argc < 2) {		//命令行中需要给定要处理的文件名
		cout << "Usage:command filename" << endl;
		return 1;
	}
	string filename = argv[1];
	string outfile = filename + "_ws";
	string initPath = "/home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API";
	if (!ICTCLAS_Init(initPath.c_str())) {
		cout << "Init fails" << endl;
		return -1;
	}
	ICTCLAS_FileProcess(filename.c_str(), outfile.c_str(), CODE_TYPE_UTF8,
			    1);
	ICTCLAS_Exit();
	return 0;
}

注意编译时要指明头文件和动态库的路径：

g++ wordseg.cpp -o wordseg -I /home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -L /home/orisun/master/ICTCLAS50_Linux_RHAS_32_C/API -lICTCLAS50

3.保留特写词性的词语，其它的删掉。同时把文档合并成一行。

posfilter.cpp

#include<iostream>
#include<fstream>
#include<string>
#include<set>
#include<cstring>
using namespace std;

int main(int argc, char *argv[])
{
	if (argc < 2) {		//命令行中需要给定要处理的文件名
		cout << "Usage:command filename" << endl;
		return 1;
	}

	string arr_pos[] = { "/n",	//名词
		"/nr",		//人名
		"/nr1",		//汉语姓氏
		"/nr2",		//汉语名字
		"/nrj",		//日语人名
		"/ns",		//地名
		"/nt",		//机构团体名
		"/wj",		//句号
		"/nl",		//名词性惯用语
		"/ng",		//名词性语素
		"/v",		//动词
		"/vd",		//副动词
		"/vn",		//名动词
		"/vl",		//动词性惯用语
		"/vg",		//动词性语素
		"/a",		//形容词
		"/an",		//名形词
		"/ag",		//形容词性语素
		"/al",		//形容词性惯用语
		""
	};
	set < string > set_pos;
	int i;
	for (i = 0; arr_pos[i] != ""; ++i)
		set_pos.insert(arr_pos[i]);

	string filename(argv[1]);
	string outfile = filename + "_pos";

	ifstream ifs(filename.c_str());	//打开输入文件
	ofstream ofs(outfile.c_str());	//打开输出文件
	if (!(ifs && ofs)) {
		cerr << "error:open file failed." << endl;
		return 1;
	}

	string word;
	while (ifs >> word) {
		bool flag = false;
		int length = word.find("/");
		//cout<<word<<"\t"<<length<<endl;
		if (length == 3 && strncmp(word.c_str(), "。", 3) != 0)	//过滤掉单个汉字，但是不过滤全角句号(因为一个汉字就3个字节)
			continue;
		string pos = word.substr(length);
		if (set_pos.find(pos) != set_pos.end())
			flag = true;
		if (flag)
			ofs << word << "\t";
	}

	ifs.close();
	ofs.close();
	return 0;
}

4.把文章分为四大部分：标题，段首，段中，段尾。各部分之间用一个空行分开。标题是第一句，紧接着后两句是段首，文章末两句是段尾，其余属段中。

section.cpp

#include<iostream>
#include<cassert>
#include<string>
#include<fstream>
using namespace std;

int main(int argc, char *argv[])
{
	if (argc < 2) {		//命令行中需要给定要处理的文件名
		cout << "Usage:command filename" << endl;
		return 1;
	}
	string filename(argv[1]);	//输入文件名
	string outfile = filename + "_part";	//输出文件名
	ifstream ifs(filename.c_str());	//打开输入文件
	ofstream ofs(outfile.c_str());	//打开输出文件
	if (!(ifs && ofs)) {
		cerr << "error:open file failed." << endl;
		return 1;
	}
	string content, word;
	while (ifs >> word)	//把文件的内容全部读到content中，去除了换行符，相当于把整篇文档合并成了一行
		content.append(word + "\t");

	string period = "。/wj";
	string::size_type front = content.find(period);	//寻找第一个句号所在的位置
	assert(front != string::npos);
	content.replace(front, 6, "\t\t\t\t\n\n");

	front = content.find(period, front + 1);	//寻找第二个句号所在的位置
	front = content.find(period, front + 1);	//寻找第三个句号所在的位置
	content.replace(front, 6, "\t\t\t\t\n\n");

	string::size_type back = content.rfind(period);	//最后一个句号
	assert(back != string::npos);
	content.replace(back, 3, ".");
	back = content.rfind(period);	//倒数第二个句号
	content.replace(back, 3, ".");
	back = content.rfind(period);	//倒数第三个句号
	content.replace(back, 6, "\t\t\t\t\n\n");

	ofs << content;
	ifs.close();
	ofs.close();
	return 0;
}

5.构造元组（词语，词性，出现的次数，出现在标题，出现在段首，出现在段尾）

tuple.cpp

#include<iostream>
#include<fstream>
#include<sstream>
#include<string>
#include<map>
using namespace std;

struct Tuple {
	string word;		//词语
	string pos;		//词性
	int occurs;		//出现次数
	short local1;		//标题中出现
	short local2;		//段首出现
	short local3;		//段尾出现

	//构造函数
	 Tuple() {
	};			//由于Tuple要作为map的second，所以必须提供空参数构造函数
	Tuple(string w, string p, int o = 1, short l1 = 0, short l2 =
	      0, short l3 = 0)
 :	    word(w), pos(p), occurs(o), local1(l1), local2(l2), local3(l3) {
	};

};

int main(int argc, char *argv[])
{
	if (argc < 2) {
		cout << "Usage:command filename." << endl;
		return 0;
	}

	string infile(argv[1]);
	string outfile = infile + "_tuple";
	ifstream ifs(infile.c_str());
	ofstream ofs(outfile.c_str());
	if (!(ifs && ofs)) {
		cerr << "Open file failed." << endl;
		return -1;
	}

	map < string, Tuple > tmap;
	map < string, Tuple >::const_iterator itr;
	string line;
	for (int i = 0; i < 7, getline(ifs, line); ++i) {
		istringstream stream(line);
		string word;
		while (stream >> word) {
			int index = word.find("/");	//斜杠之前是词语，斜杠之后是词性
			string front = word.substr(0, index);
			itr = tmap.find(front);
			if (itr == tmap.end()) {	//到目前为止没有出现过
				string post = word.substr(index + 1);
				if (post == "wj")
					continue;
				Tuple tuple(front, post);
				switch (i) {
				case 0:
					tuple.local1 = 1;
					break;
				case 2:
					tuple.local2 = 1;
					break;
				case 6:
					tuple.local3 = 1;
					break;
				default:
					break;
				}
				tmap[front] = tuple;
			} else {	//词语曾出现过
				Tuple tuple = tmap[front];
				tuple.occurs++;
				switch (i) {
				case 0:
					tuple.local1 = 1;
					break;
				case 2:
					tuple.local2 = 1;
					break;
				case 6:
					tuple.local3 = 1;
					break;
				default:
					break;
				}
				tmap[front] = tuple;
			}
		}
	}

	for (itr = tmap.begin(); itr != tmap.end(); ++itr) {
		//将（词语，词性，次数，位置）写入文件
		ofs << itr->second.word << "\t" << itr->
		    second.pos << "\t" << itr->second.
		    occurs << "\t" << itr->second.local1 << "\t" << itr->second.
		    local2 << "\t" << itr->second.local3 << endl;
	}

	ifs.close();
	ofs.close();
	return 0;
}

6.把同义词词林存入gdbm数据库

sy2db.cpp

#include<gdbm.h>
#include<iostream>
#include<fstream>
#include<sys/stat.h>
#include<sstream>
using namespace std;

int main(int argc, char *argv[])
{
	string infile("同义词词林扩展版");
	ifstream ifs(infile.c_str());
	if (!ifs) {
		cerr << "open file failed!" << endl;
		return -1;
	}

	GDBM_FILE dbm_ptr;
	dbm_ptr = gdbm_open("sydb", 0, GDBM_WRCREAT, S_IRUSR | S_IWUSR, NULL);
	datum key, data;

	string line, word;
	while (getline(ifs, line)) {
		istringstream stream(line);
		stream >> word;	//取出每行的第一列作为key
		key.dptr = (char *)word.c_str();
		key.dsize = word.size() + 1;
		data.dptr = (char *)line.c_str();
		data.dsize = line.size() + 1;
		gdbm_store(dbm_ptr, key, data, GDBM_REPLACE);
	}

	ifs.close();
	gdbm_close(dbm_ptr);
	return 0;
}

算法部分

7.计算词语之间的相似度

simMatrix.cpp

#include<iostream>
#include<fstream>
#include<sstream>
#include<gdbm.h>
#include<sys/stat.h>
#include<climits>
#include<cassert>
#include<vector>

using namespace std;

/**相似度计算相关参数设置**/
const double init_dist = 10;
const double alpha = 5.0;
const double beta = 0.66;
const double weight[6] = { 1.0, 0.5, 0.25, 0.125, 0.06, 0.03 };

GDBM_FILE dbm_ptr;		//数据库句柄
vector <string> words;	//存储文章中出现的词

/**读出所有的词，存入vector**/
void initWords(string filename)
{
	ifstream ifs(filename.c_str());
	assert(ifs);
	string line;
	while (getline(ifs, line)) {
		istringstream stream(line);
		string word;
		stream >> word;	//读出一行中的首列词语即可
		words.push_back(word);
	}
	ifs.close();
}

/**计算两个编码(编码分为code位和标志位)的距离**/
double calDist(string code1, string code2)
{
	if (code1[7] == '@' || code2[7] == '@')	/*词语自我封闭、独立，在同义词词林中既没有同义词，也没有相关词 */
		return init_dist;
	double dist = -10;	//初始距离给一个负数
	int eqi = 0;		//两个code相同的倍数
	int i;
	for (i = 0; i < 7; ++i) {
		if (code1[i] != code2[i])
			break;
		eqi++;
	}
	if (i < 7) {		//code位不同
		switch (eqi) {
		case 0:
		case 1:
		case 2:
			dist = weight[eqi] * init_dist;
			break;
		case 3:
		case 4:
		case 5:
			dist = weight[eqi - 1] * init_dist;
			break;
		case 6:
			dist = weight[eqi - 2] * init_dist;
			break;
		default:
			break;
		}
	} else {		//code位相同
		if (code1[i] == code2[i]) {	//标志位相同
			if (code1[i] == '=')	//同义
				dist = 0;
			else if (code1[i] == '#')	//同类
				dist = weight[5] * init_dist;
		} else {	//只有code位相同，标志位就一定相同，所以else的情况不会了生
			cout << code1 << "和" << code2 <<
			    "code位相同，标志位居然不相同！" <<
			    endl;
			return -1;
		}
	}
	return dist;
}

/**计算两个词的相似度**/
double calSim(string word1, string word2)
{
	if (word1 == word2)	//如果是同一个词，则相似度为1
		return 1;
	datum key1, data1, key2, data2;
	key1.dptr = (char *)word1.c_str();
	key1.dsize = word1.size() + 1;
	data1 = gdbm_fetch(dbm_ptr, key1);
	int size1 = data1.dsize;
	key2.dptr = (char *)word2.c_str();
	key2.dsize = word2.size() + 1;
	data2 = gdbm_fetch(dbm_ptr, key2);
	int size2 = data2.dsize;

	if (size1 != 0 && size2 != 0) {	//两个词都在词林中找得到
		int i, j;
		string word1;
		vector <string> vec1, vec2;
		string buffer1(data1.dptr);
		istringstream stream1(buffer1);
		stream1 >> word1;
		stream1 >> word1;	//路过前两列
		while (stream1 >> word1) {
			vec1.push_back(word1);	//把词对应的编码都存入vector中 
		}
		string word2;
		string buffer2(data2.dptr);
		istringstream stream2(buffer2);
		stream2 >> word2;
		stream2 >> word2;	//路过前两列
		while (stream2 >> word2) {
			vec2.push_back(word2);
		}
		double minDist = INT_MAX;	//初始距离为无穷大
		for (int i = 0; i != vec1.size(); ++i) {
			for (int j = 0; j != vec2.size(); ++j) {
				//cout<<vec1[i]<<"和"<<vec2[j]<<"的距离"<<endl;
				double dist = calDist(vec1[i], vec2[j]);
				if (dist < minDist)
					minDist = dist;	//两个词的距离是所有编码组合中距离的最小值
			}
		}
		return alpha / (alpha + minDist);	//从距离到相似度的转换
	} else			//只要有一个词不在词林中，则返回相似度为0
		return 0;
}

int main(int argc, char *argv[])
{
	if (argc < 2) {
		cout << "Usage:command filename." << endl;
		return 0;
	}
	string infile(argv[1]);
	initWords(infile);
	dbm_ptr = gdbm_open("sydb", 0, GDBM_READER, S_IRUSR | S_IWUSR, NULL);
	ofstream ofs("simadj");
	ofs << words.size() << endl;	//把邻接矩阵的规模写入文件首行
	for (int i = 0; i != words.size(); ++i) {
		ofs << i << "\t";
		for (int j = 0; j < i; ++j) {	//把顶点之间的相似度存入下三角矩阵
			double sim = calSim(words[i], words[j]);
			if (sim > beta)	//相似度大于阈值时才认为两个顶点之间有边
				ofs << j << "(" << sim << ")" << "\t";
		}
		ofs << endl;
	}
	ofs.close();
	gdbm_close(dbm_ptr);
	return 0;
}

8.根据词语的语义相似度矩阵，计算词语居间度

顶点V_i的居间度bc_i定义为：

\begin{equation}{bc}_{i}=\sum_{m,k=1}^{n}{\frac{{g}_{mk}({V}_{i})}{{g}_{mk}}}\end{equation}

n是顶点的个数，g_mk是顶点m和k之间的最短路径的个数，g_mk(V_i)是顶点m和k之间的最短路径中经过顶点V_i的条数。

对于无向图可以表示为

\begin{equation}{bc}_{i}=\sum_{m=1}^{n}{\sum_{k=1}^{m-1}{\frac{{g}_{mk}({V}_{i})}{{g}_{mk}}}}\end{equation}

Dijkstra算法可以找到单源节点的最短径，但是只能找出一条，要想找到两顶点之间的所有最短路径只需对经典Dijkstra稍作修改（见下面的代码）。在Dijkstra中运用PairingHeap可以提高算法效率，见我的另外一篇博客《用PairingHeap改进Dijkstra算法》。分别指定不同的顶点作起点就可以找出图中所有的最短路径。

代码中使用一个全局数组PairDependencyArray [ num_of_vertex ] 来保存各个节点的居间度，数组初始化为0，随着新的最短路径的发现，数组元素不断增加。比如运行一次Dijkstra后我们发现了顶点V₁到其他顶点之间的最短路径：

V	known	d	p
V1	T	0	0
V2	T	1	V1
V3	T	4	V4
V4	T	2	V1,V2
V5	T	3	V4
V6	T	7	V7
V7	T	6	V4,V5

我们直观地画出V₁到V₆的最短路径（有多条）：

现在我们要更新每条路径上除两端点之外的中间节点的居间度，它们的居间度要增加一个值，这个值怎么计算呢？终点赋予1.0，它的前继节点平分这个值。

PairDependencyArray [7]增加1;

PairDependencyArray [4]增加0.5+0.5;

PairDependencyArray [5]增加0.5;

PairDependencyArray [2]增加0.25+0.25;

pariheap.h

#ifndef _PAIRHEAP_H
#define _PAIRHEAP_H

#include<iostream>
#include<cstdlib>
#include<vector>
#include<utility>
using namespace std;

struct PairNode {
	int nodeindex;
	double element;
	PairNode *prev, *leftChild, *nextSibling;

	 PairNode() {
	} PairNode(int i, double d, PairNode * p = NULL, PairNode * l =
		   NULL, PairNode * n = NULL)
 :	    nodeindex(i), element(d), prev(p), leftChild(l), nextSibling(n) {
	}
};

/**
 * 打印配对堆
 */
void printNode(PairNode * root)
{
	if (root == NULL)
		return;
	else {
		cout << root->nodeindex << "(" << root->element << ")" << "\t";
		cout << root->nodeindex << "'s next:";
		printNode(root->nextSibling);
		if (root->element < INT_MAX) {
			cout << root->nodeindex << "'s leftChild:";
			printNode(root->leftChild);
		}
	}
}

/**
 * 合并以first和second为根的两棵树
 * 函数开始时first->nextSibling必须为NULL
 * second可以为NULL
 * 树合并后，first成为新树的根节点
 */
void compareAndLink(PairNode * &first, PairNode * second)
{
	if (second == NULL)
		return;
	if (second->element < first->element) {	//谁小谁作父节点
		second->prev = first->prev;
		first->prev = second;
		first->nextSibling = second->leftChild;
		if (first->nextSibling != NULL)
			first->nextSibling->prev = first;
		second->leftChild = first;
		first = second;
	} else {
		second->prev = first;
		first->nextSibling = second->nextSibling;
		if (first->nextSibling != NULL)
			first->nextSibling->prev = first;
		second->nextSibling = first->leftChild;
		if (second->nextSibling != NULL)
			second->nextSibling->prev = second;
		first->leftChild = second;
	}
}

/**
 * 插入新项，返回指向新节点的指针
 */
PairNode *insert(PairNode * &root, PairNode * newNode)
{
	if (root == NULL)
		root = newNode;
	else
		compareAndLink(root, newNode);
	return newNode;
}

/**
 * 将指定的节点降低delta值
 */
void decreaseKey(PairNode * &root, PairNode * p, const double delta)
{
	if (delta <= 0) {
		cerr << "降低的值为非正数." << endl;
		exit(1);
	}
	p->element -= delta;
	if (p != root) {
		if (p->nextSibling != NULL)	//如果有右兄弟节点
			p->nextSibling->prev = p->prev;
		if (p->prev->leftChild == p)	//如果p是最左孩子
			p->prev->leftChild = p->nextSibling;
		else		//如果p不是最左孩子
			p->prev->nextSibling = p->nextSibling;

		p->nextSibling = NULL;
		compareAndLink(root, p);
	}
}

/**
 * 两趟归并firstSibling的所有右兄弟节点
 * firstSibling在函数开始时不为NULL，函数结束后成为新树的根节点
 */
PairNode *combineSiblings(PairNode * firstSibling)
{
	if (firstSibling->nextSibling == NULL)
		return firstSibling;
	static vector < PairNode * >treeArray(200);	//初始长度给定为200,需要时再用resize扩展
	int numSiblings = 0;
	for (; firstSibling != NULL; numSiblings++) {
		if (numSiblings == treeArray.size())
			treeArray.resize(numSiblings * 2);
		treeArray[numSiblings] = firstSibling;
		firstSibling->prev->nextSibling = NULL;	//虽然配对堆用的是双向链表存储节点，但只要打断向后的指针，使每个节点的nextSibling=NULL,再进行conpareAndLink时就不会出错
		firstSibling = firstSibling->nextSibling;
	}
	if (numSiblings == treeArray.size())
		treeArray.resize(numSiblings + 1);
	treeArray[numSiblings] = NULL;
	//第一趟，从左向右两两归并
	int i = 0;
	for (; i + 1 < numSiblings; i += 2)
		compareAndLink(treeArray[i], treeArray[i + 1]);
	int j = i - 2;
	if (j == numSiblings - 3)
		compareAndLink(treeArray[j], treeArray[j + 2]);
	//第二趟，从右向左逐个向前归并
	for (; j >= 2; j -= 2)
		compareAndLink(treeArray[j - 2], treeArray[j]);
	return treeArray[0];
}

/**
 * 移除最小节点（即根节点）
 */
void deleteMin(PairNode * &root)
{
	if (root == NULL) {
		cerr << "PairingHeap has already been empty,can't deleteMin." <<
		    endl;
		exit(-1);
	}
	PairNode *oldRoot = root;
	if (root->leftChild == NULL)
		root = NULL;
	else
		root = combineSiblings(root->leftChild);
	delete oldRoot;		//释放空间
}

/**
 * 返回指定节点的指针
 */
PairNode *findNode(PairNode * &root, const pair < int, double >&node)
{
	if (root == NULL)
		return NULL;
	if (root->nodeindex == node.first)	//只要找到nodeindex匹配的就可以，因为只要nodeindex匹配，element就一定也匹配
		return root;
	if (node.second + 0.001 > root->element) {	//如果要找的节点node的element比root的element还小，则node不可能在以root为根的子树下面
		//按理来讲配对堆中任何一棵子树的根节点不会比子树中的其他节点大，由于计算误差的累积，这里我们给出0.001的容错量
		PairNode *rect = findNode(root->leftChild, node);	//先搜子节点
		if (rect != NULL) {
			return rect;
		} else {
			return findNode(root->nextSibling, node);	//再搜兄弟节点
		}
	} else {
		return findNode(root->nextSibling, node);
	}
}

/**
 * 释放配对堆
 */
void deleteHeap(PairNode * &root)
{
	if (root != NULL) {
		deleteHeap(root->leftChild);
		deleteHeap(root->nextSibling);
		delete root;
	}
}

#endif

bc.cpp

#include<utility>
#include<climits>
#include<vector>
#include<fstream>
#include<iostream>
#include<sstream>
#include<cassert>
#include<cstdlib>
#include<list>
#include"pairheap.h"
using namespace std;
 
struct Table {
    bool known;
    double dv;
     vector < int >pv;
};

template <typename Printable>
struct Node{
    Printable data;
    Node *parent;

    Node(Printable d,Node *p=NULL)
        :data(d),parent(p){
    }

};
 
int vertextNum;         //顶点数
double **adjMatrix;     //用邻接矩阵来存储图
vector < Table > dijkTable;   //dijkstra维护的表
double *pairDependency;

vector <string> words;    //存储文章中出现的词
 
/**读出所有的词，存入vector**/
void initWords(string filename)
{
    ifstream ifs(filename.c_str());
    assert(ifs);
    string line;
    while (getline(ifs, line)) {
        istringstream stream(line);
        string word;
        stream >> word;   //读出一行中的首列词语即可
        words.push_back(word);
    }
    ifs.close();
}
 
void initMatrix(string filename)
{
    ifstream ifs(filename.c_str());
    assert(ifs);
    int scale;
    string line;
    getline(ifs, line); //读出第一行，存放着矩阵的规模
    istringstream stream(line);
    stream >> scale;
    vertextNum = scale;
 
    adjMatrix = new double *[scale];    //动态创建二维数组
    for (int i = 0; i < scale; ++i) {
        adjMatrix[i] = new double[scale];
        for (int j = 0; j < scale; ++j)
            adjMatrix[i][j] = 0;
    }
 
    int row, col;
    string word;
    int vari;
    while (getline(ifs, line)) {
        istringstream strm(line);
        strm >> vari; //第一列存储的是顶点的index
        row = vari;
        while (strm >> word) {
            int pos1 = word.find("(");
            int pos2 = word.find(")");
            int ind = atoi(word.substr(0, pos1).c_str());
            col = ind;
            double dis =
                atof(word.substr(pos1 + 1, pos2 - pos1 - 1).
                 c_str());
            adjMatrix[row][col] = adjMatrix[col][row] = dis;
        }
    }
 
    ifs.close();
}
 
/*释放邻接矩阵*/
void deleteMatrix()
{
    for (int i = 0; i < vertextNum; ++i)
        delete adjMatrix[i];
    delete[]adjMatrix;
}
 
void printMatrix()
{
    for (int i = 0; i < vertextNum; i++) {
        for (int j = 0; j < vertextNum; ++j) {
            cout << adjMatrix[i][j] << "\t";
        }
        cout << endl;
    }
}
 
void initDijkTable()
{
    dijkTable.clear();
    Table tb;
    tb.dv = INT_MAX;
    tb.known = false;
    for (int i = 0; i < vertextNum; ++i)
        dijkTable.push_back(tb);
}
 
void printDijkTable()
{
    for (int i = 0; i < vertextNum; ++i) {
        cout << i << "\t" << (dijkTable[i].
                      known ? "TRUE" : "FALSE") << "\t" <<
            dijkTable[i].dv << "\t";
        for (int j = 0; j < dijkTable[i].pv.size(); ++j) {
            cout << dijkTable[i].pv.at(j) << "\t";
        }
        cout << endl;
    }
}

void addNode(Node<string> *nodeP,int index,int startindex,list<Node<string> *> *leafNodes){
    if(index==startindex){
        leafNodes->push_back(nodeP);
        return;
    }
    for(int j=0;j<dijkTable[index].pv.size();++j){
        Node<string>* nodeC=new Node<string>(words[dijkTable[index].pv.at(j)]);
        nodeC->parent=nodeP;
        addNode(nodeC,dijkTable[index].pv.at(j),startindex,leafNodes);
    }
} 

/*根据DijkTable打印所有的最短路徑*/
void printSPathFromSource(int startindex){
    ofstream ofs("shortpath",ofstream::app);        //把所有的最短路径追加方式写入文件
    for (int endindex = 0; endindex < vertextNum; ++endindex) {
        list<Node<string> *> leafNodes;
        Node<string>* nodeP=new Node<string>(words[endindex]);
        addNode(nodeP,endindex,startindex,&leafNodes);
        
        list<Node<string> *>::iterator itr=leafNodes.begin();
        while(itr!=leafNodes.end()){
            Node<string>* down=*itr;
            while(down){
                ofs<<down->data<<"\t";
                down=down->parent;
            }
            ofs<<endl;
            itr++;
        }
    }
}
 
/*指定起点，运行带pairingheap的dijkstra算法*/
void dijkstra(int start)
{
    initDijkTable();
    dijkTable[start].dv = 0;
    PairNode *phroot = new PairNode(start, 0);
    for (int i = 0; i < vertextNum; ++i) {
        if (i == start)
            continue;
        insert(phroot, new PairNode(i, INT_MAX));
    }
    while (phroot != NULL) {
        int index = phroot->nodeindex;
        dijkTable[index].known = true;
        deleteMin(phroot);
        for (int i = 0; i < vertextNum; ++i) {
            if (adjMatrix[index][i] != 0
                && dijkTable[i].known == false) {
                double newdis =
                    dijkTable[index].dv + adjMatrix[index][i];
                double delta = dijkTable[i].dv - newdis;
                if (delta > 0) {
                    pair < int, double >pa;
                    pa.first = i;
                    pa.second = dijkTable[i].dv;
                    PairNode *fp = findNode(phroot, pa);
                    if (fp == NULL) {
                        cerr << "not found:" << pa.
                            first << "(" << pa.
                            second << ")" << endl;
                        cerr << "root=" << phroot->
                            nodeindex << endl;
                        exit(-1);
                    }
                    decreaseKey(phroot, fp, delta);
                    dijkTable[i].dv = newdis;
                    dijkTable[i].pv.clear();
                    dijkTable[i].pv.push_back(index);
                } else if (delta == 0) {
                    dijkTable[i].pv.push_back(index);
                }
            }
        }
    }
    deleteHeap(phroot);
}
 
void ancRat(double base, int index, int start)
{
    if (index == start)
        return;
    int len = dijkTable[index].pv.size();
    for (int i = 0; i < len; ++i) {
        int ind = dijkTable[index].pv.at(i);
        if (ind == start)
            continue;
        pairDependency[ind] += base / (len * vertextNum);   //分母上加一项vertextNum,是为了避免计算出来的居间度太大
        ancRat(pairDependency[ind], ind, start);
    }
}
 
/*计算从顶点start到terminal的最短路径上的所有节点的pair-dependency*/
void pairDepend(int start, int terminal)
{
    ancRat(1.0, terminal, start);
}
 
int main(int argc, char *argv[])
{
    if (argc < 3) {
        cerr << "Usage:command sim_matrix_file tuple_file." << endl;
        return -1;
    }
    string filename(argv[1]);
    initWords(argv[2]);
    initMatrix(filename);   //初始华邻接矩阵
    pairDependency = new double[vertextNum];
    for (int i = 0; i < vertextNum; ++i)
        pairDependency[i] = 0;
    for (int i = 0; i < vertextNum; ++i) {
        dijkstra(i);
        printSPathFromSource(i);
        //printDijkTable();
        for (int j = 0; j < i; ++j) {    //因为是无向图，所以算一半就够了
            pairDepend(i, j);
        }
    }
    deleteMatrix();
 
    ofstream ofs("BetweenCencility");
    ofs << vertextNum << endl;
    for (int i = 0; i < vertextNum; ++i)
        ofs << pairDependency[i] << endl;
    ofs.close();
 
    return 0;
}

9.计算居间度密度

bcdensity.cpp

#include<vector>
#include<functional>
#include<algorithm>
#include<fstream>
#include<sstream>
#include<iostream>
#include<climits>
#include<cstdlib>
#include<cassert>

using namespace std;

class myclass {
 public:
	myclass(int a, double b):first(a), second(b) {
	} int first;
	double second;
	//重载<操作符，按第1个数据成员排序
	bool operator <(const myclass & m) const {
		return first < m.first;
}};
//自定义仿函数，按第2个数据成员排序
bool less_second(const myclass & m1, const myclass & m2)
{
	return m1.second < m2.second;
}

vector < myclass > bc_vec;
vector < myclass > bcdens_vec;
int vec_len;
int s = 2;
const int c = 5;
const float d = 0.8;
int maxloop = 6;

//初始化居间度（从文件中读取）
void init_bcvec(string filename)
{
	ifstream ifs(filename.c_str());
	assert(ifs);
	string line;
	getline(ifs, line);	//从第一行中取出全部词语的总数
	stringstream strstm(line);
	strstm >> vec_len;
	int index = 0;
	while (getline(ifs, line)) {
		stringstream strstm(line);
		double bc;
		strstm >> bc;
		myclass mc(index, bc);
		bc_vec.push_back(mc);
		++index;
	}
	sort(bc_vec.begin(), bc_vec.end(), less_second);	//按照second成员对vector排序
}

//计算居间度密度
void density()
{
	int loop = 0;
	double maxratio = INT_MAX;
	double *IntervalDensity = NULL;
	double block;
	vector < myclass >::const_iterator itr;
	while (maxratio >= d && loop++ <= maxloop) {
		s *= c;
		//把居间度平均分为s个区间
		block = (bc_vec[vec_len - 1].second - bc_vec[0].second) / s;
		if (IntervalDensity != NULL)
			delete[]IntervalDensity;
		IntervalDensity = new double[s];
		itr = bc_vec.begin();
		//计算各个区间上元素数目占总元素的数目
		for (int i = 0; i < s; ++i) {
			while (itr != bc_vec.end()
			       && itr->second <= block * (i + 1)) {
				IntervalDensity[i]++;
				itr++;
			}
			IntervalDensity[i] /= vec_len;

		}

		maxratio = *max_element(IntervalDensity, IntervalDensity + s);
	}
	//查找每个顶点对应的居间度密度
	itr = bc_vec.begin();
	for (int i = 0; i < s; ++i) {
		while (itr != bc_vec.end() && itr->second <= block * (i + 1) + 1e-8) {	//浮点运算存在误差，所以加上这个1e-8
			int index = itr->first;
			double dens = IntervalDensity[i];
			myclass mc(index, dens);
			bcdens_vec.push_back(mc);
			itr++;
		}
	}
	delete[]IntervalDensity;
}

//把居间度密度输出到文件
void printDensity(string infile, string outfile)
{
	//Vd按first排序
	sort(bcdens_vec.begin(), bcdens_vec.end());
	ifstream ifs(infile.c_str());
	ofstream ofs(outfile.c_str());
	assert(ifs && ofs);
	vector < myclass >::const_iterator itr = bcdens_vec.begin();
	string line;
	while (getline(ifs, line) && itr != bcdens_vec.end()) {
		ofs << line << "\t" << (itr++)->second << endl;
	}
	ifs.close();
	ofs.close();
}

int main(int argc, char *argv[])
{
	if (argc < 4) {
		cout << "Usage:" << argv[0] << " tuplefile bcfile mergefile" <<
		    endl;
		return 0;
	}
	init_bcvec(argv[2]);
	density();
	printDensity(argv[1], argv[3]);
	return 0;
}

10.计算词语的总得分。得分最高的K个为关键词

score.cpp

#include<fstream>
#include<iostream>
#include<cassert>
#include<sstream>
#include<cstdlib>
#include<queue>
#include<stack>
using namespace std;

class myclass {
 public:
	string word;
	double score;
	 myclass(string w, double s):word(w), score(s) {
	};
	bool operator <(const myclass & rth) const {
		return score > rth.score;
}};

priority_queue < myclass > maxPQ;
int queue_size;			//控制大根堆的容量，即关键词的数目

const float vdw = 0.6;
const float tw = 0.4;
const float locw1 = 0.5;
const float locw2 = 0.3;
const float locw3 = 0.3;
const float lenw = 0.01;
const float posw = 0.5;
const float tfw = 0.8;

double getScore(int wordlen, string pos, int count, int title, int head,
		int tail, double bcdens)
{
	double score = 0.0;
	double pos_score;
	int len = pos.size();
	if (pos[len - 1] == 'g') {	//语素
		pos_score = 0.2;
	} else if (pos[0] == 'n'	//名词
		   || pos[len - 1] == 'n'	//名动词或名形词
		   || pos[len - 1] == 'l') {	//成语或习用语
		pos_score = 0.6;
	} else if (pos == "v") {	//动词
		pos_score = 0.3;
	} else if (pos == "vd") {	//副动词
		pos_score = 0.4;
	} else if (pos == "a") {	//形容词
		pos_score = 0.5;
	} else {
		cerr << "存在未知词性." << endl;
		pos_score = 0.0;
	}
	score =
	    vdw * bcdens + tw * (title * locw1 + head * locw2 + tail * locw3 +
				 wordlen * lenw + pos_score * posw +
				 count * tfw);
	return score;
}

int main(int argc, char *argv[])
{
	if (argc < 3) {
		cout << "Usage:" << argv[0] << " filename num_of_keywords" <<
		    endl;
		return 0;
	}
	queue_size = atoi(argv[2]);
	string outfile = "kw_score";
	ifstream ifs(argv[1]);
	ofstream ofs(outfile.c_str());
	assert(ifs && ofs);
	string line;
	while (getline(ifs, line)) {
		string word, pos;
		int count, title, head, tail;
		double bcdens;
		istringstream stream(line);
		stream >> word >> pos >> count >> title >> head >> tail >>
		    bcdens;
		double score =
		    getScore(word.size() / 3, pos, count, title, head, tail,
			     bcdens);
		ofs << word << "\t" << score << endl;
		myclass mc(word, score);
		if (maxPQ.size() == queue_size) {
			if (mc < maxPQ.top()) {
				maxPQ.pop();
				maxPQ.push(mc);
			}
		} else {
			maxPQ.push(mc);
		}
	}
	ifs.close();
	ofs.close();
	cout << "关键词:" << endl;
	stack<myclass> st;
	while (!maxPQ.empty()) {
		myclass mc = maxPQ.top();
		st.push(mc);
		maxPQ.pop();
	}
	while(!st.empty()){
		myclass mc = st.top();
		cout << mc.word << "\t" << mc.score << endl;
		st.pop();
	}
	return 0;
}

11.算法测试。

下面取了《人民日报》上的两篇文章，进行了关键词提取。结果如下：

《让雷锋精神代代相传》2012年3月5日见报

让雷锋精神代代相传。
雷锋具有崇高理想和坚定信念，是实践社会主义、共产主义思想道德的楷模，是全国人民学习的光辉榜样。党的十七届六中全会《决定》强调，深入开展学雷锋活动，采取措施推动学习活动常态化。贯彻落实这一要求，需要我们深刻理解雷锋精神的时代内涵，积极探索新形势下弘扬雷锋精神的有效途径，让雷锋精神代代相传。这对于推动社会主义核心价值体系建设、巩固全党全国人民团结奋斗的共同思想道德基础具有重要意义。本期“大家论道”刊发的这组文章，围绕这一问题进行了阐述。

雷锋精神的时代内涵

雷锋这个响亮的名字和以他的名字命名的雷锋精神，深深镌刻在一代又一代中国人的心中，影响和激励着一代又一代中国人为建设中国特色社会主义而奋勇前进。半个世纪过去了，在雷锋精神的感召下，我国涌现出无数雷锋式的先进人物，他们继承和弘扬雷锋精神，为其注入了新的内涵。雷锋精神的时代内涵，集中体现为服务人民、助人为乐的奉献精神，干一行爱一行、专一行精一行的敬业精神，锐意进取、自强不息的创新精神，艰苦奋斗、勤俭节约的创业精神。【详细】

新时期弘扬雷锋精神的着力点

雷锋精神内涵丰富，历久弥新。在新的历史条件下，弘扬雷锋精神应把握以下几个主要着力点。

不断坚定中国特色社会主义理想信念

雷锋是一个立场坚定的共产主义战士。雷锋说过：“我就是长着一个心眼，我一心向着党，向着社会主义，向着共产主义”；“有人说：人生在世，吃好、穿好、玩好是最幸福的。我觉得人生在世，只有勤劳，发愤图强，用自己的双手创造财富，为人类的解放事业——共产主义贡献自己的一切，这才是最幸福的”。这是雷锋对共产主义忠诚信仰的鲜明表达。坚定的共产主义信仰是雷锋精神的精髓。在坚定的共产主义信仰引领下，雷锋的一言一行、一举一动都表现了一个革命战士、gongchandang(写成汉字博客园不让发表)人为实现共产主义伟大理想而不懈奋斗的崇高精神。【详细】

以学雷锋推动社会主义核心价值体系建设

党的十七届六中全会《决定》强调，深入开展学雷锋活动，采取措施推动学习活动常态化。深入贯彻落实全会精神，一项重要任务就是引导广大干部群众积极开展学雷锋活动，推动社会主义核心价值体系建设，不断巩固全党全国各族人民团结奋斗的共同思想道德基础。

开展学雷锋活动对社会主义核心价值体系建设具有重要推动作用

雷锋是中华民族的道德标杆。深入开展学雷锋活动，充分发挥道德模范人物的示范效应，对于推动社会主义核心价值体系建设具有重要作用。

我提取的关键词：

雷锋 9.38646
精神 6.41262
社会主义 3.75446
共产主义 3.31446
推动 3.04646

凤凰网提取的关键词：

雷锋
精神
共产主义
社会主义

《宁要“不完美”的改革不要不改革的危机》2012年2月23日见报

宁要“不完美”的改革 不要不改革的危机。
无论方案多么周密、智慧多么高超，改革总会引起一些非议：既得利益者会用优势话语权阻碍改革，媒体公众会带着挑剔目光审视改革，一些人甚至还会以乌托邦思维苛求改革。对于改革者来说，认真听取民意，又不为流言所动，既需要智慧和审慎，更要有勇气与担当

自1978年至今，中国的改革已如舟至中流，有了更开阔的行进空间，也面临着“中流击水、浪遏飞舟”的挑战。

发展起来的问题、公平正义的焦虑、路径锁定的忧叹……在*南方谈话20周年、党的十八大即将召开之际，人们对改革的普遍关切，标注着30多年来以开放为先导的改革进入了新的历史方位。

冲破思想藩篱、触动现实利益，改革从一开始就挑战着既定格局，也无可避免地伴随着“不同声音”。无论当年的联产承包、物价闯关、工资闯关，还是今天的官员财产公示、垄断行业改革、事业单位改革，改革总是在争议乃至非议中前行。

所不同的是，从“摸着石头过河”到“改革顶层设计”，从经济领域到社会政治领域，改革越是向前推进，所触及的矛盾就越深，涉及的利益就越复杂，碰到的阻力也就越大。用一句通俗的话来讲，容易的都改得差不多了，剩下的全是难啃的“硬骨头”，不能回避也无法回避。

改革就会招惹是非，改革就是“自找麻烦”，改革也很难十全十美。30多年后，身处深水区和攻坚期，无论方案多么周密、智慧多么高超，改革总会引起一些非议：既得利益者会用优势话语权阻碍改革，媒体公众会带着挑剔目光审视改革，一些人甚至还会以乌托邦思维苛求改革。对于改革者来说，认真听取民意，又不为流言所动，既需要智慧和审慎，更要有勇气与担当。

在改革进程中，可怕的不是反对声音的出现，而是一出现不同声音，改革就戛然而止。现实中，或是囿于既得利益的阻力，或是担心不可掌控的风险，或是陷入“不稳定幻象”，在一些人那里，改革的“渐进”逐渐退化为“不进”，“积极稳妥”往往变成了“稳妥”有余而“积极”不足。这些年来，一些地方改革久议不决，一些部门改革决而难行，一些领域改革行而难破，莫不与此有关。

然而，“改革有风险，但不改革党就会有危险”。纵观世界一些大党大国的衰落，一个根本原因就是只有修修补补的机巧，没有大刀阔斧的魄力，最终因改革停滞而走入死胡同。对于当前各地各部门千头万绪的改革来说，面对“躲不开、绕不过”的体制机制障碍，如果怕这怕那、趑趄不前，抱着“多一事不如少一事”的消极态度，甚至将问题矛盾击鼓传花，固然可以求得一时轻松、周全某些利益，但只能把问题拖延成历史问题，让危机跑在了改革前面，最终引发更多矛盾、酿成更大危机，甚至落入所谓“转型期陷阱”。

小平同志在20多年前就曾告诫：“不要怕冒一点风险。我们已经形成了一种能力，承担风险的能力”，“改革开放越前进，承担和抵抗风险的能力就越强。我们处理问题，要完全没有风险不可能，冒点风险不怕”。事实上，从改革开放之初的崩溃边缘，到南方谈话前的历史徘徊，我们党正是着眼于国家和人民的未来，以“天变不足畏，祖宗不足法，人言不足恤”的改革精神，敢于抓住主要矛盾、勇于直面风险考验，才能化危为机，推动改革开放巨轮劈波斩浪，让中国成为了世界第二大经济体。

宁要微词，不要危机；宁要“不完美”的改革，不要不改革的危机。一个长期执政的大党，尤其要时刻警惕短期行为损害执政根基，防止局部利益左右发展方向，力避消极懈怠延误改革时机，所思所虑不独是当前社会的发展稳定，更有党和国家事业的长治久安。面对全新的改革历史方位，当以“不畏浮云遮望眼”的宽广视野，以无私无畏的责任担当，按照hujintao***所要求的，“不失时机地推进重要领域和关键环节改革”，“继续推进经济体制、政治体制、文化体制、社会体制改革创新”。如此，我们就一定能把风险化解在当下，让发展乘势而上，为党和国家赢得一个光明的未来。

我提取的关键词：

改革 14.7982
风险 3.59819
危机 2.39819
历史 1.99819
发展 1.93819

凤凰网提取的关键词：

改革
风险
危机
问题

12.最后我实事求是地宣告“基于语义”的关键词提取是失败的。

还是《让雷锋精神代代相传》这篇文章，按照sore.cpp中的算法提取前20个关键词如下：

雷锋 9.35777
精神 6.4173
社会主义 3.72577
共产主义 3.28577
推动 3.01777
活动 2.61265
道德 2.43777
体系 2.31777
价值 2.31777
坚定 2.29777
内涵 2.19777
建设 2.17265
具有 2.05777
人民 1.99777
重要 1.97777
开展 1.91265
中国 1.87777
核心 1.85265
弘扬 1.81777
全国 1.67777

如果把“语义”的权值调整为0,即把score.cpp修改两行：

const float vdw = 0.0;

const float tw = 1.0;

这样得到的前20个关键词是：

雷锋 22.22
精神 16.02
社会主义 8.14
共产主义 7.04
活动 6.52
推动 6.37
建设 5.42
道德 4.92
开展 4.77
体系 4.62
核心 4.62
价值 4.62
坚定 4.57
内涵 4.32
具有 3.97
人民 3.82
重要 3.77
中国 3.52
弘扬 3.37
全国 3.02

可见“语义”在关键词提取中实际上没有发挥作用！

posted @ 2012-03-04 16:08 张朝阳阅读(33487) 评论(10) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

高性能golang

Technologies come and technologies go, but insight is forever.

文章关键词提取算法

文本预处理部分

算法部分

公告