综合版本的中科院分词器加斯坦福依存分析树

主要要做3件事：

第一件事下载ICTCLAS50_Windows_64_JNI或者ICTCLAS50_Windows_32_JNI ，按照上面的系统文档进行调试（解压后DOC下有这个文档）

第二件事下载stanford-parser（也就是斯坦福的依存分析树）越新越好，笔者用的是2012-07-09版本的。

最后一件事就是代码调试,代码如下:

import java.io.*;
import java.util.List;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreePrint;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;

import ICTCLAS.I3S.AC.ICTCLAS50;


public class ParserTest {
	
	public static void main( String arg[] )
	{
		String filename = "question2.txt";
		getSentence(filename);
	}

	private static void getSentence(String inFile)
	{
		File file = new File(inFile);
        BufferedReader reader = null;
        try {
            System.out.println("以行为单位读取文件内容，一次读一整行：");
            reader = new BufferedReader(new FileReader(file));
            String tempString = null;
            int line = 1;
            
            LexicalizedParser lp = LexicalizedParser.loadModel("chinesePCFG.ser.gz");
            // 一次读入一行，直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                // 显示行号
                System.out.println("line " + line + ": " + tempString);
                line++;              
                //显示分词结果
                tempString=tempString.replace(" ", "");//去问句前隐藏的空格
                String seg = segment(tempString);
               // System.out.println(seg + " ");
                
                //斯坦福解析
                String[] sent = seg.split(" ");
                List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
                Tree parse = lp.apply(rawWords);
                
                TreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
       	        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
       	        GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
       	        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
       	        int size2 = tdl.size();
       	 	    System.out.println("依存的总数是："+size2);
       	        for (int j = 0; j < size2; j++) 
       		   {	
       	    	System.out.println(tdl.get(j).gov());   //依存关系的第一个词
       	    	System.out.println(tdl.get(j).dep());   //依存关系的第二个词 	  
       	    	System.out.println(tdl.get(j).reln());  //依存关系        
       	    	System.out.println("*********************");
       		   }
       	        System.out.println(tdl);  //输出它的依存关系 
                
/*                TreePrint tp = new TreePrint("wordsAndTags,typedDependenciesCollapsed",new ChineseTreebankLanguagePack());
                //斯坦福的依存关系数
                tp.printTree(parse);  //输出的时候带有斯坦福的词性标注
                System.out.println("*********************");*/
                System.out.println("\n");
                
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
	}
	
	public static String segment(String sentence)
	{
		try
		{
			ICTCLAS50 testICTCLAS50 = new ICTCLAS50();
			String argu = ".";
			//初始化
			if (testICTCLAS50.ICTCLAS_Init(argu.getBytes("GB2312")) == false)
			{
				System.out.println("Init Fail!");
				return null;
			}
			//设置词性标注集(0 计算所二级标注集，1 计算所一级标注集，2 北大二级标注集，3 北大一级标注集)
			testICTCLAS50.ICTCLAS_SetPOSmap(2);
			//导入用户字典
			int nCount = 0;
			String usrdir = "userdict.txt"; //用户字典路径
			byte[] usrdirb = usrdir.getBytes();//将string转化为byte类型
			//导入用户字典,返回导入用户词语个数第一个参数为用户字典路径，第二个参数为用户字典的编码类型
			nCount = testICTCLAS50.ICTCLAS_ImportUserDictFile(usrdirb, 0);
			//System.out.println("导入用户词个数" + nCount);
			nCount = 0;
			//导入用户字典后再分词  这里的0和1都是选项即上面的词性标注集
			byte nativeBytes[] = testICTCLAS50.ICTCLAS_ParagraphProcess(sentence.getBytes("GB2312"), 2, 0);	
			//System.out.println(nativeBytes.length);
			String seg = new String(nativeBytes, 0, nativeBytes.length, "GB2312");		
			nativeBytes = testICTCLAS50.ICTCLAS_ParagraphProcess(sentence.getBytes("GB2312"), 2, 1); // 带词性标记的分词结果
			String seg_pos = new String(nativeBytes, 0, nativeBytes.length,"GB2312");
			System.out.println("*********************");
			System.out.println("中科院的分词结果为： " + seg);
			System.out.println("*********************");
			System.out.println("%%%%%%%%%%%%%%%%%%%%%%");
			System.out.println("中科院的分词结果为（带词性标注）： " + seg_pos);
			System.out.println("%%%%%%%%%%%%%%%%%%%%%%");
			//保存用户字典
			testICTCLAS50.ICTCLAS_SaveTheUsrDic();
			//释放分词组件资源
			testICTCLAS50.ICTCLAS_Exit();		
			return seg;
		}
		catch (Exception ex)
		{
		}
		return null;
	}
	
}

　　注意2点：第一就是这里是自然语言问题的采用输入输出流的测试方法，看到代码中question2.txt就是测试文件的名字，最好直接放在当前项目之下。

第二就是。看见关键的3个包是否存在，即chineseFactored.ser.gz、chinesePCFG.ser.gz（只要在目录下就可以了）

还有stanford-parser.jar 的路径是否有。

运行结果如下：你会发现函数的词性标注怎么会是PPA?没错，这就是计算所分词器的一大好处—用户字典。

这只是自然语言处理句法分析的一部分，以后可能会忘记，所以在博客园留下记忆的痕迹，同时也分享给大家！

posted on 2013-03-26 16:09 犀利小胖阅读(1831) 评论(0) 收藏举报

刷新页面返回顶部

犀利小胖

综合版本的中科院分词器加斯坦福依存分析树

导航

公告