Stanford NLP语义分析

环境准备

Eclipse或者IDEA,JDK1.8,Apache Maven(注意,3.5及以后的版本都需要Java8环境才能运行,如果不想在Java8运行的话,请使用以前的版本)。
建立好一个新的Maven工程,在pom文件中加入如下代码:

<properties>  
    <corenlp.version>3.6.0</corenlp.version>  
</properties>  
  
<dependencies>  
    <dependency>  
        <groupId>edu.stanford.nlp</groupId>  
        <artifactId>stanford-corenlp</artifactId>  
        <version>${corenlp.version}</version>  
    </dependency>  
  
    <dependency>  
        <groupId>edu.stanford.nlp</groupId>  
        <artifactId>stanford-corenlp</artifactId>  
        <version>${corenlp.version}</version>  
        <classifier>models</classifier>  
    </dependency>  
  
    <dependency>  
        <groupId>edu.stanford.nlp</groupId>  
        <artifactId>stanford-corenlp</artifactId>  
        <version>${corenlp.version}</version>  
        <classifier>models-chinese</classifier>  
    </dependency>  
</dependencies>  

三个依赖包分别是CoreNlp的算法包、英文语料包、中文语料包,由于Maven默认镜像在国外,而Stanford NLP的模型文件很大,因此对网络要求比较高,网速慢的一不小心就time out下载失败了。 解决方法是找一个包含Stanford NLP依赖库的国内镜像,修改Maven的setting,xml中的mirror属性。

英文文本的处理

英文的处理官网也给出了示例代码,我这里只做一下整合,代码如下:

https://stanfordnlp.github.io/CoreNLP/api.html

package edu.zju.cst.krselee.examples.english;  
  
import edu.stanford.nlp.dcoref.CorefChain;  
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;  
import edu.stanford.nlp.ling.CoreAnnotations;  
import edu.stanford.nlp.ling.CoreLabel;  
import edu.stanford.nlp.pipeline.Annotation;  
import edu.stanford.nlp.pipeline.StanfordCoreNLP;  
import edu.stanford.nlp.semgraph.SemanticGraph;  
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;  
import edu.stanford.nlp.trees.Tree;  
import edu.stanford.nlp.trees.TreeCoreAnnotations;  
import edu.stanford.nlp.util.CoreMap;  
  
import java.util.List;  
import java.util.Map;  
import java.util.Properties;  
  
/** 
 * Created by KrseLee on 2016/11/5. 
 */  
public class StanfordEnglishNlpExample {  
  
    public static void main(String[] args) {  
  
        StanfordEnglishNlpExample example = new StanfordEnglishNlpExample();  
  
        example.runAllAnnotators();  
  
    }  
  
    public void runAllAnnotators(){  
        // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution  
        Properties props = new Properties();  
        props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");  
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);  
  
        // read some text in the text variable  
        String text = "this is a simple text"; // Add your text here!  
  
        // create an empty Annotation just with the given text  
        Annotation document = new Annotation(text);  
  
        // run all Annotators on this text  
        pipeline.annotate(document);  
  
        parserOutput(document);  
    }  
  
    public void parserOutput(Annotation document){  
        // these are all the sentences in this document  
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types  
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);  
  
        for(CoreMap sentence: sentences) {  
            // traversing the words in the current sentence  
            // a CoreLabel is a CoreMap with additional token-specific methods  
            for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {  
                // this is the text of the token  
                String word = token.get(CoreAnnotations.TextAnnotation.class);  
                // this is the POS tag of the token  
                String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);  
                // this is the NER label of the token  
                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);  
            }  
  
            // this is the parse tree of the current sentence  
            Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);  
            System.out.println("语法树:");  
            System.out.println(tree.toString());  
  
            // this is the Stanford dependency graph of the current sentence  
            SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);  
            System.out.println("依存句法:");  
            System.out.println(dependencies.toString());  
        }  
  
        // This is the coreference link graph  
        // Each chain stores a set of mentions that link to each other,  
        // along with a method for getting the most representative mention  
        // Both sentence and token offsets start at 1!  
        Map<Integer, CorefChain> graph =  
                document.get(CorefCoreAnnotations.CorefChainAnnotation.class);  
    }  
}  

值得注意的是,Stanford NLP采用的是pipeline的方式,给用户一个参数的设置接口,之后的过程全都被封装好了,使用起来非常方便。所有的返回结果都保存在一个<pre>Annotation对象中,根据需要去获取。The Stanford CoreNLP Natural Language Processing Toolkit (http://nlp.stanford.edu/pubs/StanfordCoreNlp2014.pdf)一文中对PileLine方式做了详细的介绍,这里就不多说了,需要提到一点就是参数中,后面的参数往往依赖于前面的参数(直观的讲,就是标注pos依赖于分词tokenize,语法分析paser依赖于标注,等等)。

中文文本的处理

相对于英文来说,中文文本的处理稍微麻烦一点,主要的地方在于一个配置文件。中文语料模型包中有一个默认的配置文件StanfordCoreNLP-chinese.properties,在引入的jar中可以找到。

主要是指定相应pipeline的操作步骤以及对应的语料文件的位置。实际使用中我们可能用不到所有的步骤,或者要使用不同的语料库,因此可以自定义配置文件,再引入代码中。

主要的Java程序代码如下:

package edu.zju.cst.krselee.examples.chinese;  
  
import edu.stanford.nlp.dcoref.CorefChain;  
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;  
import edu.stanford.nlp.ling.CoreAnnotations;  
import edu.stanford.nlp.ling.CoreLabel;  
import edu.stanford.nlp.pipeline.Annotation;  
import edu.stanford.nlp.pipeline.StanfordCoreNLP;  
import edu.stanford.nlp.semgraph.SemanticGraph;  
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;  
import edu.stanford.nlp.trees.Tree;  
import edu.stanford.nlp.trees.TreeCoreAnnotations;  
import edu.stanford.nlp.util.CoreMap;  
import edu.stanford.nlp.util.PropertiesUtils;  
import edu.zju.cst.krselee.examples.english.StanfordEnglishNlpExample;  
  
import java.util.List;  
import java.util.Map;  
import java.util.Properties;  
  
/** 
 * Created by KrseLee on 2016/11/4. 
 */  
public class StanfordChineseNlpExample {  
  
  
    public static void main(String[] args) {  
  
        StanfordChineseNlpExample example = new StanfordChineseNlpExample();  
  
        example.runChineseAnnotators();  
  
    }  
  
    public void runChineseAnnotators(){  
  
        String text = "克林顿说,华盛顿将逐步落实对韩国的经济援助。"  
                + "金大中对克林顿的讲话报以掌声:克林顿总统在会谈中重申,他坚定地支持韩国摆脱经济危机。";  
        Annotation document = new Annotation(text);  
        StanfordCoreNLP corenlp = new StanfordCoreNLP("StanfordCoreNLP-chinese.properties");  
        corenlp.annotate(document);  
        parserOutput(document);  
    }  
  
    public void parserOutput(Annotation document){  
        // these are all the sentences in this document  
        // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types  
        List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class);  
  
        for(CoreMap sentence: sentences) {  
            // traversing the words in the current sentence  
            // a CoreLabel is a CoreMap with additional token-specific methods  
            for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) {  
                // this is the text of the token  
                String word = token.get(CoreAnnotations.TextAnnotation.class);  
                // this is the POS tag of the token  
                String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);  
                // this is the NER label of the token  
                String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);  
  
                System.out.println(word+"\t"+pos+"\t"+ne);  
            }  
  
            // this is the parse tree of the current sentence  
            Tree tree = sentence.get(TreeCoreAnnotations.TreeAnnotation.class);  
            System.out.println("语法树:");  
            System.out.println(tree.toString());  
  
            // this is the Stanford dependency graph of the current sentence  
            SemanticGraph dependencies = sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class);  
            System.out.println("依存句法:");  
            System.out.println(dependencies.toString());  
        }  
  
        // This is the coreference link graph  
        // Each chain stores a set of mentions that link to each other,  
        // along with a method for getting the most representative mention  
        // Both sentence and token offsets start at 1!  
        Map<Integer, CorefChain> graph =  
                document.get(CorefCoreAnnotations.CorefChainAnnotation.class);  
    }  
}  

 

posted @ 2017-09-25 15:15  wbinbin  阅读(2350)  评论(0)    收藏  举报