PHP 搜索分词实现代码

<?php 
/**
 * @author: xiaojiang 2014-01-08
 * php 建立分词树
 * */
class Tree{

    public $w = '';
    public $subT = array();
    public $isEnd = false;
    
    public function __construct($w= '' , $isEnd = false){
        if(!empty($w)){
            $this->w = $w;
            $this->isEnd = $isEnd;
        }
    }
    public function insert( $str ){
    
        $len = strlen($str);
        if(!$len) return ;
        $scope = $this;
        for( $i = 0; $i< $len; $i++ ){
            //判断汉字
            $cStr = $str[$i];
            if( ord( $cStr ) > 127 ){
                $cStr = substr($str, $i, 3);
                $i += 2;
            }
            $scope = $scope->insertNode( $cStr );
        }
        $scope->isEnd = true;
    }
    
    private function &insertNode(  $w ){
        $t = $this->hasTree( $w );
        if( !$t ){
            $t =  new Tree( $w );
            array_push($this->subT, $t );
        }
        return $t;
    }
    
    public function &hasTree($w){
        foreach ($this->subT as $t){
            if($t->w == $w)
                return $t;
        }
        return false;
    }

}


class myStr{
    
    private $str = '';
    private $arr = array();
    private $len = 0;
    public function __construct( $str){
        $this->str = $str;
        $len = strlen($str);
        for ($i = 0; $i < $len; $i++ ){
            $cStr = $str[$i];
            if(ord($cStr) > 127){
                $cStr = substr($str, $i , 3);
                $i += 2;
            }
            array_push($this->arr, $cStr);
        }
        $this->len = count($this->arr);
    }
    
    public function getIndex( $idx ){
        return $this->arr[$idx];
    }
    
    public function getLength(){
        return $this->len;
    }
}

$tIns = new Tree();
$tIns->insert('中华');
$tIns->insert('人民');
$tIns->insert('共和国');
$tIns->insert('baidu');

$strIns = new myStr("cc中华的人民共和国和中国啊啊www.baidua.com");

for ($i = 0; $i < $strIns->getLength(); $i++ ){
    
    $j = $i;
    $curW = $strIns->getIndex($i);
    $stIns = $tIns->hasTree( $curW );
    if(!$stIns) continue;
    
    $sw = '';
    while ( $stIns ){
        $sw .= $stIns->w; 
        $_isEnd = $stIns->isEnd;
        $stIns = $stIns->hasTree( $strIns->getIndex( ++$j ) );
        if( !$stIns && !$_isEnd)
            $sw = '';
    }
    if($sw)
        echo $sw."<br>";
}


?>

 输出:

中华
人民
共和国
baidu
posted @ 2014-01-08 00:41  ﹏Sakura  阅读(6484)  评论(0编辑  收藏  举报