Fork me on GitHub

AC自动机过滤敏感词

<?php
class AcAutomation { private $root; public function __construct($keywords = array()) { $this->root = $this->createNode(); foreach ($keywords as $keyword) { $this->addKeyword($keyword); } $this->buildFailIndex(); } /** * 创建节点 * @param string $value * @return stdClass */ private function createNode($value = "") { $node = new stdClass(); $node->value = $value; $node->next = array(); $node->fail = NULL; $node->len = 0; // keyword长度,0表示非末尾,非0表示keyword的长度 return $node; } /** * 添加关键词 * @param $keyword */ private function addKeyword($keyword) { $keyword = trim($keyword); if (!$keyword) { return; } $cur = $this->root; $matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($keyword))); for ($i = 1; isset($matches[$i]); $i++) { $v = $matches[$i]; if (!isset($cur->next[$v])) { $node = $this->createNode($v); $cur->next[$v] = $node; } if (!isset($matches[$i+1])) { $cur->next[$v]->len = $i; } $cur = $cur->next[$v]; } } /** * 构造fail指针 */ private function buildFailIndex() { $queue = array(); foreach ($this->root->next as $node) { $node->fail = $this->root; $queue[] = $node; } while ($queue) { $node = array_shift($queue); foreach ($node->next as $child_node) { $val = $child_node->value; $p = $node->fail; while ($p != NULL) { if (isset($p->next[$val])) { $child_node->fail = $p->next[$val]; break; } $p = $p->fail; } if ($p === NULL) { $child_node->fail = $this->root; } $queue[] = $child_node; } } } /** * 搜索 * @param $content * @return array */ public function search($content) { $pos_list = array(); $p = $this->root; $matches = unpack('N*',iconv('UTF-8', 'UCS-4', strtolower($content))); for ($i = 1; isset($matches[$i]); $i++) { $val = $matches[$i]; while (!isset($p->next[$val]) && $p != $this->root) { $p = $p->fail; } $p = isset($p->next[$val]) ? $p->next[$val] : $this->root; $temp = $p; while ($temp != $this->root) { if ($temp->len) { $pos_list[] = array($i - $temp->len, $i); } $temp = $temp->fail; } } return $pos_list; } }
$keywords = array('复杂度','个数','返回','代表','性能','数据','GET','安全性','耗时','优化','文档','并发','类型');
$ac = new AcAutomation($keywords);
$res = $ac->search($content);
foreach ($res as $pos) {
    echo mb_substr($content,$pos[0],$pos[1] - $pos[0],'UTF-8'),PHP_EOL;
}
posted @ 2022-09-03 11:45  v_nice  阅读(66)  评论(0)    收藏  举报
1