结构:首字散列表、trie索引树结点
优点:分词中,不需预知待查询词的长度,沿树链逐字匹配。
缺点:构造和维护比较复杂,单词树枝多,浪费了一定的空间
* @version 0.1
* @todo 构造通用的字典算法,并写了一个简易的分词
* @author shjuto@gmail.com
* trie字典树
*
*/
代码如下 |
复制代码 |
class trie
{
private $trie;
function __construct()
{
$trie = array('children' => array(),'isword'=>false);
}
/**
* 把词加入词典
*
* @param string $key
*/
function &setword($word='')
{
$trienode = &$this->trie;
for($i = 0;$i < strlen($word);$i++)
{
$character = $word[$i];
if(!isset($trienode['children'][$character]))
{
$trienode['children'][$character] = array('isword'=>false);
}
if($i == strlen($word)-1)
{
$trienode['children'][$character] = array('isword'=>true);
}
$trienode = &$trienode['children'][$character];
}
}
/**
* 判断是否为词典词
*
* @param string $word
* @return bool true/false
*/
function & isword($word)
{
$trienode = &$this->trie;
for($i = 0;$i < strlen($word);$i++)
{
$character = $word[$i];
if(!isset($trienode['children'][$character]))
& |