关于我们┊AboutMe

昵称:Hopol(■童■)

联系:QQ:18883

邮箱:info(at)18883.com

主页:www.18883.com

手机浏览 日志归档 RSS 2.0 订阅
Register | Login

一个用PHP写的中文分词函数

<?php
class Segmentation {
     var 
$options = array('lowercase' => TRUE,
                                 
'segment_english' => FALSE);
     var 
$dict_name 'Unknown';
     var 
$dict_words = array();
     function 
setLowercase($value) {
             if (
$value) {
                 
$this->options['lowercase'] = TRUE;
             } else {
                 
$this->options['lowercase'] = FALSE;
             }
             return 
TRUE;
     }
     function 
setSegmentEnglish($value) {
             if (
$value) {
                 
$this->options['segment_english'] = TRUE;
             } else {
                 
$this->options['segment_english'] = FALSE;
             }
             return 
TRUE;
     }
     function 
load($dict_file) {
             if (!
file_exists($dict_file)) {
                 return 
FALSE;
             }
             
$fp fopen($dict_file'r');
             
$temp fgets($fp1024);
             if (
$temp === FALSE) {
                 return 
FALSE;
             } else {
                 if (
strpos($temp"\t") !== FALSE) {
                     list (
$dict_type$dict_name) = explode("\t"trim($temp));
                 } else {
                     
$dict_type trim($temp);
                     
$dict_name 'Unknown';
                 }
                 
$this->dict_name $dict_name;
                 if (
$dict_type !== 'DICT_WORD_W') {
                     return 
FALSE;
                 }
             }
             while (!
feof($fp)) {
                 
$this->dict_words[rtrim(fgets($fp32))] = 1;
             }
             
fclose($fp);
             return 
TRUE;
     }
     function 
getDictName() {
             return 
$this->dict_name;
     }
     function 
segmentString($str) {
             if (
count($this->dict_words) === 0) {
                 return 
FALSE;
             }
             
$lines explode("\n"$str);
             return 
$this->_segmentLines($lines);
     }
     function 
segmentFile($filename) {
             if (
count($this->dict_words) === 0) {
                 return 
FALSE;
             }
             
$lines file($filename);
             return 
$this->_segmentLines($lines);
     }
     function 
_segmentLines($lines) {
             
$contents_segmented '';
             foreach (
$lines as $line) {
                 
$contents_segmented .= $this->_segmentLine(rtrim($line)) . " \n";
             }
             do {
                 
$contents_segmented str_replace('     '' '$contents_segmented);
             } while (
strpos($contents_segmented'     ') !== FALSE);
             return 
$contents_segmented;
     }
     function 
_segmentLine($str) {
             
$str_final '';
             
$str_array = array();
             
$str_length strlen($str);
             if (
$str_length 0) {
                 if (
ord($str{$str_length-1}) >= 129) {
                     
$str .= ' ';
                 }
             }
             for (
$i=0$i<$str_length$i++) {
                 if (
ord($str{$i}) >= 129) {
                     
$str_array[] = $str{$i} . $str{$i+1};
                     
$i++;
                 } else {
                     
$str_tmp $str{$i};
                     for (
$j=$i+1$j<$str_length$j++) {
                             if (
ord($str{$j}) < 129) {
                                 
$str_tmp .= $str{$j};
                             } else {
                                 break;
                             }
                     }
                     
$str_array[] = array($str_tmp);
                     
$i $j 1;
                 }
             }
     
             
$pos count($str_array);
             while (
$pos 0) {
                 
$char $str_array[$pos-1];
                 if (
is_array($char)) {
                     
$str_final_tmp $char[0];
                     if (
$this->options['segment_english']) {
                             
$str_final_tmp preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f]+)/"" $1 "$str_final_tmp);
                             
$str_final_tmp preg_replace("/([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])([\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\\\]\^\_\`\{\|\}\~\t\f])/"" $1 $2 "$str_final_tmp);
                     }
                     if (
$this->options['lowercase']) {
                             
$str_final_tmp strtolower($str_final_tmp);
                     }
                     
$str_final " $str_final_tmp$str_final";
                     
$pos--;
                 } else {
                     
$word_found 0;
                     
$word_array = array(=> '');
                     if (
$pos 4) {
                             
$word_temp $pos 1;
                     } else {
                             
$word_temp 5;
                     }
                     for (
$i=1$i<$word_temp$i++) {
                             
$word_array[$i] = $str_array[$pos-$i] . $word_array[$i-1];
                     }
     
                     for (
$i=($word_temp-1); $i>1$i--) {
         
                             if (
array_key_exists($word_array[$i], $this->dict_words)) {
                                 
$word_found $i;
                                 break;
                             }
                     }
                     if (
$word_found) {
                             
$str_final " $word_array[$word_found]$str_final";
                             
$pos $pos $word_found;
                     } else {
                             
$str_final " $char$str_final";
                             
$pos--;
                     }
                 }
             }
             return 
$str_final;
     }
}
?>

Tags: php, 中文分词, 函数

« 上一篇 | 下一篇 »

只显示10条记录相关文章

dede5.1 UTF8商业版 (浏览: 231, 评论: 1)
dede5.1 BGK商业版 (浏览: 255, 评论: 0)
MyPic图片管理系统 (浏览: 161, 评论: 0)
PHP 美女写真 Cms (浏览: 207, 评论: 0)
Google translator API ( PHP 5 class ) (浏览: 192, 评论: 0)
MyPic v1.0 (MyPic图片管理系统) (浏览: 288, 评论: 0)
PHP在图片上写上中文[备份] (浏览: 203, 评论: 0)
DEDECMS 模板收藏 【极限资讯网】 (浏览: 494, 评论: 0)
DEDECMS 模板收藏 【CK女性站】 (浏览: 493, 评论: 0)
DEDECMS 模板收藏 【淡蓝色完整风格V1.1】 (浏览: 499, 评论: 0)

发表评论