class Token{ public $count = 0; public $key = null; private $subTokens = array(); // type could be "object" or "category" public $type = null; public static function create($key, $type="object", $MIN_LEN=2, $MAX_LEN=6){ $t = new Token; $t->key = $key; $t->type = $type; $len = mb_strlen($key, "UTF-8"); if($len > $MIN_LEN){ if($len > $MAX_LEN){ $len = $MAX_LEN + 1; } $strs = self::extractTokenStrs($key, $len-1, $len-1); foreach($strs as $str){ $t->addSubToken($str); } } return $t; } public function addSubToken($key){ if(!isset($this->subTokens[$key])){ $this->subTokens[$key]=1; } else{ $this->subTokens[$key]++; } } public function getSubTokens(){ return array_keys($this->subTokens); } public static function extractTokenStrs($string, $MIN_LEN=2, $MAX_LEN=6){ $tokenStrs = array(); //match the 2-char, 3-char, 4-char, 5-char and 6-char tokens and put it into $rawtokens $len = mb_strlen($string, "UTF-8"); if($MAX_LEN >= $MIN_LEN){ for($k = $MAX_LEN; $k >= $MIN_LEN; $k--){ for($j = 0; $j <= $len-$MIN_LEN; $j++){ $subStr = mb_substr($string, $j, $k, "UTF-8"); if(mb_strlen($subStr, "UTF-8") != $k){ break; } $tokenStrs[] = $subStr; } } } return $tokenStrs; } } class Cloud{ private $keywords = array(); private $categories = array(); private function addToken($string){ if(empty($string)){ return false; } if(!isset($this->keywords[$string])){ $this->keywords[$string] = Token::create($string, "object"); } $this->keywords[$string]->count++; return $this->keywords[$string]; } private function addCategory($string){ if(empty($string)){ return false; } if(!isset($this->categories[$string])){ $this->categories[$string] = Token::create($string, "category"); } return $this->categories[$string]; } private function emptyCategories(){ $this->categories = array(); } public function addTokens($string){ $strs = self::split($string, array("的","了","然后","[\pP\pS\pZ\pC\pM]+","[0-9a-zA-Z]+")); foreach($strs as $str){ if(mb_strlen($str, "UTF-8") >= 2){ $this->addCategory($str); } } } private function emptyKeywordsList(){ $this->keywords = array(); } public function buildKeyWordsList(){ $this->emptyKeywordsList(); foreach($this->categories as $c){ $ss = Token::extractTokenStrs($c->key, 2, 6); foreach($ss as $s){ $this->addToken($s); } } } public static function longest_string_in_array($array) { $mapping = array_combine($array, array_map('mb_strlen', $array)); return array_keys($mapping, max($mapping)); } public function findBestMatchingWords($threshold){ /* find the longest words satisfies the threshold */ $keywords = array(); foreach($this->categories as $k){ $this->extractCategoryKeywords($k, $threshold, $keywords); } $keys = array_keys($keywords); if(count($keywords)===0){ return false; } $lkeys = self::longest_string_in_array($keys); $ret = array(); foreach($lkeys as $k){ $ret[$k] = $keywords[$k]; } return $ret; } private function removeInfluence(Token $token){ $num = $token->count; $ss = Token::extractTokenStrs($token->key, 2, 6); foreach($ss as $s){ $this->keywords[$s]->count-=$num; } } public function getKeywords($threshold = 3){ $this->buildKeyWordsList(); $stopList = array(); $keywords = array(); $categoriesToRestore = $this->categories; while(true){ $bestWords = $this->findBestMatchingWords($threshold); if($bestWords === false){ break; } // add it to stopList // add it the keywords foreach($bestWords as $w){ $keywords[$w->key]= $w; $this->removeInfluence($w); $stopList[] = $w->key; } // use the stop list to rebuild the categories $oldCategories = $this->categories; $this->emptyCategories(); foreach($oldCategories as $c){ $list = self::split($c->key, $stopList); foreach( $list as $nc){ $this->addCategory($nc); } } } $this->categories = $categoriesToRestore; return $keywords; } public static function split($string, $delimiters){ //build the patterns $pattern = "/"; foreach($delimiters as $c => $d){ if($c === 0){ $pattern = $pattern.$d; } else{ $pattern = $pattern."|".$d; } } $patterns = $pattern."/isu"; $rawtokens = preg_split($patterns, $string); return $rawtokens; } public function extractCategoryKeywords($root, $threshold, &$keywords=null){ $key = $root->key; if($keywords === null){ $keywords = array(); } if(isset($keywords[$key])){ return; } if(isset($this->keywords[$key]) && $this->keywords[$key]->count >= $threshold){ $keywords[$key]=$this->keywords[$key]; } else{ $subTokens = $root->getSubTokens(); foreach( $subTokens as $s){ $this->extractCategoryKeywords($this->keywords[$s], $threshold, $keywords); } } } }// Cloud ends set_time_limit ( 1000 ); $string = <<<EOT meta http-equiv="Content-Type" content="text/html; charset=UTF-8" meta name="description" content="[回到部落格首頁]" EOT; $cloud = new Cloud(); $cloud->addTokens($string); print_r(array_keys($cloud->getKeywords(3))); echo "hello";
Reference
No comments:
Post a Comment