class Token{
public $count = 0;
public $key = null;
private $subTokens = array();
// type could be "object" or "category"
public $type = null;
public static function create($key, $type="object", $MIN_LEN=2, $MAX_LEN=6){
$t = new Token;
$t->key = $key;
$t->type = $type;
$len = mb_strlen($key, "UTF-8");
if($len > $MIN_LEN){
if($len > $MAX_LEN){
$len = $MAX_LEN + 1;
}
$strs = self::extractTokenStrs($key, $len-1, $len-1);
foreach($strs as $str){
$t->addSubToken($str);
}
}
return $t;
}
public function addSubToken($key){
if(!isset($this->subTokens[$key])){
$this->subTokens[$key]=1;
}
else{
$this->subTokens[$key]++;
}
}
public function getSubTokens(){
return array_keys($this->subTokens);
}
public static function extractTokenStrs($string, $MIN_LEN=2, $MAX_LEN=6){
$tokenStrs = array();
//match the 2-char, 3-char, 4-char, 5-char and 6-char tokens and put it into $rawtokens
$len = mb_strlen($string, "UTF-8");
if($MAX_LEN >= $MIN_LEN){
for($k = $MAX_LEN; $k >= $MIN_LEN; $k--){
for($j = 0; $j <= $len-$MIN_LEN; $j++){
$subStr = mb_substr($string, $j, $k, "UTF-8");
if(mb_strlen($subStr, "UTF-8") != $k){
break;
}
$tokenStrs[] = $subStr;
}
}
}
return $tokenStrs;
}
}
class Cloud{
private $keywords = array();
private $categories = array();
private function addToken($string){
if(empty($string)){
return false;
}
if(!isset($this->keywords[$string])){
$this->keywords[$string] = Token::create($string, "object");
}
$this->keywords[$string]->count++;
return $this->keywords[$string];
}
private function addCategory($string){
if(empty($string)){
return false;
}
if(!isset($this->categories[$string])){
$this->categories[$string] = Token::create($string, "category");
}
return $this->categories[$string];
}
private function emptyCategories(){
$this->categories = array();
}
public function addTokens($string){
$strs = self::split($string, array("的","了","然后","[\pP\pS\pZ\pC\pM]+","[0-9a-zA-Z]+"));
foreach($strs as $str){
if(mb_strlen($str, "UTF-8") >= 2){
$this->addCategory($str);
}
}
}
private function emptyKeywordsList(){
$this->keywords = array();
}
public function buildKeyWordsList(){
$this->emptyKeywordsList();
foreach($this->categories as $c){
$ss = Token::extractTokenStrs($c->key, 2, 6);
foreach($ss as $s){
$this->addToken($s);
}
}
}
public static function longest_string_in_array($array) {
$mapping = array_combine($array, array_map('mb_strlen', $array));
return array_keys($mapping, max($mapping));
}
public function findBestMatchingWords($threshold){
/* find the longest words satisfies the threshold */
$keywords = array();
foreach($this->categories as $k){
$this->extractCategoryKeywords($k, $threshold, $keywords);
}
$keys = array_keys($keywords);
if(count($keywords)===0){
return false;
}
$lkeys = self::longest_string_in_array($keys);
$ret = array();
foreach($lkeys as $k){
$ret[$k] = $keywords[$k];
}
return $ret;
}
private function removeInfluence(Token $token){
$num = $token->count;
$ss = Token::extractTokenStrs($token->key, 2, 6);
foreach($ss as $s){
$this->keywords[$s]->count-=$num;
}
}
public function getKeywords($threshold = 3){
$this->buildKeyWordsList();
$stopList = array();
$keywords = array();
$categoriesToRestore = $this->categories;
while(true){
$bestWords = $this->findBestMatchingWords($threshold);
if($bestWords === false){
break;
}
// add it to stopList
// add it the keywords
foreach($bestWords as $w){
$keywords[$w->key]= $w;
$this->removeInfluence($w);
$stopList[] = $w->key;
}
// use the stop list to rebuild the categories
$oldCategories = $this->categories;
$this->emptyCategories();
foreach($oldCategories as $c){
$list = self::split($c->key, $stopList);
foreach( $list as $nc){
$this->addCategory($nc);
}
}
}
$this->categories = $categoriesToRestore;
return $keywords;
}
public static function split($string, $delimiters){
//build the patterns
$pattern = "/";
foreach($delimiters as $c => $d){
if($c === 0){
$pattern = $pattern.$d;
}
else{
$pattern = $pattern."|".$d;
}
}
$patterns = $pattern."/isu";
$rawtokens = preg_split($patterns, $string);
return $rawtokens;
}
public function extractCategoryKeywords($root, $threshold, &$keywords=null){
$key = $root->key;
if($keywords === null){
$keywords = array();
}
if(isset($keywords[$key])){
return;
}
if(isset($this->keywords[$key]) && $this->keywords[$key]->count >= $threshold){
$keywords[$key]=$this->keywords[$key];
}
else{
$subTokens = $root->getSubTokens();
foreach( $subTokens as $s){
$this->extractCategoryKeywords($this->keywords[$s], $threshold, $keywords);
}
}
}
}// Cloud ends
set_time_limit ( 1000 );
$string = <<<EOT
meta http-equiv="Content-Type" content="text/html; charset=UTF-8"
meta name="description" content="[回到部落格首頁]"
EOT;
$cloud = new Cloud();
$cloud->addTokens($string);
print_r(array_keys($cloud->getKeywords(3)));
echo "hello";
Reference