class Token{
 public $count = 0;
 public $key = null;
 private $subTokens = array();
 // type could be "object" or "category"
 public $type = null;
 public static function create($key, $type="object", $MIN_LEN=2, $MAX_LEN=6){
  $t = new Token;
  $t->key = $key;
  $t->type = $type;
  $len = mb_strlen($key, "UTF-8");
  if($len > $MIN_LEN){
   if($len > $MAX_LEN){
    $len = $MAX_LEN + 1;
   }
   $strs = self::extractTokenStrs($key, $len-1, $len-1);
   foreach($strs as $str){
    $t->addSubToken($str);
   }
  }
  return $t;
 }
 public function addSubToken($key){
  if(!isset($this->subTokens[$key])){
   $this->subTokens[$key]=1;
  }
  else{
   $this->subTokens[$key]++;
  }
 }
 public function getSubTokens(){
  return array_keys($this->subTokens);
 }
 public static function extractTokenStrs($string, $MIN_LEN=2, $MAX_LEN=6){
  $tokenStrs = array();
  //match the 2-char, 3-char, 4-char, 5-char and 6-char tokens and put it into $rawtokens
  $len = mb_strlen($string, "UTF-8");
  if($MAX_LEN >= $MIN_LEN){
   for($k = $MAX_LEN; $k >= $MIN_LEN; $k--){
    for($j = 0; $j <= $len-$MIN_LEN; $j++){
     $subStr = mb_substr($string, $j, $k, "UTF-8");
     if(mb_strlen($subStr, "UTF-8") != $k){
      break;
     }
     $tokenStrs[] = $subStr;
    }
   }
  }
  return $tokenStrs;
 }
}
class Cloud{
 private $keywords = array();
 private $categories = array();
 private function addToken($string){
  if(empty($string)){
   return false;
  }
  if(!isset($this->keywords[$string])){
   $this->keywords[$string] = Token::create($string, "object");
  }
  $this->keywords[$string]->count++;
  return $this->keywords[$string];
 }
 private function addCategory($string){
  if(empty($string)){
   return false;
  }
  if(!isset($this->categories[$string])){
   $this->categories[$string] = Token::create($string, "category");
  }
  return $this->categories[$string];
 }
 private function emptyCategories(){
  $this->categories = array();
 }
 public function addTokens($string){
  $strs = self::split($string, array("的","了","然后","[\pP\pS\pZ\pC\pM]+","[0-9a-zA-Z]+"));
  foreach($strs as $str){
   if(mb_strlen($str, "UTF-8") >= 2){
    $this->addCategory($str);
   }
  }
 }
 private function emptyKeywordsList(){
  $this->keywords = array();
 }
 public function buildKeyWordsList(){
  $this->emptyKeywordsList();
  foreach($this->categories as $c){
   $ss = Token::extractTokenStrs($c->key, 2, 6);
   foreach($ss as $s){
    $this->addToken($s);
   }
  }
 }
 public static function longest_string_in_array($array) {
  $mapping = array_combine($array, array_map('mb_strlen', $array));
  return array_keys($mapping, max($mapping));
 }
 public function findBestMatchingWords($threshold){
  /* find the longest words satisfies the threshold */
  $keywords = array();
  foreach($this->categories as $k){
   $this->extractCategoryKeywords($k, $threshold, $keywords);
  }
  $keys = array_keys($keywords);
  if(count($keywords)===0){
   return false;
  }
  $lkeys = self::longest_string_in_array($keys);
  $ret = array();
  foreach($lkeys as $k){
   $ret[$k] = $keywords[$k];
  }
  return $ret;
 }
 private function removeInfluence(Token $token){
  $num = $token->count;
  $ss = Token::extractTokenStrs($token->key, 2, 6);
  foreach($ss as $s){
   $this->keywords[$s]->count-=$num;
  }
 }
 public function getKeywords($threshold = 3){
  $this->buildKeyWordsList();
  $stopList = array();
  $keywords = array();
  $categoriesToRestore = $this->categories;
  while(true){
   $bestWords = $this->findBestMatchingWords($threshold);
   if($bestWords === false){
    break;
   }
   // add it to stopList
   // add it the keywords
   foreach($bestWords as $w){
    $keywords[$w->key]= $w;
    $this->removeInfluence($w);
    $stopList[] = $w->key;
   }
   // use the stop list to rebuild the categories
   $oldCategories = $this->categories;
   $this->emptyCategories();
   foreach($oldCategories as $c){
    $list = self::split($c->key, $stopList);
    foreach( $list as $nc){
     $this->addCategory($nc);
    }
   }
  }
  $this->categories = $categoriesToRestore;
  return $keywords;
 }
 public static function split($string, $delimiters){
  //build the patterns
  $pattern = "/";
  foreach($delimiters as $c => $d){
   if($c === 0){
    $pattern = $pattern.$d;
   }
   else{
    $pattern = $pattern."|".$d;
   }
  }
  $patterns = $pattern."/isu";
  $rawtokens = preg_split($patterns, $string);
  return $rawtokens;
 }
 public function extractCategoryKeywords($root, $threshold, &$keywords=null){
  $key = $root->key;
  if($keywords === null){
   $keywords = array();
  }
  if(isset($keywords[$key])){
   return;
  }
  if(isset($this->keywords[$key]) && $this->keywords[$key]->count >= $threshold){
   $keywords[$key]=$this->keywords[$key];
  }
  else{
   $subTokens = $root->getSubTokens();
   foreach( $subTokens as $s){
    $this->extractCategoryKeywords($this->keywords[$s], $threshold, $keywords);
   }
  }
 }
}// Cloud ends
set_time_limit ( 1000 );
$string = <<<EOT
meta http-equiv="Content-Type" content="text/html; charset=UTF-8"
meta name="description" content="[回到部落格首頁]"
EOT;
$cloud = new Cloud();
$cloud->addTokens($string);
print_r(array_keys($cloud->getKeywords(3)));
echo "hello";
Reference