Thursday, August 4, 2011

关键字云

class Token{

 public $count = 0;

 public $key = null;

 private $subTokens = array();

 // type could be "object" or "category"

 public $type = null;



 public static function create($key, $type="object", $MIN_LEN=2, $MAX_LEN=6){

  $t = new Token;

  $t->key = $key;

  $t->type = $type;



  $len = mb_strlen($key, "UTF-8");

  if($len > $MIN_LEN){

   if($len > $MAX_LEN){

    $len = $MAX_LEN + 1;

   }

   $strs = self::extractTokenStrs($key, $len-1, $len-1);

   foreach($strs as $str){

    $t->addSubToken($str);

   }

  }

  return $t;

 }



 public function addSubToken($key){

  if(!isset($this->subTokens[$key])){

   $this->subTokens[$key]=1;

  }

  else{

   $this->subTokens[$key]++;

  }

 }



 public function getSubTokens(){
  return array_keys($this->subTokens);
 }



 public static function extractTokenStrs($string, $MIN_LEN=2, $MAX_LEN=6){

  $tokenStrs = array();

  //match the 2-char, 3-char, 4-char, 5-char and 6-char tokens and put it into $rawtokens

  $len = mb_strlen($string, "UTF-8");



  if($MAX_LEN >= $MIN_LEN){

   for($k = $MAX_LEN; $k >= $MIN_LEN; $k--){

    for($j = 0; $j <= $len-$MIN_LEN; $j++){

     $subStr = mb_substr($string, $j, $k, "UTF-8");

     if(mb_strlen($subStr, "UTF-8") != $k){

      break;

     }

     $tokenStrs[] = $subStr;

    }

   }

  }

  return $tokenStrs;

 }





}

class Cloud{

 private $keywords = array();

 private $categories = array();





 private function addToken($string){

  if(empty($string)){

   return false;

  }



  if(!isset($this->keywords[$string])){

   $this->keywords[$string] = Token::create($string, "object");

  }

  $this->keywords[$string]->count++;

  return $this->keywords[$string];

 }



 private function addCategory($string){

  if(empty($string)){

   return false;

  }



  if(!isset($this->categories[$string])){

   $this->categories[$string] = Token::create($string, "category");

  }

  return $this->categories[$string];

 }



 private function emptyCategories(){

  $this->categories = array();

 }



 public function addTokens($string){

  $strs = self::split($string, array("的","了","然后","[\pP\pS\pZ\pC\pM]+","[0-9a-zA-Z]+"));

  foreach($strs as $str){
   if(mb_strlen($str, "UTF-8") >= 2){

    $this->addCategory($str);


   }

  }



 }

 private function emptyKeywordsList(){
  $this->keywords = array();
 }

 public function buildKeyWordsList(){
  $this->emptyKeywordsList();
  foreach($this->categories as $c){
   $ss = Token::extractTokenStrs($c->key, 2, 6);
   foreach($ss as $s){
    $this->addToken($s);
   }
  }
 }



 public static function longest_string_in_array($array) {

  $mapping = array_combine($array, array_map('mb_strlen', $array));

  return array_keys($mapping, max($mapping));

 }



 public function findBestMatchingWords($threshold){

  /* find the longest words satisfies the threshold */

  $keywords = array();

  foreach($this->categories as $k){

   $this->extractCategoryKeywords($k, $threshold, $keywords);

  }

  $keys = array_keys($keywords);

  if(count($keywords)===0){

   return false;

  }

  $lkeys = self::longest_string_in_array($keys);

  $ret = array();

  foreach($lkeys as $k){

   $ret[$k] = $keywords[$k];

  }

  return $ret;

 }

 private function removeInfluence(Token $token){
  $num = $token->count;
  $ss = Token::extractTokenStrs($token->key, 2, 6);

  foreach($ss as $s){
   $this->keywords[$s]->count-=$num;
  }
 }



 public function getKeywords($threshold = 3){

  $this->buildKeyWordsList();

  $stopList = array();
  $keywords = array();


  $categoriesToRestore = $this->categories;
  while(true){



   $bestWords = $this->findBestMatchingWords($threshold);



   if($bestWords === false){

    break;

   }



   // add it to stopList

   // add it the keywords

   foreach($bestWords as $w){

    $keywords[$w->key]= $w;
    $this->removeInfluence($w);
    $stopList[] = $w->key;

   }



   // use the stop list to rebuild the categories

   $oldCategories = $this->categories;

   $this->emptyCategories();

   foreach($oldCategories as $c){

    $list = self::split($c->key, $stopList);

    foreach( $list as $nc){

     $this->addCategory($nc);

    }

   }

  }

  $this->categories = $categoriesToRestore;
  return $keywords;

 }



 public static function split($string, $delimiters){

  //build the patterns

  $pattern = "/";



  foreach($delimiters as $c => $d){

   if($c === 0){

    $pattern = $pattern.$d;

   }

   else{

    $pattern = $pattern."|".$d;

   }

  }

  $patterns = $pattern."/isu";

  $rawtokens = preg_split($patterns, $string);

  return $rawtokens;

 }



 public function extractCategoryKeywords($root, $threshold, &$keywords=null){



  $key = $root->key;



  if($keywords === null){

   $keywords = array();

  }



  if(isset($keywords[$key])){

   return;

  }



  if(isset($this->keywords[$key]) && $this->keywords[$key]->count >= $threshold){

   $keywords[$key]=$this->keywords[$key];

  }

  else{

   $subTokens = $root->getSubTokens();

   foreach( $subTokens as $s){

    $this->extractCategoryKeywords($this->keywords[$s], $threshold, $keywords);

   }

  }

 }

}// Cloud ends

set_time_limit ( 1000 );
$string = <<<EOT
meta http-equiv="Content-Type" content="text/html; charset=UTF-8"
meta name="description" content="[回到部落格首頁]"
EOT;
$cloud = new Cloud();
$cloud->addTokens($string);
print_r(array_keys($cloud->getKeywords(3)));
echo "hello";


Reference 

No comments:

Post a Comment