Tuesday, July 26, 2011

check Chinese, Japanese and Korean(CJK) block in UTF-8

function countChars($text, $startCode, $endCode){
 
 $len = mb_strlen($text,"UTF-8" );
 $s = hexdec($startCode);
 $e = hexdec($endCode);
 $cnt = 0;
 for($k = 0; $k < $len; $k++){
  $ch = mb_substr($text, $k, 1, "UTF-8");
  if(getUnicodeFromOneUTF8($ch) >= $s && getUnicodeFromOneUTF8($ch) <= $e){
   $cnt++;
  }
 }
 return $cnt;
}

function countChinese($text){
 return countChars($text, "4e00", "9fa5");
}

function countJapanese($text){
 return countChars($text, "0800", "4e00");
}

function countKorean($text){
 return countChars($text, "3130", "318f") + countChars($text, "ac00", "d7a3");
}

function countSpaces($text){
 return countChars($text, "20", "20");
}

function getUnicodeFromOneUTF8($word) {
  //获取其字符的内部数组表示,所以本文件应用utf-8编码!
  if (is_array( $word))
    $arr = $word;
  else 
    $arr = str_split($word);
  //此时,$arr应类似array(228, 189, 160)
  //定义一个空字符串存储
  $bin_str = '';
  //转成数字再转成二进制字符串,最后联合起来。
  foreach ($arr as $value)
    $bin_str .= decbin(ord($value));
  //此时,$bin_str应类似111001001011110110100000,如果是汉字"你"
  //正则截取
  $bin_str = preg_replace('/^.{4}(.{4}).{2}(.{6}).{2}(.{6})$/','$1$2$3', $bin_str);
  //此时, $bin_str应类似0100111101100000,如果是汉字"你"
  //u4e00-u9fa5 (中文)
  //x3130-x318f (韩文)
  //xac00-xd7a3 (韩文)
  //u0800-u4e00 (日文)
  return bindec($bin_str); //返回类似20320,汉字"你"
  //return dechex(bindec($bin_str)); //如想返回十六进制4f60,用这句
}
Reference 

No comments:

Post a Comment