function countChars($text, $startCode, $endCode){
$len = mb_strlen($text,"UTF-8" );
$s = hexdec($startCode);
$e = hexdec($endCode);
$cnt = 0;
for($k = 0; $k < $len; $k++){
$ch = mb_substr($text, $k, 1, "UTF-8");
if(getUnicodeFromOneUTF8($ch) >= $s && getUnicodeFromOneUTF8($ch) <= $e){
$cnt++;
}
}
return $cnt;
}
function countChinese($text){
return countChars($text, "4e00", "9fa5");
}
function countJapanese($text){
return countChars($text, "0800", "4e00");
}
function countKorean($text){
return countChars($text, "3130", "318f") + countChars($text, "ac00", "d7a3");
}
function countSpaces($text){
return countChars($text, "20", "20");
}
function getUnicodeFromOneUTF8($word) {
//获取其字符的内部数组表示,所以本文件应用utf-8编码!
if (is_array( $word))
$arr = $word;
else
$arr = str_split($word);
//此时,$arr应类似array(228, 189, 160)
//定义一个空字符串存储
$bin_str = '';
//转成数字再转成二进制字符串,最后联合起来。
foreach ($arr as $value)
$bin_str .= decbin(ord($value));
//此时,$bin_str应类似111001001011110110100000,如果是汉字"你"
//正则截取
$bin_str = preg_replace('/^.{4}(.{4}).{2}(.{6}).{2}(.{6})$/','$1$2$3', $bin_str);
//此时, $bin_str应类似0100111101100000,如果是汉字"你"
//u4e00-u9fa5 (中文)
//x3130-x318f (韩文)
//xac00-xd7a3 (韩文)
//u0800-u4e00 (日文)
return bindec($bin_str); //返回类似20320,汉字"你"
//return dechex(bindec($bin_str)); //如想返回十六进制4f60,用这句
}
Reference
Tuesday, July 26, 2011
check Chinese, Japanese and Korean(CJK) block in UTF-8
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment