function countChars($text, $startCode, $endCode){ $len = mb_strlen($text,"UTF-8" ); $s = hexdec($startCode); $e = hexdec($endCode); $cnt = 0; for($k = 0; $k < $len; $k++){ $ch = mb_substr($text, $k, 1, "UTF-8"); if(getUnicodeFromOneUTF8($ch) >= $s && getUnicodeFromOneUTF8($ch) <= $e){ $cnt++; } } return $cnt; } function countChinese($text){ return countChars($text, "4e00", "9fa5"); } function countJapanese($text){ return countChars($text, "0800", "4e00"); } function countKorean($text){ return countChars($text, "3130", "318f") + countChars($text, "ac00", "d7a3"); } function countSpaces($text){ return countChars($text, "20", "20"); } function getUnicodeFromOneUTF8($word) { //获取其字符的内部数组表示,所以本文件应用utf-8编码! if (is_array( $word)) $arr = $word; else $arr = str_split($word); //此时,$arr应类似array(228, 189, 160) //定义一个空字符串存储 $bin_str = ''; //转成数字再转成二进制字符串,最后联合起来。 foreach ($arr as $value) $bin_str .= decbin(ord($value)); //此时,$bin_str应类似111001001011110110100000,如果是汉字"你" //正则截取 $bin_str = preg_replace('/^.{4}(.{4}).{2}(.{6}).{2}(.{6})$/','$1$2$3', $bin_str); //此时, $bin_str应类似0100111101100000,如果是汉字"你" //u4e00-u9fa5 (中文) //x3130-x318f (韩文) //xac00-xd7a3 (韩文) //u0800-u4e00 (日文) return bindec($bin_str); //返回类似20320,汉字"你" //return dechex(bindec($bin_str)); //如想返回十六进制4f60,用这句 }Reference
Tuesday, July 26, 2011
check Chinese, Japanese and Korean(CJK) block in UTF-8
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment