[pLog-svn] r2373 - plog/branches/plog-1.0.2/class/data/utf8

oscar at devel.plogworld.net oscar at devel.plogworld.net
Thu Jul 28 09:31:56 GMT 2005


Author: oscar
Date: 2005-07-28 09:31:55 +0000 (Thu, 28 Jul 2005)
New Revision: 2373

Modified:
   plog/branches/plog-1.0.2/class/data/utf8/utf8_funcs.php
Log:
updated the library for dealing with utf-8 as resquested in http://bugs.plogworld.net/view.php?id=652


Modified: plog/branches/plog-1.0.2/class/data/utf8/utf8_funcs.php
===================================================================
--- plog/branches/plog-1.0.2/class/data/utf8/utf8_funcs.php	2005-07-27 21:55:18 UTC (rev 2372)
+++ plog/branches/plog-1.0.2/class/data/utf8/utf8_funcs.php	2005-07-28 09:31:55 UTC (rev 2373)
@@ -9,20 +9,21 @@
  */
 
 /**
- * int utf8_isValidChar(array $inArray)
+ * int utf8_isValidChar(string $inputStr, $start = 0)
  * Is it a valid utf8 character
- * @param $inArr input ascii characters array
+ * @param $inputStr input string
+ * @param $start start index
  * @return the ascii bytes of the utf8 char if it is a valid utf8 char. 0 if input array is empty, or -1 if it's invalid 
  * @note don't use pass-by-reference for $inArr here, otherwise efficiency will decreased significantly 
+ * @note change param $inArr from char array to string ($inputStr), for porformance purpose.
+ * @note preg_split consumes too much memory and cpu when split a big string to char array
  */
-function utf8_isValidChar($inArr, $start = 0)
+function utf8_isValidChar($inputStr, $start = 0)
 {
-	if(empty($inArr) || $start < 0)
-		return 0;
-	$size = count($inArr);
-	if($size <= $start)
-		return 0;
-	$inOrd = ord($inArr[$start]);
+	$size = strlen($inputStr);
+	if($size <=0 || $start < 0 || $size <= $start) return 0;
+
+	$inOrd = ord($inputStr{$start});
 	$us = 0;
 	if($inOrd <= 0x7F) { //0xxxxxxx
 		return 1;
@@ -44,7 +45,7 @@
 
 	for($i=1; $i<$us; $i++)
 	{
-		$od = ord($inArr[$start+$i]); 
+		$od = ord($inputStr{$start+$i}); 
 		if($od <0x80 || $od > 0xBF)
 			return -1;
 	}
@@ -63,13 +64,14 @@
 {
 	if($start<0 || $length == 0)
 		return false;
-	$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
+	//discard preg_split function. it consumes too much system resource when it tries to split a big string to pieces
+	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
 	//find start
 	$si = 0;
 	$si_single = 0;
 	while($si < $start)
 	{
-		$hm = utf8_isValidChar($rawArr, $si_single);
+		$hm = utf8_isValidChar($inputStr, $si_single);
 		if($hm == -1)
 		{
 			//ignore invalid character?
@@ -99,7 +101,7 @@
 	$li = 0;
 	while($li < $length)
 	{
-		$hm = utf8_isValidChar($rawArr, $si_single);
+		$hm = utf8_isValidChar($inputStr, $si_single);
 		if($hm == -1)
 		{
 			if(!$ignore_error)
@@ -117,7 +119,7 @@
 		else
 		{
 			//for($i=0; $i<$hm; $i++) $retArr[] = array_shift($rawArr);
-			for($i=0; $i<$hm; $i++) $retArr[] = $rawArr[$si_single++];
+			for($i=0; $i<$hm; $i++) $retArr[] = $inputStr{$si_single++};
 			$li++;
 		}
 	}
@@ -132,10 +134,10 @@
  */
 function utf8_strlen($inputStr, $ignore_error = true)
 {
-	$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
+	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
 	$len = 0;
 	$si_single = 0;
-	while(($hm = utf8_isValidChar($rawArr, $si_single)) != 0)
+	while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
 	{
 		if($hm == -1)
 		{
@@ -160,13 +162,14 @@
  */ 
 function utf8_proportion($inputStr)
 {
-	$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
-	$rawLen = count($rawArr);
+	//$rawArr = preg_split('//',$inputStr,-1, PREG_SPLIT_NO_EMPTY); 
+	//$rawLen = count($rawArr);
+	$rawLen = strlen($inputStr);
 	if($rawLen == 0)
 		return 100;
 	$validChars = 0;
 	$si_single = 0;
-	while(($hm = utf8_isValidChar($rawArr, $si_single)) != 0)
+	while(($hm = utf8_isValidChar($inputStr, $si_single)) != 0)
 	{
 		if($hm == -1)
 		{




More information about the pLog-svn mailing list