[pLog-svn] r5948 - plugins/branches/lifetype-1.2/tagcloud

jondaley at devel.lifetype.net jondaley at devel.lifetype.net
Mon Sep 24 14:50:43 EDT 2007


Author: jondaley
Date: 2007-09-24 14:50:43 -0400 (Mon, 24 Sep 2007)
New Revision: 5948

Modified:
   plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php
Log:
What do folks think about this?  I just wrote a post where I repeated a couple words a bunch of times, and even though the tagcloud is set to look at the last 60 posts, because I had repeated the word so many times, it ended up as the most frequent word.  I think the tagcloud should take into account where the words came from, and lower the value of repeated words in the same post.  I arbitrarily picked '2' as the maximum number, and that should probably be larger, or even better, a configuration option

Modified: plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php
===================================================================
--- plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php	2007-09-23 20:46:56 UTC (rev 5947)
+++ plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php	2007-09-24 18:50:43 UTC (rev 5948)
@@ -2,7 +2,8 @@
 
     lt_include( PLOG_CLASS_PATH."class/plugin/pluginbase.class.php" );
 
-    
+define('TAGCLOUD_FILTER_REGEXP', '/\s*[\s+\.|\?|,|(|)|\-+|\'|\"|!|=|;|×|\$|\/|:|{|}]\s*/i');
+
     /*
      * This plugin generate TagCloud for a specific Blog
      */
@@ -28,7 +29,7 @@
             lt_include( PLOG_CLASS_PATH."class/database/db.class.php" );
             $this->db =& Db::getDb();
             $this->id = "tagcloud";
-            $this->version = "20070725";
+            $this->version = "20070924";
             
             $this->locales = Array("en_UK");
                                 
@@ -162,15 +163,37 @@
 		$result = $this->db->Execute($query);
         if(!$result || ($result->RecordCount() == 0))
 		    return false;
-	
+
+        $data = array();
 		// get the articles content
 		while ($row = $result->FetchRow()) {
-			$data[] = $row['normalized_topic'].' '.$row['normalized_text'];
+			$data[] = $row['normalized_topic'] . " ";
+
+                // Limit the amount of occurrences in one post, to prevent
+                // one post from taking over all of the statistics.
+                // TODO: 1. don't switch back and forth between strings and
+                //          arrays so many times
+                //       2. Make the number of occurrences allowed a configuration
+                //          option
+            $text_limited = array();
+            $words = preg_split(FILTER_REGEXP, strtolower($row['normalized_text']));
+            foreach($words as $word){
+                if(empty($text_limited[$word])){
+                    $text_limited[$word] = 1;
+                }
+                else if($text_limited[$word] == 1){
+                    $text_limited[$word] = 2;
+                }
+            }
+            foreach($text_limited as $word => $frequency){
+                for($i = 0; $i < $frequency; $i++)
+                    $data[] = $word . " ";
+            }
 		}	
 		$data = implode(' ',$data);
-		
+        
 		// Split keywords
-		$words = preg_split('/\s*[\s+\.|\?|,|(|)|\-+|\'|\"|!|=|;|&#0215;|\$|\/|:|{|}]\s*/i', strtolower($data));
+        $words = preg_split(FILTER_REGEXP, strtolower($data));
 		$acv = array_count_values( $words );
 
 		// Remove unwanted keywords



More information about the pLog-svn mailing list