[pLog-svn] r5948 - plugins/branches/lifetype-1.2/tagcloud
jondaley at devel.lifetype.net
jondaley at devel.lifetype.net
Mon Sep 24 14:50:43 EDT 2007
Author: jondaley
Date: 2007-09-24 14:50:43 -0400 (Mon, 24 Sep 2007)
New Revision: 5948
Modified:
plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php
Log:
What do folks think about this? I just wrote a post where I repeated a couple words a bunch of times, and even though the tagcloud is set to look at the last 60 posts, because I had repeated the word so many times, it ended up as the most frequent word. I think the tagcloud should take into account where the words came from, and lower the value of repeated words in the same post. I arbitrarily picked '2' as the maximum number, and that should probably be larger, or even better, a configuration option
Modified: plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php
===================================================================
--- plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php 2007-09-23 20:46:56 UTC (rev 5947)
+++ plugins/branches/lifetype-1.2/tagcloud/plugintagcloud.class.php 2007-09-24 18:50:43 UTC (rev 5948)
@@ -2,7 +2,8 @@
lt_include( PLOG_CLASS_PATH."class/plugin/pluginbase.class.php" );
-
+define('TAGCLOUD_FILTER_REGEXP', '/\s*[\s+\.|\?|,|(|)|\-+|\'|\"|!|=|;|×|\$|\/|:|{|}]\s*/i');
+
/*
* This plugin generate TagCloud for a specific Blog
*/
@@ -28,7 +29,7 @@
lt_include( PLOG_CLASS_PATH."class/database/db.class.php" );
$this->db =& Db::getDb();
$this->id = "tagcloud";
- $this->version = "20070725";
+ $this->version = "20070924";
$this->locales = Array("en_UK");
@@ -162,15 +163,37 @@
$result = $this->db->Execute($query);
if(!$result || ($result->RecordCount() == 0))
return false;
-
+
+ $data = array();
// get the articles content
while ($row = $result->FetchRow()) {
- $data[] = $row['normalized_topic'].' '.$row['normalized_text'];
+ $data[] = $row['normalized_topic'] . " ";
+
+ // Limit the amount of occurrences in one post, to prevent
+ // one post from taking over all of the statistics.
+ // TODO: 1. don't switch back and forth between strings and
+ // arrays so many times
+ // 2. Make the number of occurrences allowed a configuration
+ // option
+ $text_limited = array();
+ $words = preg_split(FILTER_REGEXP, strtolower($row['normalized_text']));
+ foreach($words as $word){
+ if(empty($text_limited[$word])){
+ $text_limited[$word] = 1;
+ }
+ else if($text_limited[$word] == 1){
+ $text_limited[$word] = 2;
+ }
+ }
+ foreach($text_limited as $word => $frequency){
+ for($i = 0; $i < $frequency; $i++)
+ $data[] = $word . " ";
+ }
}
$data = implode(' ',$data);
-
+
// Split keywords
- $words = preg_split('/\s*[\s+\.|\?|,|(|)|\-+|\'|\"|!|=|;|×|\$|\/|:|{|}]\s*/i', strtolower($data));
+ $words = preg_split(FILTER_REGEXP, strtolower($data));
$acv = array_count_values( $words );
// Remove unwanted keywords
More information about the pLog-svn
mailing list