[pLog-svn] r1633 - in plog/trunk/class: . bayesian

Tue Mar 29 17:27:55 GMT 2005

Author: oscar
Date: 2005-03-29 17:27:55 +0000 (Tue, 29 Mar 2005)
New Revision: 1633

Modified:
   plog/trunk/class/Doxyfile
   plog/trunk/class/bayesian/bayesianfiltercore.class.php
   plog/trunk/class/bayesian/bayesianfilterutils.class.php
   plog/trunk/class/bayesian/bayesiantokenizer.class.php
   plog/trunk/class/bayesian/tokenizer.class.php
Log:
added more documentation


Modified: plog/trunk/class/Doxyfile
===================================================================

--- plog/trunk/class/Doxyfile	2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/Doxyfile	2005-03-29 17:27:55 UTC (rev 1633)
@@ -416,7 +416,7 @@
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = object/ dao/ logger/ gallery/ locale/ config/ file/ database/ data/forms security/ xml/ controller/ mail/
+INPUT                  = object/ dao/ logger/ gallery/ locale/ config/ file/ database/ data/forms security/ xml/ controller/ mail/ bayesian/
 
 # If the value of the INPUT tag contains directories, you can use the 
 # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 

Modified: plog/trunk/class/bayesian/bayesianfiltercore.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesianfiltercore.class.php	2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesianfiltercore.class.php	2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,7 +1,12 @@
 <?php
 
     /**
-     * @package bayesian
+     * \defgroup Bayesian
+     *
+     * The Bayesian module provides the logic that implements a spam filter based
+     * on Bayesian filtering techniques. The filter is trained via the user's own posts (they are
+     * considered "safe" content) and the negative food is provided via the posts marked as spam by
+     * users via the admin interface.
      */
 
 
@@ -11,8 +16,21 @@
     include_once( PLOG_CLASS_PATH."class/dao/bayesiantokens.class.php" );
 
     /**
-     * provides the 'train' and 'untrain' method that are crucial to get our
-     * filter to differentiate between spam and non-spam...
+     * \ingroup Bayesian
+     *
+     * This class provides the 'train' and 'untrain' method that are crucial to get our
+     * filter to differentiate between spam and non-spam. 
+     *
+     * The filter needs a bit of training before it can recognize spam content so please use the 
+     * BayesianFilterCore::train() method for giving "bad" content to the filter and the 
+     * BayesianFilterCore::untrain() for giving "good" content to the filter.
+     *
+     * This class is the base for the BayesianFilter object that works as a PipelineFilter in the
+     * Pipeline.
+     *
+     * Client classes will rarely need to use this class.
+     *
+     * @see BayesianFilter
      */
 	class BayesianFilterCore extends Object
     {
@@ -25,13 +43,14 @@
         /**
          * Trains the filter to recognize comments like this as spam
          *
-         * @param blogId
-         * @param topic
-         * @param text
-         * @param userName
-         * @param userEmail
-         * @param userUrl
-         * @param spam Wether we should set this message as spam or not
+         * @param blogId The blog id
+         * @param topic The topic of the comment/article that we're using to train the filter
+         * @param text The text of the comment/articles that we're usingn to train the filter
+         * @param userName Name of the user posting this comment/article
+         * @param userEmail Email address of the user posting this comment/article
+         * @param userUrl URL of the user posting this comment/article
+         * @param spam Wether we should set this message as spam or not. The content will be marked
+         * as non-spam by default
          * @static
          * @return true
          */
@@ -72,7 +91,7 @@
         }
         
         /**
-         * given an Article object, trains the filter based on the article data
+         * Given an Article object, trains the filter based on the article data
          *
          * @param article An Article object
          * @return true
@@ -92,13 +111,14 @@
         /**
          * untrains the filter
          *
-         * @param blogId
-         * @param topic
-         * @param text
-         * @param userName
-         * @param userEmail
-         * @param userUrl,
-         * @param spam
+         * @param blogId The blog id
+         * @param topic The topic of the comment/article that we're using to untrain the filter
+         * @param text The text of the comment/articles that we're usingn to untrain the filter
+         * @param userName Name of the user posting this comment/article
+         * @param userEmail Email address of the user posting this comment/article
+         * @param userUrl URL of the user posting this comment/article
+         * @param spam Wether we should unmark these contents as spam or not. The content will be unmarked
+         * as non-spam by default
          * @static
          * @see train
          */

Modified: plog/trunk/class/bayesian/bayesianfilterutils.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesianfilterutils.class.php	2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesianfilterutils.class.php	2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,20 +1,17 @@
 <?php
 
-    /**
-     * @package bayesian
-     */
-
-
 	include_once( PLOG_CLASS_PATH."class/dao/model.class.php" );
     include_once( PLOG_CLASS_PATH."class/bayesian/bayesianfiltercore.class.php" );
     include_once( PLOG_CLASS_PATH."class/dao/articles.class.php" );
     include_once( PLOG_CLASS_PATH."class/dao/articlecomments.class.php" );
 
     /**
+     * \ingroup Bayesian
+     *
      * some commodity functions have been included here such as making the filter
      * take all the comments from the blog and train the filter using those comments
      * as "good" raw material and so on. These functions do not really belong to the
-     * core so I've put them here
+     * core so I've put them here.
      */
     class BayesianFilterUtils extends Model
     {

Modified: plog/trunk/class/bayesian/bayesiantokenizer.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesiantokenizer.class.php	2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesiantokenizer.class.php	2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,37 +1,39 @@
 <?php
 
-    /**
-     * @package bayesian
-     */
-
-
     include_once( PLOG_CLASS_PATH."class/dao/bayesiantoken.class.php" );
 	include_once( PLOG_CLASS_PATH."class/bayesian/tokenizer.class.php" );
 
 	define( "SPLIT_REG_EXP", "[^a-zA-Z0-9àáèéíïòóúüÀÁÈÉÍÏÒÓÚÜ'$!,.^-]+");
 
     /**
-     * Filters the text posted in a comment by a user, to prevent spam-bots. This
-     * filter only works if the incoming request has the "op" parameter as
-     * "AddComment", because then it means that we're posting a comment. If it's not
-     * like that, then we'll quit. Otherwise, the process will continue as normally.
-     */
-     
-	class BayesianTokenizer extends Tokenizer {
+     * \ingroup Bayesian
+     *
+     * This class takes care of splitting a valid html source in the different words that
+     * make it up, taking tags into account. The main public method is BayesianTokenizer::tokenize()
+     */     
+	class BayesianTokenizer extends Tokenizer 
+	{
     
         var $_htmlTags = array();
         
         /**
-        * -- Add function info here --
-        */        
+         * constructor, it only calls the parent constructor.
+         * @see Tokenizer
+         */        
         function BayesianTokenizer()
         {            
             $this->Tokenizer();
         }
         
         /**
-        * -- Add function info here --
-        */
+         * given an input text, possibly containing HTML tags, it will split it into
+         * all the different words that make it up.
+         *
+         * @param text The text to split
+         * @param unique Whether the return array should contain unique items or if the same
+         * word is allowed more than once.
+         * @return An array where each item is a word from the text
+         */
         function tokenize($text, $unique = false)
         {
         	$this->_htmlTags = array();
@@ -49,8 +51,8 @@
         }
         
         /**
-        * -- Add function info here --
-        */
+         * @private
+         */
         function _tokenize($text)
         {
             $tokensTemp = split(SPLIT_REG_EXP, $text);            
@@ -97,8 +99,8 @@
         }
         
         /**
-        * -- Add function info here --
-        */
+         * @private
+         */
     	function _getValidHtmlTags($tags)
     	{
     		$validTags = array();
@@ -114,9 +116,9 @@
     		return $validTags;
     	}
     	
-    	/**
-        * -- Add function info here --
-        */
+        /**
+         * @private
+         */
     	function _stripHtmlTags($text)
     	{    	    
             preg_match_all("/(<[^>]+>)/", $text, $regs);
@@ -131,9 +133,9 @@
     		return preg_replace("/<[^>]+>/", "", $text);
     	}
     	
-    	/**
-        * -- Add function info here --
-        */
+        /**
+         * @private
+         */
     	function _tokenizeHtmlTags($tags)
     	{   
     	    $tokens = array();
@@ -146,9 +148,9 @@
             return $tokens;
     	}
     	
-    	/**
-        * -- Add function info here --
-        */
+        /**
+         * @private
+         */
     	function _tokenizeHtmlTag($tag)
     	{   
     	    $tokens = array();
@@ -182,9 +184,9 @@
             return $tokens;
     	}
     	
-    	/**
-        * -- Add function info here --
-        */
+        /**
+         * @private
+         */
     	function _unquoteToken($token)
     	{   
     	    if (ereg("^['\"](.+)['\"]$", $token, $regs))
@@ -197,9 +199,9 @@
             }
     	}    	
     	
-    	/**
-        * -- Add function info here --
-        */
+        /**
+         * @private
+         */
         function addContextMark($tokens, $mark)
         {
             $count = count($tokens);

Modified: plog/trunk/class/bayesian/tokenizer.class.php
===================================================================
--- plog/trunk/class/bayesian/tokenizer.class.php	2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/tokenizer.class.php	2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,26 +1,32 @@
 <?php
 
-    /**
-     * @package bayesian
-     */
-
-
 	include_once( PLOG_CLASS_PATH."class/object/object.class.php" );
 
     /**
-     * Interface class that defines the methods that should be implemented
-     * by child classes wishing to implement a configuratino settings storage backend.
+     * \ingroup Bayesian
+     * 
+     * Class that defines the interface for classes wishing to implement a tokenizer
      */
-    class Tokenizer extends Object {
+    class Tokenizer extends Object 
+    {
 
         /**
-        * -- Add function info here --
-        */
+         * constructor, takes no parameters
+         */
     	function Tokenizer()
         {
         	$this->Object();
         }
 
+        /**
+         * given an input text, possibly containing HTML tags, it will split it into
+         * all the different words that make it up.
+         *
+         * @param text The text to split
+         * @param unique Whether the return array should contain unique items or if the same
+         * word is allowed more than once.
+         * @return An array where each item is a word from the text
+         */
         function tokenize($text, $unique = false)
         {
         	throw(new Exception("Tokenizer::tokenize: This method must be implemented by child classes."));