[pLog-svn] r1633 - in plog/trunk/class: . bayesian
oscar at devel.plogworld.net
oscar at devel.plogworld.net
Tue Mar 29 17:27:55 GMT 2005
Author: oscar
Date: 2005-03-29 17:27:55 +0000 (Tue, 29 Mar 2005)
New Revision: 1633
Modified:
plog/trunk/class/Doxyfile
plog/trunk/class/bayesian/bayesianfiltercore.class.php
plog/trunk/class/bayesian/bayesianfilterutils.class.php
plog/trunk/class/bayesian/bayesiantokenizer.class.php
plog/trunk/class/bayesian/tokenizer.class.php
Log:
added more documentation
Modified: plog/trunk/class/Doxyfile
===================================================================
--- plog/trunk/class/Doxyfile 2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/Doxyfile 2005-03-29 17:27:55 UTC (rev 1633)
@@ -416,7 +416,7 @@
# directories like "/usr/src/myproject". Separate the files or directories
# with spaces.
-INPUT = object/ dao/ logger/ gallery/ locale/ config/ file/ database/ data/forms security/ xml/ controller/ mail/
+INPUT = object/ dao/ logger/ gallery/ locale/ config/ file/ database/ data/forms security/ xml/ controller/ mail/ bayesian/
# If the value of the INPUT tag contains directories, you can use the
# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
Modified: plog/trunk/class/bayesian/bayesianfiltercore.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesianfiltercore.class.php 2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesianfiltercore.class.php 2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,7 +1,12 @@
<?php
/**
- * @package bayesian
+ * \defgroup Bayesian
+ *
+ * The Bayesian module provides the logic that implements a spam filter based
+ * on Bayesian filtering techniques. The filter is trained via the user's own posts (they are
+ * considered "safe" content) and the negative food is provided via the posts marked as spam by
+ * users via the admin interface.
*/
@@ -11,8 +16,21 @@
include_once( PLOG_CLASS_PATH."class/dao/bayesiantokens.class.php" );
/**
- * provides the 'train' and 'untrain' method that are crucial to get our
- * filter to differentiate between spam and non-spam...
+ * \ingroup Bayesian
+ *
+ * This class provides the 'train' and 'untrain' method that are crucial to get our
+ * filter to differentiate between spam and non-spam.
+ *
+ * The filter needs a bit of training before it can recognize spam content so please use the
+ * BayesianFilterCore::train() method for giving "bad" content to the filter and the
+ * BayesianFilterCore::untrain() for giving "good" content to the filter.
+ *
+ * This class is the base for the BayesianFilter object that works as a PipelineFilter in the
+ * Pipeline.
+ *
+ * Client classes will rarely need to use this class.
+ *
+ * @see BayesianFilter
*/
class BayesianFilterCore extends Object
{
@@ -25,13 +43,14 @@
/**
* Trains the filter to recognize comments like this as spam
*
- * @param blogId
- * @param topic
- * @param text
- * @param userName
- * @param userEmail
- * @param userUrl
- * @param spam Wether we should set this message as spam or not
+ * @param blogId The blog id
+ * @param topic The topic of the comment/article that we're using to train the filter
+ * @param text The text of the comment/articles that we're usingn to train the filter
+ * @param userName Name of the user posting this comment/article
+ * @param userEmail Email address of the user posting this comment/article
+ * @param userUrl URL of the user posting this comment/article
+ * @param spam Wether we should set this message as spam or not. The content will be marked
+ * as non-spam by default
* @static
* @return true
*/
@@ -72,7 +91,7 @@
}
/**
- * given an Article object, trains the filter based on the article data
+ * Given an Article object, trains the filter based on the article data
*
* @param article An Article object
* @return true
@@ -92,13 +111,14 @@
/**
* untrains the filter
*
- * @param blogId
- * @param topic
- * @param text
- * @param userName
- * @param userEmail
- * @param userUrl,
- * @param spam
+ * @param blogId The blog id
+ * @param topic The topic of the comment/article that we're using to untrain the filter
+ * @param text The text of the comment/articles that we're usingn to untrain the filter
+ * @param userName Name of the user posting this comment/article
+ * @param userEmail Email address of the user posting this comment/article
+ * @param userUrl URL of the user posting this comment/article
+ * @param spam Wether we should unmark these contents as spam or not. The content will be unmarked
+ * as non-spam by default
* @static
* @see train
*/
Modified: plog/trunk/class/bayesian/bayesianfilterutils.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesianfilterutils.class.php 2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesianfilterutils.class.php 2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,20 +1,17 @@
<?php
- /**
- * @package bayesian
- */
-
-
include_once( PLOG_CLASS_PATH."class/dao/model.class.php" );
include_once( PLOG_CLASS_PATH."class/bayesian/bayesianfiltercore.class.php" );
include_once( PLOG_CLASS_PATH."class/dao/articles.class.php" );
include_once( PLOG_CLASS_PATH."class/dao/articlecomments.class.php" );
/**
+ * \ingroup Bayesian
+ *
* some commodity functions have been included here such as making the filter
* take all the comments from the blog and train the filter using those comments
* as "good" raw material and so on. These functions do not really belong to the
- * core so I've put them here
+ * core so I've put them here.
*/
class BayesianFilterUtils extends Model
{
Modified: plog/trunk/class/bayesian/bayesiantokenizer.class.php
===================================================================
--- plog/trunk/class/bayesian/bayesiantokenizer.class.php 2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/bayesiantokenizer.class.php 2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,37 +1,39 @@
<?php
- /**
- * @package bayesian
- */
-
-
include_once( PLOG_CLASS_PATH."class/dao/bayesiantoken.class.php" );
include_once( PLOG_CLASS_PATH."class/bayesian/tokenizer.class.php" );
define( "SPLIT_REG_EXP", "[^a-zA-Z0-9àáèéíïòóúüÀÁÈÉÍÏÒÓÚÜ'$!,.^-]+");
/**
- * Filters the text posted in a comment by a user, to prevent spam-bots. This
- * filter only works if the incoming request has the "op" parameter as
- * "AddComment", because then it means that we're posting a comment. If it's not
- * like that, then we'll quit. Otherwise, the process will continue as normally.
- */
-
- class BayesianTokenizer extends Tokenizer {
+ * \ingroup Bayesian
+ *
+ * This class takes care of splitting a valid html source in the different words that
+ * make it up, taking tags into account. The main public method is BayesianTokenizer::tokenize()
+ */
+ class BayesianTokenizer extends Tokenizer
+ {
var $_htmlTags = array();
/**
- * -- Add function info here --
- */
+ * constructor, it only calls the parent constructor.
+ * @see Tokenizer
+ */
function BayesianTokenizer()
{
$this->Tokenizer();
}
/**
- * -- Add function info here --
- */
+ * given an input text, possibly containing HTML tags, it will split it into
+ * all the different words that make it up.
+ *
+ * @param text The text to split
+ * @param unique Whether the return array should contain unique items or if the same
+ * word is allowed more than once.
+ * @return An array where each item is a word from the text
+ */
function tokenize($text, $unique = false)
{
$this->_htmlTags = array();
@@ -49,8 +51,8 @@
}
/**
- * -- Add function info here --
- */
+ * @private
+ */
function _tokenize($text)
{
$tokensTemp = split(SPLIT_REG_EXP, $text);
@@ -97,8 +99,8 @@
}
/**
- * -- Add function info here --
- */
+ * @private
+ */
function _getValidHtmlTags($tags)
{
$validTags = array();
@@ -114,9 +116,9 @@
return $validTags;
}
- /**
- * -- Add function info here --
- */
+ /**
+ * @private
+ */
function _stripHtmlTags($text)
{
preg_match_all("/(<[^>]+>)/", $text, $regs);
@@ -131,9 +133,9 @@
return preg_replace("/<[^>]+>/", "", $text);
}
- /**
- * -- Add function info here --
- */
+ /**
+ * @private
+ */
function _tokenizeHtmlTags($tags)
{
$tokens = array();
@@ -146,9 +148,9 @@
return $tokens;
}
- /**
- * -- Add function info here --
- */
+ /**
+ * @private
+ */
function _tokenizeHtmlTag($tag)
{
$tokens = array();
@@ -182,9 +184,9 @@
return $tokens;
}
- /**
- * -- Add function info here --
- */
+ /**
+ * @private
+ */
function _unquoteToken($token)
{
if (ereg("^['\"](.+)['\"]$", $token, $regs))
@@ -197,9 +199,9 @@
}
}
- /**
- * -- Add function info here --
- */
+ /**
+ * @private
+ */
function addContextMark($tokens, $mark)
{
$count = count($tokens);
Modified: plog/trunk/class/bayesian/tokenizer.class.php
===================================================================
--- plog/trunk/class/bayesian/tokenizer.class.php 2005-03-29 15:14:36 UTC (rev 1632)
+++ plog/trunk/class/bayesian/tokenizer.class.php 2005-03-29 17:27:55 UTC (rev 1633)
@@ -1,26 +1,32 @@
<?php
- /**
- * @package bayesian
- */
-
-
include_once( PLOG_CLASS_PATH."class/object/object.class.php" );
/**
- * Interface class that defines the methods that should be implemented
- * by child classes wishing to implement a configuratino settings storage backend.
+ * \ingroup Bayesian
+ *
+ * Class that defines the interface for classes wishing to implement a tokenizer
*/
- class Tokenizer extends Object {
+ class Tokenizer extends Object
+ {
/**
- * -- Add function info here --
- */
+ * constructor, takes no parameters
+ */
function Tokenizer()
{
$this->Object();
}
+ /**
+ * given an input text, possibly containing HTML tags, it will split it into
+ * all the different words that make it up.
+ *
+ * @param text The text to split
+ * @param unique Whether the return array should contain unique items or if the same
+ * word is allowed more than once.
+ * @return An array where each item is a word from the text
+ */
function tokenize($text, $unique = false)
{
throw(new Exception("Tokenizer::tokenize: This method must be implemented by child classes."));
More information about the pLog-svn
mailing list