[pLog-svn] r2348 - plog/trunk/class/security

pwestbro at devel.plogworld.net pwestbro at devel.plogworld.net
Tue Jul 19 05:55:13 GMT 2005


Author: pwestbro
Date: 2005-07-19 05:55:13 +0000 (Tue, 19 Jul 2005)
New Revision: 2348

Modified:
   plog/trunk/class/security/bayesianfilter.class.php
   plog/trunk/class/security/commentfilter.class.php
   plog/trunk/class/security/pipeline.class.php
   plog/trunk/class/security/pipelinerequest.class.php
Log:
Fixed bug http://bugs.plogworld.net/view.php?id=638

Now when a pipeline filter rejects a message, all of the other pipeline
filters are run again so they have a chance to do something with this new
information.  The bayesian spam filter, if it had thought that the message
was not spam, will untrain the message, and then train it as spam.

The DNS Anti Spam filter could submit the ip address and the urls to the dns
black lists.

Note: when merging the 1.0.2 branch to the trunk, r2305 should not be
merged.

I will write up a bug for modifing the plugins


Modified: plog/trunk/class/security/bayesianfilter.class.php
===================================================================
--- plog/trunk/class/security/bayesianfilter.class.php	2005-07-19 04:57:46 UTC (rev 2347)
+++ plog/trunk/class/security/bayesianfilter.class.php	2005-07-19 05:55:13 UTC (rev 2348)
@@ -1,6 +1,6 @@
 <?php
 
-	include_once( PLOG_CLASS_PATH."class/security/pipelinefilter.class.php" );
+    include_once( PLOG_CLASS_PATH."class/security/pipelinefilter.class.php" );
 
     define("HIGH_SPAM_PROBABILITY", 1000);
 
@@ -25,12 +25,12 @@
      * in order to filter spam comments out. The filter needs some training but after that it should
      * be fairly reliable.
      */
-	class BayesianFilter extends PipelineFilter 
-	{
+    class BayesianFilter extends PipelineFilter 
+    {
     
         function BayesianFilter( $pipelineRequest )
         {
-        	$this->PipelineFilter( $pipelineRequest );
+            $this->PipelineFilter( $pipelineRequest );
         }
         
         /**
@@ -43,51 +43,53 @@
         {
             include_once( PLOG_CLASS_PATH."class/dao/articlecomments.class.php" );
 
-        	$config	=& Config::getConfig();
-        	
-        	if (!$config->getValue("bayesian_filter_enabled"))
-        	{
-        		return new PipelineResult(true);
-			}
-		
-        	// get some info        	
+            $config =& Config::getConfig();
+            
+            if (!$config->getValue("bayesian_filter_enabled"))
+            {
+                return new PipelineResult(true);
+            }
+        
+            // get some info            
             $blogInfo = $this->_pipelineRequest->getBlogInfo();
             $request  = $this->_pipelineRequest->getHttpRequest();
-			
-            // we only have to filter the contents if the user is posting a comment or we're receiving
+            $previoslyRejected = $this->_pipelineRequest->getRejectedState();
+            
+            // we only have to filter the contents if the user is posting a comment 
+            // or we're receiving a trackback
             // so there's no point in doing anything else if that's not the case
             if( $request->getValue( "op" ) != "AddComment" && $request->getValue( "op" ) != "AddTrackback" ) {
-            	$result = new PipelineResult();
+                $result = new PipelineResult();
                 return $result;
             }
             
             // if it's a trackback, the data is in another place...
-			$isTrackback = ($request->getValue( "op" ) == "AddTrackback");
-			if( $isTrackback ) {
-				$commentText = $request->getValue( "excerpt" );
-				$commentTopic = $request->getValue( "title" );
-				$articleId = $request->getValue( "id" );
-				// that's all we can get from a trackback...
-				$userName = $request->getValue( "blog_name" );
-				$userUrl = $request->getValue( "url" );
-				$userEmail = $request->getValue( "" );
-			}
-			else {
-				// or else let's assume that we're dealing with a comment
-				$commentText = $request->getValue( "commentText" );
-				$commentTopic = $request->getValue( "commentTopic" );
-				$userName = $request->getValue( "userName" );
-				$userEmail = $request->getValue( "userEmail" );
-				$userUrl = $request->getValue( "userUrl" );
-				$articleId = $request->getValue( "articleId" );
-				$parentId  = $request->getValue( "parentId" );			
-			}
-			
+            $isTrackback = ($request->getValue( "op" ) == "AddTrackback");
+            if( $isTrackback ) {
+                $commentText = $request->getValue( "excerpt" );
+                $commentTopic = $request->getValue( "title" );
+                $articleId = $request->getValue( "id" );
+                // that's all we can get from a trackback...
+                $userName = $request->getValue( "blog_name" );
+                $userUrl = $request->getValue( "url" );
+                $userEmail = $request->getValue( "" );
+            }
+            else {
+                // or else let's assume that we're dealing with a comment
+                $commentText = $request->getValue( "commentText" );
+                $commentTopic = $request->getValue( "commentTopic" );
+                $userName = $request->getValue( "userName" );
+                $userEmail = $request->getValue( "userEmail" );
+                $userUrl = $request->getValue( "userUrl" );
+                $articleId = $request->getValue( "articleId" );
+                $parentId  = $request->getValue( "parentId" );          
+            }
+            
             if( $parentId == "" )
-            	$parentId = 0;
+                $parentId = 0;
 
             $spamicity = $this->getSpamProbability($blogInfo->getId(), $commentTopic, $commentText, $userName, $userEmail, $userUrl);
-			
+            
             if ($spamicity >= $config->getValue("bayesian_filter_spam_probability_treshold"))
             {
                 $result = new PipelineResult(false, HIGH_SPAM_PROBABILITY, "You cannot post this message. Anti-spam filter has blocked it.");
@@ -99,35 +101,53 @@
                 // still be added but marked as spam and so on... sometimes breaking a few
                 // rules makes things easier :)
                 if( $config->getValue( "bayesian_filter_spam_comments_action" ) == BAYESIAN_FILTER_KEEP_COMMENT_ACTION ) {
-                	$comments = new ArticleComments();
-                	$clientIp = Client::getIp();
-					$comment = new UserComment( $articleId, $parentId, $commentTopic, $commentText,
-					                               null, $userName, $userEmail, $userUrl, $clientIp,
-												   0, COMMENT_STATUS_SPAM );
-					// mark it as a trackback instead of a user comment...
-					
-					if( $isTrackback ) {
-						$this->log->debug("saving the trackback but saving it as spam...");
-						$comment->setType( COMMENT_TYPE_TRACKBACK );
-					}
-						
-					// add the comment to the db
+                    $comments = new ArticleComments();
+                    $clientIp = Client::getIp();
+                    $comment = new UserComment( $articleId, $parentId, $commentTopic, $commentText,
+                                                   null, $userName, $userEmail, $userUrl, $clientIp,
+                                                   0, COMMENT_STATUS_SPAM );
+                    // mark it as a trackback instead of a user comment...
+                    
+                    if( $isTrackback ) {
+                        $this->log->debug("saving the trackback but saving it as spam...");
+                        $comment->setType( COMMENT_TYPE_TRACKBACK );
+                    }
+                        
+                    // add the comment to the db
                     $comments->addComment( $comment );
                 }
                 else {
-                	// nothing to do here, simply throw the comment away
+                    // nothing to do here, simply throw the comment away
                 }
-				$spam = true;
+                $spam = true;
             }
             else
             {
                 $result = new PipelineResult(true);
-				$spam = false;
+                $spam = false;
             }
-			
-			// train the filter with the message, be it spam or not...
-			BayesianFilterCore::train( $blogInfo->getId(), $commentTopic, $commentText, $userName, $userEmail,
-			                           $userUrl, $spam );
+            
+            if ( !$previoslyRejected )
+            {
+                // train the filter with the message, be it spam or not...
+                BayesianFilterCore::train( $blogInfo->getId(), $commentTopic, $commentText, $userName, $userEmail,
+                                           $userUrl, $spam );
+            }
+            else
+            {
+            	// This is a rejected message. If we think that this is non-spam, 
+            	// we want to untrain it and then retrain it as spam
+            	if ( !$spam )
+            	{
+            		// Un-train this non-spam
+					BayesianFilterCore::untrain( $blogInfo->getId(), $commentTopic, $commentText, $userName, $userEmail,
+											   $userUrl, $spam );
+											   
+					// train this as spam
+					BayesianFilterCore::train( $blogInfo->getId(), $commentTopic, $commentText, $userName, $userEmail,
+											   $userUrl, true );
+            	}
+            }
 
             //print "<h1>" . number_format($spamicity * 100, 0) . "% of spamicity</h1>";
             return $result;
@@ -138,7 +158,7 @@
         */        
         function getSpamProbability($blogId, $topic, $text, $userName, $userEmail, $userUrl)
         {
-	        include_once( PLOG_CLASS_PATH."class/bayesian/bayesiantokenizer.class.php" );
+            include_once( PLOG_CLASS_PATH."class/bayesian/bayesiantokenizer.class.php" );
 
             $tokenizer = new BayesianTokenizer();
 
@@ -158,14 +178,14 @@
         /**
         * @private
         */
-		function _getMostSignificantTokens($blogId, $tokens)
+        function _getMostSignificantTokens($blogId, $tokens)
         {       
             include_once( PLOG_CLASS_PATH."class/dao/bayesiantokens.class.php" );
             include_once( PLOG_CLASS_PATH."class/dao/bayesianfilterinfos.class.php" );
 
-        	$config	=& Config::getConfig(); 
-        	
-        	$bayesianFilterInfos = new BayesianFilterInfos();
+            $config =& Config::getConfig(); 
+            
+            $bayesianFilterInfos = new BayesianFilterInfos();
             $bayesianFilterInfo  = $bayesianFilterInfos->getBlogBayesianFilterInfo($blogId);
             
             $totalSpam = $bayesianFilterInfo->getTotalSpam();
@@ -175,7 +195,7 @@
             
             foreach ($tokens as $token)
             {
-            	$bayesianTokens->updateOccurrences($blogId, $token, 0, 0, $totalSpam, $totalNonSpam, false);
+                $bayesianTokens->updateOccurrences($blogId, $token, 0, 0, $totalSpam, $totalNonSpam, false);
             }
             
             $tokens = $bayesianTokens->getBayesianTokensFromArray($blogId, $tokens);                                                
@@ -183,9 +203,9 @@
                         
             foreach ($tokens as $token)
             {
-            	if ($token->isSignificant() && $token->isValid())
-                {                	
-					array_push($tempArray, abs($token->getProb() - 0.5));
+                if ($token->isSignificant() && $token->isValid())
+                {                   
+                    array_push($tempArray, abs($token->getProb() - 0.5));
                 }
             }
 
@@ -220,7 +240,7 @@
                 $productProb   *= $token->getProb();                
                 $productNoProb *= (1 - $token->getProb());                
             }
-                            	
+                                
             return $productProb / ($productProb + $productNoProb);
         }
     }

Modified: plog/trunk/class/security/commentfilter.class.php
===================================================================
--- plog/trunk/class/security/commentfilter.class.php	2005-07-19 04:57:46 UTC (rev 2347)
+++ plog/trunk/class/security/commentfilter.class.php	2005-07-19 05:55:13 UTC (rev 2348)
@@ -40,6 +40,10 @@
             if( $request->getValue( "op" ) != "AddComment" )
             	return new PipelineResult();
 
+			// if this is already rejected, there is no reason to do anything here
+			if ( $this->_pipelineRequest->getRejectedState() )
+            	return new PipelineResult();
+            	
         	// get the value of the maximum size of a comment, in bytes
         	$config =& Config::getConfig();
             $maxSize = $config->getValue( "maximum_comment_size" );

Modified: plog/trunk/class/security/pipeline.class.php
===================================================================
--- plog/trunk/class/security/pipeline.class.php	2005-07-19 04:57:46 UTC (rev 2347)
+++ plog/trunk/class/security/pipeline.class.php	2005-07-19 05:55:13 UTC (rev 2348)
@@ -9,7 +9,7 @@
      */
 
 
-	include_once( PLOG_CLASS_PATH."class/object/object.class.php" );
+    include_once( PLOG_CLASS_PATH."class/object/object.class.php" );
     include_once( PLOG_CLASS_PATH."class/config/config.class.php" );
     
     /**
@@ -35,14 +35,14 @@
      * The out of the box implementation of the Pipeline comes with a null filter (a filter that does nothing -- go figure :))
      * and a filter that implements a Bayesian filter for advanced spam protection. See the BayesianFilter class for more information.
      */
-	class Pipeline extends Object 
-	{
+    class Pipeline extends Object 
+    {
 
-    	/**
+        /**
          * HTTP request that will be used if the filter is doing
          * some content filtering
          */
-    	var $_httpRequest;
+        var $_httpRequest;
 
         /**
          * the BlogInfo object that has information about the blog
@@ -62,9 +62,9 @@
          * @param blogInfo The BlogInfo object with information about the blog
          * that is currently executing this pipeline
          */
-    	function Pipeline( $httpRequest, $blogInfo = null )
+        function Pipeline( $httpRequest, $blogInfo = null )
         {
-        	$this->Object();
+            $this->Object();
 
             $this->_httpRequest = $httpRequest;
             $this->_blogInfo    = $blogInfo;
@@ -121,33 +121,59 @@
         function process()
         {
             require_once( PLOG_CLASS_PATH . 'class/security/pipelinerequest.class.php' );
+            require_once( PLOG_CLASS_PATH . 'class/security/pipelineresult.class.php' );
             global $_pLogPipelineRegisteredFilters;        
         
-        	// check if the pipeline is enabled
+            // check if the pipeline is enabled
             $config =& Config::getConfig();
             if( $config->getValue( "security_pipeline_enabled" ) == false ) {
-            	// pipeline is disabled, so everything's fine
-            	return new PipelineResult( true );
+                // pipeline is disabled, so everything's fine
+                return new PipelineResult( true );
             }
 
+            // Assume that this will be successful
+            $this->_result = new PipelineResult( true );
+
             // if enabled, then check all the filters
-        	foreach( $_pLogPipelineRegisteredFilters as $filterClass ) {
-        	    // create an instance of the filter
-        	    $pipelineRequest = new PipelineRequest( $this->_httpRequest, $this->_blogInfo );
-        	    $filter = new $filterClass( $pipelineRequest );
-        	    // and execute it...       	    
-            	$result = $filter->filter();
+            foreach( $_pLogPipelineRegisteredFilters as $filterClass ) {
+                // create an instance of the filter
+                $pipelineRequest = new PipelineRequest( $this->_httpRequest, $this->_blogInfo );
+                $filter = new $filterClass( $pipelineRequest );
+                // and execute it...            
+                $result = $filter->filter();
                 // if there was an error, we better say so now
                 // and quite, making sure that we're keeping the
                 // error code
-                if( !$result->isValid()) {                
-                	$this->_result = $result;
-                	return $result;
+                 
+                // Save off the result
+				$this->_result = $result;
+                    
+                if( !$result->isValid()) { 
+                    // break out of this loop
+                    break;
                 }
             }
+    
+            // If one of the filters returns that this was not a valid result
+            if ( !$this->_result->isValid() ) {
+                // Now rerun through all of the filters so they can clean up 
+                // if they have saved anything persistantly
+                // This also gives filters a chance to do anything else they 
+                // want to do (i.e. report ip address to dns blacklist)
+    
+                foreach( $_pLogPipelineRegisteredFilters as $filterClass ) {
+                    // create an instance of the filter
+                    $pipelineRequest = new PipelineRequest( $this->_httpRequest, 
+                                                            $this->_blogInfo, 
+                                                            true );
+                    $filter = new $filterClass( $pipelineRequest );
+                    // and execute it...            
+                    $result = $filter->filter();
+                    // if there was an error, we want to keep going
+                }
+            }
 
-            $this->_result = $result;
-            return $result;
+            return $this->_result ;
         }
     }
 ?>

Modified: plog/trunk/class/security/pipelinerequest.class.php
===================================================================
--- plog/trunk/class/security/pipelinerequest.class.php	2005-07-19 04:57:46 UTC (rev 2347)
+++ plog/trunk/class/security/pipelinerequest.class.php	2005-07-19 05:55:13 UTC (rev 2348)
@@ -16,6 +16,7 @@
 
     	var $_httpRequest;
         var $_blogInfo;
+        var $_requestRejected;
 
         /**
          * Constructor.
@@ -24,16 +25,17 @@
          * @param blogInfo A BlogInfo object with information about the blog
          * currently executing the request
          */
-        function PipelineRequest( $httpRequest, $blogInfo )
+        function PipelineRequest( $httpRequest, $blogInfo, $rejected = false )
         {
         	$this->Object();
 
             if( is_array($httpRequest))
             	$this->_httpRequest = new Properties( $httpRequest );
             else
-        		$this->_httpRequest = $httpRequest;
+        		$this->_httpRequest  = $httpRequest;
                 
-            $this->_blogInfo    = $blogInfo;
+            $this->_blogInfo         = $blogInfo;
+            $this->_requestRejected  = $rejected;
         }
 
         /**
@@ -52,5 +54,15 @@
         {
         	return $this->_httpRequest;
         }
+        
+        /**
+        * @return Returns a boolean that indicates if this pipeline request has 
+        * already been rejected
+        */
+        function getRejectedState()
+        {
+        	return $this->_requestRejected;
+        }
+        
     }
 ?>




More information about the pLog-svn mailing list