#!/usr/bin/perl
# $Rev: 1219 $
# rebuildspamdb 2
# rebuilds bayesian spam database
# (c) John Hanna 2003 under the terms of the GPL
# Updated July 2004 for simple proxy support.
# (c) Fritz Borgstedt 2006 under the terms of the GPL
# Updated Feb 2008 refactoring and rewrites
# (c) Kevin 2008 under the terms of the GPL
use bytes;    # get rid of anoying 'Malformed UTF-8' messages
use Digest::MD5 qw(md5_hex);
use English '-no_match_vars';
use File::Copy;
use IO::Handle;
use IO::Socket();
use strict;

#use warnings;
our $VERSION = "2.1_3";

#no output buffering to screen
*STDOUT->autoflush();

#holy predeclarations Batman!
use vars qw(
    $base $correctednotspam $correctednotspamcount $correctedspam
    $correctedspamcount $DoNotCollectRed $EmailAdrRe $EmailDomainRe $HamWordCount
    $KeepWhitelistedSpam $Log $maillogExt $MaxBytes $MaxFiles $MaxWhitelistDays
    $MaxWhitelistLength $mydb $myhost $mypassword $myuser $notspamlog $processTime
    $notspamlogcount $npRe $OrderedTieHashSize $pbdbfile $proxyserver $noGripListUpload
    $RebuildLog $rebuildrun $redlistdb $redRe $redReRE $spamdb $spamdbFile $RegExLength
    $spam $spamdbFname $spamlog $spamlogcount $SpamWordCount $starttime
    $usesubject $WhitelistCleanFreq $whitelistdb $WhitelistObject $RedlistObject $whiteRe $whiteReRE $wildcardUser
    %HamHash %Helo %Redlist %spam %SpamHash %Whitelist $asspLog $DoNotCollectRedList $DoNotCollectRedRe
);

#load configuration options from assp.cfg file
&loadconfig();
my $AvailTieRDBM  = eval "use Tie::RDBM; 1";    # Is the required module installed?
my $CanUseTieRDBM = $AvailTieRDBM;              # this looks wierd but it's the only way it works
undef $AvailTieRDBM;
$EmailAdrRe    = "[^()<>@,;:\\\"\\[\\]\000-\040]+";
$EmailDomainRe = '(?:\w[\w\.\-]*\.\w+|\[[\d\.]*\.\d+\])';

# set counts
$HamWordCount          = $SpamWordCount = $correctedspamcount = 0;
$correctednotspamcount = $spamlogcount  = $notspamlogcount    = 0;

#deprecated
if ( $ARGV[0] ) {
    my $notick = $ARGV[0] eq '-notick';

    #&getmaxtick('rebuild');
}


# open log file
if ( -e "$rebuildrun.bak" ) {
    unlink("$rebuildrun.bak") or die "unable to remove file: $!";
}
if ( -e $rebuildrun ) {
    copy( $rebuildrun, "$rebuildrun.bak" ) or die "unable to copy file for: $!";
}
open( $RebuildLog, '>', "$rebuildrun" ) or die "unable to open file for logging: $!";
$starttime = time;
&printlog("\n");
for ( my $c = 54; $c >= 1; $c-- ) { &printlog(q{*}); }
&printlog( "\nRebuildSpamDB $VERSION started - " . localtime(time) . "\n" );
&printlog("\n---ASSP Settings---\n");
if ($DoNotCollectRed) {
    &printlog(
        "Do Not Collect Redlisted Messages: Enabled\n**Redlisted Messages will be removed from the corpus!**\n\n");
}
if ($DoNotCollectRedList) {
    &printlog(
        "Do Not Collect Messages with RedListed address: Enabled\n**Messages with RedListed addresses will be removed from the corpus!**\n\n"
    );
}
if ($DoNotCollectRedRe) {
    &printlog(
        "Do Not Collect RedRe Messages: Enabled\n**Messages matching the RedRe will be removed from the corpus!**\n\n");
}
if ($usesubject) {
    &printlog(
        "Use Subject as Maillog Names: True\n Recommendation: You should switch to False and run move2num.pl!\n\n");
}
else { &printlog("Use Subject as Maillog Names: False\n"); }
&printlog("Maxbytes: $MaxBytes \n");
&printlog("Maxfiles: $MaxFiles \n");

#rebuild various cache files and lists
&repair();

# randomly, let's clean the whitelist of old entries
if ( rand() < $WhitelistCleanFreq ) { &cleanwhite(); }

# name, contents, refrence to "compiled" object
&compileregex( "whiteRe", $whiteRe, \$whiteReRE );
&compileregex( "redRe",   $redRe,   \$redReRE );

# redlist,whitelist
&createlistobjects();

			 # isspam?, path, filter, weight, processing sub
$correctedspamcount    = &processfolder( 1, $correctedspam,    ".rpt",      2, \&dospamhash );
$correctednotspamcount = &processfolder( 0, $correctednotspam, ".rpt",      4, \&dohamhash );
$spamlogcount          = &processfolder( 1, $spamlog,          $maillogExt, 1, \&checkspam );
$notspamlogcount       = &processfolder( 0, $notspamlog,       $maillogExt, 1, \&checkham );
our $norm = $HamWordCount ? ( $SpamWordCount / $HamWordCount ) : 1;
open( my $normFile, '>', "$base/normfile" ) || warn "unable to open $base/normfile: $!\n";
if ($normFile) {
    print { $normFile } "$norm $correctedspamcount $correctednotspamcount $spamlogcount $notspamlogcount";
    close $normFile;
}

# Create Bayesian DB
&generatescores();

# Create HELo blacklist
&createheloblacklist();
&printlog(
    "\nSpam Weight:\t   " . commify($SpamWordCount) . "\nNot-Spam Weight:   " . commify($HamWordCount) . "\n\n" );
if ( !($norm) ) {    #invalid norm
    &printlog("Warning: Corpus insufficent to calculate normality!\n");
}
else {               #norm exists, print it
    &printlog( "Corpus norm:\t%.4f\n", $norm );
}
if ( $norm < 0.6 ) {
    &printlog("Corpus norm should be between 0.6 and 1.4\n\nRecommendation: You need more spam messages in the corpus.\n");
}
if ( $norm > 1.4 ) {
    &printlog("Corpus norm should be between 0.6 and 1.4\n\nRecommendation: You need more not-spam messages in the corpus.\n");
}
if ( $spamlogcount >= $MaxFiles || $notspamlogcount >= $MaxFiles ) {
    &printlog(
        "Recommendation: You should run move2num.pl to reduce the number of messages in your corpus. Excess files will be ingored by this script.\n"
    );
}
if ( $MaxBytes > 8000 && $norm > 1.3 ) {
    &printlog( "\nRecommendation: You should reduce Maxbytes to " . ( ( $MaxBytes + 2000 ) / 2 ) . "!  \n" );
}
if   ( time - $starttime != 0 ) { $processTime = time - $starttime; }
else                            { $processTime = 1; }
&printlog( "\nTotal processing time: %d second(s)\n\n", $processTime );


if ( !$noGripListUpload || !$asspLog ) { &uploadgriplist(); }
&printlog( "\n" . localtime(time) . ": RebuildSpamDB $VERSION ended\n" );
close $RebuildLog;
##########################################
#           script ends here
##########################################
sub createlistobjects {

    if ( $CanUseTieRDBM && $whitelistdb =~ /mysql/ && !$KeepWhitelistedSpam ) {
        eval {
            $WhitelistObject = tie %Whitelist, 'Tie::RDBM', "dbi:mysql:database=$mydb;host=$myhost",
                { user => "$myuser", password => "$mypassword", table => 'whitelist', create => 0 };
        };
        if ($EVAL_ERROR) {
            &printlog("whitelist mysql error: $@");
            $CanUseTieRDBM = 0;
            $whitelistdb   = "whitelist";
        }
    }
    elsif ( !$KeepWhitelistedSpam ) {
        if ( -e $whitelistdb ) { $WhitelistObject = tie( %Whitelist, 'orderedtie', "$whitelistdb" ); }
    }
    if ( $CanUseTieRDBM && $redlistdb =~ /mysql/ && ( $DoNotCollectRed || $DoNotCollectRedList ) ) {
        eval {
            $RedlistObject = tie %Redlist, 'Tie::RDBM', "dbi:mysql:database=$mydb;host=$myhost",
                { user => "$myuser", password => "$mypassword", table => 'redlist', create => 0 };
        };
        if ($EVAL_ERROR) {
            &printlog("redlist mysql error: $@");
            $CanUseTieRDBM = 0;
            $redlistdb     = "whitelist";
        }
    }
    elsif ($DoNotCollectRed) {
        if ( -e $redlistdb ) { $RedlistObject = tie( %Redlist, 'orderedtie', "$redlistdb" ); }
    }
    return;
} ## end sub createlistobjects

sub generatescores {
    my ( $t, $s, @result, $pair, $v );
    &printlog("\nGenerating weighted Bayesian tuplets...");
    open( $spamdbFile, '>', "$spamdb.tmp" ) || die "unable to open $spamdb.tmp: $!\n";
    binmode $spamdbFile;
    print { $spamdbFile } "\n";
    while ( ( $pair, $v ) = each(%spam) ) {
        ( $s, $t ) = split( q{ }, $v );
        $t = ( $t - $s ) * $norm + $s;    # normalize t
        if ( $t < 5 ) {

            #$unknowns+=$s; $unknownt+=$t;
            next;
        }

        # if token represents all spam or all ham then square its value
        if ( $s == $t || $s == 0 ) {
            $s = $s * $s;
            $t = $t * $t;
        }
        $v = ( 1 + $s ) / ( $t + 2 );
        $v = sprintf( "%.7f", $v );
        $v = '0.9999999' if $v >= 1;
        $v = '0.0000001' if $v <= 0;
        push( @result, "$pair\002$v\n" ) if abs( $v - .5 ) > .09;
    }
    &printlog("done\n");
    undef %spam;    # free some memory
    &printlog("\nSaving rebuilt SPAM database...");
    for ( sort @result ) { print { $spamdbFile } $_; }
    close $spamdbFile;
    if ( -e "$spamdb.bak" ) { unlink("$spamdb.bak") || &printlog("unable to remove '$spamdb.bak' $!\n") }
    if ( -e $spamdb ) {
        rename( $spamdb, "$spamdb.bak" ) || &printlog("unable to rename '$spamdb' to '$spamdb.bak' $!\n");
    }
    rename( "$spamdb.tmp", $spamdb ) || &printlog("unable to rename '$spamdb.tmp' to '$spamdb' $!\n");
    &printlog("done\n");
    my $filesize = -s "$spamdb";
    &printlog( "\nResulting file '$spamdbFname' is " . commify($filesize) . " bytes\n" );
    my $pairs = scalar @result;
    print "Bayesian Pairs: " . commify($pairs) . "\n";
    return;
} ## end sub generatescores

sub createheloblacklist {
    my (@Helo);
    open( my $FheloBlack, '>', "$spamdb.helo.tmp" ) || &printlog("unable to open '$spamdb.helo.tmp' $!\n");
    binmode $FheloBlack;
    print { $FheloBlack } "\n";
    while ( my ( $helostr, $weights ) = each(%Helo) ) {
        $weights->[1] = 0 if ( !defined $weights->[1] );
        $weights->[0] = 0 if ( !defined $weights->[0] );
        if ( $weights->[1] / ( $weights->[0] + $weights->[1] + .1 ) > .98 ) { push( @Helo, "$helostr\0021\n" ); }
    }
    print { $FheloBlack } sort @Helo;
    close $FheloBlack;
    &printlog( "\nHELO Blacklist: " . scalar(@Helo) . " HELOs\n" );
    if ( -e "$spamdb.helo.bak" ) {
        unlink("$spamdb.helo.bak") || &printlog("unable to remove '$spamdb.helo.bak' $!\n");
    }
    if ( -e "$spamdb.helo" ) {
        rename( "$spamdb.helo", "$spamdb.helo.bak" )
            || &printlog("unable to rename '$spamdb.helo' to '$spamdb.helo.bak' $!\n");
    }
    rename( "$spamdb.helo.tmp", "$spamdb.helo" )
        || &printlog("unable to rename '$spamdb.helo.tmp' to '$spamdb.helo' $!\n");
    return;
}

sub loadconfig {
    open( my $confFile, '<', "assp.cfg" ) || die "cannot open \"assp.cfg\": $!";
    local $/;
    my %Config = split( /:=|\n/, <$confFile> );
    close $confFile or die "unable to close: $!";
    $base                = $Config{ base };
    $correctednotspam    = $Config{ correctednotspam } && "$Config{base}/$Config{correctednotspam}" || 'errors/notspam';
    $correctedspam       = $Config{ correctedspam } && "$Config{base}/$Config{correctedspam}" || 'errors/spam';
    $DoNotCollectRed     = $Config{ DoNotCollectRed };
    $DoNotCollectRedRe   = $Config{ DoNotCollectRedRe };
    $DoNotCollectRedList = $Config{ DoNotCollectRedList };
    $KeepWhitelistedSpam = $Config{ KeepWhitelistedSpam };
    $Log                 = $Config{ logfile } && "$Config{base}/$Config{logfile}" || 'maillog.txt';
    $maillogExt          = $Config{ maillogExt };
    $MaxBytes            = $Config{ MaxBytes };
    $MaxFiles            = $Config{ MaxFiles };
    $MaxWhitelistDays    = $Config{ MaxWhitelistDays } || 90;
    $MaxWhitelistLength  = $Config{ MaxWhitelistLength } || 60;
    $notspamlog          = $Config{ notspamlog } && "$Config{base}/$Config{notspamlog}" || 'notspam';
    $npRe                = $Config{ npRe };
    $OrderedTieHashSize  = $Config{ OrderedTieHashSize } || 10_000;
    $pbdbfile            = $Config{ pbdb };
    $proxyserver         = $Config{ proxyserver };
    $redlistdb           = $Config{ redlistdb } && "$Config{base}/$Config{redlistdb}" || 'redlist';
    $redRe               = $Config{ redRe };
    $spamdb              = $Config{ spamdb } && "$Config{base}/$Config{spamdb}" || 'spamdb';
    $spamdbFname         = $Config{ spamdb } || 'spamdb';
    $spamlog             = $Config{ spamlog } && "$Config{base}/$Config{spamlog}" || 'spam';
    $usesubject          = $Config{ UseSubjectsAsMaillogNames };
    $WhitelistCleanFreq  = $Config{ WhitelistCleanFreq } || 0.5;
    $whitelistdb         = $Config{ whitelistdb } && "$Config{base}/$Config{whitelistdb}" || 'whitelist';
    $noGripListUpload    = $Config{ noGripListUpload };
    $asspLog             = $Config{ asspLog };
    $whiteRe             = $Config{ whiteRe };
    $wildcardUser        = $Config{ wildcardUser };
    $mydb                = $Config{ mydb };
    $myhost              = $Config{ myhost };
    $myuser              = $Config{ myuser };
    $mypassword          = $Config{ mypassword };
    $rebuildrun          = &fixPath($base) . "/rebuildrun.txt";
    $RegExLength         = $Config{ RegExLength };
    return;
} ## end sub loadconfig

sub processfolder {
    my ( $fldrType, $fldrpath, $filter, $weight, $sub ) = @_;
    my ( $count, $processFolderTime, $folderStartTime, $fileCount, @files );
    our ( $WhiteCount, $RedCount );
    $folderStartTime = time;
    $fldrpath        = &fixPath($fldrpath);
    &printlog( "\n" . $fldrpath . "\n" );
    $fldrpath .= "/*$filter";
    $fileCount = &countfiles($fldrpath);
    &printlog( "File Count:\t" . commify($fileCount) );
    &printlog("\nProcessing...");
    $count = $RedCount = $WhiteCount = 0;
    @files = glob($fldrpath);

    #while( glob($fldrpath) && $count <= $MaxFiles ) {
    foreach my $file (@files) {
        &add( $fldrType, $file, $weight, $sub );
        $count++;
        last if $count >= $MaxFiles;    #too many files
    }
    if   ( time - $folderStartTime != 0 ) { $processFolderTime = time - $folderStartTime; }
    else                                  { $processFolderTime = 1; }
    $count = $count - ( $RedCount + $WhiteCount );
    if ($RedCount) {
        &printlog( "\nRemoved Red:\t" . commify($RedCount) );
    }

    if ($WhiteCount) {
        &printlog( "\nRemoved White:\t" . commify($WhiteCount) );
    }

    &printlog( "\nImported Files:\t" . commify($count) );

    if ( $count >= $MaxFiles ) {
        &printlog("\nFolder contents exceeded \`MaxFiles\`($MaxFiles). ");

        if ($usesubject) {
            &printlog("\nDisable \`UseSubjectsAsMaillogNames\` and run \`move2num.pl\`. ");
        }
    }

    #&printlog( "\n " . commify($SpamWordCount) . " spam weight \n " . commify($HamWordCount) . " non-spam weight." );
    &printlog("\nFinished in $processFolderTime second(s)\n");
    
    return $count;
} ## end sub processfolder

sub countfiles {
    my ($fldrpath) = @_;
    my @fileCount = glob("$fldrpath");
    return scalar(@fileCount);
}

sub commify {
    local $_ = shift;
    1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
    return $_;
}

sub hash {
    my ($msgText) = @_;

    my ( $head, $body );

    # creates a md5 hash of $msg body
    if ( $msgText =~ /^(.*?)\n\r?\n(.*)/s ) {

        $head = $1;
        $body = $2;

        return md5_hex($body);
    }
    else {

        #return q;
        #There is no split, the message has no valid body
        return md5_hex($msgText);
    }

    #return $value;
    return;
}

sub dospamhash {
    my ( $FileName, $msgText ) = @_;
    $SpamHash{ &hash($msgText) } = '1';
    return;
}

sub dohamhash {
    my ( $FileName, $msgText ) = @_;
    $HamHash{ &hash($msgText) } = q{};
    return;
}

sub checkspam {
    my ( $FileName, $msgText ) = @_;
    our $HamHash;
    $msgText = &hash($msgText);
    my ( $return, $reason );
    if ( defined( $HamHash{ $msgText } ) ) {

        # we've found a message in the spam database that is the same as one in the corrected Ham group
        my $fn = shift;
        &deletefile( $fn, "corrected spam" );
        return 1;
    }
    elsif ( $reason = &redlisted( $_[1] ) ) {
        my $fn = shift;
        &deletefile( $fn, $reason );
        return 1;
    }
    elsif ( $reason = &whitelisted( $_[1] ) ) {
        my $fn = shift;
        &deletefile( $fn, $reason );
        return 1;
    }
    return 0;
}

sub checkham {
    my ( $FileName, $msgText ) = @_;
    our $SpamHash;
    my ( $return, $reason );
    $msgText = &hash($msgText);
    if ( defined( $SpamHash{ $msgText } ) ) {

        # we've found a message in the ham database that is the same as one in the corrected spam group
        my $fn = shift;
        &deletefile( $fn, "corrected spam" );
        return 1;
    }
    elsif ( $reason = &redlisted( $_[1] ) ) {
        my $fn = shift;
        &deletefile( $fn, "$reason" );
        return 1;
    }
    return 0;
}

sub getrecontent {
    my ( $value, $name ) = @_;
    my $fromfile = 0;
    if ( $value =~ /^ *file: *(.+)/i ) {

        # the option list is actually saved in a file.
        $fromfile = 1;
        my $fil = $1;
        $fil = "$base/$fil" if $fil !~ /^\Q$base\E/i;
        local $/;
        if ( open( my $File, '<', $fil ) ) {
            $value = <$File>;

            # clean off comments
            $value =~ s/#.*//g;

            # replace newlines (and the whitespace that surrounds them) with a |(pipe character)
            $value =~ s/\s*\n\s*/|/g;
            close $File;
        }
        else { $value = q{}; }
    }
    $value =~ s/\|\|/\|/g;
    $value =~ s/\s*\|/\|/g;
    $value =~ s/\|\s*/\|/g;
    $value =~ s/\|\|+/\|/g;
    $value =~ s/^\s*\|?//;
    $value =~ s/\|?\s*$//;
    $value =~ s/\|$//;
    return $value;
} ## end sub getrecontent

sub whitelisted {
    return 0 if $KeepWhitelistedSpam;
    my $m = shift;
    my ( $curaddr, %seen );

    # test against expression to recognize whitelisted mail
    if ( $whiteRe && $m =~ ( '(' . $whiteReRE . ')' ) ) {
        my $reason = $1;
        $reason =~ s/\s+$/ /g;
        $reason =~ s/[\r\n]/ /g;
        if ( length($reason) >= $RegExLength ) { $reason = substr( $reason, 0, ( $RegExLength - 4 ) ) . "..." }
        our $WhiteCount++;
        return ( "Regex:White '" . $reason . q{'} );
    }
    $m =~ s/\n\r?\n.*//s;    # remove body
    while ( $m =~ /($EmailAdrRe\@$EmailDomainRe)/igo ) {
        my $curaddr = lc($1);    #

        #my $curaddr = lc( $1 . $2 );
        if ( exists $seen{ $curaddr } ) {
            next;                #we already checked this address
        }
        else { $seen{ $curaddr } = 1; }
        if ( $Whitelist{ $curaddr } ) {
            my $reason = $curaddr;
            $reason =~ s/\s+$/ /g;
            $reason =~ s/[\r\n]/ /g;
            our $WhiteCount++;
            return ( "WhiteList: '" . $reason . q{'} );
        }
        if ($wildcardUser) {
            my ( $mfdd, $alldd, $reason );
            $mfdd = $1 if $curaddr =~ /(\@.*)/;
            $alldd = "$wildcardUser$mfdd";
            if ( $Whitelist{ lc $alldd } ) {
                $reason = $curaddr;
                $reason =~ s/\s+$/ /g;
                $reason =~ s/[\r\n]/ /g;
                our $WhiteCount++;
                return ( "WhiteList-Wild: '" . $reason . q{'} );
            }
        }
    } ## end while ( $m =~ /($EmailAdrRe\@$EmailDomainRe)/igo)
    return 0;
} ## end sub whitelisted

sub redlisted {
    my $m = shift;
    my (%seen);

    # test against expression to recognize redlisted mail
    if ( $DoNotCollectRedRe || $DoNotCollectRed ) {    #skip Redre check, 1.3.5 and higher
        if ( $redRe && $m =~ ( '(' . $redReRE . ')' ) ) {
            my $reason = $1;
            $reason =~ s/\s+$/ /g;
            $reason =~ s/[\r\n]/ /g;
            if ( length($reason) >= $RegExLength ) { $reason = substr( $reason, 0, ( $RegExLength - 4 ) ) . "..." }
            our $RedCount++;
            return ( "Regex:Red '" . $reason . q{'} );
        }
    }
    if ( $DoNotCollectRedList || $DoNotCollectRed ) {    #skip Redlist check, 1.3.5 and higher
        $m =~ s/\n\r?\n.*//s;                            # remove body
        while ( $m =~ /($EmailAdrRe\@$EmailDomainRe)/igo ) {
            my $curaddr = lc($1);

            #$curaddr = lc( $1 . $2 );
            if ( exists $seen{ $curaddr } ) {
                next;                                    #we already checked this address
            }
            else { $seen{ $curaddr } = 1; }
            if ( $Redlist{ $curaddr } ) {
                my $reason = $curaddr;
                $reason =~ s/\s+$/ /g;
                $reason =~ s/[\r\n]/ /g;
                our $RedCount++;
                return ( "redlist: '" . $reason . q{'} );
            }
        }
    }
    return 0;
} ## end sub redlisted

sub deletefile {
    my ( $fn, $reason ) = @_;
    if ( -e $fn ) {
        if ( -w $fn || -W $fn ) {
            &printlog( "\nremove " . $fn . q{ } . $reason );
            unlink($fn);
        }
        else { printlog( "\ncannot delete " . $reason . " message " . $fn . ": file is not writable: $!" ); }
    }
    else { printlog( "\ncannot delete " . $reason . " message " . $fn . ": $!" ); }
    return;
}

sub get {
    my ( $fn, $sub ) = @_;
    my $message;
    my $count;
    my $numreadchars;
    open( my $file, '<', "$fn" ) || return;

    # Maxbytes or 10000, whichever is less
    $numreadchars = $MaxBytes <= 10_000 ? $MaxBytes : 10_000;
    $count = read( $file, $message, $numreadchars );    # read characters into memory
    close $file;
    return if $sub->( $fn, $message );                  # have i read this before?

    return $message;
}

sub add {
    my ( $isspam, $fn, $factor, $sub ) = @_;
    return if -d $fn;
    my ( $curHelo, $CurWord, $PrevWord, $sfac, $tfac );
    $PrevWord = $CurWord = q{};
    my $content = &get( $fn, $sub );
    return unless $content;
    if ( $content =~ /helo=(.*?)\)/i ) {
        $curHelo = lc($1);
        if ( $Helo{ $curHelo } ) { $Helo{ $curHelo }->[$isspam] += $factor; }
        else {    #it doesn't seem to exist. create it.
            $Helo{ $curHelo }->[$isspam] = $factor;
        }
    }
    $content = &clean($content);
    while ( $content =~ /([-\$A-Za-z0-9\'\.!\240-\377]{2,})/g ) {
        if ( length($1) > 20 || length($1) < 2 ) { next }
        $PrevWord = $CurWord;
        $CurWord  = lc($1);

        #next if $text=~/^\d/;      # ignore numbers
        $CurWord =~ s/[,.']+$//;    # remove commas and periods at the end of strings
        $CurWord =~ s/!!!+/!!/g;    # replace excessive exclamation points
        $CurWord =~ s/--+/-/g;      # replace excessive dashes
        if ( !$PrevWord ) { next }  # We only want word pairs
        if ( length($CurWord) < 2 || length($PrevWord) < 2 ) { next }    # too short after cleaning

        # increment global weights, they are not really word counts
        if   ($isspam) { $SpamWordCount += $factor; }
        else           { $HamWordCount  += $factor; }
        if ( exists( $spam{ "$PrevWord $CurWord" } ) ) {
            ( $sfac, $tfac ) = split( q{ }, $spam{ "$PrevWord $CurWord" } );
        }
        else {

            # the pair does not exist, create it
            $spam{ "$PrevWord $CurWord" } = "0 0";
            ( $sfac, $tfac ) = split( q{ }, $spam{ "$PrevWord $CurWord" } );
        }
        $sfac += $isspam ? $factor : 0;
        $tfac += $factor;
        $spam{ "$PrevWord $CurWord" } = "$sfac $tfac";
    } ## end while ( $content =~ /([-\$A-Za-z0-9\'\.!\240-\377]{2,})/g)
    return;
} ## end sub add

# clean up source email
sub clean {
    local $_ = "\n" . shift;
    my $helo;
    if ( $helo = /helo=([^)]+)\)/i ) {

        # if the helo string is long, break it up
        if ( length($helo) > 19 ) { $helo =~ s/(\w+)/ hlo $1 /g }
    }
    else { $helo = q{}; }
    my $rcpt = "rcpt " . join( " rcpt ", /($EmailAdrRe\@$EmailDomainRe)/go );

    # replace &#ddd encoding
    s/&#(\d{1,3});?/chr($1)/ge;

    #s/base64.{0,99}\n\n([a-zA-Z0-9+\/\n=]+)/base64decode($1)/gse;
    # replace base64 encoding
    s/\n([a-zA-Z0-9+\/=]{40,}\r?\n[a-zA-Z0-9+\/=\r\n]+)/&base64decode($1)/gse;

    # clean up quoted-printable references
    s/(Subject: .*)=\r?\n/$1\n/;

    #if(/quoted-printable/) {
    s/=\r?\n//g;
    s/=([0-9a-fA-F]{2})/pack("C",hex($1))/gei;

    #}
    #s/(http:\/\/\S+)/&fixurl($1)/ige;
    s/%([0-9a-fA-F][0-9a-fA-F])/pack('C',hex($1))/ge;    # replace url encoding

    # strip out mime continuation
    s/.*---=_NextPart_.*\n//g;

    # mark the subject
    s/\nsubject: (.*)/&fixsub($1)/ige;

    # remove received lines
    s/\n(received|Content-Type): .*(\n[\t ].*)*//ig;

    # remove other header lines
    s/(\n[a-zA-Z\-]{2,40}: .*(\n[\t ].*)*){2,}//g;

    # clean up &nbsp; and &amp;
    s/&nbsp;?/ /gi;
    s/&amp;?/and/gi;
    s/(\d),(\d)/$1$2/g;
    s/\r//g;
    s/ *\n/\n/g;
    s/\n\n\n\n\n+/\nblines blines\n/g;

    # clean up html stuff
    s/<script.*?>\s*(<!\S*)?/ jscripttag jscripttag /ig;
    while (s/(\w+)(<[^>]*>)((<[^>]*>)*\w+)/$2$1$3/g) { }    # move html out of words
    s/<([biu]|strong)>/ boldifytext boldifytext /gi;

    # remove some tags that are not informative
    s/<\/?(p|br|div|t[dr])[^>]*>/\n/gi;
    s/<\/([biu]|font|strong)>//gi;
    s/<\/?(html|meta|head|body|span|o)[^>]*>//ig;
    s/(<a\s[^>]*>)(.*?)(<\s*\/a\s*>)/$1.fixlinktext($2).$3/igse;
    s/<\s*\/a\s*>//gi;

    # treat titles like subjects
    s/<title[^>]*>(.*?)<\/title>/&fixsub($1)/ige;

    # remove style sheets
    s/<style[^>]*>.*?<\/style>//igs;

    # remove html comments
    s/<!.*?-->//gs;
    s/<![^>]*>//g;

    # look for random words
    s/[ a-z0-9][ghjklmnpqrstvwxz_]{2}[bcdfghjklmnpqrstvwxz_0-9]{3}\S*/ randword randword /gi;

    # remove mime seperators
    s/\n--.*randword.*//g;

    # look for linked images
    s/(<a[^>]*>[^<]*<img)/ linkedimage linkedimage $1/gis;
    s/<[^>]*href\s*=\s*("[^"]*"|\S*)/&fixhref($1)/isge;
    s/http:\/\/(\S*)/&fixhref($1)/isge;
    s/(\S+\@\S*\.\w{2,3})\b/&fixhref($1)/ge;

    #clean MSHTML shit
    s/=3D/=/gs;
    s/=20\n//gs;
    s/src=\"cid\:[\w\W]+?\"//gs;
    return "helo: $helo\n$rcpt\n$_";
} ## end sub clean

sub cleanwhite {
    &printlog("\n---Cleaning whitelist---\n");
    my $calcTime = time - 24 * 3600 * $MaxWhitelistDays;
    my ( $wlbefore, $wlafter );
    if ( !( $whitelistdb =~ /mysql/ ) ) {
        if ( open( F, "<", $whitelistdb ) ) {
            binmode(F);
            my $nwhite;
            local $/ = "\n";
            $nwhite = "\n";
            while (<F>) {
                $wlbefore++;
                my ( $a, $time ) = split( "\002", $_ );
                if (m/^\'/) { next }    #skip addresses with leading ' chars
                if ( $time eq q{} || $a eq q{} ) { next }
                chomp($time);
                chomp($a);
                next if $calcTime > $time || length($a) > $MaxWhitelistLength;
                $nwhite .= $_;
                $wlafter++;
            }
            close F;
            copy( $whitelistdb, "$whitelistdb.bak" );
            open( O, ">", $whitelistdb );
            binmode(O);
            print O$nwhite;
            close O;
        }
    } ## end if ( !( $whitelistdb =~...
    else {
        my %Whitelist;
        my $WhitelistObject;
        eval {
            $WhitelistObject = tie %Whitelist, 'Tie::RDBM', "dbi:mysql:database=$mydb;host=$myhost",
                { user => "$myuser", password => "$mypassword", table => 'whitelist', create => 0 };
        };
        if ($EVAL_ERROR) {
            &printlog("whitelist mysql error: $@");
            $CanUseTieRDBM = 0;
            $whitelistdb   = "whitelist";
        }
        $wlbefore = scalar keys %Whitelist;
        $wlafter  = $wlbefore;
        while ( my ( $key, $value ) = each %Whitelist ) {

            #my $date1 = localtime($value); #debugging stuff
            #my $date2 = localtime($calcTime);
            #print "$key=$value\n";
            if ( $value < $calcTime || length($key) > $MaxWhitelistLength ) {
                if ( $Whitelist{ $key } ) {
                    delete $Whitelist{ $key };
                    $wlafter--;
                }
            }
        }
        $WhitelistObject->flush() if $WhitelistObject && $whitelistdb !~ /mysql/;

        #untie %Whitelist;
    } ## end else [ if ( !( $whitelistdb =~...
    &printlog( "whitelist before: " . commify($wlbefore) . "\n" );
    &printlog( "whitelist after:  " . commify($wlafter) . "\n" );
    return;
} ## end sub cleanwhite

sub dayofweek {

    # this is mercilessly hacked from John Von Essen's Date::Day
    my ( $d, $m, $y ) = $_[0] =~ /(\S+) +(\S+) +(\S+)/;

    # data for DayOfWeek function
    my %Months = (
        'Jan', 1, 'Feb', 2, 'Mar', 3, 'Apr', 4,  'May', 5,  'Jun', 6,
        'Jul', 7, 'Aug', 8, 'Sep', 9, 'Oct', 10, 'Nov', 11, 'Dec', 12,
    );
    my %Month = ( 1, 0, 2, 3, 3, 2, 4, 5, 5, 0, 6, 3, 7, 5, 8, 1, 9, 4, 10, 6, 11, 2, 12, 4, );
    my %Weekday = ( 0, 'srdSUN', 1, 'srdMON', 2, 'srdTUE', 3, 'srdWED', 4, 'srdTHU', 5, 'srdFRI', 6, 'srdSAT', );
    $y += 2000;
    $m = $Months{ $m };
    if ( $m <= 2 ) { $y--; }
    my $wday = ( ( $d + $Month{ $m } + $y + ( int( $y / 4 ) ) - ( int( $y / 100 ) ) + ( int( $y / 400 ) ) ) % 7 );
    return $Weekday{ $wday };
}
sub fixhref     { my $t = shift; $t =~ s/(\w+)/ href $1 /g; return $t; }
sub fixlinktext { my $t = shift; $t =~ s/(\w+)/atxt $1/g;   return $t; }

sub fixurl {
    my $a = shift;
    $a =~ s/%([0-9a-fA-F][0-9a-fA-F])/pack('C',hex($1))/ge;
    return $a;
}

sub fixsub {
    my $s = shift;

    #print "$s=>";
    $s =~ s/ {3,}/ lotsaspaces /g;
    $s =~ s/(\S+)/ssub $1/g;

    #print "$s\n";
    return "\n$s ssub";
}

sub base64decode {
    my $str = shift;
    my $res = "\n\n";
    $str =~ tr|A-Za-z0-9+/||cd;
    $str =~ tr|A-Za-z0-9+/| -_|;
    while ( $str =~ /(.{1,60})/gs ) {
        my $len = chr( 32 + length($1) * 3 / 4 );
        $res .= unpack( "u", $len . $1 );
    }
    return $res;
}

sub printlog {
    my ( $text, $format ) = @_;
    if ( !$format ) {
        print "$text";
        print { $RebuildLog } "$text";
    }
    if ($format) {
        printf "$text", $format;
        printf $RebuildLog "$text", $format;
    }
    return;
}

sub uploadgriplist {
    local $/ = "\n";

    #&printlog("Start building Griplist \n");
    open( my $FLogFile, '<', "$Log" ) || &printlog("Unable to create Griplist.\n unable to open logfile '$Log': $!\n");
    my ( $date, $ip, $i1, $i2, %m, %ok, %locals, $match, $peeraddress, $connect, $day, $gooddays, $st );

    #build list of the last 4 days
    $day = localtime();
    $day =~ s/^... (...) +(\d+) (\S+) ..(..)/$1-$2-$4/;
    $gooddays .= "$day|";
    $day = localtime( time - 24 * 3600 );
    $day =~ s/^... (...) +(\d+) (\S+) ..(..)/$1-$2-$4/;
    $gooddays .= "$day|";
    $day = localtime( time - 48 * 3600 );
    $day =~ s/^... (...) +(\d+) (\S+) ..(..)/$1-$2-$4/;
    $gooddays .= "$day|";
    $day = localtime( time - 72 * 3600 );
    $day =~ s/^... (...) +(\d+) (\S+) ..(..)/$1-$2-$4/;
    $gooddays .= "$day";
    undef $day;
    %locals = ( '127', 1, '10', 1, '192.168', 1, '169.254', 1 );    #RFC 1918
    for ( 16 .. 31 ) { $locals{ "172.$_" } = 1 }                    #RFC 1918

    while (<$FLogFile>) {
        next unless ( $date, $ip, $i1, $i2, $match ) = /($gooddays) .*\s((\d+)\.(\d+)\.\d+)\.\d+ .* to: \S+ (.*)/io;
        next if $locals{ ("$i1|$i1.$i2") };                         # ignore private IP ranges
        if (m/(local\sor\swhitelisted|message\sok)/i) {

            #Good IP
            $m{ $ip }  += 1;
            $ok{ $ip } += 1;
            next;
        }
        if (m/(\sspam|bad\sattachment|virus\sdetected)/i) {

            #Bad IP
            $m{ $ip }  += 1;
            $ok{ $ip } += 0;
            next;
        }
    }
    close $FLogFile;
    if ( !%m ) {
        print "Skipping GrIPlist upload. Not enough messages processed.\n";
        return;
    }
    for ( sort keys %m ) { $st .= "$_\001$m{$_}\002$ok{$_}\003"; }
    if ($proxyserver) {
        &printlog("Uploading Griplist via Proxy: $proxyserver\n");
        $peeraddress = $proxyserver;
        $connect     = "POST http://scripts.asspsmtp.org/grey/uploadGrip.pl HTTP/1.0";
    }
    else {
        &printlog("Uploading Griplist via Direct Connection\n");
        $peeraddress = "scripts.asspsmtp.org:80";
        $connect     = <<"EOF";
POST /grey/uploadGrip.pl HTTP/1.1
User-Agent: rebuildspamdb/$VERSION ($^O; Perl/$];)
Host: scripts.asspsmtp.org
EOF
    }
    my $socket = new IO::Socket::INET( Proto => 'tcp', PeerAddr => $peeraddress, Timeout => 2 );
    if ( defined $socket ) {
        my $len = length($st);
        $connect .= <<"EOF";
Content-Type: application/x-www-form-urlencoded
Content-Length: $len

$st
EOF
        print { $socket } $connect;
        $socket->close;
        &printlog("submitted $len bytes\n");
    }
    else {
        &printlog("unable to connect to scripts.asspsmtp.org to upload griplist\n");
        return;
    }
    return;
} ## end sub uploadgriplist

sub compileregex {
    use re 'eval';
    my ( $name, $contents, $REname ) = @_;
    $contents = getrecontent( $contents, $name );
    $contents ||= '^(?!)';    # regexp that never matches

    # trim long matches to 32 chars including '...' at the end
    eval { $$REname = qr/(?si)$contents/ };
    if ($EVAL_ERROR) { &printlog("regular expression error in '$contents' for $name: $@"); }
    return q{};
}

sub optionList {

    # this converts a | separated list into a RE
    my ( $d, $configname ) = @_;
    $d = getrecontent( $d, $configname );
    $d =~ s/([\.\[\]\-\(\)\*\+\\])/\\$1/g;
    return $d;
}

sub fixPath {
    my ($path) = @_;
    my $len = length($path);
    if   ( !substr( $path, ( $len - 1 ), 1 ) eq q{/} ) { return $path . q{/}; }
    else                                               { return $path; }
    return;
}

sub repair {
    $/ = "\n";

    # mxa ptr rbl spf uribl white black
    my $pbdb = "$base/$pbdbfile";
    my ( @files, %w );
    my ( $k,     $v );
    if ( !( $pbdbfile =~ /mysql/ ) ) {
        foreach ( glob("$pbdb.*.db") ) { push( @files, $_ ); }
    }
    if ( !( $whitelistdb =~ /mysql/ ) ) { push( @files, $whitelistdb ); }
    if ( !( $redlistdb   =~ /mysql/ ) ) { push( @files, $redlistdb ); }
    foreach my $f (@files) {
        if ( !-e $f ) { next }
        open( my $curfile, "<", $f );

        #<$curfile>;
        while (<$curfile>) {
            ( $k, $v ) = split( /[\001\002\n]/, $_ );
            if ( $k eq q{} || $v eq q{} ) { next }

            #print "$k=$v\n";
            $w{ $k } = $v;
        }
        close $curfile;
        open( my $newfile, ">", "$f.new" );
        binmode $newfile;
        print { $newfile } "\n";
        for ( sort keys %w ) { print { $newfile } "$_\002$w{$_}\n"; }
        close $newfile;
        rename( $f, "$f.bak" );
        rename( "$f.new", $f );
        undef %w;
    }
    return;
} ## end sub repair
#####################################################################################
#                orderedtie
{

    package orderedtie;

    # This is a tied value that caches lookups from a sorted file; \n separates records,
    # \002 separates the key from the value. After main::OrderedTieHashSize lookups the cache is
    # cleared. This give us most of the speed of the hash without the huge memory overhead of storing
    # the entire hash and should be totally portable. Picking the best value for n requires some
    # tuning. A \n is required to start the file.
    # if you're updating entries it behoves you to call flush every so often to make sure that your
    # changes are saved. This also frees the memory used to remember updated values.
    # for my purposes a value of undef and a nonexistant key are the same
    # Obviosly if your keys or values contain \n or \002 it will totally goof things up.
    sub TIEHASH {
        my ( $c, $fn ) = @_;
        my $self = { fn => $fn, age => mtime($fn), cnt => 0, cache => {}, updated => {}, ptr => 1, };
        bless $self, $c;
        return $self;
    }
    sub DESTROY { $_[0]->flush(); }
    sub mtime { my @s = stat( $_[0] ); $s[9]; }

    sub flush {
        my $this = shift;
        return unless %{ $this->{ updated } };
        my $f = $this->{ fn };
        open( O, '>', "$f.tmp" ) || return;
        binmode(O);
        open( I, '<', "$f" ) || print O"\n";
        binmode(I);
        local $/ = "\n";
        my @l = ( sort keys %{ $this->{ updated } } );
        my ( $k, $d, $r, $v );

        while ( $r = <I> ) {
            ( $k, $d ) = split( "\002", $r );
            while ( @l && $l[0] lt $k ) {
                $v = $this->{ updated }{ $l[0] };
                print O"$l[0]\002$v\n" if $v;
                shift(@l);
            }
            if ( $l[0] eq $k ) {
                $v = $this->{ updated }{ $l[0] };
                print O"$l[0]\002$v\n" if $v;
                shift(@l);
            }
            else { print O$r; }
        }
        while (@l) {
            $v = $this->{ updated }{ $l[0] };
            print O"$l[0]\002$v\n" if $v;
            shift(@l);
        }
        close I;
        close O;
        unlink($f);
        rename( "$f.tmp", $f );
        $this->{ updated } = {};
    } ## end sub flush

    sub STORE {
        my ( $this, $key, $value ) = @_;
        $this->{ cache }{ $key } = $this->{ updated }{ $key } = $value;
    }

    sub FETCH {
        my ( $this, $key ) = @_;
        return $this->{ cache }{ $key } if exists $this->{ cache }{ $key };
        $this->resetCache()
            if ( $this->{ cnt }++ > $main::OrderedTieHashSize
            || ( $this->{ cnt } & 0x1f ) == 0 && mtime( $this->{ fn } ) != $this->{ age } );
        return $this->{ cache }{ $key } = binsearch( $this->{ fn }, $key );
    }

    sub resetCache {
        my $this = shift;
        $this->{ cnt }   = 0;
        $this->{ age }   = mtime( $this->{ fn } );
        $this->{ cache } = { %{ $this->{ updated } } };
    }

    sub binsearch {
        my ( $f, $k ) = @_;
        open( F, '<', "$f" ) || return;
        binmode(F);
        my $siz = my $h = -s $f;
        $siz -= 1024;
        my $l  = 0;
        my $k0 = $k;
        $k =~ s/([\[\]\(\)\*\^\!\|\+\.\\\/\?\`\$\@\{\}])/\\$1/g;    # make sure there's no re chars unqutoed in the key

        #print "k=$k ($_[1])\n";
        while (1) {
            my $m = ( ( $l + $h ) >> 1 ) - 1024;
            $m = 0 if $m < 0;

            #print "($l $m $h) ";
            seek( F, $m, 0 );
            my $d;
            my $read = read( F, $d, 2048 );
            if ( $d =~ /\n$k\002([^\n]*)\n/ ) {
                close F;

                #print "got $1\n";
                return $1;
            }
            my ( $pre, $first, $last, $post ) = $d =~ /^(.*?)\n(.*?)\002.*\n(.*?)\002.*?\n(.*?)$/s;

            #print "f=$first ";
            last unless defined $first;
            if ( $k0 gt $first && $k0 lt $last ) {

                #print "got miss\n";
                last;
            }
            if ( $k0 lt $first ) {
                last if $m == 0;
                $h = $m - 1024 + length($pre);
                $h = 0 if $h < 0;
            }
            if ( $k0 gt $last ) {
                last if $m >= $siz;
                $l = $m + $read - length($post);
            }

            #print "l=$l h=$h ";
        } ## end while (1)
        close F;
        return;
    } ## end sub binsearch

    sub FIRSTKEY {
        my $this = shift;
        $this->flush();
        $this->{ ptr } = 1;
        $this->NEXTKEY();
    }

    sub NEXTKEY {
        my ( $this, $lastkey ) = @_;
        local $/ = "\n";
        open( F, '<', "$this->{fn}" ) || return;
        binmode(F);
        seek( F, $this->{ ptr }, 0 );
        my $r = <F>;
        return unless $r;
        $this->{ ptr } = tell F;
        close F;
        my ( $k, $v ) = $r =~ /(.*?)\002(.*?)\n/s;

        if ( !exists( $this->{ cache }{ $k } ) && $this->{ cnt }++ > $main::OrderedTieHashSize ) {
            $this->{ cnt }   = 0;
            $this->{ cache } = { %{ $this->{ updated } } };
        }
        $this->{ cache }{ $k } = $v;
        $k;
    }

    sub EXISTS {
        my ( $this, $key ) = @_;
        return FETCH( $this, $key );
    }

    sub DELETE {
        my ( $this, $key ) = @_;
        $this->{ cache }{ $key } = $this->{ updated }{ $key } = undef;
    }

    sub CLEAR {
        my ($this) = @_;
        open( F, '>', "$this->{fn}" );
        binmode(F);
        print "\n";
        close F;
        $this->{ cache }   = {};
        $this->{ updated } = {};
        $this->{ cnt }     = 0;
    }
}