diff --git a/bin/gatherstats.pl b/bin/gatherstats.pl index d360541..c741da7 100755 --- a/bin/gatherstats.pl +++ b/bin/gatherstats.pl @@ -20,7 +20,7 @@ BEGIN { use strict; use warnings; -use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList); +use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList ParseHeaders); use DBI; use Getopt::Long qw(GetOptions); @@ -31,7 +31,7 @@ Getopt::Long::config ('bundling'); # define types of information that can be gathered # all / groups (/ clients / hosts) my %LegalStats; -@LegalStats{('all','groups')} = (); +@LegalStats{('all','groups','hosts')} = (); ################################# Main program ################################# @@ -110,6 +110,7 @@ if ($Conf{'TLH'}) { my $DBHandle = InitDB(\%Conf,1); my $DBRaw = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'}); my $DBGrps = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'}); +my $DBHosts = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'}); ### get data for each month &Bleat(1,'Test mode. Database is not updated.') if $OptTest; @@ -121,6 +122,15 @@ foreach my $Month (&ListMonth($Period)) { if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') { &GroupStats($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$OptCheckgroupsFile,$OptTest,$OptDebug); }; + + ### HostStats + if ($OptStatsType eq 'all' or $OptStatsType eq 'hosts') { + # define known hosts using subdomains + my @KnownHosts = qw(aioe.org arcor-online.net arcor-ip.de news.astraweb.com read.cnntp.org easynews.com eternal-september.org + fernuni-hagen.de free.fr newsread.freenet.ag googlegroups.com + news.neostrada.pl newsdawg.com newscene.com news-service.com octanews.com xsnews.nl news.xs4all.nl); + &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTest,$OptDebug,@KnownHosts); + }; }; ### close handles @@ -215,6 +225,126 @@ sub GroupStats { }; ### ---------------------------------------------------------------------------- +sub HostStats { +### ---------------------------------------------------------------------------- +### collect number of postings per server +### IN : $DBHandle : database handle +### $DBRaw : database table for raw data (to read from) +### $DBHosts : database table for hosts data (to write to) +### $Month : current month to do +### $Test : test mode +### $Debug : debug mode +### @KnownHosts : list of known hosts with subdomains +### OUT: (nothing) + my ($DBHandle,$DBRaw,$DBHosts,$Month,$Test,$Debug,@KnownHosts) = @_; + + # define known hosts using subdomains + my %Postings; + + # get raw header data from raw table for given month + my $DBQuery = $DBHandle->prepare(sprintf("SELECT headers FROM %s ". + "WHERE day LIKE ? AND NOT disregard", + $DBRaw)); + $DBQuery->execute($Month.'-%') + or &Bleat(2,sprintf("Can't get hosts data for %s from %s.%s: ". + "$DBI::errstr\n",$Month,$DBRaw)); + + ### ---------------------------------------------- + print "----- HostStats -----\n" if $Debug; + ### parse headers + while (($_) = $DBQuery->fetchrow_array) { + my $Host; + my %Header = ParseHeaders(split(/\n/,$_)); + + # ([a-z0-9-_]+\.[a-z0-9-_.]+) tries to match a hostname + # Injection-Info + if($Header{'injection-info'}) { + ($Host) = $Header{'injection-info'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+);/i; + # reset if IP address + undef($Host) if $Host && $Host !~ /[g-z]/i; + } + # X-Trace + if (!$Host && $Header{'x-trace'}) { + (undef, $Host) = $Header{'x-trace'} =~ /^(\s|\d)*([a-z0-9-_]+\.[a-z0-9-_.]+)/i; + # reset if IP address + undef($Host) if $Host && $Host !~ /[g-z]/i; + } + # NNTP-Posting-Host + if (!$Host && $Header{'nntp-posting-host'}) { + ($Host) = $Header{'nntp-posting-host'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+)/i; + # reset if IP address + undef($Host) if $Host && $Host !~ /[g-z]/i; + } + # Path + if (!$Host) { + if ($Header{'path'} =~ /!([^!]+)!.POSTED!/) { + $Host = "$1"; + } elsif ($Header{'path'} =~ /!.POSTED.([^!]+)!/) { + $Host = "$1"; + } else { + # iterate on the Path: header until we have a host name or no more + # path elements + while (!$Host && $Header{'path'} =~ /!/) { + ($Host) = $Header{'path'} =~ /!?([a-z0-9-_]+\.[a-z0-9-_.]+)![^!]+$/i; + undef($Host) if $Host && $Host =~ /\.MISMATCH/; + # remove last path element + $Header{'path'} =~ s/![^!]+$//; + }; + } + } + + # special cases + $Host = 'news.highwinds-media.com' if $Host =~ /fx\d\d\.\S{3}\.POSTED/ + or $Host =~ /newsfe\d+\.(iad|ams2)/; + $Host = 'newshosting.com' if $Host =~ /post\d*\.iad/; + + # trailing .POSTED + ($Host) = $Host =~ /(\S+)\.POSTED$/ if $Host =~ /\.POSTED$/; + + # normalize hosts + foreach (@KnownHosts) { + if ($Host =~ /\.$_$/) { + ($Host) = $_ ; + last; + } + } + + # lowercase + $Host = lc($Host); + + # count host + if ($Host) { + $Postings{$Host}++; + } else { + &Bleat(2,sprintf("%s FAILED", $Header{'message-id'})) if !$Host; + } + + # !DEBUG! printf("%s: %s\n", $Header{'message-id'}, $Host); + }; + + # delete old data for that month + if (!$Test) { + $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?", + $DBHosts),undef,$Month) + or &Bleat(2,sprintf("Can't delete old hosts data for %s from %s: ". + "$DBI::errstr\n",$Month,$DBHosts)); + }; + + foreach my $Host (sort keys %Postings) { + print "$Host => $Postings{$Host}\n" if $Debug; + if (!$Test) { + # write to database + $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ". + "(month,host,postings) ". + "VALUES (?, ?, ?)",$DBHosts)); + $DBQuery->execute($Month, $Host, $Postings{$Host}) + or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ". + "$DBI::errstr\n",$Month,$Host,$DBHosts)); + $DBQuery->finish; + }; + }; +}; + __END__ ################################ Documentation #################################