First version of HostStats.

Signed-off-by: Thomas Hochstein <thh@thh.name>
This commit is contained in:
Thomas Hochstein 2025-05-10 21:39:08 +02:00
parent 73a2d70f16
commit 9ccb915d77

View file

@ -20,7 +20,7 @@ BEGIN {
use strict;
use warnings;
use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList);
use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList ParseHeaders);
use DBI;
use Getopt::Long qw(GetOptions);
@ -31,7 +31,7 @@ Getopt::Long::config ('bundling');
# define types of information that can be gathered
# all / groups (/ clients / hosts)
my %LegalStats;
@LegalStats{('all','groups')} = ();
@LegalStats{('all','groups','hosts')} = ();
################################# Main program #################################
@ -110,6 +110,7 @@ if ($Conf{'TLH'}) {
my $DBHandle = InitDB(\%Conf,1);
my $DBRaw = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
my $DBGrps = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
my $DBHosts = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'});
### get data for each month
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
@ -121,6 +122,15 @@ foreach my $Month (&ListMonth($Period)) {
if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
&GroupStats($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$OptCheckgroupsFile,$OptTest,$OptDebug);
};
### HostStats
if ($OptStatsType eq 'all' or $OptStatsType eq 'hosts') {
# define known hosts using subdomains
my @KnownHosts = qw(aioe.org arcor-online.net arcor-ip.de news.astraweb.com read.cnntp.org easynews.com eternal-september.org
fernuni-hagen.de free.fr newsread.freenet.ag googlegroups.com
news.neostrada.pl newsdawg.com newscene.com news-service.com octanews.com xsnews.nl news.xs4all.nl);
&HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTest,$OptDebug,@KnownHosts);
};
};
### close handles
@ -215,6 +225,126 @@ sub GroupStats {
};
### ----------------------------------------------------------------------------
sub HostStats {
### ----------------------------------------------------------------------------
### collect number of postings per server
### IN : $DBHandle : database handle
### $DBRaw : database table for raw data (to read from)
### $DBHosts : database table for hosts data (to write to)
### $Month : current month to do
### $Test : test mode
### $Debug : debug mode
### @KnownHosts : list of known hosts with subdomains
### OUT: (nothing)
my ($DBHandle,$DBRaw,$DBHosts,$Month,$Test,$Debug,@KnownHosts) = @_;
# define known hosts using subdomains
my %Postings;
# get raw header data from raw table for given month
my $DBQuery = $DBHandle->prepare(sprintf("SELECT headers FROM %s ".
"WHERE day LIKE ? AND NOT disregard",
$DBRaw));
$DBQuery->execute($Month.'-%')
or &Bleat(2,sprintf("Can't get hosts data for %s from %s.%s: ".
"$DBI::errstr\n",$Month,$DBRaw));
### ----------------------------------------------
print "----- HostStats -----\n" if $Debug;
### parse headers
while (($_) = $DBQuery->fetchrow_array) {
my $Host;
my %Header = ParseHeaders(split(/\n/,$_));
# ([a-z0-9-_]+\.[a-z0-9-_.]+) tries to match a hostname
# Injection-Info
if($Header{'injection-info'}) {
($Host) = $Header{'injection-info'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+);/i;
# reset if IP address
undef($Host) if $Host && $Host !~ /[g-z]/i;
}
# X-Trace
if (!$Host && $Header{'x-trace'}) {
(undef, $Host) = $Header{'x-trace'} =~ /^(\s|\d)*([a-z0-9-_]+\.[a-z0-9-_.]+)/i;
# reset if IP address
undef($Host) if $Host && $Host !~ /[g-z]/i;
}
# NNTP-Posting-Host
if (!$Host && $Header{'nntp-posting-host'}) {
($Host) = $Header{'nntp-posting-host'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+)/i;
# reset if IP address
undef($Host) if $Host && $Host !~ /[g-z]/i;
}
# Path
if (!$Host) {
if ($Header{'path'} =~ /!([^!]+)!.POSTED!/) {
$Host = "$1";
} elsif ($Header{'path'} =~ /!.POSTED.([^!]+)!/) {
$Host = "$1";
} else {
# iterate on the Path: header until we have a host name or no more
# path elements
while (!$Host && $Header{'path'} =~ /!/) {
($Host) = $Header{'path'} =~ /!?([a-z0-9-_]+\.[a-z0-9-_.]+)![^!]+$/i;
undef($Host) if $Host && $Host =~ /\.MISMATCH/;
# remove last path element
$Header{'path'} =~ s/![^!]+$//;
};
}
}
# special cases
$Host = 'news.highwinds-media.com' if $Host =~ /fx\d\d\.\S{3}\.POSTED/
or $Host =~ /newsfe\d+\.(iad|ams2)/;
$Host = 'newshosting.com' if $Host =~ /post\d*\.iad/;
# trailing .POSTED
($Host) = $Host =~ /(\S+)\.POSTED$/ if $Host =~ /\.POSTED$/;
# normalize hosts
foreach (@KnownHosts) {
if ($Host =~ /\.$_$/) {
($Host) = $_ ;
last;
}
}
# lowercase
$Host = lc($Host);
# count host
if ($Host) {
$Postings{$Host}++;
} else {
&Bleat(2,sprintf("%s FAILED", $Header{'message-id'})) if !$Host;
}
# !DEBUG! printf("%s: %s\n", $Header{'message-id'}, $Host);
};
# delete old data for that month
if (!$Test) {
$DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
$DBHosts),undef,$Month)
or &Bleat(2,sprintf("Can't delete old hosts data for %s from %s: ".
"$DBI::errstr\n",$Month,$DBHosts));
};
foreach my $Host (sort keys %Postings) {
print "$Host => $Postings{$Host}\n" if $Debug;
if (!$Test) {
# write to database
$DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
"(month,host,postings) ".
"VALUES (?, ?, ?)",$DBHosts));
$DBQuery->execute($Month, $Host, $Postings{$Host})
or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ".
"$DBI::errstr\n",$Month,$Host,$DBHosts));
$DBQuery->finish;
};
};
};
__END__
################################ Documentation #################################