Just warn if host or client can't be identified. Signed-off-by: Thomas Hochstein <thh@thh.name>
965 lines
32 KiB
Perl
Executable file
965 lines
32 KiB
Perl
Executable file
#! /usr/bin/perl
|
|
#
|
|
# gatherstats.pl
|
|
#
|
|
# This script will gather statistical information from a database
|
|
# containing headers and other information from a INN feed.
|
|
#
|
|
# It is part of the NewsStats package.
|
|
#
|
|
# Copyright (c) 2010-2013, 2025 Thomas Hochstein <thh@thh.name>
|
|
#
|
|
# It can be redistributed and/or modified under the same terms under
|
|
# which Perl itself is published.
|
|
|
|
BEGIN {
|
|
use File::Basename;
|
|
# we're in .../bin, so our module is in ../lib
|
|
push(@INC, dirname($0).'/../lib');
|
|
}
|
|
use strict;
|
|
use warnings;
|
|
|
|
use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList ParseHeaders);
|
|
|
|
use DBI;
|
|
use Data::Dumper;
|
|
use Encode qw(decode encode);
|
|
use Getopt::Long qw(GetOptions);
|
|
Getopt::Long::config ('bundling');
|
|
|
|
################################# Definitions ##################################
|
|
|
|
# define types of information that can be gathered
|
|
# all / groups (/ clients / hosts)
|
|
my %LegalStats;
|
|
@LegalStats{('all','groups','hosts','clients')} = ();
|
|
|
|
################################# Main program #################################
|
|
|
|
### read commandline options
|
|
my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
|
|
$OptHostsDB,$OptMID,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,
|
|
$OptConfFile);
|
|
GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
|
|
'clientsdb=s' => \$OptClientsDB,
|
|
'd|debug+' => \$OptDebug,
|
|
'groupsdb=s' => \$OptGroupsDB,
|
|
'hierarchy=s' => \$OptTLH,
|
|
'hostsdb=s' => \$OptHostsDB,
|
|
'mid=s' => \$OptMID,
|
|
'm|month=s' => \$OptMonth,
|
|
'rawdb=s' => \$OptRawDB,
|
|
's|stats=s' => \$OptStatsType,
|
|
't|test!' => \$OptTest,
|
|
'conffile=s' => \$OptConfFile,
|
|
'h|help' => \&ShowPOD,
|
|
'V|version' => \&ShowVersion) or exit 1;
|
|
|
|
### read configuration
|
|
my %Conf = %{ReadConfig($OptConfFile)};
|
|
|
|
### override configuration via commandline options
|
|
my %ConfOverride;
|
|
$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
|
|
$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
|
|
$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
|
|
$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
|
|
$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
|
|
&OverrideConfig(\%Conf,\%ConfOverride);
|
|
|
|
# set --debug and --test if --mid is set
|
|
if ($OptMID) {
|
|
$OptDebug = 1; $OptTest = 1;
|
|
}
|
|
|
|
### get type of information to gather, defaulting to 'all'
|
|
$OptStatsType = 'all' if !$OptStatsType;
|
|
&Bleat(2, sprintf("Unknown type '%s'!", $OptStatsType))
|
|
if !exists($LegalStats{$OptStatsType});
|
|
|
|
### get time period from --month
|
|
# get verbal description of time period, drop SQL code
|
|
my ($Period) = &GetTimePeriod($OptMonth);
|
|
# bail out if --month is invalid or set to 'ALL';
|
|
# we don't support the latter
|
|
&Bleat(2,"--month option has an invalid format - please use 'YYYY-MM' or ".
|
|
"'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
|
|
|
|
### init database
|
|
my $DBHandle = InitDB(\%Conf,1);
|
|
my $DBRaw = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
|
|
my $DBGrps = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
|
|
my $DBHosts = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'});
|
|
my $DBClients = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableClnts'});
|
|
|
|
### get data for each month
|
|
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
|
|
foreach my $Month (&ListMonth($Period)) {
|
|
|
|
print "---------- $Month ----------\n" if $OptDebug;
|
|
|
|
### GroupStats
|
|
if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
|
|
### reformat $Conf{'TLH'}
|
|
my $TLH;
|
|
if ($Conf{'TLH'}) {
|
|
# $Conf{'TLH'} is parsed as an array by Config::Auto;
|
|
# make a flat list again, separated by :
|
|
if (ref($Conf{'TLH'}) eq 'ARRAY') {
|
|
$TLH = join(':',@{$Conf{'TLH'}});
|
|
} else {
|
|
$TLH = $Conf{'TLH'};
|
|
}
|
|
# strip whitespace
|
|
$TLH =~ s/\s//g;
|
|
# add trailing dots if none are present yet
|
|
# (using negative look-behind assertions)
|
|
$TLH =~ s/(?<!\.):/.:/g;
|
|
$TLH =~ s/(?<!\.)$/./;
|
|
# check for illegal characters
|
|
&Bleat(2,'Config error - illegal characters in TLH definition!')
|
|
if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
|
|
# escape dots
|
|
$TLH =~ s/\./\\./g;
|
|
if ($TLH =~ /:/) {
|
|
# reformat $TLH from a:b to (a)|(b),
|
|
# e.g. replace ':' by ')|('
|
|
$TLH =~ s/:/)|(/g;
|
|
$TLH = '(' . $TLH . ')';
|
|
};
|
|
};
|
|
&GroupStats($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$OptCheckgroupsFile,$OptMID,$OptTest,$OptDebug);
|
|
};
|
|
|
|
### HostStats
|
|
if ($OptStatsType eq 'all' or $OptStatsType eq 'hosts') {
|
|
# define known hosts using subdomains
|
|
my @KnownHosts = qw(abavia.com aioe.org arcor-online.net arcor-ip.de astraweb.com read.cnntp.org
|
|
easynews.com eternal-september.org euro.net fernuni-hagen.de free.fr newsread.freenet.ag
|
|
googlegroups.com heirich.name news.neostrada.pl netcologne.de newsdawg.com newscene.com
|
|
news-service.com octanews.com readnews.com wieslauf.sub.de highway.telekom.at
|
|
united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl);
|
|
&HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@KnownHosts);
|
|
};
|
|
|
|
### ClientStats
|
|
if ($OptStatsType eq 'all' or $OptStatsType eq 'clients') {
|
|
# define agents/clients that shouldn't be counted
|
|
my @DropAgents = qw(debian fedora firefox gecko gentoo lightning mandriva mnenhy mozilla
|
|
pclinuxos perl php presto suse suse/opensuse thunderbrowse ubuntu version);
|
|
push(@DropAgents, 'red hat');
|
|
&ClientStats($DBHandle,$DBRaw,$DBClients,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@DropAgents);
|
|
};
|
|
};
|
|
|
|
### close handles
|
|
$DBHandle->disconnect;
|
|
|
|
################################# Subroutines ##################################
|
|
|
|
sub GroupStats {
|
|
### ----------------------------------------------------------------------------
|
|
### collect number of postings per group
|
|
### IN : $DBHandle : database handle
|
|
### $DBRaw : database table for raw data (to read from)
|
|
### $DBGrps : database table for groups data (to write to)
|
|
### $Month : current month to do
|
|
### $TLH : TLHs to collect
|
|
### $Checkgroupsfile : filename template for checkgroups file
|
|
### (expanded to $Checkgroupsfile-$Month)
|
|
### $MID : specific Message-ID to fetch (testing purposes)
|
|
### $Test : test mode
|
|
### $Debug : debug mode
|
|
### OUT: (nothing)
|
|
my ($DBHandle,$DBRaw,$DBGrps,$Month,$TLH,$CheckgroupsFile,$MID,$Test,$Debug) = @_;
|
|
|
|
# read list of newsgroups from --checkgroups
|
|
# into a hash
|
|
my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$CheckgroupsFile,$Month))}
|
|
if $CheckgroupsFile;
|
|
|
|
my $DBQuery;
|
|
if (!$MID) {
|
|
### ----------------------------------------------
|
|
### get groups data (number of postings per group)
|
|
# get groups data from raw table for given month
|
|
$DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s ".
|
|
"WHERE day LIKE ? AND NOT disregard",
|
|
$DBRaw));
|
|
$DBQuery->execute($Month.'-%')
|
|
or &Bleat(2,sprintf("Can't get groups data for %s from %s: ".
|
|
"$DBI::errstr\n",$Month,
|
|
$DBRaw));
|
|
} else {
|
|
$DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s ".
|
|
"WHERE mid = ?", $DBRaw));
|
|
$DBQuery->execute($MID)
|
|
or &Bleat(2,sprintf("Can't get groups data for %s from %s: ".
|
|
"$DBI::errstr\n",$MID,
|
|
$DBRaw));
|
|
}
|
|
|
|
# count postings per group
|
|
my %Postings;
|
|
while (($_) = $DBQuery->fetchrow_array) {
|
|
# get list of newsgroups and hierarchies from Newsgroups:
|
|
my %Newsgroups = ListNewsgroups($_,$TLH,
|
|
$CheckgroupsFile ? \%ValidGroups : '');
|
|
# count each newsgroup and hierarchy once
|
|
foreach (sort keys %Newsgroups) {
|
|
$Postings{$_}++;
|
|
};
|
|
};
|
|
|
|
# add valid but empty groups if --checkgroups is set
|
|
if (%ValidGroups) {
|
|
foreach (sort keys %ValidGroups) {
|
|
if (!defined($Postings{$_})) {
|
|
# add current newsgroup as empty group
|
|
$Postings{$_} = 0;
|
|
warn (sprintf("ADDED: %s as empty group\n",$_));
|
|
# add empty hierarchies for current newsgroup as needed
|
|
foreach (ParseHierarchies($_)) {
|
|
my $Hierarchy = $_ . '.ALL';
|
|
if (!defined($Postings{$Hierarchy})) {
|
|
$Postings{$Hierarchy} = 0;
|
|
warn (sprintf("ADDED: %s as empty group\n",$Hierarchy));
|
|
};
|
|
};
|
|
}
|
|
};
|
|
};
|
|
|
|
# delete old data for that month
|
|
if (!$Test) {
|
|
$DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
|
|
$DBGrps), undef,$Month)
|
|
or &Bleat(2,sprintf("Can't delete old groups data for %s from %s: ".
|
|
"$DBI::errstr\n",$Month,$DBGrps));
|
|
};
|
|
|
|
print "----- GroupStats -----\n" if $Debug;
|
|
foreach my $Newsgroup (sort keys %Postings) {
|
|
print "$Newsgroup => $Postings{$Newsgroup}\n" if $Debug;
|
|
if (!$Test) {
|
|
# write to database
|
|
$DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
|
|
"(month,newsgroup,postings) ".
|
|
"VALUES (?, ?, ?)",$DBGrps));
|
|
$DBQuery->execute($Month, $Newsgroup, $Postings{$Newsgroup})
|
|
or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ".
|
|
"$DBI::errstr\n",$Month,$Newsgroup,$DBGrps));
|
|
$DBQuery->finish;
|
|
};
|
|
};
|
|
};
|
|
### ----------------------------------------------------------------------------
|
|
|
|
sub HostStats {
|
|
### ----------------------------------------------------------------------------
|
|
### collect number of postings per server
|
|
### IN : $DBHandle : database handle
|
|
### $DBRaw : database table for raw data (to read from)
|
|
### $DBHosts : database table for hosts data (to write to)
|
|
### $Month : current month to do
|
|
### $TLH : TLHs to collect
|
|
### $MID : specific Message-ID to fetch (testing purposes)
|
|
### $Test : test mode
|
|
### $Debug : debug mode
|
|
### @KnownHosts : list of known hosts with subdomains
|
|
### OUT: (nothing)
|
|
my ($DBHandle,$DBRaw,$DBHosts,$Month,$TLH,$MID,$Test,$Debug,@KnownHosts) = @_;
|
|
|
|
my (%Postings,$DBQuery);
|
|
|
|
$DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID);
|
|
|
|
### ----------------------------------------------
|
|
print "----- HostStats -----\n" if $Debug;
|
|
### parse headers
|
|
while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
|
|
### skip postings with wrong TLH
|
|
next if ($TLH && !CheckTLH($Newsgroups,$TLH));
|
|
|
|
my $Host;
|
|
my %Header = ParseHeaders(split(/\n/,$Headers));
|
|
|
|
# ([a-z0-9-_]+\.[a-z0-9-_.]+) tries to match a hostname
|
|
# Injection-Info
|
|
if($Header{'injection-info'}) {
|
|
($Host) = $Header{'injection-info'} =~ /^\s*([a-z0-9-_]+\.[a-z0-9-_.]+);/i;
|
|
# reset if IP address
|
|
undef($Host) if $Host && $Host !~ /[g-z]/i;
|
|
}
|
|
# X-Trace
|
|
if (!$Host && $Header{'x-trace'}) {
|
|
(undef, $Host) = $Header{'x-trace'} =~ /^(\s|\d)*([a-z0-9-_]+\.[a-z0-9-_.]+)/i;
|
|
# reset if IP address
|
|
undef($Host) if $Host && $Host !~ /[g-z]/i;
|
|
}
|
|
# Path
|
|
if (!$Host) {
|
|
if ($Header{'path'} =~ /!([^!]+)!.POSTED!/) {
|
|
$Host = "$1";
|
|
} elsif ($Header{'path'} =~ /([^!]+)!.POSTED.[^!]+!?/) {
|
|
$Host = "$1";
|
|
} else {
|
|
# iterate on the Path: header until we have a host name or no more
|
|
# path elements
|
|
while (!$Host && $Header{'path'} =~ /!/) {
|
|
($Host) = $Header{'path'} =~ /!?([a-z0-9-_]+\.[a-z0-9-_.]+)!!?[^!]+!?$/i;
|
|
undef($Host) if $Host && $Host =~ /\.MISMATCH/;
|
|
# remove last path element
|
|
$Header{'path'} =~ s/!!?[^!]+$//;
|
|
};
|
|
}
|
|
}
|
|
|
|
# trailing .POSTED
|
|
($Host) = $Host =~ /(\S+)\.POSTED$/ if $Host =~ /\.POSTED$/;
|
|
|
|
# special cases
|
|
$Host = 'news.highwinds-media.com' if $Host =~ /f(e|x)\d\d\.\S{3}\d?$/
|
|
or $Host =~ /(newsfe|fed)\d+\.(iad|ams2)$/;
|
|
$Host = 'newshosting.com' if $Host =~ /post\d*\.iad$/;
|
|
$Host = 'eternal-september.org' if $Host =~ /dont-email\.me$/;
|
|
|
|
# normalize hosts
|
|
foreach (@KnownHosts) {
|
|
if ($Host =~ /\.$_$/) {
|
|
($Host) = $_ ;
|
|
last;
|
|
}
|
|
}
|
|
|
|
# count host
|
|
if ($Host) {
|
|
$Host = lc($Host);
|
|
$Postings{$Host}++;
|
|
$Postings{'ALL'}++;
|
|
} else {
|
|
&Bleat(1,sprintf("%s FAILED", $Header{'message-id'})) if !$Host;
|
|
}
|
|
|
|
printf("%s: %s\n", $Header{'message-id'}, $Host) if ($MID or $Debug && $Debug >1);
|
|
};
|
|
|
|
# delete old data for that month
|
|
if (!$Test) {
|
|
$DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
|
|
$DBHosts),undef,$Month)
|
|
or &Bleat(2,sprintf("Can't delete old hosts data for %s from %s: ".
|
|
"$DBI::errstr\n",$Month,$DBHosts));
|
|
};
|
|
|
|
foreach my $Host (sort keys %Postings) {
|
|
print "$Host => $Postings{$Host}\n" if $Debug;
|
|
if (!$Test) {
|
|
# write to database
|
|
$DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
|
|
"(month,host,postings) ".
|
|
"VALUES (?, ?, ?)",$DBHosts));
|
|
$DBQuery->execute($Month, $Host, $Postings{$Host})
|
|
or &Bleat(2,sprintf("Can't write groups data for %s/%s to %s: ".
|
|
"$DBI::errstr\n",$Month,$Host,$DBHosts));
|
|
$DBQuery->finish;
|
|
};
|
|
};
|
|
};
|
|
|
|
sub ClientStats {
|
|
### ----------------------------------------------------------------------------
|
|
### collect number of postings per client (and version)
|
|
### IN : $DBHandle : database handle
|
|
### $DBRaw : database table for raw data (to read from)
|
|
### $DBClients : database table for clients data (to write to)
|
|
### $Month : current month to do
|
|
### $TLH : TLHs to collect
|
|
### $MID : specific Message-ID to fetch (testing purposes)
|
|
### $Test : test mode
|
|
### $Debug : debug mode
|
|
### @DropAgents : list of UserAgent "agents" that won't be counted
|
|
### OUT: (nothing)
|
|
my ($DBHandle,$DBRaw,$DBClients,$Month,$TLH,$MID,$Test,$Debug,@DropAgents) = @_;
|
|
|
|
my (%Postings,$DBQuery);
|
|
my %DropAgent = map { $_ => 1 } @DropAgents;
|
|
|
|
$DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID);
|
|
|
|
### ----------------------------------------------
|
|
print "----- ClientStats -----\n" if $Debug;
|
|
### parse headers
|
|
while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
|
|
### skip postings with wrong TLH
|
|
next if ($TLH && !CheckTLH($Newsgroups,$TLH));
|
|
|
|
my (@Clients, $Client, $Version);
|
|
my %Header = ParseHeaders(split(/\n/,$Headers));
|
|
|
|
### X-Mailer
|
|
if ($Header{'x-mailer'}) {
|
|
# transfer to x-newsreader and parse from there
|
|
$Header{'x-newsreader'} = $Header{'x-mailer'};
|
|
}
|
|
### X-Newsreader
|
|
if ($Header{'x-newsreader'}) {
|
|
$Header{'x-newsreader'} = RemoveComments($Header{'x-newsreader'});
|
|
# remove 'http://' and 'via' (CrossPoint)
|
|
$Header{'x-newsreader'} =~ s/https?:\/\///;
|
|
$Header{'x-newsreader'} =~ s/ ?via(.+)?$//;
|
|
# parse header
|
|
# User-Agent style
|
|
if ($Header{'x-newsreader'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
|
|
# transfer to user-agent and parse from there
|
|
$Header{'user-agent'} = $Header{'x-newsreader'};
|
|
# "client name version"
|
|
} elsif ($Header{'x-newsreader'} =~ / /) {
|
|
($Client, $Version) = ParseXNewsreader($Header{'x-newsreader'});
|
|
} else {
|
|
$Client = $Header{'x-newsreader'};
|
|
$Version = '';
|
|
}
|
|
if ($Client) {
|
|
# special cases
|
|
$Client = 'CrossPoint' if $Client =~ /^CrossPoint\//;
|
|
$Client = 'Virtual Access' if $Client =~ /^Virtual Access/;
|
|
my %UserAgent = (agent => $Client,
|
|
version => $Version);
|
|
push @Clients, { %UserAgent };
|
|
} else {
|
|
$Header{'user-agent'} = $Header{'x-newsreader'};
|
|
}
|
|
}
|
|
### User-Agent
|
|
if(!@Clients && $Header{'user-agent'}) {
|
|
$Header{'user-agent'} = RemoveComments($Header{'user-agent'});
|
|
### well-formed?
|
|
if ($Header{'user-agent'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
|
|
@Clients = ParseUserAgent($Header{'user-agent'});
|
|
} else {
|
|
# snip and add known well-formed agents from the trailing end
|
|
while ($Header{'user-agent'} =~ /(((Hamster)|(Hamster-Pg)|(KorrNews)|(OE-Tools)|(Mime-proxy))(\/[^\/ ]+))$/) {
|
|
push @Clients, ParseUserAgent($1);
|
|
$Header{'user-agent'} =~ s/ [^\/ ]+\/[^\/ ]+$//;
|
|
}
|
|
### special cases
|
|
# remove 'http://open-news-network.org'
|
|
$Header{'user-agent'} =~ s/^https?:\/\/open-news-network.org(\S+)?//;
|
|
# Thunderbird
|
|
if ($Header{'user-agent'} =~ /((Mozilla[- ])?Thunderbird) ?([0-9.]+)?/) {
|
|
$Client = 'Thunderbird';
|
|
$Version = $3;
|
|
# XP
|
|
} elsif ($Header{'user-agent'} =~ /((TrueXP|FreeXP|XP2(\/Agent)?)) \/(.+)$/) {
|
|
$Client = $1;
|
|
$Version = $4;
|
|
$Client = 'XP2' if $Client eq 'XP2/Agent';
|
|
### most general case
|
|
# client version
|
|
# client/version
|
|
# client/32 version
|
|
# - version may end in one non-numeric character
|
|
# - including trailing beta/pre/...
|
|
# 1) client: (([^0-9]+)|(\D+\/\d+))
|
|
# 2) version: (\S+\d\D?)
|
|
# 3) trailing: (( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?
|
|
} elsif ($Header{'user-agent'} =~ /^(([^0-9]+)|(\D+\/\d+))[\/ ]((\S+\d\D?)(( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?)$/) {
|
|
$Client = $1;
|
|
$Version = $4;
|
|
### some very special cases
|
|
# SeaMonkey/nn
|
|
} elsif ($Header{'user-agent'} =~ /SeaMonkey\/([0-9.]+)/) {
|
|
$Client = 'Seamonkey';
|
|
$Version = $1;
|
|
# Emacs nn/Gnus nn
|
|
} elsif ($Header{'user-agent'} =~ /Emacs [0-9.]+\/Gnus ([0-9.]+)/) {
|
|
$Client = 'Gnus';
|
|
$Version = $1;
|
|
# failed to parse
|
|
} else {
|
|
$Client = $Header{'user-agent'};
|
|
}
|
|
# count client, if found
|
|
if ($Client) {
|
|
my %UserAgent = (agent => $Client,
|
|
version => $Version);
|
|
push @Clients, { %UserAgent };
|
|
} else {
|
|
&Bleat(1,sprintf("%s FAILED", $Header{'message-id'})) if !@Clients;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (@Clients) {
|
|
$Postings{'ALL'}{'ALL'}++;
|
|
foreach (@Clients) {
|
|
# filter agents for User-Agent with multiple agents
|
|
next if $#Clients && exists($DropAgent{lc($_->{'agent'})});
|
|
# encode to utf-8, if necessary
|
|
$_->{'agent'} = encode('UTF-8', $_->{'agent'}) if $_->{'agent'} =~ /[\x80-\x{ffff}]/;
|
|
$_->{'version'} = encode('UTF-8', $_->{'version'}) if $_->{'version'} and $_->{'version'} =~ /[\x80-\x{ffff}]/;
|
|
# special cases
|
|
# Mozilla
|
|
$_->{'agent'} = 'Mozilla' if $_->{'agent'} eq '•Mozilla';
|
|
$_->{'agent'} =~ s/^Mozilla //;
|
|
# Forte Agent
|
|
$_->{'agent'} = 'Forte Agent' if $_->{'agent'} eq 'ForteAgent';
|
|
if ($_->{'agent'} eq 'Forte Agent') {
|
|
$_->{'version'} =~ s/-/\//;
|
|
$_->{'version'} = '' if $_->{'version'} eq '32Bit';
|
|
}
|
|
# count client ('ALL') and client/version (if version is present)
|
|
$Postings{$_->{'agent'}}{'ALL'}++;
|
|
$Postings{$_->{'agent'}}{$_->{'version'}}++ if $_->{'version'};
|
|
|
|
printf("%s: %s {%s}\n", $Header{'message-id'}, $_->{'agent'},
|
|
$_->{'version'} ? $Postings{$_->{'agent'}}{$_->{'version'}} : '')
|
|
if ($MID or $Debug && $Debug >1);
|
|
}
|
|
}
|
|
};
|
|
|
|
# delete old data for that month
|
|
if (!$Test) {
|
|
$DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
|
|
$DBClients),undef,$Month)
|
|
or &Bleat(2,sprintf("Can't delete old client data for %s from %s: ".
|
|
"$DBI::errstr\n",$Month,$DBClients));
|
|
};
|
|
|
|
foreach my $Client (sort keys %Postings) {
|
|
foreach my $Version (sort keys %{$Postings{$Client}}) {
|
|
printf ("%s {%s}: %d\n",$Client,$Version,$Postings{$Client}{$Version}) if $Debug;
|
|
|
|
if (!$Test) {
|
|
# write to database
|
|
$DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
|
|
"(month,client,version,postings) ".
|
|
"VALUES (?, ?, ?, ?)",$DBClients));
|
|
$DBQuery->execute($Month, $Client, $Version, $Postings{$Client}{$Version})
|
|
or &Bleat(2,sprintf("Can't write groups data for %s/%s/%s to %s: ".
|
|
"$DBI::errstr\n",$Month,$Client,$Version,$DBClients));
|
|
$DBQuery->finish;
|
|
};
|
|
}
|
|
};
|
|
|
|
};
|
|
|
|
sub GetHeaders {
|
|
### ----------------------------------------------------------------------------
|
|
### get (newsgroups and) raw headers from database
|
|
### IN : $DBHandle: database handle
|
|
### $DBRaw : database table for raw data (to read from)
|
|
### $Month : current month to do
|
|
### $MID : specific Message-ID to fetch (testing purposes)
|
|
### OUT: DBI statement handle
|
|
my ($DBHandle,$DBRaw,$Month,$MID) = @_;
|
|
|
|
my $DBQuery;
|
|
|
|
if (!$MID) {
|
|
# get raw header data from raw table for given month
|
|
$DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups,headers FROM %s ".
|
|
"WHERE day LIKE ? AND NOT disregard",
|
|
$DBRaw));
|
|
$DBQuery->execute($Month.'-%')
|
|
or &Bleat(2,sprintf("Can't get header data for %s from %s: ".
|
|
"$DBI::errstr\n",$Month,$DBRaw));
|
|
} else {
|
|
$DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups,headers FROM %s ".
|
|
"WHERE mid = ?", $DBRaw));
|
|
$DBQuery->execute($MID)
|
|
or &Bleat(2,sprintf("Can't get header data for %s from %s: ".
|
|
"$DBI::errstr\n",$MID,$DBRaw));
|
|
}
|
|
return $DBQuery;
|
|
}
|
|
|
|
sub CheckTLH {
|
|
### ----------------------------------------------------------------------------
|
|
### count newsgroups from legal TLH(s)
|
|
### IN : $Newsgroups: comma separated list of newsgroups
|
|
### $TLH : (reference to an array of) legal TLH(s)
|
|
### OUT: number of newsgroups from legal TLH(s)
|
|
my ($Newsgroups,$TLH) = @_;
|
|
|
|
my (@TLH,$GroupCount);
|
|
|
|
# fill @TLH from $TLH, which can be an array reference or a scalar value
|
|
if (ref($TLH) eq 'ARRAY') {
|
|
@TLH = @{$TLH};
|
|
} else {
|
|
push @TLH, $TLH;
|
|
}
|
|
|
|
# remove whitespace from contents of Newsgroups:
|
|
chomp($Newsgroups);
|
|
$Newsgroups =~ s/\s//;
|
|
for (split /,/, $Newsgroups) {
|
|
my $Newsgroup = $_;
|
|
foreach (@TLH) {
|
|
# increment $GroupCount if $Newsgroup starts with $TLH
|
|
$GroupCount++ if $Newsgroup =~ /^$_/;
|
|
}
|
|
};
|
|
|
|
return $GroupCount;
|
|
}
|
|
|
|
sub RemoveComments {
|
|
### ----------------------------------------------------------------------------
|
|
### remove comments and other junk from header
|
|
### IN : $Header: a header
|
|
### OUT: the header, with comments and other junk removed
|
|
my $Header = shift;
|
|
|
|
# decode MIME encoded words
|
|
if ($Header =~ /=\?\S+\?[BQ]\?/) {
|
|
$Header = decode("MIME-Header",$Header);
|
|
}
|
|
|
|
# remove nested comments from '(' to first ')'
|
|
while ($Header =~ /\([^)]+\)/) {
|
|
$Header =~ s/\([^()]+?\)//;
|
|
}
|
|
|
|
# remove dangling ')'
|
|
$Header =~ s/\S+\)//;
|
|
|
|
# remove from dangling '(' to end of header
|
|
$Header =~ s/\(.+$//;
|
|
|
|
# remove from '[' to first ']'
|
|
$Header =~ s/\[[^\[\]]+?\]//;
|
|
|
|
# remove 'Nr. ... lebt'
|
|
$Header =~ s/Nr\. \d+ lebt//;
|
|
|
|
# remove nn:nn:nn
|
|
$Header =~ s/\d\d:\d\d:\d\d//;
|
|
|
|
# remove 'mm/... '
|
|
$Header =~ s/\/mm\/\S+//;
|
|
|
|
# remove ' DE' / _DE'
|
|
$Header =~ s/[ _]DE//;
|
|
|
|
# remove trailing 'eol' or '-shl'
|
|
$Header =~ s/(eol)|(-shl)$//;
|
|
|
|
# remove from ';' or ',' (CrossPoint)
|
|
# or '&' to end of header
|
|
$Header =~ s/[;,&].+$//;
|
|
|
|
# remove from 'by ' or 'unter Windows' or '@ Windows'
|
|
# to end of header
|
|
$Header =~ s/((by )|(unter +Windows)|(@ Windows)).+$//;
|
|
|
|
# remove superfluous whitespace in header
|
|
# and whitespace around header
|
|
$Header =~ s/\s+/ /g;
|
|
$Header =~ s/^\s+//;
|
|
$Header =~ s/\s+$//;
|
|
|
|
return $Header;
|
|
}
|
|
|
|
sub ParseXNewsreader {
|
|
### ----------------------------------------------------------------------------
|
|
### parse X-Newsreader header (client and version, if present)
|
|
### IN : $XNR: a X-Newsreader header
|
|
### OUT: client and version, if present
|
|
my $XNR = shift;
|
|
|
|
my ($Client, $Version);
|
|
|
|
foreach (split(/ /,$XNR)) {
|
|
# add to client name if no digit present
|
|
if (!/\d[0-9.]/ or /\/\d$/) {
|
|
$Client .= $_ . ' ' ;
|
|
# otherwise, use as version and terminate parsing
|
|
} else {
|
|
$Version = $_;
|
|
last;
|
|
}
|
|
}
|
|
|
|
# remove trailing whitespace
|
|
$Client =~ s/\s+$// if $Client;
|
|
|
|
# set $Version
|
|
$Version = '' if !$Version;
|
|
|
|
return $Client, $Version;
|
|
}
|
|
|
|
|
|
sub ParseUserAgent {
|
|
### ----------------------------------------------------------------------------
|
|
### parse User-Agent header (agent and version)
|
|
### IN : $UserAgent: a User-Agent header
|
|
### OUT: array of hashes (agent/version)
|
|
my $UserAgent = shift;
|
|
|
|
my @UserAgents;
|
|
|
|
# a well-formed User-Agent header will contain pairs of
|
|
# client/version, i.e. 'slrn/0.9.7.3'
|
|
foreach (split(/ /,$UserAgent)) {
|
|
my %UserAgent;
|
|
/^(.+)\/(.+)$/;
|
|
$UserAgent{'agent'} = $1;
|
|
$UserAgent{'version'} = $2;
|
|
push @UserAgents, { %UserAgent };
|
|
}
|
|
|
|
return @UserAgents;
|
|
}
|
|
|
|
__END__
|
|
|
|
################################ Documentation #################################
|
|
|
|
=head1 NAME
|
|
|
|
gatherstats - process statistical data from a raw source
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--conffile> I<filename>]
|
|
|
|
=head1 REQUIREMENTS
|
|
|
|
See L<doc/README>.
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This script will extract and process statistical information from a
|
|
database table which is fed from F<feedlog.pl> for a given time period
|
|
and write its results to (an)other database table(s). Entries marked
|
|
with I<'disregard'> in the database will be ignored; currently, you
|
|
have to set this flag yourself, using your database management tools.
|
|
You can exclude erroneous entries that way (e.g. automatic reposts
|
|
(think of cancels flood and resurrectors); spam; ...).
|
|
|
|
The time period to act on defaults to last month; you can assign
|
|
another time period or a single month via the B<--month> option (see
|
|
below).
|
|
|
|
By default B<gatherstats> will process all types of information; you
|
|
can change that using the B<--stats> option and assigning the type of
|
|
information to process.
|
|
|
|
Possible information types include:
|
|
|
|
=over 3
|
|
|
|
=item B<groups> (postings per group per month)
|
|
|
|
B<gatherstats> will examine Newsgroups: headers. Crosspostings will be
|
|
counted for each single group they appear in. Groups not in I<TLH>
|
|
will be ignored.
|
|
|
|
B<gatherstats> will also add up the number of postings for each
|
|
hierarchy level, but only count each posting once. A posting to
|
|
de.alt.test will be counted for de.alt.test, de.alt.ALL and de.ALL,
|
|
respectively. A crossposting to de.alt.test and de.alt.admin, on the
|
|
other hand, will be counted for de.alt.test and de.alt.admin each, but
|
|
only once for de.alt.ALL and de.ALL.
|
|
|
|
Data is written to I<DBTableGrps> (see L<doc/INSTALL>); you can
|
|
override that default through the B<--groupsdb> option.
|
|
|
|
=item B<hosts> (postings from host per month)
|
|
|
|
B<gatherstats> will examine Injection-Info:, X-Trace: and Path:
|
|
headers and try to normalize them. The sum of all detected hosts will
|
|
also be saved for each month. Groups not in I<TLH> will be ignored.
|
|
|
|
Data is written to I<DBTableHosts> (see L<doc/INSTALL>); you can
|
|
override that default through the B<--hostsdb> option.
|
|
|
|
=item B<clients> (postings by client per month)
|
|
|
|
B<gatherstats> will examine User-Agent:, X-Newsreader: and X-Mailer:
|
|
headers and try to remove comments and non-standard contents. Clients
|
|
and client versions are counted separately. The sum of all detected
|
|
clients will also be saved for each month. Groups not in I<TLH> will
|
|
be ignored.
|
|
|
|
Data is written to I<DBTableClnts> (see L<doc/INSTALL>); you can
|
|
override that default through the B<--clientsdb> option.
|
|
|
|
=back
|
|
|
|
=head2 Configuration
|
|
|
|
B<gatherstats> will read its configuration from F<newsstats.conf>
|
|
which should be present in etc/ via Config::Auto or from a configuration file
|
|
submitted by the B<--conffile> option.
|
|
|
|
See L<doc/INSTALL> for an overview of possible configuration options.
|
|
|
|
You can override configuration options via the B<--hierarchy>,
|
|
B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
|
|
respectively.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 3
|
|
|
|
=item B<-V>, B<--version>
|
|
|
|
Print out version and copyright information and exit.
|
|
|
|
=item B<-h>, B<--help>
|
|
|
|
Print this man page and exit.
|
|
|
|
=item B<-d>, B<--debug>
|
|
|
|
Output debugging information to STDOUT while processing (number of
|
|
postings per group).
|
|
|
|
=item B<-t>, B<--test>
|
|
|
|
Do not write results to database. You should use B<--debug> in
|
|
conjunction with B<--test> ... everything else seems a bit pointless.
|
|
|
|
=item B<-m>, B<--month> I<YYYY-MM[:YYYY-MM]>
|
|
|
|
Set processing period to a single month in YYYY-MM format or to a time
|
|
period between two month in YYYY-MM:YYYY-MM format (two month, separated
|
|
by a colon).
|
|
|
|
=item B<-s>, B<--stats> I<type>
|
|
|
|
Set processing type to one of I<all>, I<groups> or I<hosts>. Defaults
|
|
to all.
|
|
|
|
=item B<-c>, B<--checkgroups> I<filename template>
|
|
|
|
Check each group against a list of valid newsgroups read from a file,
|
|
one group on each line and ignoring everything after the first
|
|
whitespace (so you can use a file in checkgroups format or (part of)
|
|
your INN active file).
|
|
|
|
The filename is taken from I<filename template>, amended by each
|
|
B<--month> B<gatherstats> is processing in the form of I<template-YYYY-MM>,
|
|
so that
|
|
|
|
gatherstats -m 2010-01:2010-12 -c checkgroups
|
|
|
|
will check against F<checkgroups-2010-01> for January 2010, against
|
|
F<checkgroups-2010-02> for February 2010 and so on.
|
|
|
|
Newsgroups not found in the checkgroups file will be dropped (and
|
|
logged to STDERR), and newsgroups found there but having no postings
|
|
will be added with a count of 0 (and logged to STDERR).
|
|
|
|
=item B<--hierarchy> I<TLH> (newsgroup hierarchy/hierarchies)
|
|
|
|
Override I<TLH> from F<newsstats.conf>.
|
|
|
|
I<TLH> can be a single word or a comma-separated list.
|
|
|
|
=item B<--rawdb> I<table> (raw data table)
|
|
|
|
Override I<DBTableRaw> from F<newsstats.conf>.
|
|
|
|
=item B<--groupsdb> I<table> (postings per group table)
|
|
|
|
Override I<DBTableGrps> from F<newsstats.conf>.
|
|
|
|
=item B<--hostsdb> I<table> (host data table)
|
|
|
|
Override I<DBTableHosts> from F<newsstats.conf>.
|
|
|
|
=item B<--clientsdb> I<table> (client data table)
|
|
|
|
Override I<DBTableClnts> from F<newsstats.conf>.
|
|
|
|
=item B<--conffile> I<filename>
|
|
|
|
Load configuration from I<filename> instead of F<newsstats.conf>.
|
|
|
|
=back
|
|
|
|
=head1 INSTALLATION
|
|
|
|
See L<doc/INSTALL>.
|
|
|
|
=head1 EXAMPLES
|
|
|
|
Process all types of information for lasth month:
|
|
|
|
gatherstats
|
|
|
|
Do a dry run, showing results of processing:
|
|
|
|
gatherstats --debug --test
|
|
|
|
Process all types of information for January of 2010:
|
|
|
|
gatherstats --month 2010-01
|
|
|
|
Process only number of postings for the year of 2010,
|
|
checking against checkgroups-*:
|
|
|
|
gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
|
|
|
|
=head1 FILES
|
|
|
|
=over 4
|
|
|
|
=item F<bin/gatherstats.pl>
|
|
|
|
The script itself.
|
|
|
|
=item F<lib/NewsStats.pm>
|
|
|
|
Library functions for the NewsStats package.
|
|
|
|
=item F<etc/newsstats.conf>
|
|
|
|
Runtime configuration file.
|
|
|
|
=back
|
|
|
|
=head1 BUGS
|
|
|
|
Please report any bugs or feature requests to the author or use the
|
|
bug tracker at L<https://code.virtcomm.de/thh/newsstats/issues>!
|
|
|
|
=head1 SEE ALSO
|
|
|
|
=over 2
|
|
|
|
=item -
|
|
|
|
L<doc/README>
|
|
|
|
=item -
|
|
|
|
L<doc/INSTALL>
|
|
|
|
=back
|
|
|
|
This script is part of the B<NewsStats> package.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Thomas Hochstein <thh@thh.name>
|
|
|
|
=head1 COPYRIGHT AND LICENSE
|
|
|
|
Copyright (c) 2010-2013, 2025 Thomas Hochstein <thh@thh.name>
|
|
|
|
This program is free software; you may redistribute it and/or modify it
|
|
under the same terms as Perl itself.
|
|
|
|
=cut
|