diff --git a/bin/dbcreate.pl b/bin/dbcreate.pl index ea0fd6c..fee67ab 100755 --- a/bin/dbcreate.pl +++ b/bin/dbcreate.pl @@ -46,7 +46,7 @@ my $DBCreate = < < < < < < < < 1 } @DropAgents; + + $DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID); + + ### ---------------------------------------------- + print "----- ClientStats -----\n" if $Debug; + ### parse headers + while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) { + ### skip postings with wrong TLH + next if ($TLH && !CheckTLH($Newsgroups,$TLH)); + + my (@Clients, $Client, $Version); + my %Header = ParseHeaders(split(/\n/,$Headers)); + + ### X-Mailer + if ($Header{'x-mailer'}) { + # transfer to x-newsreader and parse from there + $Header{'x-newsreader'} = $Header{'x-mailer'}; + } + ### X-Newsreader + if ($Header{'x-newsreader'}) { + $Header{'x-newsreader'} = RemoveComments($Header{'x-newsreader'}); + # remove 'http://' and 'via' (CrossPoint) + $Header{'x-newsreader'} =~ s/https?:\/\///; + $Header{'x-newsreader'} =~ s/ ?via(.+)?$//; + # parse header + # User-Agent style + if ($Header{'x-newsreader'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) { + # transfer to user-agent and parse from there + $Header{'user-agent'} = $Header{'x-newsreader'}; + # "client name version" + } elsif ($Header{'x-newsreader'} =~ / /) { + ($Client, $Version) = ParseXNewsreader($Header{'x-newsreader'}); + } else { + $Client = $Header{'x-newsreader'}; + $Version = ''; + } + if ($Client) { + # special cases + $Client = 'CrossPoint' if $Client =~ /^CrossPoint\//; + $Client = 'Virtual Access' if $Client =~ /^Virtual Access/; + my %UserAgent = (agent => $Client, + version => $Version); + push @Clients, { %UserAgent }; + } else { + $Header{'user-agent'} = $Header{'x-newsreader'}; + } + } + ### User-Agent + if(!@Clients && $Header{'user-agent'}) { + $Header{'user-agent'} = RemoveComments($Header{'user-agent'}); + ### well-formed? + if ($Header{'user-agent'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) { + @Clients = ParseUserAgent($Header{'user-agent'}); + } else { + # snip and add known well-formed agents from the trailing end + while ($Header{'user-agent'} =~ /(((Hamster)|(Hamster-Pg)|(KorrNews)|(OE-Tools)|(Mime-proxy))(\/[^\/ ]+))$/) { + push @Clients, ParseUserAgent($1); + $Header{'user-agent'} =~ s/ [^\/ ]+\/[^\/ ]+$//; + } + ### special cases + # remove 'http://open-news-network.org' + $Header{'user-agent'} =~ s/^https?:\/\/open-news-network.org(\S+)?//; + # Thunderbird + if ($Header{'user-agent'} =~ /((Mozilla[- ])?Thunderbird) ?([0-9.]+)?/) { + $Client = 'Thunderbird'; + $Version = $3; + # XP + } elsif ($Header{'user-agent'} =~ /((TrueXP|FreeXP|XP2(\/Agent)?)) \/(.+)$/) { + $Client = $1; + $Version = $4; + $Client = 'XP2' if $Client eq 'XP2/Agent'; + ### most general case + # client version + # client/version + # client/32 version + # - version may end in one non-numeric character + # - including trailing beta/pre/... + # 1) client: (([^0-9]+)|(\D+\/\d+)) + # 2) version: (\S+\d\D?) + # 3) trailing: (( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)? + } elsif ($Header{'user-agent'} =~ /^(([^0-9]+)|(\D+\/\d+))[\/ ]((\S+\d\D?)(( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?)$/) { + $Client = $1; + $Version = $4; + ### some very special cases + # SeaMonkey/nn + } elsif ($Header{'user-agent'} =~ /SeaMonkey\/([0-9.]+)/) { + $Client = 'Seamonkey'; + $Version = $1; + # Emacs nn/Gnus nn + } elsif ($Header{'user-agent'} =~ /Emacs [0-9.]+\/Gnus ([0-9.]+)/) { + $Client = 'Gnus'; + $Version = $1; + # failed to parse + } else { + $Client = $Header{'user-agent'}; + } + # count client, if found + if ($Client) { + my %UserAgent = (agent => $Client, + version => $Version); + push @Clients, { %UserAgent }; + } else { + &Bleat(2,sprintf("%s FAILED", $Header{'message-id'})) if !@Clients; + } + } + } + + if (@Clients) { + $Postings{'ALL'}{'ALL'}++; + foreach (@Clients) { + # filter agents for User-Agent with multiple agents + next if $#Clients && exists($DropAgent{lc($_->{'agent'})}); + # encode to utf-8, if necessary + $_->{'agent'} = encode('UTF-8', $_->{'agent'}) if $_->{'agent'} =~ /[\x80-\x{ffff}]/; + $_->{'version'} = encode('UTF-8', $_->{'version'}) if $_->{'version'} and $_->{'version'} =~ /[\x80-\x{ffff}]/; + # special cases + # Mozilla + $_->{'agent'} = 'Mozilla' if $_->{'agent'} eq '•Mozilla'; + $_->{'agent'} =~ s/^Mozilla //; + # Forte Agent + $_->{'agent'} = 'Forte Agent' if $_->{'agent'} eq 'ForteAgent'; + if ($_->{'agent'} eq 'Forte Agent') { + $_->{'version'} =~ s/-/\//; + $_->{'version'} = '' if $_->{'version'} eq '32Bit'; + } + # count client ('ALL') and client/version (if version is present) + $Postings{$_->{'agent'}}{'ALL'}++; + $Postings{$_->{'agent'}}{$_->{'version'}}++ if $_->{'version'}; + + printf("%s: %s {%s}\n", $Header{'message-id'}, $_->{'agent'}, + $_->{'version'} ? $Postings{$_->{'agent'}}{$_->{'version'}} : '') + if ($MID or $Debug && $Debug >1); + } + } + }; + + # delete old data for that month + if (!$Test) { + $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?", + $DBClients),undef,$Month) + or &Bleat(2,sprintf("Can't delete old client data for %s from %s: ". + "$DBI::errstr\n",$Month,$DBClients)); + }; + + foreach my $Client (sort keys %Postings) { + foreach my $Version (sort keys %{$Postings{$Client}}) { + printf ("%s {%s}: %d\n",$Client,$Version,$Postings{$Client}{$Version}) if $Debug; + + if (!$Test) { + # write to database + $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ". + "(month,client,version,postings) ". + "VALUES (?, ?, ?, ?)",$DBClients)); + $DBQuery->execute($Month, $Client, $Version, $Postings{$Client}{$Version}) + or &Bleat(2,sprintf("Can't write groups data for %s/%s/%s to %s: ". + "$DBI::errstr\n",$Month,$Client,$Version,$DBClients)); + $DBQuery->finish; + }; + } + }; + +}; + sub GetHeaders { ### ---------------------------------------------------------------------------- ### get (newsgroups and) raw headers from database -### IN : $DBHandle : database handle -### $DBRaw : database table for raw data (to read from) -### $Month : current month to do -### $MID : specific Message-ID to fetch (testing purposes) +### IN : $DBHandle: database handle +### $DBRaw : database table for raw data (to read from) +### $Month : current month to do +### $MID : specific Message-ID to fetch (testing purposes) ### OUT: DBI statement handle my ($DBHandle,$DBRaw,$Month,$MID) = @_; @@ -389,8 +581,8 @@ sub GetHeaders { sub CheckTLH { ### ---------------------------------------------------------------------------- ### count newsgroups from legal TLH(s) -### IN : $Newsgroups : comma separated list of newsgroups -### $TLH : (reference to an array of) legal TLH(s) +### IN : $Newsgroups: comma separated list of newsgroups +### $TLH : (reference to an array of) legal TLH(s) ### OUT: number of newsgroups from legal TLH(s) my ($Newsgroups,$TLH) = @_; @@ -417,6 +609,116 @@ sub CheckTLH { return $GroupCount; } +sub RemoveComments { +### ---------------------------------------------------------------------------- +### remove comments and other junk from header +### IN : $Header: a header +### OUT: the header, with comments and other junk removed + my $Header = shift; + + # decode MIME encoded words + if ($Header =~ /=\?\S+\?[BQ]\?/) { + $Header = decode("MIME-Header",$Header); + } + + # remove nested comments from '(' to first ')' + while ($Header =~ /\([^)]+\)/) { + $Header =~ s/\([^()]+?\)//; + } + + # remove dangling ')' + $Header =~ s/\S+\)//; + + # remove from dangling '(' to end of header + $Header =~ s/\(.+$//; + + # remove from '[' to first ']' + $Header =~ s/\[[^\[\]]+?\]//; + + # remove 'Nr. ... lebt' + $Header =~ s/Nr\. \d+ lebt//; + + # remove nn:nn:nn + $Header =~ s/\d\d:\d\d:\d\d//; + + # remove 'mm/... ' + $Header =~ s/\/mm\/\S+//; + + # remove ' DE' / _DE' + $Header =~ s/[ _]DE//; + + # remove trailing 'eol' or '-shl' + $Header =~ s/(eol)|(-shl)$//; + + # remove from ';' or ',' (CrossPoint) + # or '&' to end of header + $Header =~ s/[;,&].+$//; + + # remove from 'by ' or 'unter Windows' or '@ Windows' + # to end of header + $Header =~ s/((by )|(unter +Windows)|(@ Windows)).+$//; + + # remove superfluous whitespace in header + # and whitespace around header + $Header =~ s/\s+/ /g; + $Header =~ s/^\s+//; + $Header =~ s/\s+$//; + + return $Header; +} + +sub ParseXNewsreader { +### ---------------------------------------------------------------------------- +### parse X-Newsreader header (client and version, if present) +### IN : $XNR: a X-Newsreader header +### OUT: client and version, if present + my $XNR = shift; + + my ($Client, $Version); + + foreach (split(/ /,$XNR)) { + # add to client name if no digit present + if (!/\d[0-9.]/ or /\/\d$/) { + $Client .= $_ . ' ' ; + # otherwise, use as version and terminate parsing + } else { + $Version = $_; + last; + } + } + + # remove trailing whitespace + $Client =~ s/\s+$// if $Client; + + # set $Version + $Version = '' if !$Version; + + return $Client, $Version; +} + + +sub ParseUserAgent { +### ---------------------------------------------------------------------------- +### parse User-Agent header (agent and version) +### IN : $UserAgent: a User-Agent header +### OUT: array of hashes (agent/version) + my $UserAgent = shift; + + my @UserAgents; + + # a well-formed User-Agent header will contain pairs of + # client/version, i.e. 'slrn/0.9.7.3' + foreach (split(/ /,$UserAgent)) { + my %UserAgent; + /^(.+)\/(.+)$/; + $UserAgent{'agent'} = $1; + $UserAgent{'version'} = $2; + push @UserAgents, { %UserAgent }; + } + + return @UserAgents; +} + __END__ ################################ Documentation ################################# @@ -427,7 +729,7 @@ gatherstats - process statistical data from a raw source =head1 SYNOPSIS -B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] [B<--conffile> I] +B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--hostsdb> I] [B<--clientsdb> I] [B<--conffile> I] =head1 REQUIREMENTS @@ -474,12 +776,23 @@ override that default through the B<--groupsdb> option. =item B (postings from host per month) B will examine Injection-Info:, X-Trace: and Path: -headers and try to normalize them. Groups not in I will be -ignored. The sum of all detected hosts will also saved for each month. +headers and try to normalize them. The sum of all detected hosts will +also be saved for each month. Groups not in I will be ignored. Data is written to I (see L); you can override that default through the B<--hostsdb> option. +=item B (postings by client per month) + +B will examine User-Agent:, X-Newsreader: and X-Mailer: +headers and try to remove comments and non-standard contents. Clients +and client versions are counted separately. The sum of all detected +clients will also be saved for each month. Groups not in I will +be ignored. + +Data is written to I (see L); you can +override that default through the B<--clientsdb> option. + =back =head2 Configuration @@ -561,14 +874,14 @@ Override I from F. Override I from F. -=item B<--clientsdb> I (client data table) - -Override I from F. - =item B<--hostsdb> I
(host data table) Override I from F. +=item B<--clientsdb> I
(client data table) + +Override I from F. + =item B<--conffile> I Load configuration from I instead of F. diff --git a/doc/ChangeLog b/doc/ChangeLog index dfa6de7..6792cc6 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -4,6 +4,7 @@ NewsStats 0.4.0 (unreleased) * Extract getting raw headers from HostStats to subroutine. * Improve documentation for config file. * ParseHeader: re-merge continuation lines. + * Add ClientStats to gatherstats. NewsStats 0.3.0 (2025-05-18) * Extract GroupStats (in gatherstats) to subroutine. diff --git a/etc/newsstats.conf.sample b/etc/newsstats.conf.sample index 1ea3430..a68ee6a 100644 --- a/etc/newsstats.conf.sample +++ b/etc/newsstats.conf.sample @@ -13,7 +13,7 @@ DBDatabase = newsstats DBTableRaw = raw_de DBTableGrps = groups_de DBTableHosts = hosts_de -#DBTableClnts = +DBTableClnts = clnts_de ### hierarchy configuration # comma-separated list of TLHs to parse