Add ClientStats to gatherstats.
Signed-off-by: Thomas Hochstein <thh@thh.name>
This commit is contained in:
		
							parent
							
								
									3e73346b20
								
							
						
					
					
						commit
						a553b374ce
					
				
					 4 changed files with 352 additions and 21 deletions
				
			
		| 
						 | 
				
			
			@ -46,7 +46,7 @@ my $DBCreate = <<SQLDB;
 | 
			
		|||
CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8;
 | 
			
		||||
SQLDB
 | 
			
		||||
 | 
			
		||||
my %DBCreate = ('DBTableRaw'  => <<RAW, 'DBTableGrps' => <<GRPS, 'DBTableHosts' => <<HOSTS);
 | 
			
		||||
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableGrps' => <<GRPS, 'DBTableHosts' => <<HOSTS, 'DBTableClnts' => <<CLIENTS);
 | 
			
		||||
--
 | 
			
		||||
-- Table structure for table DBTableRaw
 | 
			
		||||
--
 | 
			
		||||
| 
						 | 
				
			
			@ -102,6 +102,23 @@ CREATE TABLE IF NOT EXISTS `$Conf{'DBTableHosts'}` (
 | 
			
		|||
  KEY `host` (`host`)
 | 
			
		||||
) ENGINE=MyISAM  DEFAULT CHARSET=utf8 COMMENT='Postings per server';
 | 
			
		||||
HOSTS
 | 
			
		||||
--
 | 
			
		||||
-- Table structure for table DBTableClnts
 | 
			
		||||
--
 | 
			
		||||
 | 
			
		||||
CREATE TABLE IF NOT EXISTS `$Conf{'DBTableClnts'}` (
 | 
			
		||||
  `id` bigint(20) unsigned NOT NULL auto_increment,
 | 
			
		||||
  `month` varchar(7) character set ascii NOT NULL,
 | 
			
		||||
  `client` varchar(150) NOT NULL,
 | 
			
		||||
  `version` varchar(20) NOT NULL,
 | 
			
		||||
  `postings` int(11) NOT NULL,
 | 
			
		||||
  `revision` timestamp NOT NULL default CURRENT_TIMESTAMP on update CURRENT_TIMESTAMP,
 | 
			
		||||
  PRIMARY KEY  (`id`),
 | 
			
		||||
  UNIQUE KEY `month_client_version` (`month`,`client`,`version`),
 | 
			
		||||
  KEY `month` (`month`),
 | 
			
		||||
  KEY `client` (`client`)
 | 
			
		||||
) ENGINE=MyISAM  DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='Postings per client';
 | 
			
		||||
CLIENTS
 | 
			
		||||
 | 
			
		||||
##### --------------------------------------------------------------------------
 | 
			
		||||
##### Installation / upgrade instructions
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -23,6 +23,8 @@ use warnings;
 | 
			
		|||
use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ParseHierarchies ReadGroupList ParseHeaders);
 | 
			
		||||
 | 
			
		||||
use DBI;
 | 
			
		||||
use Data::Dumper;
 | 
			
		||||
use Encode qw(decode encode);
 | 
			
		||||
use Getopt::Long qw(GetOptions);
 | 
			
		||||
Getopt::Long::config ('bundling');
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -31,7 +33,7 @@ Getopt::Long::config ('bundling');
 | 
			
		|||
# define types of information that can be gathered
 | 
			
		||||
# all / groups (/ clients / hosts)
 | 
			
		||||
my %LegalStats;
 | 
			
		||||
@LegalStats{('all','groups','hosts')} = ();
 | 
			
		||||
@LegalStats{('all','groups','hosts','clients')} = ();
 | 
			
		||||
 | 
			
		||||
################################# Main program #################################
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -61,8 +63,8 @@ my %Conf = %{ReadConfig($OptConfFile)};
 | 
			
		|||
my %ConfOverride;
 | 
			
		||||
$ConfOverride{'DBTableRaw'}   = $OptRawDB if $OptRawDB;
 | 
			
		||||
$ConfOverride{'DBTableGrps'}  = $OptGroupsDB if $OptGroupsDB;
 | 
			
		||||
$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
 | 
			
		||||
$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
 | 
			
		||||
$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
 | 
			
		||||
$ConfOverride{'TLH'} = $OptTLH if $OptTLH;
 | 
			
		||||
&OverrideConfig(\%Conf,\%ConfOverride);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -85,10 +87,11 @@ my ($Period) = &GetTimePeriod($OptMonth);
 | 
			
		|||
         "'YYYY-MM:YYYY-MM'!") if (!$Period or $Period eq 'all time');
 | 
			
		||||
 | 
			
		||||
### init database
 | 
			
		||||
my $DBHandle = InitDB(\%Conf,1);
 | 
			
		||||
my $DBRaw    = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
 | 
			
		||||
my $DBGrps   = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
 | 
			
		||||
my $DBHosts  = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'});
 | 
			
		||||
my $DBHandle   = InitDB(\%Conf,1);
 | 
			
		||||
my $DBRaw      = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableRaw'});
 | 
			
		||||
my $DBGrps     = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableGrps'});
 | 
			
		||||
my $DBHosts    = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableHosts'});
 | 
			
		||||
my $DBClients  = sprintf('%s.%s',$Conf{'DBDatabase'},$Conf{'DBTableClnts'});
 | 
			
		||||
 | 
			
		||||
### get data for each month
 | 
			
		||||
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
 | 
			
		||||
| 
						 | 
				
			
			@ -139,6 +142,15 @@ foreach my $Month (&ListMonth($Period)) {
 | 
			
		|||
                        united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl);
 | 
			
		||||
    &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@KnownHosts);
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  ### ClientStats
 | 
			
		||||
  if ($OptStatsType eq 'all' or $OptStatsType eq 'clients') {
 | 
			
		||||
    # define agents/clients that shouldn't be counted
 | 
			
		||||
    my @DropAgents = qw(debian fedora firefox gecko gentoo lightning mandriva mnenhy mozilla
 | 
			
		||||
                        pclinuxos perl php presto suse suse/opensuse thunderbrowse ubuntu version);
 | 
			
		||||
    push(@DropAgents, 'red hat');
 | 
			
		||||
    &ClientStats($DBHandle,$DBRaw,$DBClients,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@DropAgents);
 | 
			
		||||
  };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
### close handles
 | 
			
		||||
| 
						 | 
				
			
			@ -356,13 +368,193 @@ sub HostStats {
 | 
			
		|||
  };
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
sub ClientStats {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### collect number of postings per client (and version)
 | 
			
		||||
### IN : $DBHandle         : database handle
 | 
			
		||||
###      $DBRaw            : database table for raw data (to read from)
 | 
			
		||||
###      $DBClients        : database table for clients data (to write to)
 | 
			
		||||
###      $Month            : current month to do
 | 
			
		||||
###      $TLH              : TLHs to collect
 | 
			
		||||
###      $MID              : specific Message-ID to fetch (testing purposes)
 | 
			
		||||
###      $Test             : test mode
 | 
			
		||||
###      $Debug            : debug mode
 | 
			
		||||
###      @DropAgents       : list of UserAgent "agents" that won't be counted
 | 
			
		||||
### OUT: (nothing)
 | 
			
		||||
  my ($DBHandle,$DBRaw,$DBClients,$Month,$TLH,$MID,$Test,$Debug,@DropAgents) = @_;
 | 
			
		||||
 | 
			
		||||
  my (%Postings,$DBQuery);
 | 
			
		||||
  my %DropAgent = map { $_ => 1 } @DropAgents;
 | 
			
		||||
 | 
			
		||||
  $DBQuery = GetHeaders($DBHandle,$DBRaw,$Month,$MID);
 | 
			
		||||
 | 
			
		||||
  ### ----------------------------------------------
 | 
			
		||||
  print "----- ClientStats -----\n" if $Debug;
 | 
			
		||||
  ### parse headers
 | 
			
		||||
  while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
 | 
			
		||||
    ### skip postings with wrong TLH
 | 
			
		||||
    next if ($TLH && !CheckTLH($Newsgroups,$TLH));
 | 
			
		||||
 | 
			
		||||
    my (@Clients, $Client, $Version);
 | 
			
		||||
    my %Header = ParseHeaders(split(/\n/,$Headers));
 | 
			
		||||
 | 
			
		||||
    ### X-Mailer
 | 
			
		||||
    if ($Header{'x-mailer'}) {
 | 
			
		||||
      # transfer to x-newsreader and parse from there
 | 
			
		||||
      $Header{'x-newsreader'} = $Header{'x-mailer'};
 | 
			
		||||
    }
 | 
			
		||||
    ### X-Newsreader
 | 
			
		||||
    if ($Header{'x-newsreader'}) {
 | 
			
		||||
      $Header{'x-newsreader'} = RemoveComments($Header{'x-newsreader'});
 | 
			
		||||
      # remove 'http://' and 'via' (CrossPoint)
 | 
			
		||||
      $Header{'x-newsreader'} =~ s/https?:\/\///;
 | 
			
		||||
      $Header{'x-newsreader'} =~ s/ ?via(.+)?$//;
 | 
			
		||||
      # parse header
 | 
			
		||||
      # User-Agent style
 | 
			
		||||
      if ($Header{'x-newsreader'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
 | 
			
		||||
        # transfer to user-agent and parse from there
 | 
			
		||||
        $Header{'user-agent'} = $Header{'x-newsreader'};
 | 
			
		||||
      # "client name version"
 | 
			
		||||
      } elsif ($Header{'x-newsreader'} =~ / /) {
 | 
			
		||||
        ($Client, $Version) = ParseXNewsreader($Header{'x-newsreader'});
 | 
			
		||||
      } else {
 | 
			
		||||
        $Client = $Header{'x-newsreader'};
 | 
			
		||||
        $Version = '';
 | 
			
		||||
      }
 | 
			
		||||
      if ($Client) {
 | 
			
		||||
        # special cases
 | 
			
		||||
        $Client  = 'CrossPoint'     if $Client =~ /^CrossPoint\//;
 | 
			
		||||
        $Client  = 'Virtual Access' if $Client =~ /^Virtual Access/;
 | 
			
		||||
        my %UserAgent = (agent   => $Client,
 | 
			
		||||
                         version => $Version);
 | 
			
		||||
        push @Clients, { %UserAgent };
 | 
			
		||||
      } else {
 | 
			
		||||
        $Header{'user-agent'} = $Header{'x-newsreader'};
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    ### User-Agent
 | 
			
		||||
    if(!@Clients && $Header{'user-agent'}) {
 | 
			
		||||
      $Header{'user-agent'} = RemoveComments($Header{'user-agent'});
 | 
			
		||||
      ### well-formed?
 | 
			
		||||
      if ($Header{'user-agent'} =~ /^([^\/ ]+\/[^\/ ]+ ?)+$/) {
 | 
			
		||||
        @Clients = ParseUserAgent($Header{'user-agent'});
 | 
			
		||||
      } else {
 | 
			
		||||
        # snip and add known well-formed agents from the trailing end
 | 
			
		||||
        while ($Header{'user-agent'} =~ /(((Hamster)|(Hamster-Pg)|(KorrNews)|(OE-Tools)|(Mime-proxy))(\/[^\/ ]+))$/) {
 | 
			
		||||
          push @Clients, ParseUserAgent($1);
 | 
			
		||||
          $Header{'user-agent'} =~ s/ [^\/ ]+\/[^\/ ]+$//;
 | 
			
		||||
        }
 | 
			
		||||
        ### special cases
 | 
			
		||||
        # remove 'http://open-news-network.org'
 | 
			
		||||
        $Header{'user-agent'} =~ s/^https?:\/\/open-news-network.org(\S+)?//;
 | 
			
		||||
        # Thunderbird
 | 
			
		||||
        if ($Header{'user-agent'} =~ /((Mozilla[- ])?Thunderbird) ?([0-9.]+)?/) {
 | 
			
		||||
          $Client  = 'Thunderbird';
 | 
			
		||||
          $Version = $3;
 | 
			
		||||
        # XP
 | 
			
		||||
        } elsif ($Header{'user-agent'} =~ /((TrueXP|FreeXP|XP2(\/Agent)?)) \/(.+)$/) {
 | 
			
		||||
          $Client  = $1;
 | 
			
		||||
          $Version = $4;
 | 
			
		||||
          $Client  = 'XP2' if $Client eq 'XP2/Agent';
 | 
			
		||||
        ### most general case
 | 
			
		||||
        # client version
 | 
			
		||||
        # client/version
 | 
			
		||||
        # client/32 version
 | 
			
		||||
        # - version may end in one non-numeric character
 | 
			
		||||
        # - including trailing beta/pre/...
 | 
			
		||||
        # 1) client:   (([^0-9]+)|(\D+\/\d+))
 | 
			
		||||
        # 2) version:  (\S+\d\D?)
 | 
			
		||||
        # 3) trailing: (( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?
 | 
			
		||||
        } elsif ($Header{'user-agent'} =~ /^(([^0-9]+)|(\D+\/\d+))[\/ ]((\S+\d\D?)(( alpha\d?)|( beta\d?)|( rc\d)| pre| trialware)?)$/) {
 | 
			
		||||
          $Client  = $1;
 | 
			
		||||
          $Version = $4;
 | 
			
		||||
        ### some very special cases
 | 
			
		||||
        # SeaMonkey/nn
 | 
			
		||||
        } elsif ($Header{'user-agent'} =~ /SeaMonkey\/([0-9.]+)/) {
 | 
			
		||||
          $Client  = 'Seamonkey';
 | 
			
		||||
          $Version = $1;
 | 
			
		||||
        # Emacs nn/Gnus nn
 | 
			
		||||
        } elsif ($Header{'user-agent'} =~ /Emacs [0-9.]+\/Gnus ([0-9.]+)/) {
 | 
			
		||||
          $Client  = 'Gnus';
 | 
			
		||||
          $Version = $1;
 | 
			
		||||
        # failed to parse
 | 
			
		||||
        } else {
 | 
			
		||||
          $Client = $Header{'user-agent'};
 | 
			
		||||
        }
 | 
			
		||||
        # count client, if found
 | 
			
		||||
        if ($Client) {
 | 
			
		||||
          my %UserAgent = (agent   => $Client,
 | 
			
		||||
                           version => $Version);
 | 
			
		||||
          push @Clients, { %UserAgent };
 | 
			
		||||
        } else {
 | 
			
		||||
          &Bleat(2,sprintf("%s FAILED", $Header{'message-id'})) if !@Clients;
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if (@Clients) {
 | 
			
		||||
      $Postings{'ALL'}{'ALL'}++;
 | 
			
		||||
      foreach (@Clients) {
 | 
			
		||||
        # filter agents for User-Agent with multiple agents
 | 
			
		||||
        next if $#Clients && exists($DropAgent{lc($_->{'agent'})});
 | 
			
		||||
        # encode to utf-8, if necessary
 | 
			
		||||
        $_->{'agent'}   = encode('UTF-8', $_->{'agent'})   if $_->{'agent'} =~ /[\x80-\x{ffff}]/;
 | 
			
		||||
        $_->{'version'} = encode('UTF-8', $_->{'version'}) if $_->{'version'} and $_->{'version'} =~ /[\x80-\x{ffff}]/;
 | 
			
		||||
        # special cases
 | 
			
		||||
        # Mozilla
 | 
			
		||||
        $_->{'agent'} = 'Mozilla' if $_->{'agent'} eq '•Mozilla';
 | 
			
		||||
        $_->{'agent'} =~ s/^Mozilla //;
 | 
			
		||||
        # Forte Agent
 | 
			
		||||
        $_->{'agent'} = 'Forte Agent' if $_->{'agent'} eq 'ForteAgent';
 | 
			
		||||
        if ($_->{'agent'} eq 'Forte Agent') {
 | 
			
		||||
          $_->{'version'} =~ s/-/\//;
 | 
			
		||||
          $_->{'version'} = '' if $_->{'version'} eq '32Bit';
 | 
			
		||||
        }
 | 
			
		||||
        # count client ('ALL') and client/version (if version is present)
 | 
			
		||||
        $Postings{$_->{'agent'}}{'ALL'}++;
 | 
			
		||||
        $Postings{$_->{'agent'}}{$_->{'version'}}++ if $_->{'version'};
 | 
			
		||||
 | 
			
		||||
        printf("%s: %s {%s}\n", $Header{'message-id'}, $_->{'agent'},
 | 
			
		||||
                                $_->{'version'} ? $Postings{$_->{'agent'}}{$_->{'version'}} : '')
 | 
			
		||||
                                if ($MID or $Debug && $Debug >1);
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  # delete old data for that month
 | 
			
		||||
  if (!$Test) {
 | 
			
		||||
    $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s WHERE month = ?",
 | 
			
		||||
                                     $DBClients),undef,$Month)
 | 
			
		||||
      or &Bleat(2,sprintf("Can't delete old client data for %s from %s: ".
 | 
			
		||||
                          "$DBI::errstr\n",$Month,$DBClients));
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
  foreach my $Client (sort keys %Postings) {
 | 
			
		||||
    foreach my $Version (sort keys %{$Postings{$Client}}) {
 | 
			
		||||
      printf ("%s {%s}: %d\n",$Client,$Version,$Postings{$Client}{$Version}) if $Debug;
 | 
			
		||||
 | 
			
		||||
      if (!$Test) {
 | 
			
		||||
        # write to database
 | 
			
		||||
        $DBQuery = $DBHandle->prepare(sprintf("INSERT INTO %s ".
 | 
			
		||||
                                              "(month,client,version,postings) ".
 | 
			
		||||
                                              "VALUES (?, ?, ?, ?)",$DBClients));
 | 
			
		||||
        $DBQuery->execute($Month, $Client, $Version, $Postings{$Client}{$Version})
 | 
			
		||||
          or &Bleat(2,sprintf("Can't write groups data for %s/%s/%s to %s: ".
 | 
			
		||||
                              "$DBI::errstr\n",$Month,$Client,$Version,$DBClients));
 | 
			
		||||
        $DBQuery->finish;
 | 
			
		||||
      };
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
sub GetHeaders {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### get (newsgroups and) raw headers from database
 | 
			
		||||
### IN : $DBHandle : database handle
 | 
			
		||||
###      $DBRaw    : database table for raw data (to read from)
 | 
			
		||||
###      $Month    : current month to do
 | 
			
		||||
###      $MID      : specific Message-ID to fetch (testing purposes)
 | 
			
		||||
### IN : $DBHandle: database handle
 | 
			
		||||
###      $DBRaw   : database table for raw data (to read from)
 | 
			
		||||
###      $Month   : current month to do
 | 
			
		||||
###      $MID     : specific Message-ID to fetch (testing purposes)
 | 
			
		||||
### OUT: DBI statement handle
 | 
			
		||||
  my ($DBHandle,$DBRaw,$Month,$MID) = @_;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -389,8 +581,8 @@ sub GetHeaders {
 | 
			
		|||
sub CheckTLH {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### count newsgroups from legal TLH(s)
 | 
			
		||||
### IN : $Newsgroups : comma separated list of newsgroups
 | 
			
		||||
###      $TLH        : (reference to an array of) legal TLH(s)
 | 
			
		||||
### IN : $Newsgroups: comma separated list of newsgroups
 | 
			
		||||
###      $TLH       : (reference to an array of) legal TLH(s)
 | 
			
		||||
### OUT: number of newsgroups from legal TLH(s)
 | 
			
		||||
  my ($Newsgroups,$TLH) = @_;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -417,6 +609,116 @@ sub CheckTLH {
 | 
			
		|||
  return $GroupCount;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub RemoveComments {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### remove comments and other junk from header
 | 
			
		||||
### IN : $Header: a header
 | 
			
		||||
### OUT: the header, with comments and other junk removed
 | 
			
		||||
  my $Header = shift;
 | 
			
		||||
 | 
			
		||||
  # decode MIME encoded words
 | 
			
		||||
  if ($Header =~ /=\?\S+\?[BQ]\?/) {
 | 
			
		||||
    $Header = decode("MIME-Header",$Header);
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  # remove nested comments from '(' to first ')'
 | 
			
		||||
  while ($Header =~ /\([^)]+\)/) {
 | 
			
		||||
    $Header =~ s/\([^()]+?\)//;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # remove dangling ')'
 | 
			
		||||
  $Header =~ s/\S+\)//;
 | 
			
		||||
 | 
			
		||||
  # remove from dangling '(' to end of header
 | 
			
		||||
  $Header =~ s/\(.+$//;
 | 
			
		||||
 | 
			
		||||
  # remove from '[' to first ']'
 | 
			
		||||
  $Header =~ s/\[[^\[\]]+?\]//;
 | 
			
		||||
 | 
			
		||||
  # remove 'Nr. ... lebt'
 | 
			
		||||
  $Header =~ s/Nr\. \d+ lebt//;
 | 
			
		||||
 | 
			
		||||
  # remove nn:nn:nn
 | 
			
		||||
  $Header =~ s/\d\d:\d\d:\d\d//;
 | 
			
		||||
 | 
			
		||||
  # remove 'mm/... '
 | 
			
		||||
  $Header =~ s/\/mm\/\S+//;
 | 
			
		||||
 | 
			
		||||
  # remove ' DE' / _DE'
 | 
			
		||||
  $Header =~ s/[ _]DE//;
 | 
			
		||||
 | 
			
		||||
  # remove trailing 'eol' or '-shl'
 | 
			
		||||
  $Header =~ s/(eol)|(-shl)$//;
 | 
			
		||||
 | 
			
		||||
  # remove from ';' or ',' (CrossPoint)
 | 
			
		||||
  # or '&' to end of header
 | 
			
		||||
  $Header =~ s/[;,&].+$//;
 | 
			
		||||
 | 
			
		||||
  # remove from 'by ' or 'unter Windows' or '@ Windows'
 | 
			
		||||
  # to end of header
 | 
			
		||||
  $Header =~ s/((by )|(unter +Windows)|(@ Windows)).+$//;
 | 
			
		||||
 | 
			
		||||
  # remove superfluous whitespace in header
 | 
			
		||||
  # and whitespace around header
 | 
			
		||||
  $Header =~ s/\s+/ /g;
 | 
			
		||||
  $Header =~ s/^\s+//;
 | 
			
		||||
  $Header =~ s/\s+$//;
 | 
			
		||||
 | 
			
		||||
  return $Header;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
sub ParseXNewsreader {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### parse X-Newsreader header (client and version, if present)
 | 
			
		||||
### IN : $XNR: a X-Newsreader header
 | 
			
		||||
### OUT: client and version, if present
 | 
			
		||||
  my $XNR = shift;
 | 
			
		||||
 | 
			
		||||
  my ($Client, $Version);
 | 
			
		||||
 | 
			
		||||
  foreach (split(/ /,$XNR)) {
 | 
			
		||||
    # add to client name if no digit present
 | 
			
		||||
    if (!/\d[0-9.]/ or /\/\d$/) {
 | 
			
		||||
      $Client .= $_ . ' ' ;
 | 
			
		||||
    # otherwise, use as version and terminate parsing
 | 
			
		||||
    } else {
 | 
			
		||||
      $Version = $_;
 | 
			
		||||
      last;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  # remove trailing whitespace
 | 
			
		||||
  $Client =~ s/\s+$// if $Client;
 | 
			
		||||
 | 
			
		||||
  # set $Version
 | 
			
		||||
  $Version = '' if !$Version;
 | 
			
		||||
   
 | 
			
		||||
  return $Client, $Version;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
sub ParseUserAgent {
 | 
			
		||||
### ----------------------------------------------------------------------------
 | 
			
		||||
### parse User-Agent header (agent and version)
 | 
			
		||||
### IN : $UserAgent: a User-Agent header
 | 
			
		||||
### OUT: array of hashes (agent/version)
 | 
			
		||||
  my $UserAgent = shift;
 | 
			
		||||
 | 
			
		||||
  my @UserAgents;
 | 
			
		||||
 | 
			
		||||
  # a well-formed User-Agent header will contain pairs of
 | 
			
		||||
  # client/version, i.e. 'slrn/0.9.7.3'
 | 
			
		||||
  foreach (split(/ /,$UserAgent)) {
 | 
			
		||||
    my %UserAgent;
 | 
			
		||||
    /^(.+)\/(.+)$/;
 | 
			
		||||
    $UserAgent{'agent'}   = $1;
 | 
			
		||||
    $UserAgent{'version'} = $2;
 | 
			
		||||
    push @UserAgents, { %UserAgent };
 | 
			
		||||
  }
 | 
			
		||||
  
 | 
			
		||||
  return @UserAgents;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__END__
 | 
			
		||||
 | 
			
		||||
################################ Documentation #################################
 | 
			
		||||
| 
						 | 
				
			
			@ -427,7 +729,7 @@ gatherstats - process statistical data from a raw source
 | 
			
		|||
 | 
			
		||||
=head1 SYNOPSIS
 | 
			
		||||
 | 
			
		||||
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>]
 | 
			
		||||
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--conffile> I<filename>]
 | 
			
		||||
 | 
			
		||||
=head1 REQUIREMENTS
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -474,12 +776,23 @@ override that default through the B<--groupsdb> option.
 | 
			
		|||
=item B<hosts> (postings from host per month)
 | 
			
		||||
 | 
			
		||||
B<gatherstats> will examine Injection-Info:, X-Trace: and Path:
 | 
			
		||||
headers and try to normalize them. Groups not in I<TLH> will be
 | 
			
		||||
ignored. The sum of all detected hosts will also saved for each month.
 | 
			
		||||
headers and try to normalize them. The sum of all detected hosts will
 | 
			
		||||
also be saved for each month. Groups not in I<TLH> will be ignored.
 | 
			
		||||
 | 
			
		||||
Data is written to I<DBTableHosts> (see L<doc/INSTALL>); you can
 | 
			
		||||
override that default through the B<--hostsdb> option.
 | 
			
		||||
 | 
			
		||||
=item B<clients> (postings by client per month)
 | 
			
		||||
 | 
			
		||||
B<gatherstats> will examine User-Agent:, X-Newsreader: and X-Mailer:
 | 
			
		||||
headers and try to remove comments and non-standard contents. Clients
 | 
			
		||||
and client versions are counted separately. The sum of all detected
 | 
			
		||||
clients will also be saved for each month. Groups not in I<TLH> will
 | 
			
		||||
be ignored. 
 | 
			
		||||
 | 
			
		||||
Data is written to I<DBTableClnts> (see L<doc/INSTALL>); you can
 | 
			
		||||
override that default through the B<--clientsdb> option.
 | 
			
		||||
 | 
			
		||||
=back
 | 
			
		||||
 | 
			
		||||
=head2 Configuration
 | 
			
		||||
| 
						 | 
				
			
			@ -561,14 +874,14 @@ Override I<DBTableRaw> from F<newsstats.conf>.
 | 
			
		|||
 | 
			
		||||
Override I<DBTableGrps> from F<newsstats.conf>.
 | 
			
		||||
 | 
			
		||||
=item B<--clientsdb> I<table> (client data table)
 | 
			
		||||
 | 
			
		||||
Override I<DBTableClnts> from F<newsstats.conf>.
 | 
			
		||||
 | 
			
		||||
=item B<--hostsdb> I<table> (host data table)
 | 
			
		||||
 | 
			
		||||
Override I<DBTableHosts> from F<newsstats.conf>.
 | 
			
		||||
 | 
			
		||||
=item B<--clientsdb> I<table> (client data table)
 | 
			
		||||
 | 
			
		||||
Override I<DBTableClnts> from F<newsstats.conf>.
 | 
			
		||||
 | 
			
		||||
=item B<--conffile> I<filename>
 | 
			
		||||
 | 
			
		||||
Load configuration from I<filename> instead of F<newsstats.conf>.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,7 @@ NewsStats 0.4.0 (unreleased)
 | 
			
		|||
  * Extract getting raw headers from HostStats to subroutine.
 | 
			
		||||
  * Improve documentation for config file.
 | 
			
		||||
  * ParseHeader: re-merge continuation lines.
 | 
			
		||||
  * Add ClientStats to gatherstats.
 | 
			
		||||
 | 
			
		||||
NewsStats 0.3.0 (2025-05-18)
 | 
			
		||||
  * Extract GroupStats (in gatherstats) to subroutine.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -13,7 +13,7 @@ DBDatabase = newsstats
 | 
			
		|||
DBTableRaw   = raw_de
 | 
			
		||||
DBTableGrps  = groups_de
 | 
			
		||||
DBTableHosts = hosts_de
 | 
			
		||||
#DBTableClnts =
 | 
			
		||||
DBTableClnts = clnts_de
 | 
			
		||||
 | 
			
		||||
### hierarchy configuration
 | 
			
		||||
# comma-separated list of TLHs to parse
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue