Merge branch 'gatherstats' into next

This commit is contained in:
Thomas Hochstein 2013-08-11 21:56:37 +02:00
commit c3973e7d0d
2 changed files with 46 additions and 23 deletions

View file

@ -184,7 +184,7 @@ sub ListNewsgroups {
next if($TLH and !/^$TLH/); next if($TLH and !/^$TLH/);
# don't count invalid newsgroups # don't count invalid newsgroups
if(%ValidGroups and !defined($ValidGroups{$_})) { if(%ValidGroups and !defined($ValidGroups{$_})) {
&Bleat(1,sprintf("DROPPED: %s",$_)); warn (sprintf("DROPPED: %s\n",$_));
next; next;
} }
# add original newsgroup to %Newsgroups # add original newsgroup to %Newsgroups
@ -230,8 +230,9 @@ sub ReadGroupList {
my %ValidGroups; my %ValidGroups;
open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!"); open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!");
while (<$LIST>) { while (<$LIST>) {
s/^(\S+).*$/$1/; s/^\s*(\S+).*$/$1/;
chomp; chomp;
next if /^$/;
$ValidGroups{$_} = '1'; $ValidGroups{$_} = '1';
}; };
close $LIST; close $LIST;

View file

@ -78,28 +78,30 @@ my $TLH;
if ($Conf{'TLH'}) { if ($Conf{'TLH'}) {
# $Conf{'TLH'} is parsed as an array by Config::Auto; # $Conf{'TLH'} is parsed as an array by Config::Auto;
# make a flat list again, separated by : # make a flat list again, separated by :
if (ref($TLH) eq 'ARRAY') { if (ref($Conf{'TLH'}) eq 'ARRAY') {
$TLH = join(':',@{$Conf{'TLH'}}); $TLH = join(':',@{$Conf{'TLH'}});
} else { } else {
$TLH = $Conf{'TLH'}; $TLH = $Conf{'TLH'};
} }
# strip whitespace # strip whitespace
$TLH =~ s/\s//g; $TLH =~ s/\s//g;
# add trailing dots if none are present yet
# (using negative look-behind assertions)
$TLH =~ s/(?<!\.):/.:/g;
$TLH =~ s/(?<!\.)$/./;
# check for illegal characters # check for illegal characters
&Bleat(2,'Config error - illegal characters in TLH definition!') &Bleat(2,'Config error - illegal characters in TLH definition!')
if ($TLH !~ /^[a-zA-Z0-9:]+$/); if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
# escape dots
$TLH =~ s/\./\\./g;
if ($TLH =~ /:/) { if ($TLH =~ /:/) {
# reformat $TLH from a:b to (a)|(b), # reformat $TLH from a:b to (a)|(b),
# e.g. replace '.' by '|' # e.g. replace ':' by ')|('
$TLH =~ s/:/)|(/g; $TLH =~ s/:/)|(/g;
$TLH = '(' . $TLH . ')'; $TLH = '(' . $TLH . ')';
}; };
}; };
# read list of newsgroups from --checkgroups
# into a hash
my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
### init database ### init database
my $DBHandle = InitDB(\%Conf,1); my $DBHandle = InitDB(\%Conf,1);
@ -110,6 +112,11 @@ foreach my $Month (&ListMonth($Period)) {
print "---------- $Month ----------\n" if $OptDebug; print "---------- $Month ----------\n" if $OptDebug;
if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') { if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
# read list of newsgroups from --checkgroups
# into a hash
my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
if $OptCheckgroupsFile;
### ---------------------------------------------- ### ----------------------------------------------
### get groups data (number of postings per group) ### get groups data (number of postings per group)
# get groups data from raw table for given month # get groups data from raw table for given month
@ -125,7 +132,7 @@ foreach my $Month (&ListMonth($Period)) {
# count postings per group # count postings per group
my %Postings; my %Postings;
while (($_) = $DBQuery->fetchrow_array) { while (($_) = $DBQuery->fetchrow_array) {
# get list oft newsgroups and hierarchies from Newsgroups: # get list of newsgroups and hierarchies from Newsgroups:
my %Newsgroups = ListNewsgroups($_,$TLH, my %Newsgroups = ListNewsgroups($_,$TLH,
$OptCheckgroupsFile ? \%ValidGroups : ''); $OptCheckgroupsFile ? \%ValidGroups : '');
# count each newsgroup and hierarchy once # count each newsgroup and hierarchy once
@ -138,8 +145,15 @@ foreach my $Month (&ListMonth($Period)) {
if (%ValidGroups) { if (%ValidGroups) {
foreach (sort keys %ValidGroups) { foreach (sort keys %ValidGroups) {
if (!defined($Postings{$_})) { if (!defined($Postings{$_})) {
$Postings{$_} = 0 ; # expand newsgroup with hierarchies
warn (sprintf("ADDED: %s as empty group\n",$_)); my @Newsgroups = ParseHierarchies($_);
# add each empty newsgroup and empty hierarchies, too, as needed
foreach (@Newsgroups) {
if (!defined($Postings{$_})) {
$Postings{$_} = 0;
warn (sprintf("ADDED: %s as empty group\n",$_));
};
};
} }
}; };
}; };
@ -189,7 +203,7 @@ gatherstats - process statistical data from a raw source
=head1 SYNOPSIS =head1 SYNOPSIS
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
=head1 REQUIREMENTS =head1 REQUIREMENTS
@ -283,15 +297,23 @@ Set processing type to one of I<all> and I<groups>. Defaults to all
(and is currently rather pointless as only I<groups> has been (and is currently rather pointless as only I<groups> has been
implemented). implemented).
=item B<-c>, B<--checkgroups> I<filename> =item B<-c>, B<--checkgroups> I<filename template>
Check each group against a list of valid newsgroups read from Check each group against a list of valid newsgroups read from a file,
I<filename>, one group on each line and ignoring everything after the one group on each line and ignoring everything after the first
first whitespace (so you can use a file in checkgroups format or (part whitespace (so you can use a file in checkgroups format or (part of)
of) your INN active file). your INN active file).
Newsgroups not found in I<filename> will be dropped (and logged to The filename is taken from I<filename template>, amended by each B<--
STDERR), and newsgroups found in I<filename> but having no postings month> B<gatherstats> is processing, so that
gatherstats -m 2010-01:2010-12 -c checkgroups
will check against F<checkgroups-2010-01> for January 2010, against
F<checkgroups-2010-02> for February 2010 and so on.
Newsgroups not found in the checkgroups file will be dropped (and
logged to STDERR), and newsgroups found there but having no postings
will be added with a count of 0 (and logged to STDERR). will be added with a count of 0 (and logged to STDERR).
=item B<--hierarchy> I<TLH> (newsgroup hierarchy) =item B<--hierarchy> I<TLH> (newsgroup hierarchy)
@ -335,9 +357,9 @@ Process all types of information for January of 2010:
gatherstats --month 2010-01 gatherstats --month 2010-01
Process only number of postings for the year of 2010, Process only number of postings for the year of 2010,
checking against checkgroups-2010.txt: checking against checkgroups-*:
gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
=head1 FILES =head1 FILES