Merge branch 'gatherstats' into next

This commit is contained in:
Thomas Hochstein 2013-08-11 21:56:37 +02:00
commit c3973e7d0d
2 changed files with 46 additions and 23 deletions

View file

@ -184,7 +184,7 @@ sub ListNewsgroups {
next if($TLH and !/^$TLH/);
# don't count invalid newsgroups
if(%ValidGroups and !defined($ValidGroups{$_})) {
&Bleat(1,sprintf("DROPPED: %s",$_));
warn (sprintf("DROPPED: %s\n",$_));
next;
}
# add original newsgroup to %Newsgroups
@ -230,8 +230,9 @@ sub ReadGroupList {
my %ValidGroups;
open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!");
while (<$LIST>) {
s/^(\S+).*$/$1/;
s/^\s*(\S+).*$/$1/;
chomp;
next if /^$/;
$ValidGroups{$_} = '1';
};
close $LIST;

View file

@ -78,28 +78,30 @@ my $TLH;
if ($Conf{'TLH'}) {
# $Conf{'TLH'} is parsed as an array by Config::Auto;
# make a flat list again, separated by :
if (ref($TLH) eq 'ARRAY') {
if (ref($Conf{'TLH'}) eq 'ARRAY') {
$TLH = join(':',@{$Conf{'TLH'}});
} else {
$TLH = $Conf{'TLH'};
}
# strip whitespace
$TLH =~ s/\s//g;
# add trailing dots if none are present yet
# (using negative look-behind assertions)
$TLH =~ s/(?<!\.):/.:/g;
$TLH =~ s/(?<!\.)$/./;
# check for illegal characters
&Bleat(2,'Config error - illegal characters in TLH definition!')
if ($TLH !~ /^[a-zA-Z0-9:]+$/);
if ($TLH !~ /^[a-zA-Z0-9:+.-]+$/);
# escape dots
$TLH =~ s/\./\\./g;
if ($TLH =~ /:/) {
# reformat $TLH from a:b to (a)|(b),
# e.g. replace '.' by '|'
# e.g. replace ':' by ')|('
$TLH =~ s/:/)|(/g;
$TLH = '(' . $TLH . ')';
};
};
# read list of newsgroups from --checkgroups
# into a hash
my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile;
### init database
my $DBHandle = InitDB(\%Conf,1);
@ -110,6 +112,11 @@ foreach my $Month (&ListMonth($Period)) {
print "---------- $Month ----------\n" if $OptDebug;
if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') {
# read list of newsgroups from --checkgroups
# into a hash
my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))}
if $OptCheckgroupsFile;
### ----------------------------------------------
### get groups data (number of postings per group)
# get groups data from raw table for given month
@ -125,7 +132,7 @@ foreach my $Month (&ListMonth($Period)) {
# count postings per group
my %Postings;
while (($_) = $DBQuery->fetchrow_array) {
# get list oft newsgroups and hierarchies from Newsgroups:
# get list of newsgroups and hierarchies from Newsgroups:
my %Newsgroups = ListNewsgroups($_,$TLH,
$OptCheckgroupsFile ? \%ValidGroups : '');
# count each newsgroup and hierarchy once
@ -137,9 +144,16 @@ foreach my $Month (&ListMonth($Period)) {
# add valid but empty groups if --checkgroups is set
if (%ValidGroups) {
foreach (sort keys %ValidGroups) {
if (!defined($Postings{$_})) {
# expand newsgroup with hierarchies
my @Newsgroups = ParseHierarchies($_);
# add each empty newsgroup and empty hierarchies, too, as needed
foreach (@Newsgroups) {
if (!defined($Postings{$_})) {
$Postings{$_} = 0;
warn (sprintf("ADDED: %s as empty group\n",$_));
};
};
}
};
};
@ -189,7 +203,7 @@ gatherstats - process statistical data from a raw source
=head1 SYNOPSIS
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<checkgroups file>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>]
=head1 REQUIREMENTS
@ -283,15 +297,23 @@ Set processing type to one of I<all> and I<groups>. Defaults to all
(and is currently rather pointless as only I<groups> has been
implemented).
=item B<-c>, B<--checkgroups> I<filename>
=item B<-c>, B<--checkgroups> I<filename template>
Check each group against a list of valid newsgroups read from
I<filename>, one group on each line and ignoring everything after the
first whitespace (so you can use a file in checkgroups format or (part
of) your INN active file).
Check each group against a list of valid newsgroups read from a file,
one group on each line and ignoring everything after the first
whitespace (so you can use a file in checkgroups format or (part of)
your INN active file).
Newsgroups not found in I<filename> will be dropped (and logged to
STDERR), and newsgroups found in I<filename> but having no postings
The filename is taken from I<filename template>, amended by each B<--
month> B<gatherstats> is processing, so that
gatherstats -m 2010-01:2010-12 -c checkgroups
will check against F<checkgroups-2010-01> for January 2010, against
F<checkgroups-2010-02> for February 2010 and so on.
Newsgroups not found in the checkgroups file will be dropped (and
logged to STDERR), and newsgroups found there but having no postings
will be added with a count of 0 (and logged to STDERR).
=item B<--hierarchy> I<TLH> (newsgroup hierarchy)
@ -335,9 +357,9 @@ Process all types of information for January of 2010:
gatherstats --month 2010-01
Process only number of postings for the year of 2010,
checking against checkgroups-2010.txt:
checking against checkgroups-*:
gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt
gatherstats -m 2010-01:2010-12 -s groups -c checkgroups
=head1 FILES