From 43a0fc776902f3a7b3ea019e94b67cb7e4500039 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Sun, 27 May 2012 13:56:06 +0200 Subject: [PATCH 1/7] Fix parsing of more than one TLH in config. The code introduced in 17ffbebad562acd3af71328cdbf187297b5a9e6d did not check the correct variable for being an array. Improve an unrelated comment, too. Signed-off-by: Thomas Hochstein --- gatherstats.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gatherstats.pl b/gatherstats.pl index b570cd8..d2d4faa 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -78,7 +78,7 @@ my $TLH; if ($Conf{'TLH'}) { # $Conf{'TLH'} is parsed as an array by Config::Auto; # make a flat list again, separated by : - if (ref($TLH) eq 'ARRAY') { + if (ref($Conf{'TLH'}) eq 'ARRAY') { $TLH = join(':',@{$Conf{'TLH'}}); } else { $TLH = $Conf{'TLH'}; @@ -90,7 +90,7 @@ if ($Conf{'TLH'}) { if ($TLH !~ /^[a-zA-Z0-9:]+$/); if ($TLH =~ /:/) { # reformat $TLH from a:b to (a)|(b), - # e.g. replace '.' by '|' + # e.g. replace ':' by ')|(' $TLH =~ s/:/)|(/g; $TLH = '(' . $TLH . ')'; }; From 7773fb6d8f26f2d2331a4bb394ed164160a9a210 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Sun, 27 May 2012 13:58:32 +0200 Subject: [PATCH 2/7] Match TLHs correctly, not only partially. The TLH was checked to match the beginning of the newsgroup name, not the whole TLH part. So the TLH "de" would match not only "de.test", but also "denver.test", which was not the desired outcome. Signed-off-by: Thomas Hochstein --- gatherstats.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gatherstats.pl b/gatherstats.pl index d2d4faa..6e41485 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -85,9 +85,15 @@ if ($Conf{'TLH'}) { } # strip whitespace $TLH =~ s/\s//g; + # add trailing dots if none are present yet + # (using negative look-behind assertions) + $TLH =~ s/(? Date: Sun, 27 May 2012 14:00:14 +0200 Subject: [PATCH 3/7] Allow more characters in TLH definitions. TLH may now also contain literal dots '.', allowing for using second or third level hierarchies as "TLH". To faciliate that, '+' and '-' will be allowed, too. Signed-off-by: Thomas Hochstein --- gatherstats.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gatherstats.pl b/gatherstats.pl index 6e41485..0a5a9a2 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -91,7 +91,7 @@ if ($Conf{'TLH'}) { $TLH =~ s/(? Date: Sun, 27 May 2012 15:31:49 +0200 Subject: [PATCH 4/7] Remove call to &Bleat where not appropriate. Some warn()ings are used for debugging purposes. Signed-off-by: Thomas Hochstein --- NewsStats.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NewsStats.pm b/NewsStats.pm index bdfdcf5..f0a4066 100644 --- a/NewsStats.pm +++ b/NewsStats.pm @@ -184,7 +184,7 @@ sub ListNewsgroups { next if($TLH and !/^$TLH/); # don't count invalid newsgroups if(%ValidGroups and !defined($ValidGroups{$_})) { - &Bleat(1,sprintf("DROPPED: %s",$_)); + warn (sprintf("DROPPED: %s\n",$_)); next; } # add original newsgroup to %Newsgroups From 7662b1065e85874ceee234701d93aa50a1cde408 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Sun, 27 May 2012 15:33:11 +0200 Subject: [PATCH 5/7] Be more fault-tolerant when reading checkgroups. * Accept lines starting with whitespace. * Drop empty "groups", i.e. lines containing only whitespace. Signed-off-by: Thomas Hochstein --- NewsStats.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NewsStats.pm b/NewsStats.pm index f0a4066..428719e 100644 --- a/NewsStats.pm +++ b/NewsStats.pm @@ -230,8 +230,9 @@ sub ReadGroupList { my %ValidGroups; open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!"); while (<$LIST>) { - s/^(\S+).*$/$1/; + s/^\s*(\S+).*$/$1/; chomp; + next if /^$/; $ValidGroups{$_} = '1'; }; close $LIST; From 93c8eae2edcb7ccb4a3fe25817908fdb0723f324 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Sun, 27 May 2012 15:53:29 +0200 Subject: [PATCH 6/7] Change interpretation of --checkgroups to template In most hierarchies, the list of valid newsgroups will change over time, so you'll have to use another checkgroups file for each month. gatherstats will now understand the value of --checkgroups to be a template and amend it with each month it is processing. Documentation changed accordingly. Signed-off-by: Thomas Hochstein --- gatherstats.pl | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/gatherstats.pl b/gatherstats.pl index 0a5a9a2..2bc426b 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -102,10 +102,6 @@ if ($Conf{'TLH'}) { }; }; -# read list of newsgroups from --checkgroups -# into a hash -my %ValidGroups = %{ReadGroupList($OptCheckgroupsFile)} if $OptCheckgroupsFile; - ### init database my $DBHandle = InitDB(\%Conf,1); @@ -116,6 +112,11 @@ foreach my $Month (&ListMonth($Period)) { print "---------- $Month ----------\n" if $OptDebug; if ($OptStatsType eq 'all' or $OptStatsType eq 'groups') { + # read list of newsgroups from --checkgroups + # into a hash + my %ValidGroups = %{ReadGroupList(sprintf('%s-%s',$OptCheckgroupsFile,$Month))} + if $OptCheckgroupsFile; + ### ---------------------------------------------- ### get groups data (number of postings per group) # get groups data from raw table for given month @@ -195,7 +196,7 @@ gatherstats - process statistical data from a raw source =head1 SYNOPSIS -B [B<-Vhdt>] [B<-m> I | I] [B<-s> I I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] +B [B<-Vhdt>] [B<-m> I | I] [B<-s> I I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] =head1 REQUIREMENTS @@ -289,15 +290,23 @@ Set processing type to one of I and I. Defaults to all (and is currently rather pointless as only I has been implemented). -=item B<-c>, B<--checkgroups> I +=item B<-c>, B<--checkgroups> I -Check each group against a list of valid newsgroups read from -I, one group on each line and ignoring everything after the -first whitespace (so you can use a file in checkgroups format or (part -of) your INN active file). +Check each group against a list of valid newsgroups read from a file, +one group on each line and ignoring everything after the first +whitespace (so you can use a file in checkgroups format or (part of) +your INN active file). -Newsgroups not found in I will be dropped (and logged to -STDERR), and newsgroups found in I but having no postings +The filename is taken from I, amended by each B<-- +month> B is processing, so that + + gatherstats -m 2010-01:2010-12 -c checkgroups + +will check against F for January 2010, against +F for February 2010 and so on. + +Newsgroups not found in the checkgroups file will be dropped (and +logged to STDERR), and newsgroups found there but having no postings will be added with a count of 0 (and logged to STDERR). =item B<--hierarchy> I (newsgroup hierarchy) @@ -341,9 +350,9 @@ Process all types of information for January of 2010: gatherstats --month 2010-01 Process only number of postings for the year of 2010, -checking against checkgroups-2010.txt: +checking against checkgroups-*: - gatherstats -m 2010-01:2010-12 -s groups -c checkgroups-2010.txt + gatherstats -m 2010-01:2010-12 -s groups -c checkgroups =head1 FILES From b5125b1099cf5cf12beb0520d5896b9a1d7850ae Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Sun, 11 Aug 2013 01:47:32 +0200 Subject: [PATCH 7/7] Add empty 'virtual' .ALL hierarchies as needed. When using a --checkgroups file while tabulating, valid but empty groups will be added with a posting count of zero as needed. If all groups in a sub-hierarchy are empty, the virtual '.ALL' group for that sub-hierarchy was not created, though. If local.test.dummy and local.test.binary were both empty, both groups were added with a posting count of '0', but local.test.ALL was not. Now we loop through all hierarchy elements using ParseHierarchies and add empty .ALL hierarchies as needed. Fixes #49. Also fixing a typo in some comment. :-) Signed-off-by: Thomas Hochstein --- gatherstats.pl | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/gatherstats.pl b/gatherstats.pl index 2bc426b..ae7d65d 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -132,7 +132,7 @@ foreach my $Month (&ListMonth($Period)) { # count postings per group my %Postings; while (($_) = $DBQuery->fetchrow_array) { - # get list oft newsgroups and hierarchies from Newsgroups: + # get list of newsgroups and hierarchies from Newsgroups: my %Newsgroups = ListNewsgroups($_,$TLH, $OptCheckgroupsFile ? \%ValidGroups : ''); # count each newsgroup and hierarchy once @@ -145,12 +145,19 @@ foreach my $Month (&ListMonth($Period)) { if (%ValidGroups) { foreach (sort keys %ValidGroups) { if (!defined($Postings{$_})) { - $Postings{$_} = 0 ; - warn (sprintf("ADDED: %s as empty group\n",$_)); + # expand newsgroup with hierarchies + my @Newsgroups = ParseHierarchies($_); + # add each empty newsgroup and empty hierarchies, too, as needed + foreach (@Newsgroups) { + if (!defined($Postings{$_})) { + $Postings{$_} = 0; + warn (sprintf("ADDED: %s as empty group\n",$_)); + }; + }; } }; }; - + # delete old data for that month if (!$OptTest) { $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s WHERE month = ?",