diff --git a/NewsStats.pm b/NewsStats.pm index 35e0c90..2cb1be2 100644 --- a/NewsStats.pm +++ b/NewsStats.pm @@ -31,6 +31,7 @@ require Exporter; SplitPeriod ListMonth ListNewsgroups + ReadGroupList OutputData FormatOutput SQLHierarchies @@ -154,15 +155,23 @@ sub ListNewsgroups { ### explode a (scalar) list of newsgroup names to a list of newsgroup and ### hierarchy names where every newsgroup and hierarchy appears only once: ### de.alt.test,de.alt.admin -> de.ALL, de.alt.ALL, de.alt.test, de.alt.admin -### IN : $Newsgroups: a list of newsgroups (content of Newsgroups: header) -### OUT: %Newsgroups: hash containing all newsgroup and hierarchy names as keys - my ($Newsgroups) = @_; +### IN : $Newsgroups : a list of newsgroups (content of Newsgroups: header) +### $ValidGroupsR: reference to a hash containing all valid newsgroups +### as keys +### OUT: %Newsgroups : hash containing all newsgroup and hierarchy names as keys + my ($Newsgroups,$ValidGroupsR) = @_; + my %ValidGroups = %{$ValidGroupsR} if $ValidGroupsR; my %Newsgroups; chomp($Newsgroups); # remove whitespace from contents of Newsgroups: $Newsgroups =~ s/\s//; # call &HierarchyCount for each newsgroup in $Newsgroups: for (split /,/, $Newsgroups) { + # don't count invalid newsgroups + if(%ValidGroups and !defined($ValidGroups{$_})) { + warn (sprintf("DROPPED: %s\n",$_)); + next; + } # add original newsgroup to %Newsgroups $Newsgroups{$_} = 1; # add all hierarchy elements to %Newsgroups, amended by '.ALL', @@ -194,6 +203,26 @@ sub ParseHierarchies { return @Hierarchies; }; +################################################################################ +sub ReadGroupList { +################################################################################ +### read a list of valid newsgroups from file (each group on one line, +### ignoring everything after the first whitespace and so accepting files +### in checkgroups format as well as (parts of) an INN active file) +### IN : $Filename : file to read +### OUT: \%ValidGroups: hash containing all valid newsgroups + my ($Filename) = @_; + my %ValidGroups; + open (my $LIST,"<$Filename") or die "$MySelf: E: Cannot read $Filename: $!\n"; + while (<$LIST>) { + s/^(\S+).*$/$1/; + chomp; + $ValidGroups{$_} = '1'; + }; + close $LIST; + return \%ValidGroups; +}; + ################################################################################ #####----------------------------- TimePeriods ----------------------------##### diff --git a/gatherstats.pl b/gatherstats.pl index 26b71e5..a721b3e 100755 --- a/gatherstats.pl +++ b/gatherstats.pl @@ -19,7 +19,7 @@ BEGIN { } use strict; -use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups); +use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList); use DBI; @@ -33,7 +33,7 @@ my %LegalTypes; ################################# Main program ################################# ### read commandline options -my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:'); +my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:'); ### read configuration my %Conf = %{ReadConfig('newsstats.conf')}; @@ -54,6 +54,9 @@ die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Opt ### get time period (-m or -p) my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); +### read newsgroups list from -l +my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'}; + ### init database my $DBHandle = InitDB(\%Conf,1); @@ -72,18 +75,25 @@ foreach my $Month (&ListMonth($StartMonth,$EndMonth)) { # count postings per group my %Postings; - while (($_) = $DBQuery->fetchrow_array) { # get list oft newsgroups and hierarchies from Newsgroups: - my %Newsgroups = ListNewsgroups($_); + my %Newsgroups = ListNewsgroups($_,$Options{'l'} ? \%ValidGroups : ''); # count each newsgroup and hierarchy once foreach (sort keys %Newsgroups) { - # don't count newsgroup/hierarchy in wrong TLH - next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/); $Postings{$_}++; }; }; + # add valid but empty groups if -l is set + if (%ValidGroups) { + foreach (sort keys %ValidGroups) { + if (!defined($Postings{$_})) { + $Postings{$_} = 0 ; + warn (sprintf("ADDED: %s as empty group\n",$_)); + } + }; + }; + print "----- GroupStats -----\n" if $Options{'d'}; foreach my $Newsgroup (sort keys %Postings) { print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; @@ -112,7 +122,7 @@ gatherstats - process statistical data from a raw source =head1 SYNOPSIS -B [B<-Vhdo>] [B<-m> I] [B<-p> I] [B<-t> I] [B<-n> I] [B<-r> I] [B<-g> I] [B<-c> I] [B<-s> I] +B [B<-Vhdo>] [B<-m> I] [B<-p> I] [B<-t> I] [B<-l> I] [B<-n> I] [B<-r> I] [B<-g> I] [B<-c> I] [B<-s> I] =head1 REQUIREMENTS @@ -219,6 +229,17 @@ Set processing type to one of I and I. Defaults to all (and is currently rather pointless as only I has been implemented). +=item B<-l> I (check against list) + +Check each group against a list of valid newsgroups read from +I, one group on each line and ignoring everything after the +first whitespace (so you can use a file in checkgroups format or (part +of) your INN active file). + +Newsgroups not found in I will be dropped (and logged to +STDERR), and newsgroups found in I but having no postings +will be added with a count of 0 (and logged to STDERR). + =item B<-n> I (newsgroup hierarchy) Override I from F. @@ -259,9 +280,10 @@ Process all types of information for January of 2010: gatherstats -m 2010-01 -Process only number of postings for the year of 2010: +Process only number of postings for the year of 2010, +checking against checkgroups-2010.txt: - gatherstats -p 2010-01:2010-12 -t groups + gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt =head1 FILES