gatherstats.pl: Add '-l' option.

Add '-l' option to check newsgroup names against a list of valid
newsgroups read from a file (each group on one line, ignoring
everything after the first whitespace and so accepting files in
checkgroups format as well as (parts of) an INN active file).
New ReadGroupList().
Change ListNewsgroups() accordingly.
Amend documentation accodingly.

See #17.

Signed-off-by: Thomas Hochstein <thh@inter.net>
This commit is contained in:
Thomas Hochstein 2010-11-01 17:04:05 +01:00
parent 54d04e84a5
commit ad60979271
2 changed files with 63 additions and 12 deletions

View file

@ -31,6 +31,7 @@ require Exporter;
SplitPeriod SplitPeriod
ListMonth ListMonth
ListNewsgroups ListNewsgroups
ReadGroupList
OutputData OutputData
FormatOutput FormatOutput
SQLHierarchies SQLHierarchies
@ -155,14 +156,22 @@ sub ListNewsgroups {
### hierarchy names where every newsgroup and hierarchy appears only once: ### hierarchy names where every newsgroup and hierarchy appears only once:
### de.alt.test,de.alt.admin -> de.ALL, de.alt.ALL, de.alt.test, de.alt.admin ### de.alt.test,de.alt.admin -> de.ALL, de.alt.ALL, de.alt.test, de.alt.admin
### IN : $Newsgroups : a list of newsgroups (content of Newsgroups: header) ### IN : $Newsgroups : a list of newsgroups (content of Newsgroups: header)
### $ValidGroupsR: reference to a hash containing all valid newsgroups
### as keys
### OUT: %Newsgroups : hash containing all newsgroup and hierarchy names as keys ### OUT: %Newsgroups : hash containing all newsgroup and hierarchy names as keys
my ($Newsgroups) = @_; my ($Newsgroups,$ValidGroupsR) = @_;
my %ValidGroups = %{$ValidGroupsR} if $ValidGroupsR;
my %Newsgroups; my %Newsgroups;
chomp($Newsgroups); chomp($Newsgroups);
# remove whitespace from contents of Newsgroups: # remove whitespace from contents of Newsgroups:
$Newsgroups =~ s/\s//; $Newsgroups =~ s/\s//;
# call &HierarchyCount for each newsgroup in $Newsgroups: # call &HierarchyCount for each newsgroup in $Newsgroups:
for (split /,/, $Newsgroups) { for (split /,/, $Newsgroups) {
# don't count invalid newsgroups
if(%ValidGroups and !defined($ValidGroups{$_})) {
warn (sprintf("DROPPED: %s\n",$_));
next;
}
# add original newsgroup to %Newsgroups # add original newsgroup to %Newsgroups
$Newsgroups{$_} = 1; $Newsgroups{$_} = 1;
# add all hierarchy elements to %Newsgroups, amended by '.ALL', # add all hierarchy elements to %Newsgroups, amended by '.ALL',
@ -194,6 +203,26 @@ sub ParseHierarchies {
return @Hierarchies; return @Hierarchies;
}; };
################################################################################
sub ReadGroupList {
################################################################################
### read a list of valid newsgroups from file (each group on one line,
### ignoring everything after the first whitespace and so accepting files
### in checkgroups format as well as (parts of) an INN active file)
### IN : $Filename : file to read
### OUT: \%ValidGroups: hash containing all valid newsgroups
my ($Filename) = @_;
my %ValidGroups;
open (my $LIST,"<$Filename") or die "$MySelf: E: Cannot read $Filename: $!\n";
while (<$LIST>) {
s/^(\S+).*$/$1/;
chomp;
$ValidGroups{$_} = '1';
};
close $LIST;
return \%ValidGroups;
};
################################################################################ ################################################################################
#####----------------------------- TimePeriods ----------------------------##### #####----------------------------- TimePeriods ----------------------------#####

View file

@ -19,7 +19,7 @@ BEGIN {
} }
use strict; use strict;
use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups); use NewsStats qw(:DEFAULT :TimePeriods ListNewsgroups ReadGroupList);
use DBI; use DBI;
@ -33,7 +33,7 @@ my %LegalTypes;
################################# Main program ################################# ################################# Main program #################################
### read commandline options ### read commandline options
my %Options = &ReadOptions('dom:p:t:n:r:g:c:s:'); my %Options = &ReadOptions('dom:p:t:l:n:r:g:c:s:');
### read configuration ### read configuration
my %Conf = %{ReadConfig('newsstats.conf')}; my %Conf = %{ReadConfig('newsstats.conf')};
@ -54,6 +54,9 @@ die "$MySelf: E: Unknown type '-t $Options{'t'}'!\n" if !exists($LegalTypes{$Opt
### get time period (-m or -p) ### get time period (-m or -p)
my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); my ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'});
### read newsgroups list from -l
my %ValidGroups = %{&ReadGroupList($Options{'l'})} if $Options{'l'};
### init database ### init database
my $DBHandle = InitDB(\%Conf,1); my $DBHandle = InitDB(\%Conf,1);
@ -72,18 +75,25 @@ foreach my $Month (&ListMonth($StartMonth,$EndMonth)) {
# count postings per group # count postings per group
my %Postings; my %Postings;
while (($_) = $DBQuery->fetchrow_array) { while (($_) = $DBQuery->fetchrow_array) {
# get list oft newsgroups and hierarchies from Newsgroups: # get list oft newsgroups and hierarchies from Newsgroups:
my %Newsgroups = ListNewsgroups($_); my %Newsgroups = ListNewsgroups($_,$Options{'l'} ? \%ValidGroups : '');
# count each newsgroup and hierarchy once # count each newsgroup and hierarchy once
foreach (sort keys %Newsgroups) { foreach (sort keys %Newsgroups) {
# don't count newsgroup/hierarchy in wrong TLH
next if(defined($Conf{'TLH'}) and !/^$Conf{'TLH'}/);
$Postings{$_}++; $Postings{$_}++;
}; };
}; };
# add valid but empty groups if -l is set
if (%ValidGroups) {
foreach (sort keys %ValidGroups) {
if (!defined($Postings{$_})) {
$Postings{$_} = 0 ;
warn (sprintf("ADDED: %s as empty group\n",$_));
}
};
};
print "----- GroupStats -----\n" if $Options{'d'}; print "----- GroupStats -----\n" if $Options{'d'};
foreach my $Newsgroup (sort keys %Postings) { foreach my $Newsgroup (sort keys %Postings) {
print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'}; print "$Newsgroup => $Postings{$Newsgroup}\n" if $Options{'d'};
@ -112,7 +122,7 @@ gatherstats - process statistical data from a raw source
=head1 SYNOPSIS =head1 SYNOPSIS
B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>] B<gatherstats> [B<-Vhdo>] [B<-m> I<YYYY-MM>] [B<-p> I<YYYY-MM:YYYY-MM>] [B<-t> I<type>] [B<-l> I<filename>] [B<-n> I<TLH>] [B<-r> I<database table>] [B<-g> I<database table>] [B<-c> I<database table>] [B<-s> I<database table>]
=head1 REQUIREMENTS =head1 REQUIREMENTS
@ -219,6 +229,17 @@ Set processing type to one of I<all> and I<groups>. Defaults to all
(and is currently rather pointless as only I<groups> has been (and is currently rather pointless as only I<groups> has been
implemented). implemented).
=item B<-l> I<filename> (check against list)
Check each group against a list of valid newsgroups read from
I<filename>, one group on each line and ignoring everything after the
first whitespace (so you can use a file in checkgroups format or (part
of) your INN active file).
Newsgroups not found in I<filename> will be dropped (and logged to
STDERR), and newsgroups found in I<filename> but having no postings
will be added with a count of 0 (and logged to STDERR).
=item B<-n> I<TLH> (newsgroup hierarchy) =item B<-n> I<TLH> (newsgroup hierarchy)
Override I<TLH> from F<newsstats.conf>. Override I<TLH> from F<newsstats.conf>.
@ -259,9 +280,10 @@ Process all types of information for January of 2010:
gatherstats -m 2010-01 gatherstats -m 2010-01
Process only number of postings for the year of 2010: Process only number of postings for the year of 2010,
checking against checkgroups-2010.txt:
gatherstats -p 2010-01:2010-12 -t groups gatherstats -p 2010-01:2010-12 -t groups -l checkgroups-2010.txt
=head1 FILES =head1 FILES