From edd250f2652311094fff5ef352702af44f3a8a1b Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 2 May 2012 18:11:43 +0200 Subject: [PATCH] Complete rewrite of groupstats.pl and NewsStats.pm. * Switch to Getopt::Long, change coding style; limit line length. * Replace 'die' and 'warn' by calls to &Bleat(). * Completely redo options and processing: - merge -m/-p/-a into --month - replace -i/-q/-d with - much more powerful - --group-by/--order-by - replace -t/-l with - much more powerful - --lower/--upper/--boundary - remove -b and replace it with --report Fixes #33. * Add new report types, boundaries and sorting options: - report types 'average' and 'sums' - boundaries 'average' and 'sums' - upper and/or lower boundary - sort output independently Issue #35. Fixes #34, #38. * Add possibility to cross-check newsgroups against checkgroups file. * Complete rewrite of groupstats.pl internal logic: - modularize construction fo SQL queries - remove unnecessary special cases - refactor code into NewsStats.pm functions as much as possible Issue #37. Fixes #36. * Rework output formats, fix padding problem by making use of modularized SQL queries. Fixes #15, #32. * Add some more consistency checks. Issue #12. * Redo documentation. * Update TODO list. Signed-off-by: Thomas Hochstein --- NewsStats.pm | 493 +++++++++++++++++++++++-------- doc/TODO | 15 +- groupstats.pl | 804 +++++++++++++++++++++++++++++--------------------- 3 files changed, 845 insertions(+), 467 deletions(-) diff --git a/NewsStats.pm b/NewsStats.pm index cc3dd83..a04ac0f 100644 --- a/NewsStats.pm +++ b/NewsStats.pm @@ -2,7 +2,7 @@ # # Library functions for the NewsStats package. # -# Copyright (c) 2010 Thomas Hochstein +# Copyright (c) 2010-2012 Thomas Hochstein # # This module can be redistributed and/or modified under the same terms under # which Perl itself is published. @@ -16,18 +16,20 @@ our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS); require Exporter; @ISA = qw(Exporter); @EXPORT = qw( - $MySelf $MyVersion $PackageVersion - ReadOptions + $FullPath + $HomePath + ShowVersion + ShowPOD ReadConfig OverrideConfig InitDB + Bleat ); @EXPORT_OK = qw( GetTimePeriod LastMonth - CheckMonth SplitPeriod ListMonth ListNewsgroups @@ -35,58 +37,47 @@ require Exporter; OutputData FormatOutput SQLHierarchies + SQLSortOrder SQLGroupList + SQLSetBounds + SQLBuildClause GetMaxLength ); -%EXPORT_TAGS = ( TimePeriods => [qw(GetTimePeriod LastMonth CheckMonth SplitPeriod ListMonth)], +%EXPORT_TAGS = ( TimePeriods => [qw(GetTimePeriod LastMonth SplitPeriod + ListMonth)], Output => [qw(OutputData FormatOutput)], - SQLHelper => [qw(SQLHierarchies SQLGroupList GetMaxLength)]); + SQLHelper => [qw(SQLHierarchies SQLSortOrder SQLGroupList + SQLSetBounds SQLBuildClause GetMaxLength)]); $VERSION = '0.01'; our $PackageVersion = '0.01'; use Data::Dumper; use File::Basename; -use Getopt::Std; use Config::Auto; use DBI; #####-------------------------------- Vars --------------------------------##### -our $MySelf = fileparse($0, '.pl'); -our $MyVersion = "$MySelf $::VERSION (NewsStats.pm $VERSION)"; +# trim the path +our $FullPath = $0; +our $HomePath = dirname($0); +$0 =~ s%.*/%%; +# set version string +our $MyVersion = "$0 $::VERSION (NewsStats.pm $VERSION)"; #####------------------------------- Basics -------------------------------##### -################################################################################ -sub ReadOptions { -################################################################################ -### read commandline options and act on standard options -h and -V -### IN : $Params: list of legal commandline paramaters (without -h and -V) -### OUT: a hash containing the commandline options - $Getopt::Std::STANDARD_HELP_VERSION = 1; - - my ($Params) = @_; - my %Options; - - getopts('Vh'.$Params, \%Options); - - # -V: display version - &ShowVersion if ($Options{'V'}); - - # -h: feed myself to perldoc - &ShowPOD if ($Options{'h'}); - - return %Options; -}; ################################################################################ ################################################################################ sub ShowVersion { ################################################################################ ### display version and exit - print "NewsStats v$PackageVersion\n$MyVersion\nCopyright (c) 2010 Thomas Hochstein \n"; - print "This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself.\n"; + print "NewsStats v$PackageVersion\n$MyVersion\n"; + print "Copyright (c) 2010-2012 Thomas Hochstein \n"; + print "This program is free software; you may redistribute it ". + "and/or modify it under the same terms as Perl itself.\n"; exit(100); }; ################################################################################ @@ -95,7 +86,7 @@ sub ShowVersion { sub ShowPOD { ################################################################################ ### feed myself to perldoc and exit - exec('perldoc', $0); + exec('perldoc', $FullPath); exit(100); }; ################################################################################ @@ -112,7 +103,7 @@ sub ReadConfig { ################################################################################ ################################################################################ -sub OverrideConfig { +sub OverrideConfig { ################################################################################ ### override configuration values ### IN : $ConfigR : reference to configuration hash @@ -120,7 +111,8 @@ sub OverrideConfig { my ($ConfigR,$OverrideR) = @_; my %Override = %$OverrideR; # Config hash empty? - warn "$MySelf W: Empty configuration hash passed to OverrideConfig().\n" if ( keys %$ConfigR < 1); + &Bleat(1,"Empty configuration hash passed to OverrideConfig()") + if ( keys %$ConfigR < 1); # return if no overrides return if (keys %Override < 1 or keys %$ConfigR < 1); foreach my $Key (keys %Override) { @@ -138,15 +130,35 @@ sub InitDB { ### OUT: DBHandle my ($ConfigR,$Die) = @_; my %Conf = %$ConfigR; - my $DBHandle = DBI->connect(sprintf('DBI:%s:database=%s;host=%s',$Conf{'DBDriver'},$Conf{'DBDatabase'},$Conf{'DBHost'}), $Conf{'DBUser'}, $Conf{'DBPw'}, { PrintError => 0 }); + my $DBHandle = DBI->connect(sprintf('DBI:%s:database=%s;host=%s', + $Conf{'DBDriver'},$Conf{'DBDatabase'}, + $Conf{'DBHost'}), $Conf{'DBUser'}, + $Conf{'DBPw'}, { PrintError => 0 }); if (!$DBHandle) { - die ("$MySelf: E: $DBI::errstr\n") if (defined($Die) and $Die); - warn("$MySelf: W: $DBI::errstr\n"); + &Bleat(2,$DBI::errstr) if (defined($Die) and $Die); + &Bleat(1,$DBI::errstr); }; return $DBHandle; }; ################################################################################ +################################################################################ +sub Bleat { +################################################################################ +### print warning or error messages and terminate in case of error +### IN : $Level : 1 = warning, 2 = error +### $Message: warning or error message + my ($Level,$Message) = @_; + if ($Level == 1) { + warn "$0 W: $Message\n" + } elsif ($Level == 2) { + die "$0 E: $Message\n" + } else { + print "$0: $Message\n" + } +}; +################################################################################ + #####------------------------------ GetStats ------------------------------##### ################################################################################ @@ -172,7 +184,7 @@ sub ListNewsgroups { next if($TLH and !/^$TLH/); # don't count invalid newsgroups if(%ValidGroups and !defined($ValidGroups{$_})) { - warn (sprintf("DROPPED: %s\n",$_)); + &Bleat(1,sprintf("DROPPED: %s",$_)); next; } # add original newsgroup to %Newsgroups @@ -216,7 +228,7 @@ sub ReadGroupList { ### OUT: \%ValidGroups: hash containing all valid newsgroups my ($Filename) = @_; my %ValidGroups; - open (my $LIST,"<$Filename") or die "$MySelf: E: Cannot read $Filename: $!\n"; + open (my $LIST,"<$Filename") or &Bleat(2,"Cannot read $Filename: $!"); while (<$LIST>) { s/^(\S+).*$/$1/; chomp; @@ -233,29 +245,42 @@ sub ReadGroupList { ################################################################################ sub GetTimePeriod { ################################################################################ -### get a time period to act on, in order of preference: by default the -### last month; or a month submitted by -m YYYY-MM; or a time period submitted -### by -p YYYY-MM:YYYY-MM -### IN : $Month,$Period: contents of -m and -p -### OUT: $StartMonth, $EndMonth (identical if period is just one month) - my ($Month,$Period) = @_; - # exit if -m is set and not like YYYY-MM - die "$MySelf: E: Wrong date format - use '$MySelf -m YYYY-MM'!\n" if not &CheckMonth($Month); - # warn if -m and -p is set - warn "$MySelf: W: Time period assigned by '-p' takes precendece over month assigned by '-m'.\n" if ($Month && $Period); - # default: set -m to last month - $Month = &LastMonth if (!defined($Month) and !defined($Period)); - # set $StartMonth, $EndMonth - my ($StartMonth, $EndMonth); - if ($Period) { - # -p: get date range - ($StartMonth, $EndMonth) = &SplitPeriod($Period); - die "$MySelf: E: Wrong format for time period - use '$MySelf -p YYYY-MM:YYYY-MM'!\n" if !defined($StartMonth); +### get a time period to act on from --month option; +### if empty, default to last month +### IN : $Month: may be empty, 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'all' +### OUT: $Verbal,$SQL: verbal description and WHERE-clause +### of the chosen time period + my ($Month) = @_; + # define result variables + my ($Verbal, $SQL); + # define a regular expression for a month + my $REMonth = '\d{4}-\d{2}'; + + # default to last month if option is not set + if(!$Month) { + $Month = &LastMonth; + } + + # check for valid input + if ($Month =~ /^$REMonth$/) { + # single month (YYYY-MM) + ($Month) = &CheckMonth($Month); + $Verbal = $Month; + $SQL = sprintf("month = '%s'",$Month); + } elsif ($Month =~ /^$REMonth:$REMonth$/) { + # time period (YYYY-MM:YYYY-MM) + $Verbal = sprintf('%s to %s',&SplitPeriod($Month)); + $SQL = sprintf("month BETWEEN '%s' AND '%s'",&SplitPeriod($Month)); + } elsif ($Month =~ /^all$/i) { + # special case: ALL + $Verbal = 'all time'; + $SQL = ''; } else { - # set $StartMonth = $EndMonth = $Month if -p is not set - $StartMonth = $EndMonth = $Month; - }; - return ($StartMonth, $EndMonth); + # invalid input + return (undef,undef); + } + + return ($Verbal,$SQL); }; ################################################################################ @@ -278,12 +303,23 @@ sub LastMonth { ################################################################################ sub CheckMonth { ################################################################################ -### check if input is a valid month in YYYY-MM form -### IN : $Month: month -### OUT: TRUE / FALSE - my ($Month) = @_; - return 0 if (defined($Month) and $Month !~ /^\d{4}-\d{2}$/); - return 1; +### check if input (in YYYY-MM form) is valid with MM between 01 and 12; +### otherwise, fix it +### IN : @Month: array of month +### OUT: @Month: a valid month + my (@Month) = @_; + foreach my $Month (@Month) { + my ($OldMonth) = $Month; + my ($CalMonth) = substr ($Month, -2); + if ($CalMonth < 1 or $CalMonth > 12) { + $CalMonth = '12' if $CalMonth > 12; + $CalMonth = '01' if $CalMonth < 1; + substr($Month, -2) = $CalMonth; + &Bleat(1,sprintf("'%s' is an invalid date (MM must be between '01' ". + "and '12'), set to '%s'.",$OldMonth,$Month)); + } + } + return @Month; }; ################################################################################ @@ -291,10 +327,10 @@ sub SplitPeriod { ################################################################################ ### split a time period denoted by YYYY-MM:YYYY-MM into start and end month ### IN : $Period: time period -### OUT: $StartMonth, Â$EndMonth +### OUT: $StartMonth, $EndMonth my ($Period) = @_; - return (undef,undef) if $Period !~ /^\d{4}-\d{2}:\d{4}-\d{2}$/; my ($StartMonth, $EndMonth) = split /:/, $Period; + ($StartMonth,$EndMonth) = CheckMonth($StartMonth,$EndMonth); # switch parameters as necessary if ($EndMonth gt $StartMonth) { return ($StartMonth, $EndMonth); @@ -310,7 +346,8 @@ sub ListMonth { ### IN : $StartMonth, $EndMonth ### OUT: @Months: array containing all months from $StartMonth to $EndMonth my ($StartMonth, $EndMonth) = @_; - return (undef,undef) if ($StartMonth !~ /^\d{4}-\d{2}$/ or $EndMonth !~ /^\d{4}-\d{2}$/); + return (undef,undef) + if ($StartMonth !~ /^\d{4}-\d{2}$/ or $EndMonth !~ /^\d{4}-\d{2}$/); # return if $StartMonth = $EndMonth return ($StartMonth) if ($StartMonth eq $EndMonth); # set $Year, $Month from $StartMonth @@ -335,64 +372,96 @@ sub ListMonth { sub OutputData { ################################################################################ ### read database query results from DBHandle and print results with formatting -### IN : $Format : format specifier -### $FileName: file name template (-f): filename-YYYY-MM -### $DBQuery : database query handle with executed query, +### IN : $Format : format specifier +### $Comments : print or suppress all comments for machine-readable output +### $GroupBy : primary sorting order (month or key) +### $Precision: number of digits right of decimal point (0 or 2) +### $ValidKeys: reference to a hash containing all valid keys +### $FileTempl: file name template (--filetemplate): filetempl-YYYY-MM +### $DBQuery : database query handle with executed query, ### containing $Month, $Key, $Value -### $PadGroup: padding length for newsgroups field (optional) for 'pretty' - my ($Format, $FileName, $DBQuery, $PadGroup) = @_; - my ($Handle, $OUT); +### $PadGroup : padding length for key field (optional) for 'pretty' + my ($Format, $Comments, $GroupBy, $Precision, $ValidKeys, $FileTempl, + $DBQuery, $PadGroup) = @_; + my %ValidKeys = %{$ValidKeys} if $ValidKeys; + my ($FileName, $Handle, $OUT); our $LastIteration; + + # define output types + my %LegalOutput; + @LegalOutput{('dump',,'list','pretty')} = (); + # bail out if format is unknown + &Bleat(2,"Unknown output type '$Format'!") if !exists($LegalOutput{$Format}); + while (my ($Month, $Key, $Value) = $DBQuery->fetchrow_array) { + # don't display invalid keys + if(%ValidKeys and !defined($ValidKeys{$Key})) { + # FIXME + # &Bleat(1,sprintf("DROPPED: %s",$Key)); + next; + }; + # care for correct sorting order and abstract from month and keys: + # $Caption will be $Month or $Key, according to sorting order, + # and $Key will be $Key or $Month, respectively + my $Caption; + if ($GroupBy eq 'key') { + $Caption = $Key; + $Key = $Month; + } else { + $Caption = $Month; + } # set output file handle - if (!$FileName) { + if (!$FileTempl) { $Handle = *STDOUT{IO}; # set $Handle to a reference to STDOUT - } elsif (!defined($LastIteration) or $LastIteration ne $Month) { + } elsif (!defined($LastIteration) or $LastIteration ne $Caption) { close $OUT if ($LastIteration); - open ($OUT,sprintf('>%s-%s',$FileName,$Month)) or die sprintf("$MySelf: E: Cannot open output file '%s-%s': $!\n",$FileName,$Month); + # safeguards for filename creation: + # replace potential problem characters with '_' + $FileName = sprintf('%s-%s',$FileTempl,$Caption); + $FileName =~ s/[^a-zA-Z0-9_-]+/_/g; + open ($OUT,">$FileName") + or &Bleat(2,sprintf("Cannot open output file '%s': $!", + $FileName)); $Handle = $OUT; }; - print $Handle &FormatOutput($Format, $Month, $Key, $Value, $PadGroup); - $LastIteration = $Month; + print $Handle &FormatOutput($Format, $Comments, $Caption, $Key, $Value, + $Precision, $PadGroup); + $LastIteration = $Caption; }; - close $OUT if ($FileName); + close $OUT if ($FileTempl); }; ################################################################################ sub FormatOutput { ################################################################################ ### format information for output according to format specifier -### IN : $Format : format specifier -### $Month : month (as YYYY-MM) -### $Key : newsgroup, client, ... -### $Value : number of postings with that attribute -### $PadGroup: padding length for key field (optional) for 'pretty' +### IN : $Format : format specifier +### $Comments : print or suppress all comments for machine-readable output +### $Caption : month (as YYYY-MM) or $Key, according to sorting order +### $Key : newsgroup, client, ... or $Month, as above +### $Value : number of postings with that attribute +### $Precision: number of digits right of decimal point (0 or 2) +### $PadGroup : padding length for key field (optional) for 'pretty' ### OUT: $Output: formatted output - my ($Format, $Month, $Key, $Value, $PadGroup) = @_; - - # define output types - my %LegalOutput; - @LegalOutput{('dump','dumpgroup','list','pretty')} = (); - # bail out if format is unknown - die "$MySelf: E: Unknown output type '$Format'!\n" if !exists($LegalOutput{$Format}); - + my ($Format, $Comments, $Caption, $Key, $Value, $Precision, $PadGroup) = @_; my ($Output); - # keep last month in mind + # keep last caption in mind our ($LastIteration); + # create one line of output if ($Format eq 'dump') { - # output as dump (ng nnnnn) - $Output = sprintf ("%s %u\n",$Key,$Value); - } elsif ($Format eq 'dumpgroup') { - # output as dump (YYYY-NN: nnnnn) - $Output = sprintf ("%s: %5u\n",$Month,$Value); + # output as dump (key value) + $Output = sprintf ("# %s:\n",$Caption) + if ($Comments and (!defined($LastIteration) or $Caption ne $LastIteration)); + $Output .= sprintf ("%s %u\n",$Key,$Value); } elsif ($Format eq 'list') { - # output as list (YYYY-NN: ng nnnnn) - $Output = sprintf ("%s: %s %u\n",$Month,$Key,$Value); + # output as list (caption key value) + $Output = sprintf ("%s %s %u\n",$Caption,$Key,$Value); } elsif ($Format eq 'pretty') { - # output as table - $Output = sprintf ("----- %s:\n",$Month) if (!defined($LastIteration) or $Month ne $LastIteration); - $LastIteration = $Month; - $Output .= sprintf ($PadGroup ? sprintf("%%-%us %%5u\n",$PadGroup) : "%s %u\n",$Key,$Value); + # output as a table + $Output = sprintf ("# ----- %s:\n",$Caption) + if ($Comments and (!defined($LastIteration) or $Caption ne $LastIteration)); + $Output .= sprintf ($PadGroup ? sprintf("%%-%us %%10.*f\n",$PadGroup) : + "%s %.*f\n",$Key,$Precision,$Value); }; return $Output; }; @@ -408,46 +477,226 @@ sub SQLHierarchies { ### IN : $ShowHierarchies: boolean value ### OUT: SQL code my ($ShowHierarchies) = @_; - return $ShowHierarchies ? '' : "AND newsgroup NOT LIKE '%.ALL'"; + return $ShowHierarchies ? '' : "newsgroup NOT LIKE '%.ALL'"; }; ################################################################################ sub GetMaxLength { ################################################################################ ### get length of longest field in future query result -### IN : $DBHandle : database handel -### $Table : table to query -### $Field : field to check -### $WhereClause: WHERE clause -### @BindVars : bind variables for WHERE clause +### IN : $DBHandle : database handel +### $Table : table to query +### $Field : field to check +### $WhereClause : WHERE clause +### $HavingClause: HAVING clause +### @BindVars : bind variables for WHERE clause ### OUT: $Length: length of longest instnace of $Field - my ($DBHandle,$Table,$Field,$WhereClause,@BindVars) = @_; - my $DBQuery = $DBHandle->prepare(sprintf("SELECT MAX(LENGTH(%s)) FROM %s WHERE %s",$Field,$Table,$WhereClause)); - $DBQuery->execute(@BindVars) or warn sprintf("$MySelf: W: Can't get field length for %s from table %s: $DBI::errstr\n",$Field,$Table); + my ($DBHandle,$Table,$Field,$WhereClause,$HavingClause,@BindVars) = @_; + my $DBQuery = $DBHandle->prepare(sprintf("SELECT MAX(LENGTH(%s)) ". + "FROM %s %s %s",$Field,$Table, + $WhereClause,$HavingClause ? + 'GROUP BY newsgroup' . $HavingClause . + ' ORDER BY LENGTH(newsgroup) '. + 'DESC LIMIT 1': '')); + $DBQuery->execute(@BindVars) or &Bleat(1,sprintf("Can't get field length ". + "for '%s' from table '%s': ". + "$DBI::errstr",$Field,$Table)); my ($Length) = $DBQuery->fetchrow_array; return $Length; }; +################################################################################ +sub SQLSortOrder { +################################################################################ +### build a SQL 'ORDER BY' clause from $OptGroupBy (primary sorting) and +### $OptOrderBy (secondary sorting), both ascending or descending; +### descending sorting order is done by adding '-desc' +### IN : $GroupBy: primary sort by 'month' (default) or 'newsgroups' +### $OrderBy: secondary sort by month/newsgroups (default) +### or number of 'postings' +### OUT: a SQL ORDER BY clause + my ($GroupBy,$OrderBy) = @_; + my ($GroupSort,$OrderSort) = ('',''); + # $GroupBy (primary sorting) + if (!$GroupBy) { + $GroupBy = 'month'; + } else { + ($GroupBy, $GroupSort) = SQLParseOrder($GroupBy); + if ($GroupBy =~ /group/i) { + $GroupBy = 'newsgroup'; + } else { + $GroupBy = 'month'; + } + } + my $Secondary = ($GroupBy eq 'month') ? 'newsgroup' : 'month'; + # $OrderBy (secondary sorting) + if (!$OrderBy) { + $OrderBy = $Secondary; + } else { + ($OrderBy, $OrderSort) = SQLParseOrder($OrderBy); + if ($OrderBy =~ /posting/i) { + $OrderBy = "postings $OrderSort, $Secondary"; + } else { + $OrderBy = "$Secondary $OrderSort"; + } + } + return ($GroupBy,&SQLBuildClause('order',"$GroupBy $GroupSort",$OrderBy)); +}; + +################################################################################ +sub SQLParseOrder { +################################################################################ +### parse $OptGroupBy or $OptOrderBy option of the form param[-desc], e.g. +### 'month', 'month-desc', 'newsgroups-desc', but also just 'desc' +### IN : $OrderOption: order option (see above) +### OUT: parameter to sort by, +### sort order ('DESC' or nothing, meaning 'ASC') + my ($OrderOption) = @_; + my $SortOrder = ''; + if ($OrderOption =~ s/-?desc$//i) { + $SortOrder = 'DESC'; + } else { + $OrderOption =~ s/-?asc$//i + } + return ($OrderOption,$SortOrder); +}; + ################################################################################ sub SQLGroupList { ################################################################################ -### explode list of newsgroups separated by : (with wildcards) to a SQL WHERE -### clause +### explode list of newsgroups separated by : (with wildcards) +### to a SQL 'WHERE' expression ### IN : $Newsgroups: list of newsgroups (group.one.*:group.two:group.three.*) -### OUT: SQL code, list of newsgroups +### OUT: SQL code to become part of a 'WHERE' clause, +### list of newsgroups for SQL bindings my ($Newsgroups) = @_; + # substitute '*' wildcard with SQL wildcard character '%' $Newsgroups =~ s/\*/%/g; - return ('newsgroup LIKE ?', $Newsgroups) if $Newsgroups !~ /:/; + # just one newsgroup? + return (SQLGroupWildcard($Newsgroups),$Newsgroups) if $Newsgroups !~ /:/; + # list of newsgroups separated by ':' my $SQL = '('; my @GroupList = split /:/, $Newsgroups; foreach (@GroupList) { $SQL .= ' OR ' if $SQL gt '('; - $SQL .= 'newsgroup LIKE ?'; + $SQL .= SQLGroupWildcard($_); }; $SQL .= ')'; return ($SQL,@GroupList); }; +################################################################################ +sub SQLGroupWildcard { +################################################################################ +### build a valid SQL 'WHERE' expression with or without wildcards +### IN : $Newsgroup: newsgroup expression, probably with wildcard +### (group.name or group.name.%) +### OUT: SQL code to become part of a 'WHERE' clause + my ($Newsgroup) = @_; + # FIXME: check for validity + if ($Newsgroup !~ /%/) { + return 'newsgroup = ?'; + } else { + return 'newsgroup LIKE ?'; + } +}; + +################################################################################ +sub SQLSetBounds { +################################################################################ +### set upper and/or lower boundary (number of postings) +### IN : $Type: 'level', 'average', 'sum' or 'default' +### $LowBound,$UppBound: lower/upper boundary, respectively +### OUT: SQL code to become part of a WHERE or HAVING clause + my ($Type,$LowBound,$UppBound) = @_; + ($LowBound,$UppBound) = SQLCheckNumber($LowBound,$UppBound); + if($LowBound and $UppBound and $LowBound > $UppBound) { + &Bleat(1,"Lower boundary $LowBound is larger than Upper boundary ". + "$UppBound, exchanging boundaries."); + ($LowBound,$UppBound) = ($UppBound,$LowBound); + } + # default to 'default' + my $WhereHavingFunction = 'postings'; + # set $LowBound to SQL statement: + # 'WHERE postings >=', 'HAVING MIN(postings) >=' or 'HAVING AVG(postings) >=' + if ($Type eq 'level') { + $WhereHavingFunction = 'MIN(postings)' + } elsif ($Type eq 'average') { + $WhereHavingFunction = 'AVG(postings)' + } elsif ($Type eq 'sum') { + $WhereHavingFunction = 'SUM(postings)' + } + $LowBound = sprintf('%s >= '.$LowBound,$WhereHavingFunction) if ($LowBound); + # set $LowBound to SQL statement: + # 'WHERE postings <=', 'HAVING MAX(postings) <=' or 'HAVING AVG(postings) <=' + if ($Type eq 'level') { + $WhereHavingFunction = 'MAX(postings)' + } elsif ($Type eq 'average') { + $WhereHavingFunction = 'AVG(postings)' + } elsif ($Type eq 'sum') { + $WhereHavingFunction = 'SUM(postings)' + } + $UppBound = sprintf('%s <= '.$UppBound,$WhereHavingFunction) if ($UppBound); + return ($LowBound,$UppBound); +}; + +################################################################################ +sub SQLCheckNumber { +################################################################################ +### check if input is a valid positive integer; otherwise, make it one +### IN : @Numbers: array of parameters +### OUT: @Numbers: a valid positive integer + my (@Numbers) = @_; + foreach my $Number (@Numbers) { + if ($Number and $Number < 0) { + &Bleat(1,"Boundary $Number is < 0, set to ".-$Number); + $Number = -$Number; + } + $Number = '' if ($Number and $Number !~ /^\d+$/); + } + return @Numbers; +}; + +################################################################################ +sub SQLBuildClause { +################################################################################ +### build a valid SQL WHERE, GROUP BY, ORDER BY or HAVING clause +### from multiple expressions which *may* be empty +### IN : $Type: 'where', 'having', 'group' or 'order' +### @Expressions: array of expressions +### OUT: $SQLClause: a SQL clause + my ($Type,@Expressions) = @_; + my ($SQLClause,$Separator,$Statement); + # set separator ('AND' or ',') + if ($Type eq 'where' or $Type eq 'having') { + $Separator = 'AND'; + } else { + $Separator = ','; + } + # set statement + if ($Type eq 'where') { + $Statement = 'WHERE'; + } elsif ($Type eq 'order') { + $Statement = 'ORDER BY'; + } elsif ($Type eq 'having') { + $Statement = 'HAVING'; + } else { + $Statement = 'GROUP BY'; + } + # build query from expressions with separators + foreach my $Expression (@Expressions) { + if ($Expression) { + $SQLClause .= " $Separator " if ($SQLClause); + $SQLClause .= $Expression; + } + } + # add statement in front if not already present + $SQLClause = " $Statement " . $SQLClause + if ($SQLClause and $SQLClause !~ /$Statement/); + return $SQLClause; +}; + + #####------------------------------- done ---------------------------------##### 1; diff --git a/doc/TODO b/doc/TODO index 52c60ac..ce6f795 100644 --- a/doc/TODO +++ b/doc/TODO @@ -29,6 +29,9 @@ Bug numbers refer to the Mantis issue tracker at . - Add other reports NewsStats should include some other kinds of reports (stats on used clients, on postings hosts/servers, ...) + - Add tools for database management + NewsStats should offer tools e.g. to inject postings into the 'raw' database, + or to split databases. * Individual improvements + NewsStats.pm @@ -43,10 +46,15 @@ Bug numbers refer to the Mantis issue tracker at . - General tests and optimisations + feedlog.pl + - Complete rewrite (like groupstats.pl, include changes in NewsStats.pm) + - Gracefully handle interruptions to database connection, at least log + lost postings (Bug #30). + - Buffer output that cannot be written to database (Bug #31). - Add / enhance / test error handling - General tests and optimisations + gatherstats.pl + - Complete rewrite (like groupstats.pl, include changes in NewsStats.pm) - Use hierarchy information (see GroupInfo above) - Add gathering of other stats (clients, hosts, ...) - better modularisation (code reuse for other reports!) @@ -54,13 +62,6 @@ Bug numbers refer to the Mantis issue tracker at . - General tests and optimisations + groupstats.pl - - output: fix formatting - GetMaxLength does not work for special queries - like -l or -b - - beautify output (formats / captions) - - -a: improve code - it doesn't make that much sense to get first/last - month from database just to query that same database with a time period - that equals no time period ... this part of the WHERE clause should be - dropped instead - better modularisation (code reuse for other reports!) - Add / enhance / test error handling - General tests and optimisations diff --git a/groupstats.pl b/groupstats.pl index bed36e2..14d19c6 100755 --- a/groupstats.pl +++ b/groupstats.pl @@ -7,7 +7,7 @@ # # It is part of the NewsStats package. # -# Copyright (c) 2010 Thomas Hochstein +# Copyright (c) 2010-2012 Thomas Hochstein # # It can be redistributed and/or modified under the same terms under # which Perl itself is published. @@ -19,226 +19,229 @@ BEGIN { } use strict; -use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper); +use NewsStats qw(:DEFAULT :TimePeriods :Output :SQLHelper ReadGroupList); use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); ################################# Main program ################################# ### read commandline options -my %Options = &ReadOptions('m:p:an:o:t:l:b:iscqdf:g:'); +my ($OptBoundType,$OptCaptions,$OptCheckgroupsFile,$OptComments, + $OptFileTemplate,$OptFormat,$OptGroupBy,$OptGroupsDB,$LowBound,$OptMonth, + $OptNewsgroups,$OptOrderBy,$OptReportType,$OptSums,$UppBound); +GetOptions ('b|boundary=s' => \$OptBoundType, + 'c|captions!' => \$OptCaptions, + 'checkgroups=s' => \$OptCheckgroupsFile, + 'comments!' => \$OptComments, + 'filetemplate=s' => \$OptFileTemplate, + 'f|format=s' => \$OptFormat, + 'g|group-by=s' => \$OptGroupBy, + 'groupsdb=s' => \$OptGroupsDB, + 'l|lower=i' => \$LowBound, + 'm|month=s' => \$OptMonth, + 'n|newsgroups=s' => \$OptNewsgroups, + 'o|order-by=s' => \$OptOrderBy, + 'r|report=s' => \$OptReportType, + 's|sums!' => \$OptSums, + 'u|upper=i' => \$UppBound, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; +# parse parameters +# $OptComments defaults to TRUE +$OptComments = 1 if (!defined($OptComments)); +# force --nocomments when --filetemplate is used +$OptComments = 0 if ($OptFileTemplate); +# parse $OptBoundType +if ($OptBoundType) { + if ($OptBoundType =~ /level/i) { + $OptBoundType = 'level'; + } elsif ($OptBoundType =~ /av(era)?ge?/i) { + $OptBoundType = 'average'; + } elsif ($OptBoundType =~ /sums?/i) { + $OptBoundType = 'sum'; + } else { + $OptBoundType = 'default'; + } +} +# parse $OptReportType +if ($OptReportType) { + if ($OptReportType =~ /av(era)?ge?/i) { + $OptReportType = 'average'; + } elsif ($OptReportType =~ /sums?/i) { + $OptReportType = 'sum'; + } else { + $OptReportType = 'default'; + } +} +# read list of newsgroups from --checkgroups +# into a hash reference +my $ValidGroups = &ReadGroupList($OptCheckgroupsFile) if $OptCheckgroupsFile; ### read configuration -my %Conf = %{ReadConfig('newsstats.conf')}; +my %Conf = %{ReadConfig($HomePath.'/newsstats.conf')}; ### override configuration via commandline options my %ConfOverride; -$ConfOverride{'DBTableGrps'} = $Options{'g'} if $Options{'g'}; +$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; &OverrideConfig(\%Conf,\%ConfOverride); -### check for incompatible command line options -# you can't mix '-t', '-b' and '-l' -# -b/-l take preference over -t, and -b takes preference over -l -# you can't use '-f' with '-b' or '-l' -if ($Options{'b'} or $Options{'l'}) { - if ($Options{'f'}) { - # drop -f - warn ("$MySelf: W: You cannot save the report to monthly files when using top lists (-b) or levels (-l). Filename template '-f $Options{'f'}' was ignored.\n"); - undef($Options{'f'}); - }; - if ($Options{'t'}) { - # drop -t - warn ("$MySelf: W: You cannot combine thresholds (-t) and top lists (-b) or levels (-l). Threshold '-t $Options{'t'}' was ignored.\n"); - undef($Options{'t'}); - }; - if ($Options{'b'} and $Options{'l'}) { - # drop -l - warn ("$MySelf: W: You cannot combine top lists (-b) and levels (-l). Level '-l $Options{'l'}' was ignored.\n"); - undef($Options{'l'}); - }; - # -q/-d don't work with -b or -l - warn ("$MySelf: W: Sorting by number of postings (-q) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'q'}; - warn ("$MySelf: W: Reverse sorting (-d) ignored due to top list mode (-b) / levels (-l).\n") if $Options{'d'}; -}; - -### check output type -# default output type to 'pretty' -$Options{'o'} = 'pretty' if !$Options{'o'}; -# fail if more than one newsgroup is combined with 'dumpgroup' type -die ("$MySelf: E: You cannot combine newsgroup lists (-n) with more than one group with '-o dumpgroup'!\n") if ($Options{'o'} eq 'dumpgroup' and defined($Options{'n'}) and $Options{'n'} =~ /:|\*/); -# accept 'dumpgroup' only with -n -if ($Options{'o'} eq 'dumpgroup' and !defined($Options{'n'})) { - $Options{'o'} = 'dump'; - warn ("$MySelf: W: You must submit exactly one newsgroup ('-n news.group') for '-o dumpgroup'. Output type was set to 'dump'.\n"); -}; -# set output type to 'pretty' for -l -if ($Options{'l'} and $Options{'o'} ne 'pretty') { - $Options{'o'} = 'pretty'; - warn ("$MySelf: W: Output type forced to '-o pretty' due to usage of '-l'.\n"); -}; -# set output type to 'dump' for -f -if ($Options{'f'} and $Options{'o'} ne 'dump') { - $Options{'o'} = 'dump'; - warn ("$MySelf: W: Output type forced to '-o dump' due to usage of '-f'.\n"); -}; - ### init database my $DBHandle = InitDB(\%Conf,1); -### get time period -my ($StartMonth,$EndMonth); -# if '-a' is set, set start/end month from database -# FIXME - it doesn't make that much sense to get first/last month from database to query it -# with a time period that equals no time period ... -if ($Options{'a'}) { - undef($Options{'m'}); - undef($Options{'p'}); - my $DBQuery = $DBHandle->prepare(sprintf("SELECT MIN(month),MAX(month) FROM %s.%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'})); - $DBQuery->execute or die sprintf("$MySelf: E: Can't get MIN/MAX month from %s.%s: %s\n",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr); - ($StartMonth,$EndMonth) = $DBQuery->fetchrow_array; -} else { - ($StartMonth,$EndMonth) = &GetTimePeriod($Options{'m'},$Options{'p'}); -}; -# if -p or -a are set: drop -m -undef $Options{'m'} if ($Options{'p'} or $Options{'a'}); -# if time period is more than one month: force output type to '-o pretty' or '-o dumpgroup' -if ($Options{'o'} eq 'dump' and ($Options{'p'} or $Options{'a'})) { - if (defined($Options{'n'}) and $Options{'n'} !~ /:|\*/) { - # just one newsgroup is defined - warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o dumpgroup'.\n"); - $Options{'o'} = 'dumpgroup'; - } elsif (!defined($Options{'f'})) { - # more than one newsgroup - and no file output - warn ("$MySelf: W: You cannot combine time periods (-p) with '-o dump', changing output type to '-o pretty'.\n"); - $Options{'o'} = 'pretty'; - } -}; +### get time period and newsgroups, prepare SQL 'WHERE' clause +# get time period +# and set caption for output and expression for SQL 'WHERE' clause +my ($CaptionPeriod,$SQLWherePeriod) = &GetTimePeriod($OptMonth); +# bail out if --month is invalid +&Bleat(2,"--month option has an invalid format - ". + "please use 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'ALL'!") if !$CaptionPeriod; +# get list of newsgroups and set expression for SQL 'WHERE' clause +# with placeholders as well as a list of newsgroup to bind to them +my ($SQLWhereNewsgroups,@SQLBindNewsgroups) = &SQLGroupList($OptNewsgroups) + if $OptNewsgroups;; -### create report -# get list of newsgroups (-n) -my ($QueryGroupList,$QueryThreshold,@GroupList,@Params); -my $Newsgroups = $Options{'n'}; -if ($Newsgroups) { - # explode list of newsgroups for WHERE clause - ($QueryGroupList,@GroupList) = &SQLGroupList($Newsgroups); +### build SQL WHERE clause (and HAVING clause, if needed) +my ($SQLWhereClause,$SQLHavingClause); +# $OptBoundType 'level' +if ($OptBoundType and $OptBoundType ne 'default') { + $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod, + $SQLWhereNewsgroups,&SQLHierarchies($OptSums)); + $SQLHavingClause = SQLBuildClause('having',&SQLSetBounds($OptBoundType, + $LowBound,$UppBound)); +# $OptBoundType 'threshold' / 'default' or none } else { - # set to dummy value (always true) - $QueryGroupList = 1; -}; - -# manage thresholds -if (defined($Options{'t'})) { - if ($Options{'i'}) { - # -i: list groups below threshold - $QueryThreshold .= ' postings < ?'; - } else { - # default: list groups above threshold - $QueryThreshold .= ' postings > ?'; - }; - # push threshold to Params - push @Params,$Options{'t'}; -} else { - # set to dummy value (always true) - $QueryThreshold = 1; + $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod, + $SQLWhereNewsgroups,&SQLHierarchies($OptSums), + &SQLSetBounds('default',$LowBound,$UppBound)); } -# construct WHERE clause -# $QueryGroupList is "list of newsgroup" (or 1), -# $QueryThreshold is threshold definition (or 1), -# &SQLHierarchies() takes care of the exclusion of hierarchy levels (.ALL) -# according to setting of -s -my $WhereClause = sprintf('month BETWEEN ? AND ? AND %s AND %s %s',$QueryGroupList,$QueryThreshold,&SQLHierarchies($Options{'s'})); +### get sort order and build SQL 'ORDER BY' clause +# default to 'newsgroup' for $OptBoundType 'level' or 'average' +$OptGroupBy = 'newsgroup' if (!$OptGroupBy and + $OptBoundType and $OptBoundType ne 'default'); +# force to 'month' for $OptReportType 'average' or 'sum' +$OptGroupBy = 'month' if ($OptReportType and $OptReportType ne 'default'); +# parse $OptGroupBy to $GroupBy, create ORDER BY clause $SQLOrderClause +my ($GroupBy,$SQLOrderClause) = SQLSortOrder($OptGroupBy, $OptOrderBy); +# $GroupBy will contain 'month' or 'newsgroup' (parsed result of $OptGroupBy) +# set it to 'month' or 'key' for OutputData() +$GroupBy = ($GroupBy eq 'month') ? 'month' : 'key'; -# get length of longest newsgroup delivered by query for formatting purposes -# FIXME -my $MaxLength = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'},'newsgroup',$WhereClause,$StartMonth,$EndMonth,(@GroupList,@Params)); +### get report type and build SQL 'SELECT' query +my $SQLSelect; +my $SQLGroupClause = ''; +my $Precision = 0; # number of digits right of decimal point for output +if ($OptReportType and $OptReportType ne 'default') { + $SQLGroupClause = 'GROUP BY newsgroup'; + # change $SQLOrderClause: replace everything before 'postings' + $SQLOrderClause =~ s/BY.+postings/BY postings/; + if ($OptReportType eq 'average') { + $SQLSelect = "'All months',newsgroup,AVG(postings)"; + $Precision = 2; + # change $SQLOrderClause: replace 'postings' with 'AVG(postings)' + $SQLOrderClause =~ s/postings/AVG(postings)/; + } elsif ($OptReportType eq 'sum') { + $SQLSelect = "'All months',newsgroup,SUM(postings)"; + # change $SQLOrderClause: replace 'postings' with 'SUM(postings)' + $SQLOrderClause =~ s/postings/SUM(postings)/; + } + } else { + $SQLSelect = 'month,newsgroup,postings'; +}; -my ($OrderClause,$DBQuery); -# -b (best of / top list) defined? -if (!defined($Options{'b'}) and !defined($Options{'l'})) { - # default: neither -b nor -l - # set ordering (ORDER BY) to "newsgroups" or "postings", "ASC" or "DESC" - # according to -q and -d - $OrderClause = 'newsgroup'; - $OrderClause = 'postings' if $Options{'q'}; - $OrderClause .= ' DESC' if $Options{'d'}; - # prepare query: get number of postings per group from groups table for given months and newsgroups - $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE %s ORDER BY month,%s",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause)); -} elsif ($Options{'b'}) { - # -b is set (then -l can't be!) - # set sorting order (-i): top or flop list? - if ($Options{'i'}) { - $OrderClause = 'postings'; - } else { - $OrderClause = 'postings DESC'; - }; - # set -b to 10 if < 1 (Top 10) - $Options{'b'} = 10 if $Options{'b'} !~ /^\d*$/ or $Options{'b'} < 1; - # push LIMIT to Params - push @Params,$Options{'b'}; - # prepare query: get sum of postings per group from groups table for given months and newsgroups with LIMIT - $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup,SUM(postings) AS postings FROM %s.%s WHERE %s GROUP BY newsgroup ORDER BY %s,newsgroup LIMIT ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$OrderClause)); -} else { - # -l must be set now, as all other cases have been taken care of - # which kind of level (-i): more than -l x or less than -l x? - my ($Level); - if ($Options{'i'}) { - $Level = '<'; - } else { - $Level = '>'; - }; - # prepare and execute query: get list of newsgroups meeting level condition - $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroup FROM %s.%s WHERE %s GROUP BY newsgroup HAVING MAX(postings) %s ?",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$WhereClause,$Level)); - $DBQuery->execute($StartMonth,$EndMonth,@GroupList,$Options{'l'}) - or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr); +### get length of longest newsgroup name delivered by query +### for formatting purposes +my $Field = ($GroupBy eq 'month') ? 'newsgroup' : 'month'; +my $MaxLength = &GetMaxLength($DBHandle,$Conf{'DBTableGrps'}, + $Field,$SQLWhereClause,$SQLHavingClause, + @SQLBindNewsgroups); + +### build and execute SQL query +my ($DBQuery); +# special query preparation for $OptBoundType 'level', 'average' or 'sums' +if ($OptBoundType and $OptBoundType ne 'default') { + # prepare and execute first query: + # get list of newsgroups meeting level conditions + $DBQuery = $DBHandle->prepare(sprintf('SELECT newsgroup FROM %s.%s %s '. + 'GROUP BY newsgroup %s', + $Conf{'DBDatabase'},$Conf{'DBTableGrps'}, + $SQLWhereClause,$SQLHavingClause)); + $DBQuery->execute(@SQLBindNewsgroups) + or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n", + $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}, + $DBI::errstr)); # add newsgroups to a comma-seperated list ready for IN(...) query my $GroupList; while (my ($Newsgroup) = $DBQuery->fetchrow_array) { - $GroupList .= ',' if (defined($GroupList) and $GroupList ne ''); + $GroupList .= ',' if $GroupList; $GroupList .= "'$Newsgroup'"; }; - $DBQuery = $DBHandle->prepare(sprintf("SELECT month,newsgroup,postings FROM %s.%s WHERE newsgroup IN (%s) AND %s ORDER BY newsgroup,month",$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$GroupList,$WhereClause)); -}; + # enhance $WhereClause + if ($GroupList) { + $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause, + sprintf('newsgroup IN (%s)',$GroupList)); + } else { + # condition cannot be satisfied; + # force query to fail by adding '0=1' + $SQLWhereClause = SQLBuildClause('where',$SQLWhereClause,'0=1'); + } +} + +# prepare query +$DBQuery = $DBHandle->prepare(sprintf('SELECT %s FROM %s.%s %s %s %s', + $SQLSelect, + $Conf{'DBDatabase'},$Conf{'DBTableGrps'}, + $SQLWhereClause,$SQLGroupClause,$ + SQLOrderClause)); # execute query -$DBQuery->execute($StartMonth,$EndMonth,@GroupList,@Params) - or die sprintf("$MySelf: E: Can't get groups data for %s to %s from %s.%s: %s\n",$StartMonth,$EndMonth,$Conf{'DBDatabase'},$Conf{'DBTableGrps'},$DBI::errstr); +$DBQuery->execute(@SQLBindNewsgroups) + or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: %s\n", + $CaptionPeriod,$Conf{'DBDatabase'},$Conf{'DBTableGrps'}, + $DBI::errstr)); -# output results -# reset caption (-c) if -f is set -undef($Options{'c'}) if $Options{'f'}; -# print caption (-c) with time period if -m or -p is set -if ($Options{'c'}) { - if ($Options{'m'}) { - printf ("----- Report for %s\n",$StartMonth); - } else { - printf ("----- Report from %s to %s %s\n",$StartMonth,$EndMonth,$Options{'a'} ? '(all months)' : ''); - }; -}; -# print caption (-c) with newsgroup list if -n is set -printf ("----- Newsgroups: %s\n",join(',',split(/:/,$Newsgroups))) if $Options{'c'} and $Options{'n'}; -# print caption (-c) with threshold if -t is set, taking -i in account -printf ("----- Threshold: %s %u\n",$Options{'i'} ? '<' : '>',$Options{'t'}) if $Options{'c'} and $Options{'t'}; -if (!defined($Options{'b'}) and !defined($Options{'l'})) { - # default: neither -b nor -l - &OutputData($Options{'o'},$Options{'f'},$DBQuery,$MaxLength); -} elsif ($Options{'b'}) { - # -b is set (then -l can't be!) - # we have to read in the query results ourselves, as they do not have standard layout - while (my ($Newsgroup,$Postings) = $DBQuery->fetchrow_array) { - # we just assign "top x" or "bottom x" instead of a month for the caption and force an output type of pretty - print &FormatOutput('pretty', ($Options{'i'} ? 'Bottom ' : 'Top ').$Options{'b'}, $Newsgroup, $Postings, $MaxLength); - }; -} else { - # -l must be set now, as all other cases have been taken care of - # print caption (-c) with level, taking -i in account - printf ("----- Newsgroups with %s than %u postings over the whole time period\n",$Options{'i'} ? 'less' : 'more',$Options{'l'}) if $Options{'c'}; - # we have to read in the query results ourselves, as they do not have standard layout - while (my ($Month,$Newsgroup,$Postings) = $DBQuery->fetchrow_array) { - # we just switch $Newsgroups and $Month for output generation - print &FormatOutput($Options{'o'}, $Newsgroup, $Month, $Postings, 7); - }; -}; +### output results +# set default to 'pretty' +$OptFormat = 'pretty' if !$OptFormat; +# print captions if --caption is set +if ($OptCaptions && $OptComments) { + # print time period with report type + my $CaptionReportType= '(number of postings for each month)'; + if ($OptReportType and $OptReportType ne 'default') { + $CaptionReportType= '(average number of postings for each month)' + if $OptReportType eq 'average'; + $CaptionReportType= '(number of all postings for that time period)' + if $OptReportType eq 'sum'; + } + printf("# ----- Report for %s %s\n",$CaptionPeriod,$CaptionReportType); + # print newsgroup list if --newsgroups is set + printf("# ----- Newsgroups: %s\n",join(',',split(/:/,$OptNewsgroups))) + if $OptNewsgroups; + # print boundaries, if set + my $CaptionBoundary= '(counting only month fulfilling this condition)'; + if ($OptBoundType and $OptBoundType ne 'default') { + $CaptionBoundary= '(every single month)' if $OptBoundType eq 'level'; + $CaptionBoundary= '(on average)' if $OptBoundType eq 'average'; + $CaptionBoundary= '(all month summed up)' if $OptBoundType eq 'sum'; + } + printf("# ----- Threshold: %s %s x %s %s %s\n", + $LowBound ? $LowBound : '',$LowBound ? '=>' : '', + $UppBound ? '<=' : '',$UppBound ? $UppBound : '',$CaptionBoundary) + if ($LowBound or $UppBound); + # print primary and secondary sort order + printf("# ----- Grouped by %s (%s), sorted %s%s\n", + ($GroupBy eq 'month') ? 'Months' : 'Newsgroups', + ($OptGroupBy and $OptGroupBy =~ /-?desc$/i) ? 'descending' : 'ascending', + ($OptOrderBy and $OptOrderBy =~ /posting/i) ? 'by number of postings ' : '', + ($OptOrderBy and $OptOrderBy =~ /-?desc$/i) ? 'descending' : 'ascending'); +} + +# output data +&OutputData($OptFormat,$OptComments,$GroupBy,$Precision, + $OptCheckgroupsFile ? $ValidGroups : '', + $OptFileTemplate,$DBQuery,$MaxLength); ### close handles $DBHandle->disconnect; @@ -253,60 +256,67 @@ groupstats - create reports on newsgroup usage =head1 SYNOPSIS -B [B<-Vhiscqd>] [B<-m> I | B<-p> I | B<-a>] [B<-n> I] [B<-t> I] [B<-l> I] [B<-b> I] [B<-o> I] [B<-f> I] [B<-g> I] +B [B<-Vhcs> B<--comments>] [B<-m> I[:I] | I] [B<-n> I] [B<--checkgroups> I] [B<-r> I] [B<-l> I] [B<-u> I] [B<-b> I] [B<-g> I] [B<-o> I] [B<-f> I] [B<--filetemplate> I] [B<--groupsdb> I] =head1 REQUIREMENTS -See doc/README: Perl 5.8.x itself and the following modules from CPAN: - -=over 2 - -=item - - -Config::Auto - -=item - - -DBI - -=back +See L. =head1 DESCRIPTION This script create reports on newsgroup usage (number of postings per group per month) taken from result tables created by -F. +B. -The time period to act on defaults to last month; you can assign -another month via the B<-m> switch or a time period via the B<-p> -switch; the latter takes preference. +=head2 Features and options + +=head3 Time period and newsgroups + +The time period to act on defaults to last month; you can assign another +time period or a single month (or drop all time constraints) via the +B<--month> option (see below). B will process all newsgroups by default; you can limit -that to only some newsgroups by supplying a list of those groups via -B<-n> (see below). You can include hierarchy levels in the output by -adding the B<-s> switch (see below). +processing to only some newsgroups by supplying a list of those groups via +B<--newsgroups> option (see below). You can include hierarchy levels in +the output by adding the B<--sums> switch (see below). Optionally +newsgroups not present in a checkgroups file can be excluded from output, +sse B<--checkgroups> below. -Furthermore you can set a threshold via B<-t> so that only newsgroups -with more postings per month will be included in the report. You can -invert that by the B<-i> switch so only newsgroups with less than -I postings per month will be included. +=head3 Report type -You can sort the output by number of postings per month instead of the -default (alphabetical list of newsgroups) by using B<-q>; you can -reverse the sorting order (from highest to lowest or in reversed -alphabetical order) by using B<-d>. +You can choose between different B<--report> types: postings per month, +average postings per month or all postings summed up; for details, see +below. -Furthermore, you can create a list of newsgroups that had consistently -more (or less) than x postings per month during the whole report -period by using B<-l> (together with B as needed). +=head3 Upper and lower boundaries -Last but not least you can create a "best of" list of the top x -newsgroups via B<-b> (or a "worst of" list by adding B). +Furthermore you can set an upper and/or lower boundary to exclude some +results from output via the B<--lower> and B<--upper> options, +respectively. By default, all newsgroups with more and/or less postings +per month will be excluded from the result set (i.e. not shown and not +considered for average and sum reports). You can change the meaning of +those boundaries with the B<--boundary> option. For details, please see +below. -By default, B will dump an alphabetical list of newsgroups, -one per line, followed by the number of postings in that group, for -every month. You can change the output format by using B<-o> (see -below). Captions can be added by setting the B<-c> switch. +=head3 Sorting and formatting the output + +By default, all results are grouped by month; you can group results by +newsgroup instead via the B<--groupy-by> option. Within those groups, the +list of newsgroups (or months) is sorted alphabetically (or +chronologically, respectively) ascending. You can change that order (and +sort by number of postings) with the B<--order-by> option. For details and +exceptions, please see below. + +The results will be formatted as a kind of table; you can change the +output format to a simple list or just a list of newsgroups and number of +postings with the B<--format> option. Captions will be added by means of +the B<--caption> option; all comments (and captions) can be supressed by +using B<--nocomments>. + +Last but not least you can redirect all output to a number of files, e.g. +one for each month, by submitting the B<--filetemplate> option, see below. +Captions and comments are automatically disabled in this case. =head2 Configuration @@ -315,37 +325,28 @@ which should be present in the same directory via Config::Auto. See doc/INSTALL for an overview of possible configuration options. -You can override configuration options via the B<-g> switch. +You can override some configuration options via the B<--groupsdb> option. =head1 OPTIONS =over 3 -=item B<-V> (version) +=item B<-V>, B<--version> -Print out version and copyright information on B and exit. +Print out version and copyright information and exit. -=item B<-h> (help) +=item B<-h>, B<--help> Print this man page and exit. -=item B<-m> I (month) +=item B<-m>, B<--month> I -Set processing period to a month in YYYY-MM format. Ignored if B<-p> -or B<-a> is set. +Set processing period to a single month in YYYY-MM format or to a time +period between two month in YYYY-MM:YYYY-MM format (two month, separated +by a colon). By using the keyword I instead, you can set no +processing period to process the whole database. -=item B<-p> I (period) - -Set processing period to a time period between two month, each in -YYYY-MM format, separated by a colon. Overrides B<-m>. Ignored if -B<-a> is set. - -=item B<-a> (all) - -Set no processing period (process whole database). Overrides B<-m> -and B<-p>. - -=item B<-n> I (newsgroups) +=item B<-n>, B<--newsgroups> I Limit processing to a certain set of newsgroups. I can be a single newsgroup name (de.alt.test), a newsgroup hierarchy @@ -354,41 +355,7 @@ example de.test:de.alt.test:de.newusers.* -=item B<-t> I (threshold) - -Only include newsgroups with more than I postings per -month. Can be inverted by the B<-i> switch so that only newsgroups -with less than I postings will be included. - -This setting will be ignored if B<-l> or B<-b> is set. - -=item B<-l> I (level) - -Only include newsgroups with more than I postings per -month, every month during the whole reporting period. Can be inverted -by the B<-i> switch so that only newsgroups with less than I -postings every single month will be included. Output will be ordered -by newsgroup name, followed by month. - -This setting will be ignored if B<-b> is set. Overrides B<-t> and -can't be used together with B<-q>, B<-d> or B<-f>. - -=item B<-b> I (best of) - -Create a list of the I newsgroups with the most postings over the -whole reporting period. Can be inverted by the B<-i> switch so that a -list of the I newsgroups with the least postings over the whole -period is generated. Output will be ordered by sum of postings. - -Overrides B<-t> and B<-l> and can't be used together with B<-q>, B<-d> -or B<-f>. Output format is set to I (see below). - -=item B<-i> (invert) - -Used in conjunction with B<-t>, B<-l> or B<-b> to set a lower -threshold or level or generate a "bottom list" instead of a top list. - -=item B<-s> (sum per hierarchy level) +=item B<-s>, B<--sums|--nosums> (sum per hierarchy level) Include "virtual" groups for every hierarchy level in output, for example: @@ -399,62 +366,219 @@ example: See the B man page for details. -=item B<-o> I (output format) +=item B<--checkgroups> I -Set output format. Default is I, which will print a header for -each new month, followed by an alphabetical list of newsgroups, each -on a new line, followed by the number of postings in that month. -B will try to align newsgroup names and posting counts. -Usage of B<-b> will force this format; it cannot be used together with -B<-f>. +Restrict output to those newgroups present in a file in checkgroups format +(one newgroup name per line; everything after the first whitespace on each +line is ignored). All other newsgroups will be removed from output. -I format is used to create an easily parsable output consisting -of an alphabetical list of newsgroups, each on a new line, followed by -the number of postings in that month, without any alignment. This -default format can't be used with time periods of more than one month. -Usage of B<-f> will force this format. +=item B<-r>, B<--report> I -I format is like I, but will print the month in front of -the newsgroup name. +Choose the report type: I, I or I -I format can only be use with a group list (see B<-n>) of -exactly one newsgroup and is like I, but will output months, -followed by the number of postings. +By default, B will report the number of postings for each +newsgroup in each month. But it can also report the average number of +postings per group for all months or the total sum of postings per group +for all months. -=item B<-c> (captions) +For report types I and I, the B option has no +meaning and will be silently ignored (see below). -Add captions to output (reporting period, newsgroups list, threshold -and so on). +=item B<-l>, B<--lower> I -This setting will be ignored if B<-f> is set. +Set the lower boundary. See B<--boundary> below. -=item B<-q> (quantity of postings) +=item B<-l>, B<--upper> I -Sort by number of postings instead of by newsgroup names. +Set the upper boundary. See B<--boundary> below. -Cannot be used with B<-l> or B<-b>. +=item B<-b>, B<--boundary> I -=item B<-d> (descending) +Set the boundary type to one of I, I, I or +I. -Change sort order to descending. +By default, all newsgroups with more postings per month than the upper +boundary and/or less postings per month than the lower boundary will be +excluded from further processing. For the default report that means each +month only newsgroups with a number of postings between the boundaries +will be displayed. For the other report types, newsgroups with a number of +postings exceeding the boundaries in all (!) months will not be +considered. -Cannot be used with B<-l> or B<-b>. +For example, lets take a list of newsgroups like this: -=item B<-f> I (output file) + ----- 2012-01: + de.comp.datenbanken.misc 6 + de.comp.datenbanken.ms-access 84 + de.comp.datenbanken.mysql 88 + ----- 2012-02: + de.comp.datenbanken.misc 8 + de.comp.datenbanken.ms-access 126 + de.comp.datenbanken.mysql 21 + ----- 2012-03: + de.comp.datenbanken.misc 24 + de.comp.datenbanken.ms-access 83 + de.comp.datenbanken.mysql 36 -Save output to file instead of dumping it to STDOUT. B -will create one file for each month, with filenames composed by -adding year and month to the I, for example -with B<-f> I: +With C, +you'll get the following result: - stats-2010-01 - stats-2010-02 + ----- All months: + de.comp.datenbanken.ms-access 293 + de.comp.datenbanken.mysql 124 + +de.comp.datenbanken.misc has not been considered even though it has 38 +postings in total, because it has less than 25 postings in every single +month. If you want to list all newsgroups with more than 25 postings U, you'll have to set the boundary type to I, see below. + +A boundary type of I will show only those newsgroups - at all - +that satisfy the boundaries in each and every single month. With the above +list of newsgroups and +C, +you'll get this result: + + ----- All months: + de.comp.datenbanken.ms-access 293 + +de.comp.datenbanken.mysql has not been considered because it had less than +25 postings in 2012-02. + +You can use that to get a list of newsgroups that have more (or less) then +x postings during the whole reporting period. + +A boundary type of I will show only those newsgroups - at all -that +satisfy the boundaries on average. With the above list of newsgroups and +C, +you'll get this result: + + ----- All months: + de.comp.datenbanken.ms-access 293 + de.comp.datenbanken.mysql 145 + +The average number of postings in the three groups is: + + de.comp.datenbanken.misc 12.67 + de.comp.datenbanken.ms-access 97.67 + de.comp.datenbanken.mysql 48.33 + +Last but not least, a boundary type of I will show only those +newsgroups - at all - that satisfy the boundaries with the total sum of +all postings during the reporting period. With the above list of +newsgroups and +C, +you'll finally get this result: + + ----- All months: + de.comp.datenbanken.misc 38 + de.comp.datenbanken.ms-access 293 + de.comp.datenbanken.mysql 145 + + +=item B<-g>, B<--group-by> I + +By default, all results are grouped by month, sorted chronologically in +ascending order, like this: + + ----- 2012-01: + de.comp.datenbanken.ms-access 84 + de.comp.datenbanken.mysql 88 + ----- 2012-02: + de.comp.datenbanken.ms-access 126 + de.comp.datenbanken.mysql 21 + +The results can be grouped by newsgroups instead via +B<--group-by> I: + + ----- de.comp.datenbanken.ms-access: + 2012-01 84 + 2012-02 126 + ----- de.comp.datenbanken.mysql: + 2012-01 88 + 2012-02 21 + +By appending I<-desc> to the group-by option parameter, you can reverse +the sort order - e.g. B<--group-by> I will give: + + ----- 2012-02: + de.comp.datenbanken.ms-access 126 + de.comp.datenbanken.mysql 21 + ----- 2012-01: + de.comp.datenbanken.ms-access 84 + de.comp.datenbanken.mysql 88 + +Average and sums reports (see above) will always be grouped by months; +this option will therefore be ignored. + +=item B<-o>, B<--order-by> I + +Within each group (a single month or single newsgroup, see above), the +report will be sorted by newsgroup names in ascending alphabetical order +by default. You can change the sort order to descending or sort by number +of postings instead. + +=item B<-f>, B<--format> I + +Select the output format, I being the default: + + ----- 2012-01: + de.comp.datenbanken.ms-access 84 + de.comp.datenbanken.mysql 88 + ----- 2012-02: + de.comp.datenbanken.ms-access 126 + de.comp.datenbanken.mysql 21 + +I format looks like this: + + 2012-01 de.comp.datenbanken.ms-access 84 + 2012-01 de.comp.datenbanken.mysql 88 + 2012-02 de.comp.datenbanken.ms-access 126 + 2012-02 de.comp.datenbanken.mysql 21 + +And I format looks like this: + + # 2012-01: + de.comp.datenbanken.ms-access 84 + de.comp.datenbanken.mysql 88 + # 2012-02: + de.comp.datenbanken.ms-access 126 + de.comp.datenbanken.mysql 21 + +You can remove the comments by using B<--nocomments>, see below. + +=item B<-c>, B<--captions|--nocaptions> + +Add captions to output, like this: + + ----- Report for 2012-01 to 2012-02 (number of postings for each month) + ----- Newsgroups: de.comp.datenbanken.* + ----- Threshold: 10 => x <= 20 (on average) + ----- Grouped by Newsgroups (ascending), sorted by number of postings descending + +False by default. + +=item B<--comments|--nocomments> + +Add comments (group headers) to I and I output. True by default. + +Use I<--nocomments> to suppress anything except newsgroup names/months and +numbers of postings. This is enforced when using B<--filetemplate>, see below. + +=item B<--filetemplate> I + +Save output to file(s) instead of dumping it to STDOUT. B will +create one file for each month (or each newsgroup, accordant to the +setting of B<--group-by>, see above), with filenames composed by adding +year and month (or newsgroup names) to the I, for +example with B<--filetemplate> I: + + stats-2012-01 + stats-2012-02 ... and so on -This setting will be ignored if B<-l> or B<-b> is set. Output format -is set to I (see above). +B<--nocomments> is enforced, see above. -=item B<-g> I (postings per group table) +=item B<--groupsdb> I Override I from F. @@ -462,36 +586,40 @@ Override I from F. =head1 INSTALLATION -See doc/INSTALL. +See L. =head1 EXAMPLES -Show number of postings per group for lasth month in I format: +Show number of postings per group for lasth month in I format: groupstats Show that report for January of 2010 and de.alt.* plus de.test, including display of hierarchy levels: - groupstats -m 2010-01 -n de.alt.*:de.test -s + groupstats --month 2010-01 --newsgroups de.alt.*:de.test --sums -Show that report for the year of 2010 in I format: - - groupstats -p 2010-01:2010-12 -o pretty - -Only show newsgroups with less than 30 postings last month, ordered +Only show newsgroups with 30 postings or less last month, ordered by number of postings, descending, in I format: - groupstats -iqdt 30 -o pretty + groupstats --upper 30 --order-by postings-desc -Show top 10 for the first half-year of of 2010 in I format: +Show the total of all postings for the year of 2010 for all groups that +had 30 postings or less in every single month in that year, ordered by +number of postings in descending order: - groupstats -p 2010-01:2010-06 -b 10 -o pretty + groupstats -m 2010-01:2010-12 -u 30 -b level -r sums -o postings-desc -Report all groups that had less than 30 postings every singele month -in the year of 2010 (I format is forced) +The same for the average number of postings in the year of 2010: + + groupstats -m 2010-01:2010-12 -u 30 -b level -r avg -o postings-desc + +List number of postings per group for eacht month of 2010 and redirect +output to one file for each month, namend stats-2010-01 and so on, in +machine-readable form (without formatting): + + groupstats -m 2010-01:2010-12 -f dump --filetemplate stats - groupstats -p 2010-01:2010-12 -il 30 =head1 FILES @@ -507,7 +635,7 @@ Library functions for the NewsStats package. =item F -Runtime configuration file for B. +Runtime configuration file. =back @@ -522,11 +650,11 @@ bug tracker at L! =item - -doc/README +L =item - -doc/INSTALL +l>doc/INSTALL> =item - @@ -542,7 +670,7 @@ Thomas Hochstein =head1 COPYRIGHT AND LICENSE -Copyright (c) 2010 Thomas Hochstein +Copyright (c) 2010-2012 Thomas Hochstein This program is free software; you may redistribute it and/or modify it under the same terms as Perl itself.