Merge branch 'thh-parsedb' into pu

* thh-parsedb:
  Add some input validation.
  Add documentation to parsedb.pl.
  Handle more than one entitiy in From: etc.
  Let gatherstats read its data from DBTableParse.
  Add decoding and parsing of From: etc.
  Create a database table with parsed raw data.
  Make GetTimePeriod() and others accept days.

# Conflicts:
#	bin/gatherstats.pl
This commit is contained in:
Thomas Hochstein 2018-01-01 16:56:56 +01:00
commit 84e9923abe
7 changed files with 578 additions and 69 deletions

26
bin/gatherstats.pl Executable file → Normal file
View file

@ -3,7 +3,7 @@
# gatherstats.pl # gatherstats.pl
# #
# This script will gather statistical information from a database # This script will gather statistical information from a database
# containing headers and other information from a INN feed. # containing headers and other information from an INN feed.
# #
# It is part of the NewsStats package. # It is part of the NewsStats package.
# #
@ -38,7 +38,7 @@ my %LegalStats;
### read commandline options ### read commandline options
my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH,
$OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile); $OptHostsDB,$OptMonth,$OptParseDB,$OptStatsType,$OptTest,$OptConfFile);
GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
'clientsdb=s' => \$OptClientsDB, 'clientsdb=s' => \$OptClientsDB,
'd|debug!' => \$OptDebug, 'd|debug!' => \$OptDebug,
@ -46,7 +46,7 @@ GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile,
'hierarchy=s' => \$OptTLH, 'hierarchy=s' => \$OptTLH,
'hostsdb=s' => \$OptHostsDB, 'hostsdb=s' => \$OptHostsDB,
'm|month=s' => \$OptMonth, 'm|month=s' => \$OptMonth,
'rawdb=s' => \$OptRawDB, 'parsedb=s' => \$OptParseDB,
's|stats=s' => \$OptStatsType, 's|stats=s' => \$OptStatsType,
't|test!' => \$OptTest, 't|test!' => \$OptTest,
'conffile=s' => \$OptConfFile, 'conffile=s' => \$OptConfFile,
@ -58,7 +58,7 @@ my %Conf = %{ReadConfig($OptConfFile)};
### override configuration via commandline options ### override configuration via commandline options
my %ConfOverride; my %ConfOverride;
$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; $ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
$ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB;
$ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB;
$ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB;
@ -124,15 +124,15 @@ foreach my $Month (&ListMonth($Period)) {
### ---------------------------------------------- ### ----------------------------------------------
### get groups data (number of postings per group) ### get groups data (number of postings per group)
# get groups data from raw table for given month # get groups data from parsed table for given month
my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ". my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ".
"WHERE day LIKE ? AND NOT disregard", "WHERE day LIKE ? AND NOT disregard",
$Conf{'DBDatabase'}, $Conf{'DBDatabase'},
$Conf{'DBTableRaw'})); $Conf{'DBTableParse'}));
$DBQuery->execute($Month.'-%') $DBQuery->execute($Month.'-%')
or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ".
"$DBI::errstr\n",$Month, "$DBI::errstr\n",$Month,
$Conf{'DBDatabase'},$Conf{'DBTableRaw'})); $Conf{'DBDatabase'},$Conf{'DBTableParse'}));
# count postings per group # count postings per group
my %Postings; my %Postings;
@ -206,11 +206,11 @@ __END__
=head1 NAME =head1 NAME
gatherstats - process statistical data from a raw source gatherstats - process statistical data from a parsed source
=head1 SYNOPSIS =head1 SYNOPSIS
B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--rawdb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>] B<gatherstats> [B<-Vhdt>] [B<-m> I<YYYY-MM> | I<YYYY-MM:YYYY-MM>] [B<-s> I<stats>] [B<-c> I<filename template>]] [B<--hierarchy> I<TLH>] [B<--parsedb> I<database table>] [B<-groupsdb> I<database table>] [B<--clientsdb> I<database table>] [B<--hostsdb> I<database table>] [B<--conffile> I<filename>]
=head1 REQUIREMENTS =head1 REQUIREMENTS
@ -219,7 +219,7 @@ See L<doc/README>.
=head1 DESCRIPTION =head1 DESCRIPTION
This script will extract and process statistical information from a This script will extract and process statistical information from a
database table which is fed from F<feedlog.pl> for a given time period database table which is filled from F<parsedb.pl> for a given time period
and write its results to (an)other database table(s). Entries marked and write its results to (an)other database table(s). Entries marked
with I<'disregard'> in the database will be ignored; currently, you with I<'disregard'> in the database will be ignored; currently, you
have to set this flag yourself, using your database management tools. have to set this flag yourself, using your database management tools.
@ -267,7 +267,7 @@ submitted by the B<--conffile> option.
See L<doc/INSTALL> for an overview of possible configuration options. See L<doc/INSTALL> for an overview of possible configuration options.
You can override configuration options via the B<--hierarchy>, You can override configuration options via the B<--hierarchy>,
B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, B<--parsedb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options,
respectively. respectively.
=head1 OPTIONS =head1 OPTIONS
@ -328,9 +328,9 @@ will be added with a count of 0 (and logged to STDERR).
Override I<TLH> from F<newsstats.conf>. Override I<TLH> from F<newsstats.conf>.
=item B<--rawdb> I<table> (raw data table) =item B<--parsedb> I<table> (parsed data table)
Override I<DBTableRaw> from F<newsstats.conf>. Override I<DBTableParse> from F<newsstats.conf>.
=item B<--groupsdb> I<table> (postings per group table) =item B<--groupsdb> I<table> (postings per group table)

425
bin/parsedb.pl Executable file
View file

@ -0,0 +1,425 @@
#! /usr/bin/perl
#
# parsedb.pl
#
# This script will parse a database with raw header information
# from a INN feed to a structured database.
#
# It is part of the NewsStats package.
#
# Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
#
# It can be redistributed and/or modified under the same terms under
# which Perl itself is published.
BEGIN {
our $VERSION = "0.01";
use File::Basename;
# we're in .../bin, so our module is in ../lib
push(@INC, dirname($0).'/../lib');
}
use strict;
use warnings;
use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
use DBI;
use Getopt::Long qw(GetOptions);
Getopt::Long::config ('bundling');
use Encode qw/decode/;
use Mail::Address;
################################# Definitions ##################################
# define header names with separate database fields
my %DBFields = ('date' => 'date',
'references' => 'refs',
'followup-to' => 'fupto',
'from' => 'from_',
'sender' => 'sender',
'reply-to' => 'replyto',
'subject' => 'subject',
'organization' => 'organization',
'lines' => 'linecount',
'approved' => 'approved',
'supersedes' => 'supersedes',
'expires' => 'expires',
'user-agent' => 'useragent',
'x-newsreader' => 'xnewsreader',
'x-mailer' => 'xmailer',
'x-no-archive' => 'xnoarchive',
'content-type' => 'contenttype',
'content-transfer-encoding' => 'contentencoding',
'cancel-lock' => 'cancellock',
'injection-info' => 'injectioninfo',
'x-trace' => 'xtrace',
'nntp-posting-host' => 'postinghost');
# define field list for database
my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
from_name from_address sender sender_parsed sender_name
sender_address replyto replyto_parsed replyto_name
replyto_address subject subject_parsed organization linecount
approved supersedes expires useragent xnewsreader xmailer
xnoarchive contenttype contentencoding cancellock injectioninfo
xtrace postinghost headers disregard/;
################################# Main program #################################
### read commandline options
my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
GetOptions ('d|day=s' => \$OptDay,
'debug!' => \$OptDebug,
'parsedb=s' => \$OptParseDB,
'rawdb=s' => \$OptRawDB,
't|test!' => \$OptTest,
'conffile=s' => \$OptConfFile,
'h|help' => \&ShowPOD,
'V|version' => \&ShowVersion) or exit 1;
### read configuration
my %Conf = %{ReadConfig($OptConfFile)};
### override configuration via commandline options
my %ConfOverride;
$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
&OverrideConfig(\%Conf,\%ConfOverride);
### get time period
### and set $Period for output and expression for SQL 'WHERE' clause
my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
# bail out if --month is invalid or "all"
&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
"'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
### init database
my $DBHandle = InitDB(\%Conf,1);
### get & write data
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
# create $SQLWhereClause
my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
# delete old data for current period
if (!$OptTest) {
print "----------- Deleting old data ... -----------\n" if $OptDebug;
my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
$Conf{'DBDatabase'},$Conf{'DBTableParse'},
$SQLWhereClause))
or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
};
# read from DBTableRaw
print "-------------- Reading data ... -------------\n" if $OptDebug;
my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
"newsgroups, headers, disregard ".
"FROM %s.%s %s", $Conf{'DBDatabase'},
$Conf{'DBTableRaw'}, $SQLWhereClause));
$DBQuery->execute()
or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
# set output and database connection to UTF-8
# as we're going to write decoded header contents containing UTF-8 chars
binmode(STDOUT, ":utf8");
$DBHandle->do("SET NAMES 'utf8'");
# create a list of supported encondings
my %LegalEncodings;
foreach (Encode->encodings()) {
$LegalEncodings{$_} = 1;
}
# parse data in a loop and write it out
print "-------------- Parsing data ... -------------\n" if $OptDebug;
while (my $HeadersR = $DBQuery->fetchrow_hashref) {
my %Headers = %{$HeadersR};
# parse $Headers{'headers'} ('headers' from DBTableRaw)
# remove empty lines (that should not even exist in a header!)
$Headers{'headers'} =~ s/\n\s*\n/\n/g;
# merge continuation lines
# from Perl Cookbook, 1st German ed. 1999, pg. 91
$Headers{'headers'} =~ s/\n\s+/ /g;
# split headers in single lines
my $OtherHeaders;
for (split(/\n/,$Headers{'headers'})) {
# split header lines in header name and header content
my ($key,$value);
if ($_ =~ /:/) {
($key,$value) = split(/:/,$_,2);
$key =~ s/\s*//;
$value =~ s/^\s*(.+)\s*$/$1/;
} else {
&Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s",
$Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
$Headers{'id'},$_));
next;
}
# check for empty (mandatory) fields from DBTableRaw
# and set them from $Headers{'headers', if necessary
if (lc($key) =~ /^(message-id|path|newsgroups)$/) {
my $HeaderName = lc($key);
$HeaderName = 'mid' if ($HeaderName eq 'message-id');
if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') {
$Headers{$HeaderName} = $value;
&Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.",
$HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
$Headers{'id'}));
}
}
# save each header, separate database fields in %Headers,
# the rest in $OtherHeaders (but not Message-ID, Path, Peer
# and Newsgroups as those do already exist)
if (defined($DBFields{lc($key)})) {
$Headers{$DBFields{lc($key)}} = $value;
} else {
$OtherHeaders .= sprintf("%s: %s\n",$key,$value)
if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
}
}
# replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
chomp($OtherHeaders);
$Headers{'headers'} = $OtherHeaders;
foreach ('from_','sender', 'replyto', 'subject') {
if ($Headers{$_}) {
my $HeaderName = $_;
$HeaderName =~ s/_$//;
# decode From: / Sender: / Reply-To: / Subject:
if ($Headers{$_} =~ /\?(B|Q)\?/) {
# check for legal encoding and decode
(my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/;
$Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_})
if (exists($LegalEncodings{$Encoding}));
}
# extract name(s) and mail(s) from From: / Sender: / Reply-To:
# in parsed form, if available
if ($_ ne 'subject') {
my @Address;
# start parser on header or parsed header
# @Address will have an array of Mail::Address objects, one for
# each name/mail (you can have more than one person in From:!)
if (defined($Headers{$HeaderName.'_parsed'})) {
@Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'});
} else {
@Address = Mail::Address->parse($Headers{$_});
}
# split each Mail::Address object to @Names and @Adresses
my (@Names,@Adresses);
foreach (@Address) {
# take address part in @Addresses
push (@Adresses, $_->address());
# take name part form "phrase", if there is one:
# From: My Name <addr@ess> (Comment)
# otherwise, take it from "comment":
# From: addr@ess (Comment)
# and push it in @Names
my ($Name);
$Name = $_->comment() unless $Name = $_->phrase;
$Name =~ s/^\((.+)\)$/$1/;
push (@Names, $Name);
}
# put all @Adresses and all @Names in %Headers as comma separated lists
$Headers{$HeaderName.'_address'} = join(', ',@Adresses);
$Headers{$HeaderName.'_name'} = join(', ',@Names);
}
}
}
# order output for database entry: fill @SQLBindVars
print "-------------- Next entry:\n" if $OptDebug;
my @SQLBindVars;
foreach (@DBFields) {
if (defined($Headers{$_}) and $Headers{$_} ne '') {
push (@SQLBindVars,$Headers{$_});
printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
} else {
push (@SQLBindVars,undef);
}
}
# write data to DBTableParse
if (!$OptTest) {
print "-------------- Writing data ... -------------\n" if $OptDebug;
my $DBWrite =
$DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
$Conf{'DBDatabase'},
$Conf{'DBTableParse'},
# get field names from @DBFields
join(', ',@DBFields),
# create a list of '?' for each DBField
join(', ',
split(/ /,'? ' x scalar(@DBFields)))
));
$DBWrite->execute(@SQLBindVars)
or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
$DBWrite->finish;
}
};
$DBQuery->finish;
### close handles
$DBHandle->disconnect;
print "------------------- DONE! -------------------\n" if $OptDebug;
__END__
################################ Documentation #################################
=head1 NAME
parsedb - parse raw data and save it to a database
=head1 SYNOPSIS
B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
=head1 REQUIREMENTS
See L<doc/README>.
=head1 DESCRIPTION
This script will parse raw, unstructured headers from a database table which is
fed from F<feedlog.pl> for a given time period and write its results to
nother database table with separate fields (columns) for most (or even all)
relevant headers.
I<Subject:>, I<From:>, I<Sender:> and I<Reply-To:> will be parsed from MIME
encoded words to UTF-8 as needed while the unparsed copy is kept. From that
parsed copy, I<From:>, I<Sender:> and I<Reply-To:> will also be split into
separate name(s) and address(es) fields while the un-splitted copy is kept,
too.
B<parsedb> should be run nightly from cron for yesterdays data so all
other scripts get current information. The time period to act on defaults to
yesterday, accordingly; you can assign another time period or a single day via
the B<--day> option (see below).
=head2 Configuration
B<parsedb> will read its configuration from F<newsstats.conf>
should be present in etc/ via Config::Auto or from a configuration file
submitted by the B<--conffile> option.
See L<doc/INSTALL> for an overview of possible configuration options.
You can override configuration options via the B<--rawdb> and
B<--parsedb> options, respectively.
=head1 OPTIONS
=over 3
=item B<-V>, B<--version>
Print out version and copyright information and exit.
=item B<-h>, B<--help>
Print this man page and exit.
=item B<--debug>
Output (rather much) debugging information to STDOUT while processing.
=item B<-t>, B<--test>
Do not write results to database. You should use B<--debug> in
conjunction with B<--test> ... everything else seems a bit pointless.
=item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
Set processing period to a single day in YYYY-MM-DD format or to a time
period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
by a colon).
Defaults to yesterday.
=item B<--rawdb> I<table> (raw data table)
Override I<DBTableRaw> from F<newsstats.conf>.
=item B<--parsedb> I<table> (parsed data table)
Override I<DBTableParse> from F<newsstats.conf>.
=item B<--conffile> I<filename>
Load configuration from I<filename> instead of F<newsstats.conf>.
=back
=head1 INSTALLATION
See L<doc/INSTALL>.
=head1 EXAMPLES
An example crontab entry:
0 1 * * * /path/to/bin/parsedb.pl
Do a dry run for yesterday's data, showing results of processing:
parsedb --debug --test | less
=head1 FILES
=over 4
=item F<bin/parsedb.pl>
The script itself.
=item F<lib/NewsStats.pm>
Library functions for the NewsStats package.
=item F<etc/newsstats.conf>
Runtime configuration file.
=back
=head1 BUGS
Please report any bugs or feature requests to the author or use the
bug tracker at L<http://bugs.th-h.de/>!
=head1 SEE ALSO
=over 2
=item -
L<doc/README>
=item -
L<doc/INSTALL>
=back
This script is part of the B<NewsStats> package.
=head1 AUTHOR
Thomas Hochstein <thh@inter.net>
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.
=cut

View file

@ -13,6 +13,8 @@ INSTALLATION INSTRUCTIONS
1) Install the scripts 1) Install the scripts
* Get INN, mysql, Perl, and the necessary modules installed (see README).
* Download the current version of NewsStats from * Download the current version of NewsStats from
<http://th-h.de/download/scripts.php>. <http://th-h.de/download/scripts.php>.

View file

@ -47,6 +47,7 @@ Prerequisites
- Config::Auto - Config::Auto
- Date::Format - Date::Format
- DBI - DBI
- Mail::Address
* mysql 5.0.x * mysql 5.0.x

View file

@ -11,6 +11,7 @@ DBDatabase = newsstats
# tables # tables
# #
DBTableRaw = raw_de DBTableRaw = raw_de
DBTableParse = parsed_de
DBTableGrps = groups_de DBTableGrps = groups_de
#DBTableClnts = #DBTableClnts =
#DBTableHosts = #DBTableHosts =

View file

@ -47,7 +47,7 @@ my $DBCreate = <<SQLDB;
CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8; CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8;
SQLDB SQLDB
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableGrps' => <<GRPS); my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableParse' => <<PARSE, 'DBTableGrps' => <<GRPS);
-- --
-- Table structure for table DBTableRaw -- Table structure for table DBTableRaw
-- --
@ -72,6 +72,56 @@ CREATE TABLE IF NOT EXISTS `$Conf{'DBTableRaw'}` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Raw data'; ) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Raw data';
RAW RAW
-- --
-- Table structure for table DBTableParse
--
CREATE TABLE IF NOT EXISTS `$Conf{'DBTableParse'}` (
`id` bigint(20) unsigned NOT NULL auto_increment,
`day` date NOT NULL,
`mid` varchar(250) character set ascii NOT NULL,
`refs` varchar(1000) character set ascii,
`date` varchar(100) NOT NULL,
`path` varchar(1000) NOT NULL,
`newsgroups` varchar(1000) NOT NULL,
`fupto` varchar(200),
`from_` varchar(500),
`from_parsed` varchar(200),
`from_name` varchar(200),
`from_address` varchar(200),
`sender` varchar(500),
`sender_parsed` varchar(200),
`sender_name` varchar(200),
`sender_address` varchar(200),
`replyto` varchar(500),
`replyto_parsed` varchar(200),
`replyto_name` varchar(200),
`replyto_address` varchar(200),
`subject` varchar(1000) NOT NULL,
`subject_parsed` varchar(1000),
`organization` varchar(1000),
`linecount` int(4) unsigned,
`approved` varchar(250),
`supersedes` varchar(250),
`expires` varchar(100),
`useragent` varchar(500),
`xnewsreader` varchar(500),
`xmailer` varchar(500),
`xnoarchive` varchar(100),
`contenttype` varchar(500),
`contentencoding` varchar(500),
`cancellock` varchar(500),
`injectioninfo` varchar(500),
`xtrace` varchar(500),
`postinghost` varchar(1000),
`headers` longtext,
`disregard` tinyint(1) default '0',
PRIMARY KEY (`id`),
KEY `day` (`day`),
KEY `mid` (`mid`),
KEY `newsgroups` (`newsgroups`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Parsed data';
PARSE
--
-- Table structure for table DBTableGrps -- Table structure for table DBTableGrps
-- --

View file

@ -107,7 +107,7 @@ sub ReadConfig {
$ConfFile = $HomePath . '/etc/newsstats.conf' if !$ConfFile; $ConfFile = $HomePath . '/etc/newsstats.conf' if !$ConfFile;
# mandatory configuration options # mandatory configuration options
my @Mandatory = ('DBDriver','DBHost','DBUser','DBPw','DBDatabase', my @Mandatory = ('DBDriver','DBHost','DBUser','DBPw','DBDatabase',
'DBTableRaw','DBTableGrps'); 'DBTableRaw','DBTableParse','DBTableGrps');
# read config via Config::Auto # read config via Config::Auto
my $ConfR = Config::Auto::parse($ConfFile, format => 'equal'); my $ConfR = Config::Auto::parse($ConfFile, format => 'equal');
my %Conf = %{$ConfR}; my %Conf = %{$ConfR};
@ -265,33 +265,39 @@ sub ReadGroupList {
################################################################################ ################################################################################
sub GetTimePeriod { sub GetTimePeriod {
################################################################################ ################################################################################
### get a time period to act on from --month option; ### get a time period to act on from --month / --day option;
### if empty, default to last month ### if empty, default to last month / day
### IN : $Month: may be empty, 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'all' ### IN : $Period: may be empty, 'YYYY-MM(-DD)', 'YYYY-MM(-DD):YYYY-MM(-DD)'
### or 'all'
### $Type : may be 'month' or 'day'
### OUT: $Verbal,$SQL: verbal description and WHERE-clause ### OUT: $Verbal,$SQL: verbal description and WHERE-clause
### of the chosen time period ### of the chosen time period
my ($Month) = @_; my ($Period,$Type) = @_;
# define result variables # define result variables
my ($Verbal, $SQL); my ($Verbal, $SQL);
# define a regular expression for a month # check $Type
my $REMonth = '\d{4}-\d{2}'; $Type = 'month' if (!$Type or ($Type ne 'month' and $Type ne 'day'));
# define a regular expressions for a month or day
my $REPeriod = '\d{4}-\d{2}';
$REPeriod .= '-\d{2}' if ($Type eq 'day');
# default to last month if option is not set # default to last month / day if option is not set
if(!$Month) { if(!$Period) {
$Month = &LastMonth; $Period = &LastMonthDay($Type);
} }
# check for valid input # check for valid input
if ($Month =~ /^$REMonth$/) { if ($Period =~ /^$REPeriod$/) {
# single month (YYYY-MM) # single month/day [YYYY-MM(-DD)]
($Month) = &CheckMonth($Month); ($Period) = &CheckPeriod($Type,$Period);
$Verbal = $Month; $Verbal = $Period;
$SQL = sprintf("month = '%s'",$Month); $SQL = sprintf("%s = '%s'",$Type,$Period);
} elsif ($Month =~ /^$REMonth:$REMonth$/) { } elsif ($Period =~ /^$REPeriod:$REPeriod$/) {
# time period (YYYY-MM:YYYY-MM) # time period [YYYY-MM(-DD):YYYY-MM(-DD)]
$Verbal = sprintf('%s to %s',&SplitPeriod($Month)); $Verbal = sprintf('%s to %s',&SplitPeriod($Period,$Type));
$SQL = sprintf("month BETWEEN '%s' AND '%s'",&SplitPeriod($Month)); $SQL = sprintf("%s BETWEEN '%s' AND '%s'",$Type,
} elsif ($Month =~ /^all$/i) { &SplitPeriod($Period,$Type));
} elsif ($Period =~ /^all$/i) {
# special case: ALL # special case: ALL
$Verbal = 'all time'; $Verbal = 'all time';
$SQL = ''; $SQL = '';
@ -304,58 +310,82 @@ sub GetTimePeriod {
}; };
################################################################################ ################################################################################
sub LastMonth { sub LastMonthDay {
################################################################################ ################################################################################
### get last month from todays date in YYYY-MM format ### get last month/day from todays date in YYYY-MM format
### OUT: last month as YYYY-MM ### IN : $Type : may be 'month' or 'day'
# get today's date ### OUT: last month/day as YYYY-MM(-DD)
my (undef,undef,undef,undef,$Month,$Year,undef,undef,undef) = localtime(time); my ($Type) = @_;
# $Month is already defined from 0 to 11, so no need to decrease it by 1 my ($Day,$Month,$Year);
if ($Type eq 'day') {
# get yesterdays's date
(undef,undef,undef,$Day,$Month,$Year,undef,undef,undef) = localtime(time-86400);
# $Month is defined from 0 to 11, so add 1
$Month++;
} else {
# get today's date (month and year)
(undef,undef,undef,undef,$Month,$Year,undef,undef,undef) = localtime(time);
# $Month is already defined from 0 to 11, so no need to decrease it by 1
if ($Month < 1) {
$Month = 12;
$Year--;
};
}
$Year += 1900; $Year += 1900;
if ($Month < 1) { # return last month / day
$Month = 12; if ($Type eq 'day') {
$Year--; return sprintf('%4d-%02d-%02d',$Year,$Month,$Day);
}; } else {
# return last month return sprintf('%4d-%02d',$Year,$Month);
return sprintf('%4d-%02d',$Year,$Month); }
}; };
################################################################################ ################################################################################
sub CheckMonth { sub CheckPeriod {
################################################################################ ################################################################################
### check if input (in YYYY-MM form) is valid with MM between 01 and 12; ### check if input (in YYYY-MM(-DD) form) is a valid month / day;
### otherwise, fix it ### otherwise, fix it
### IN : @Month: array of month ### IN : $Type : may be 'month' or 'day'
### OUT: @Month: a valid month ### @Period: array of month/day
my (@Month) = @_; ### OUT: @Period: a valid month/day
foreach my $Month (@Month) { my ($Type,@Period) = @_;
my ($OldMonth) = $Month; foreach my $Period (@Period) {
my ($CalMonth) = substr ($Month, -2); my ($OldPeriod) = $Period;
if ($CalMonth < 1 or $CalMonth > 12) { my ($CalMonth,$CalDay);
$Period .= '-01' if ($Type eq 'month');
$CalDay = substr ($Period, -2);
$CalMonth = substr ($Period, 5, 2);
if ($CalMonth < 1 or $CalMonth > 12 or $CalDay < 1 or $CalDay > 31) {
$CalMonth = '12' if $CalMonth > 12; $CalMonth = '12' if $CalMonth > 12;
$CalMonth = '01' if $CalMonth < 1; $CalMonth = '01' if $CalMonth < 1;
substr($Month, -2) = $CalMonth; substr($Period, 5, 2) = $CalMonth;
&Bleat(1,sprintf("'%s' is an invalid date (MM must be between '01' ". $CalDay = '01' if $CalDay < 1;
"and '12'), set to '%s'.",$OldMonth,$Month)); $CalDay = '31' if $CalDay > 31;
# FIXME! - month with less than 31 days ...
substr($Period, -2) = $CalDay;
&Bleat(1,sprintf("'%s' is an invalid date, set to '%s'.",
$OldPeriod,$Period));
} }
$Period = substr($Period,0,7) if ($Type eq 'month');
} }
return @Month; return @Period;
}; };
################################################################################ ################################################################################
sub SplitPeriod { sub SplitPeriod {
################################################################################ ################################################################################
### split a time period denoted by YYYY-MM:YYYY-MM into start and end month ### split a time period denoted by YYYY-MM(-DD):YYYY-MM(-DD) into start and end
### IN : $Period: time period ### IN : $Period: time period
### OUT: $StartMonth, $EndMonth ### $Type : may be 'month' or 'day'
my ($Period) = @_; ### OUT: $StartTime, $EndTime
my ($StartMonth, $EndMonth) = split /:/, $Period; my ($Period,$Type) = @_;
($StartMonth,$EndMonth) = CheckMonth($StartMonth,$EndMonth); my ($StartTime, $EndTime) = split /:/, $Period;
($StartTime,$EndTime) = CheckPeriod($Type,$StartTime,$EndTime);
# switch parameters as necessary # switch parameters as necessary
if ($EndMonth gt $StartMonth) { if ($EndTime gt $StartTime) {
return ($StartMonth, $EndMonth); return ($StartTime, $EndTime);
} else { } else {
return ($EndMonth, $StartMonth); return ($EndTime, $StartTime);
}; };
}; };