From 3634010808edf7421d7034e94b8f7928dfadd059 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Tue, 3 Sep 2013 21:47:25 +0200 Subject: [PATCH 1/7] Make GetTimePeriod() and others accept days. GetTimePeriod() was written to take a month ('YYYY-MM') and work with that. Make it accept not only a month, but also a day ('YYYY-MM-DD') by adding a $TYpe modifier. Rename LastMonth() to LastMonthDay() and rewrite it accordingly. Rename CheckMonth() to CheckPeriod() and rewrite it accordingly. As GetTimePeriod() defaults to 'month' if no modifier is passed this change should be backwards compatible. Signed-off-by: Thomas Hochstein --- lib/NewsStats.pm | 134 +++++++++++++++++++++++++++++------------------ 1 file changed, 82 insertions(+), 52 deletions(-) diff --git a/lib/NewsStats.pm b/lib/NewsStats.pm index a3676d0..f7d8897 100644 --- a/lib/NewsStats.pm +++ b/lib/NewsStats.pm @@ -265,33 +265,39 @@ sub ReadGroupList { ################################################################################ sub GetTimePeriod { ################################################################################ -### get a time period to act on from --month option; -### if empty, default to last month -### IN : $Month: may be empty, 'YYYY-MM', 'YYYY-MM:YYYY-MM' or 'all' +### get a time period to act on from --month / --day option; +### if empty, default to last month / day +### IN : $Period: may be empty, 'YYYY-MM(-DD)', 'YYYY-MM(-DD):YYYY-MM(-DD)' +### or 'all' +### $Type : may be 'month' or 'day' ### OUT: $Verbal,$SQL: verbal description and WHERE-clause ### of the chosen time period - my ($Month) = @_; + my ($Period,$Type) = @_; # define result variables my ($Verbal, $SQL); - # define a regular expression for a month - my $REMonth = '\d{4}-\d{2}'; + # check $Type + $Type = 'month' if (!$Type or ($Type ne 'month' and $Type ne 'day')); + # define a regular expressions for a month or day + my $REPeriod = '\d{4}-\d{2}'; + $REPeriod .= '-\d{2}' if ($Type eq 'day'); - # default to last month if option is not set - if(!$Month) { - $Month = &LastMonth; + # default to last month / day if option is not set + if(!$Period) { + $Period = &LastMonthDay($Type); } # check for valid input - if ($Month =~ /^$REMonth$/) { - # single month (YYYY-MM) - ($Month) = &CheckMonth($Month); - $Verbal = $Month; - $SQL = sprintf("month = '%s'",$Month); - } elsif ($Month =~ /^$REMonth:$REMonth$/) { - # time period (YYYY-MM:YYYY-MM) - $Verbal = sprintf('%s to %s',&SplitPeriod($Month)); - $SQL = sprintf("month BETWEEN '%s' AND '%s'",&SplitPeriod($Month)); - } elsif ($Month =~ /^all$/i) { + if ($Period =~ /^$REPeriod$/) { + # single month/day [YYYY-MM(-DD)] + ($Period) = &CheckPeriod($Type,$Period); + $Verbal = $Period; + $SQL = sprintf("%s = '%s'",$Type,$Period); + } elsif ($Period =~ /^$REPeriod:$REPeriod$/) { + # time period [YYYY-MM(-DD):YYYY-MM(-DD)] + $Verbal = sprintf('%s to %s',&SplitPeriod($Period,$Type)); + $SQL = sprintf("%s BETWEEN '%s' AND '%s'",$Type, + &SplitPeriod($Period,$Type)); + } elsif ($Period =~ /^all$/i) { # special case: ALL $Verbal = 'all time'; $SQL = ''; @@ -304,58 +310,82 @@ sub GetTimePeriod { }; ################################################################################ -sub LastMonth { +sub LastMonthDay { ################################################################################ -### get last month from todays date in YYYY-MM format -### OUT: last month as YYYY-MM - # get today's date - my (undef,undef,undef,undef,$Month,$Year,undef,undef,undef) = localtime(time); - # $Month is already defined from 0 to 11, so no need to decrease it by 1 +### get last month/day from todays date in YYYY-MM format +### IN : $Type : may be 'month' or 'day' +### OUT: last month/day as YYYY-MM(-DD) + my ($Type) = @_; + my ($Day,$Month,$Year); + if ($Type eq 'day') { + # get yesterdays's date + (undef,undef,undef,$Day,$Month,$Year,undef,undef,undef) = localtime(time-86400); + # $Month is defined from 0 to 11, so add 1 + $Month++; + } else { + # get today's date (month and year) + (undef,undef,undef,undef,$Month,$Year,undef,undef,undef) = localtime(time); + # $Month is already defined from 0 to 11, so no need to decrease it by 1 + if ($Month < 1) { + $Month = 12; + $Year--; + }; + } $Year += 1900; - if ($Month < 1) { - $Month = 12; - $Year--; - }; - # return last month - return sprintf('%4d-%02d',$Year,$Month); + # return last month / day + if ($Type eq 'day') { + return sprintf('%4d-%02d-%02d',$Year,$Month,$Day); + } else { + return sprintf('%4d-%02d',$Year,$Month); + } }; ################################################################################ -sub CheckMonth { +sub CheckPeriod { ################################################################################ -### check if input (in YYYY-MM form) is valid with MM between 01 and 12; +### check if input (in YYYY-MM(-DD) form) is a valid month / day; ### otherwise, fix it -### IN : @Month: array of month -### OUT: @Month: a valid month - my (@Month) = @_; - foreach my $Month (@Month) { - my ($OldMonth) = $Month; - my ($CalMonth) = substr ($Month, -2); - if ($CalMonth < 1 or $CalMonth > 12) { +### IN : $Type : may be 'month' or 'day' +### @Period: array of month/day +### OUT: @Period: a valid month/day + my ($Type,@Period) = @_; + foreach my $Period (@Period) { + my ($OldPeriod) = $Period; + my ($CalMonth,$CalDay); + $Period .= '-01' if ($Type eq 'month'); + $CalDay = substr ($Period, -2); + $CalMonth = substr ($Period, 5, 2); + if ($CalMonth < 1 or $CalMonth > 12 or $CalDay < 1 or $CalDay > 31) { $CalMonth = '12' if $CalMonth > 12; $CalMonth = '01' if $CalMonth < 1; - substr($Month, -2) = $CalMonth; - &Bleat(1,sprintf("'%s' is an invalid date (MM must be between '01' ". - "and '12'), set to '%s'.",$OldMonth,$Month)); + substr($Period, 5, 2) = $CalMonth; + $CalDay = '01' if $CalDay < 1; + $CalDay = '31' if $CalDay > 31; + # FIXME! - month with less than 31 days ... + substr($Period, -2) = $CalDay; + &Bleat(1,sprintf("'%s' is an invalid date, set to '%s'.", + $OldPeriod,$Period)); } + $Period = substr($Period,0,7) if ($Type eq 'month'); } - return @Month; + return @Period; }; ################################################################################ sub SplitPeriod { ################################################################################ -### split a time period denoted by YYYY-MM:YYYY-MM into start and end month +### split a time period denoted by YYYY-MM(-DD):YYYY-MM(-DD) into start and end ### IN : $Period: time period -### OUT: $StartMonth, $EndMonth - my ($Period) = @_; - my ($StartMonth, $EndMonth) = split /:/, $Period; - ($StartMonth,$EndMonth) = CheckMonth($StartMonth,$EndMonth); +### $Type : may be 'month' or 'day' +### OUT: $StartTime, $EndTime + my ($Period,$Type) = @_; + my ($StartTime, $EndTime) = split /:/, $Period; + ($StartTime,$EndTime) = CheckPeriod($Type,$StartTime,$EndTime); # switch parameters as necessary - if ($EndMonth gt $StartMonth) { - return ($StartMonth, $EndMonth); + if ($EndTime gt $StartTime) { + return ($StartTime, $EndTime); } else { - return ($EndMonth, $StartMonth); + return ($EndTime, $StartTime); }; }; From 6d72dad2c0b70499877bfa844d378fb0ecb58322 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 00:03:03 +0200 Subject: [PATCH 2/7] Create a database table with parsed raw data. Incoming data is written to DBTableRaw without much interpretation. To allow for more and better analysis that raw data should be parsed daily and copied to another database table with separate fields for most header lines. All other scripts could use that pre-parsed data. * Add database schema to install.pl * Add DBTableParse to newsstats.conf.sample and as mandatory to NewsStats.pm * Add parsedb.pl TODO: - Documentation is only rudimentary. - From:, Sender:, Reply-To: and Subject: are not yet parsed. - gatherstats.pl does not yet use DbTableParse. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 323 ++++++++++++++++++++++++++++++++++++++ etc/newsstats.conf.sample | 5 +- install/install.pl | 52 +++++- lib/NewsStats.pm | 2 +- 4 files changed, 378 insertions(+), 4 deletions(-) create mode 100755 bin/parsedb.pl diff --git a/bin/parsedb.pl b/bin/parsedb.pl new file mode 100755 index 0000000..10a1a5d --- /dev/null +++ b/bin/parsedb.pl @@ -0,0 +1,323 @@ +#! /usr/bin/perl +# +# parsedb.pl +# +# This script will parse a database with raw header information +# from a INN feed to a structured database. +# +# It is part of the NewsStats package. +# +# Copyright (c) 2013 Thomas Hochstein +# +# It can be redistributed and/or modified under the same terms under +# which Perl itself is published. + +BEGIN { + our $VERSION = "0.01"; + use File::Basename; + # we're in .../bin, so our module is in ../lib + push(@INC, dirname($0).'/../lib'); +} +use strict; +use warnings; + +use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper); + +use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); + +################################# Definitions ################################## + +# define header names with separate database fields +my %DBFields = ('date' => 'date', + 'references' => 'refs', + 'followup-to' => 'fupto', + 'from' => 'from_', + 'sender' => 'sender', + 'reply-to' => 'replyto', + 'subject' => 'subject', + 'organization' => 'organization', + 'lines' => 'linecount', + 'approved' => 'approved', + 'supersedes' => 'supersedes', + 'expires' => 'expires', + 'user-agent' => 'useragent', + 'x-newsreader' => 'xnewsreader', + 'x-mailer' => 'xmailer', + 'x-no-archive' => 'xnoarchive', + 'content-type' => 'contenttype', + 'content-transfer-encoding' => 'contentencoding', + 'cancel-lock' => 'cancellock', + 'injection-info' => 'injectioninfo', + 'x-trace' => 'xtrace', + 'nntp-posting-host' => 'postinghost'); + +# define field list for database +my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed + from_name from_address sender sender_parsed sender_name + sender_address replyto replyto_parsed replyto_name + replyto_address subject subject_parsed organization linecount + approved supersedes expires useragent xnewsreader xmailer + xnoarchive contenttype contentencoding cancellock injectioninfo + xtrace postinghost headers disregard/; + +################################# Main program ################################# + +### read commandline options +my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile); +GetOptions ('d|day=s' => \$OptDay, + 'debug!' => \$OptDebug, + 'parsedb=s' => \$OptParseDB, + 'rawdb=s' => \$OptRawDB, + 't|test!' => \$OptTest, + 'conffile=s' => \$OptConfFile, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; + +### read configuration +my %Conf = %{ReadConfig($OptConfFile)}; + +### override configuration via commandline options +my %ConfOverride; +$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB; +&OverrideConfig(\%Conf,\%ConfOverride); + +### get time period +### and set $Period for output and expression for SQL 'WHERE' clause +my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day'); +# bail out if --month is invalid or "all" +&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ". + "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time'); + +### init database +my $DBHandle = InitDB(\%Conf,1); + +### get & write data +&Bleat(1,'Test mode. Database is not updated.') if $OptTest; + +# create $SQLWhereClause +my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard'); + +# delete old data for current period +if (!$OptTest) { + print "----------- Deleting old data ... -----------\n" if $OptDebug; + my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s", + $Conf{'DBDatabase'},$Conf{'DBTableParse'}, + $SQLWhereClause)) + or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); +}; + +# read from DBTableRaw +print "-------------- Reading data ... -------------\n" if $OptDebug; +my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ". + "newsgroups, headers, disregard ". + "FROM %s.%s %s", $Conf{'DBDatabase'}, + $Conf{'DBTableRaw'}, $SQLWhereClause)); +$DBQuery->execute() + or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + +# set output and database connection to UTF-8 +# as we're going to write decoded header contents containing UTF-8 chars +binmode(STDOUT, ":utf8"); +$DBHandle->do("SET NAMES 'utf8'"); + +# parse data in a loop and write it out +print "-------------- Parsing data ... -------------\n" if $OptDebug; +while (my $HeadersR = $DBQuery->fetchrow_hashref) { + my %Headers = %{$HeadersR}; + + # parse $Headers{'headers'} ('headers' from DBTableRaw) + # merge continuation lines + # from Perl Cookbook, 1st German ed. 1999, pg. 91 + $Headers{'headers'} =~ s/\n\s+/ /g; + # split headers in single lines + my $OtherHeaders; + for (split(/\n/,$Headers{'headers'})) { + # split header lines in header name and header content + my ($key,$value) = split(/:/,$_,2); + $key =~ s/\s*//; + $value =~ s/^\s*(.+)\s*$/$1/; + # save each header, separate database fields in %Headers, + # the rest in $OtherHeaders (but not Message-ID, Path, Peer + # and Newsgroups as those do already exist) + if (defined($DBFields{lc($key)})) { + $Headers{$DBFields{lc($key)}} = $value; + } else { + $OtherHeaders .= sprintf("%s: %s\n",$key,$value) + if lc($key) !~ /^(message-id|path|peer|newsgroups)$/; + } + } + # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders + chomp($OtherHeaders); + $Headers{'headers'} = $OtherHeaders; + + # order output for database entry: fill @SQLBindVars + print "-------------- Next entry:\n" if $OptDebug; + my @SQLBindVars; + foreach (@DBFields) { + if (defined($Headers{$_}) and $Headers{$_} ne '') { + push (@SQLBindVars,$Headers{$_}); + printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug; + } else { + push (@SQLBindVars,undef); + } + } + + # write data to DBTableParse + if (!$OptTest) { + print "-------------- Writing data ... -------------\n" if $OptDebug; + my $DBWrite = + $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)", + $Conf{'DBDatabase'}, + $Conf{'DBTableParse'}, + # get field names from @DBFields + join(', ',@DBFields), + # create a list of '?' for each DBField + join(', ', + split(/ /,'? ' x scalar(@DBFields))) + )); + $DBWrite->execute(@SQLBindVars) + or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); + $DBWrite->finish; + } +}; +$DBQuery->finish; + +### close handles +$DBHandle->disconnect; + +print "------------------- DONE! -------------------\n" if $OptDebug; +__END__ + +################################ Documentation ################################# + +=head1 NAME + +parsedb - parse raw data and save it to a database + +=head1 SYNOPSIS + +B [B<-Vht>] [B<--day> I | I] [B<--rawdb> I] [B<--parsedb> I] [B<--conffile> I] [B<--debug>] + +=head1 REQUIREMENTS + +See L. + +=head1 DESCRIPTION + +... + +=head2 Configuration + +... + +=head1 OPTIONS + +=over 3 + +=item B<-V>, B<--version> + +Print out version and copyright information and exit. + +=item B<-h>, B<--help> + +Print this man page and exit. + +=item B<--debug> + +Output (rather much) debugging information to STDOUT while processing. + +=item B<-t>, B<--test> + +Do not write results to database. You should use B<--debug> in +conjunction with B<--test> ... everything else seems a bit pointless. + +=item B<-d>, B<--day> I + +Set processing period to a single day in YYYY-MM-DD format or to a time +period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated +by a colon). + +Defaults to yesterday. + +=item B<--rawdb> I (raw data table) + +Override I from F. + +=item B<--parsedb> I
(parsed data table) + +Override I from F. + +=item B<--conffile> I + +Load configuration from I instead of F. + +=back + +=head1 INSTALLATION + +See L. + +=head1 EXAMPLES + +... + +=head1 FILES + +=over 4 + +=item F + +The script itself. + +=item F + +Library functions for the NewsStats package. + +=item F + +Runtime configuration file. + +=back + +=head1 BUGS + +Please report any bugs or feature requests to the author or use the +bug tracker at L! + +=head1 SEE ALSO + +=over 2 + +=item - + +L + +=item - + +L + +=back + +This script is part of the B package. + +=head1 AUTHOR + +Thomas Hochstein + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2013 Thomas Hochstein + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/etc/newsstats.conf.sample b/etc/newsstats.conf.sample index 3133ed2..19a9d67 100644 --- a/etc/newsstats.conf.sample +++ b/etc/newsstats.conf.sample @@ -4,13 +4,14 @@ # DBDriver = mysql DBHost = localhost -DBUser = -DBPw = +DBUser = +DBPw = DBDatabase = newsstats # # tables # DBTableRaw = raw_de +DBTableParse = parsed_de DBTableGrps = groups_de #DBTableClnts = #DBTableHosts = diff --git a/install/install.pl b/install/install.pl index 12cc8ec..2f53a25 100755 --- a/install/install.pl +++ b/install/install.pl @@ -53,7 +53,7 @@ my $DBCreate = < < < < < < 'equal'); my %Conf = %{$ConfR}; From 9630376c31454201c0031906bec754580cccaf96 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 00:04:17 +0200 Subject: [PATCH 3/7] Add decoding and parsing of From: etc. Decode From:, Sender:, Reply-To:, Subject:; parse From:, Sender:, Reply-To:. Add Mail::Address to prerequisites. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 40 ++++++++++++++++++++++++++++++++++++++++ doc/INSTALL | 2 ++ doc/README | 1 + 3 files changed, 43 insertions(+) diff --git a/bin/parsedb.pl b/bin/parsedb.pl index 10a1a5d..fc29747 100755 --- a/bin/parsedb.pl +++ b/bin/parsedb.pl @@ -27,6 +27,9 @@ use DBI; use Getopt::Long qw(GetOptions); Getopt::Long::config ('bundling'); +use Encode qw/decode/; +use Mail::Address; + ################################# Definitions ################################## # define header names with separate database fields @@ -157,6 +160,43 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) { chomp($OtherHeaders); $Headers{'headers'} = $OtherHeaders; + foreach ('from_','sender', 'replyto', 'subject') { + if ($Headers{$_}) { + my $HeaderName = $_; + $HeaderName =~ s/_$//; + # decode From: / Sender: / Reply-To: / Subject: + if ($Headers{$_} =~ /\?(B|Q)\?/) { + $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_}); + } + # extract name(s) and mail(s) from From: / Sender: / Reply-To: + # in parsed form, if available + if ($_ ne 'subject') { + my @Address; + # start parser on header or parsed header + # @Address will have an array of Mail::Address objects, one for + # each name/mail (you can have more than one person in From:!) + if (defined($Headers{$HeaderName.'_parsed'})) { + @Address = Mail::Address->parse($Headers{$HeaderName.'_parsed'}); + } else { + @Address = Mail::Address->parse($Headers{$_}); + } + # split each Mail::Address object + foreach (@Address) { + # take address part + $Headers{$HeaderName.'_address'} = $_->address(); + # take name part form "phrase", if there is one: + # From: My Name (Comment) + # otherwise, take it from "comment": + # From: addr@ess (Comment) + $Headers{$HeaderName.'_name'} = $_->comment() + unless $Headers{$HeaderName.'_name'}= $_->phrase; + $Headers{$HeaderName.'_name'} =~ s/^\((.+)\)$/$1/; + # FIMXE - handle more than one Mail::Address object! + } + } + } + } + # order output for database entry: fill @SQLBindVars print "-------------- Next entry:\n" if $OptDebug; my @SQLBindVars; diff --git a/doc/INSTALL b/doc/INSTALL index 1c154af..a5ca01d 100644 --- a/doc/INSTALL +++ b/doc/INSTALL @@ -13,6 +13,8 @@ INSTALLATION INSTRUCTIONS 1) Install the scripts + * Get INN, mysql, Perl, and the necessary modules installed (see README). + * Download the current version of NewsStats from . diff --git a/doc/README b/doc/README index e809cea..0ccfad5 100644 --- a/doc/README +++ b/doc/README @@ -47,6 +47,7 @@ Prerequisites - Config::Auto - Date::Format - DBI + - Mail::Address * mysql 5.0.x From ca8ac4d50f46a3ab43be87448f903997a42613e3 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 10:45:18 +0200 Subject: [PATCH 4/7] Let gatherstats read its data from DBTableParse. Switch gatherstat.pl over to the parsed database. Signed-off-by: Thomas Hochstein --- bin/gatherstats.pl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/bin/gatherstats.pl b/bin/gatherstats.pl index a5a486a..7f42448 100755 --- a/bin/gatherstats.pl +++ b/bin/gatherstats.pl @@ -3,7 +3,7 @@ # gatherstats.pl # # This script will gather statistical information from a database -# containing headers and other information from a INN feed. +# containing headers and other information from an INN feed. # # It is part of the NewsStats package. # @@ -38,7 +38,7 @@ my %LegalStats; ### read commandline options my ($OptCheckgroupsFile,$OptClientsDB,$OptDebug,$OptGroupsDB,$OptTLH, - $OptHostsDB,$OptMonth,$OptRawDB,$OptStatsType,$OptTest,$OptConfFile); + $OptHostsDB,$OptMonth,$OptParseDB,$OptStatsType,$OptTest,$OptConfFile); GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, 'clientsdb=s' => \$OptClientsDB, 'd|debug!' => \$OptDebug, @@ -46,7 +46,7 @@ GetOptions ('c|checkgroups=s' => \$OptCheckgroupsFile, 'hierarchy=s' => \$OptTLH, 'hostsdb=s' => \$OptHostsDB, 'm|month=s' => \$OptMonth, - 'rawdb=s' => \$OptRawDB, + 'parsedb=s' => \$OptParseDB, 's|stats=s' => \$OptStatsType, 't|test!' => \$OptTest, 'conffile=s' => \$OptConfFile, @@ -58,7 +58,7 @@ my %Conf = %{ReadConfig($OptConfFile)}; ### override configuration via commandline options my %ConfOverride; -$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB; $ConfOverride{'DBTableGrps'} = $OptGroupsDB if $OptGroupsDB; $ConfOverride{'DBTableClnts'} = $OptClientsDB if $OptClientsDB; $ConfOverride{'DBTableHosts'} = $OptHostsDB if $OptHostsDB; @@ -124,15 +124,15 @@ foreach my $Month (&ListMonth($Period)) { ### ---------------------------------------------- ### get groups data (number of postings per group) - # get groups data from raw table for given month + # get groups data from parsed table for given month my $DBQuery = $DBHandle->prepare(sprintf("SELECT newsgroups FROM %s.%s ". "WHERE day LIKE ? AND NOT disregard", $Conf{'DBDatabase'}, - $Conf{'DBTableRaw'})); + $Conf{'DBTableParse'})); $DBQuery->execute($Month.'-%') or &Bleat(2,sprintf("Can't get groups data for %s from %s.%s: ". "$DBI::errstr\n",$Month, - $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); # count postings per group my %Postings; @@ -206,11 +206,11 @@ __END__ =head1 NAME -gatherstats - process statistical data from a raw source +gatherstats - process statistical data from a parsed source =head1 SYNOPSIS -B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--rawdb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] [--conffile I] +B [B<-Vhdt>] [B<-m> I | I] [B<-s> I] [B<-c> I]] [B<--hierarchy> I] [B<--parsedb> I] [B<-groupsdb> I] [B<--clientsdb> I] [B<--hostsdb> I] [--conffile I] =head1 REQUIREMENTS @@ -219,7 +219,7 @@ See L. =head1 DESCRIPTION This script will extract and process statistical information from a -database table which is fed from F for a given time period +database table which is filled from F for a given time period and write its results to (an)other database table(s). Entries marked with I<'disregard'> in the database will be ignored; currently, you have to set this flag yourself, using your database management tools. @@ -266,7 +266,7 @@ which should be present in the same directory via Config::Auto. See L for an overview of possible configuration options. You can override configuration options via the B<--hierarchy>, -B<--rawdb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, +B<--parsedb>, B<--groupsdb>, B<--clientsdb> and B<--hostsdb> options, respectively. =head1 OPTIONS @@ -327,9 +327,9 @@ will be added with a count of 0 (and logged to STDERR). Override I from F. -=item B<--rawdb> I
(raw data table) +=item B<--parsedb> I
(parsed data table) -Override I from F. +Override I from F. =item B<--groupsdb> I
(postings per group table) From aef5467bfecfd4aeb83146212218c88837466de1 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 11:11:27 +0200 Subject: [PATCH 5/7] Handle more than one entitiy in From: etc. From:, Sender: etc. may contain more than one entity in a comma separated list, i.e. a From: line like "From: Me , You " is perfectly valid. Handle multiple entities when splitting those headers and save all names and all adresses as (new) comma separated lists in the corresponding database fields. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/bin/parsedb.pl b/bin/parsedb.pl index fc29747..1a0fa39 100755 --- a/bin/parsedb.pl +++ b/bin/parsedb.pl @@ -180,19 +180,24 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) { } else { @Address = Mail::Address->parse($Headers{$_}); } - # split each Mail::Address object + # split each Mail::Address object to @Names and @Adresses + my (@Names,@Adresses); foreach (@Address) { - # take address part - $Headers{$HeaderName.'_address'} = $_->address(); + # take address part in @Addresses + push (@Adresses, $_->address()); # take name part form "phrase", if there is one: # From: My Name (Comment) # otherwise, take it from "comment": # From: addr@ess (Comment) - $Headers{$HeaderName.'_name'} = $_->comment() - unless $Headers{$HeaderName.'_name'}= $_->phrase; - $Headers{$HeaderName.'_name'} =~ s/^\((.+)\)$/$1/; - # FIMXE - handle more than one Mail::Address object! + # and push it in @Names + my ($Name); + $Name = $_->comment() unless $Name = $_->phrase; + $Name =~ s/^\((.+)\)$/$1/; + push (@Names, $Name); } + # put all @Adresses and all @Names in %Headers as comma separated lists + $Headers{$HeaderName.'_address'} = join(', ',@Adresses); + $Headers{$HeaderName.'_name'} = join(', ',@Names); } } } From 13e006104bf83a3bdb3be16f44908fb1c2670213 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 11:53:28 +0200 Subject: [PATCH 6/7] Add documentation to parsedb.pl. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/bin/parsedb.pl b/bin/parsedb.pl index 1a0fa39..27a9229 100755 --- a/bin/parsedb.pl +++ b/bin/parsedb.pl @@ -258,11 +258,32 @@ See L. =head1 DESCRIPTION -... +This script will parse raw, unstructured headers from a database table which is +fed from F for a given time period and write its results to +nother database table with separate fields (columns) for most (or even all) +relevant headers. + +I, I, I and I will be parsed from MIME +encoded words to UTF-8 as needed while the unparsed copy is kept. From that +parsed copy, I, I and I will also be split into +separate name(s) and address(es) fields while the un-splitted copy is kept, +too. + +B should be run nightly from cron for yesterdays data so all +other scripts get current information. The time period to act on defaults to +yesterday, accordingly; you can assign another time period or a single day via +the B<--day> option (see below). =head2 Configuration -... +B will read its configuration from F +should be present in etc/ via Config::Auto or from a configuration file +submitted by the B<--conffile> option. + +See L for an overview of possible configuration options. + +You can override configuration options via the B<--rawdb> and +B<--parsedb> options, respectively. =head1 OPTIONS @@ -313,7 +334,13 @@ See L. =head1 EXAMPLES -... +An example crontab entry: + + 0 1 * * * /path/to/bin/parsedb.pl + +Do a dry run for yesterday's data, showing results of processing: + + parsedb --debug --test | less =head1 FILES From 48c8d4bb8e0585ce26e63d1332e6efb20babdf6f Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 13:04:14 +0200 Subject: [PATCH 7/7] Add some input validation. Our raw data doesn't have the qualitiy one should expect. There are empty header lines only containing whitespace (leading to wrong joining of apparent continuation lines); header lines that contain garbage without ':' so split is failing; empty 'newsgroups' fields; unsupported encondings in MIME encoded words ... and so on. Add fixes for the aforementioned problems. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/bin/parsedb.pl b/bin/parsedb.pl index 27a9229..b4c2056 100755 --- a/bin/parsedb.pl +++ b/bin/parsedb.pl @@ -130,12 +130,19 @@ $DBQuery->execute() binmode(STDOUT, ":utf8"); $DBHandle->do("SET NAMES 'utf8'"); +# create a list of supported encondings +my %LegalEncodings; +foreach (Encode->encodings()) { + $LegalEncodings{$_} = 1; +} # parse data in a loop and write it out print "-------------- Parsing data ... -------------\n" if $OptDebug; while (my $HeadersR = $DBQuery->fetchrow_hashref) { my %Headers = %{$HeadersR}; # parse $Headers{'headers'} ('headers' from DBTableRaw) + # remove empty lines (that should not even exist in a header!) + $Headers{'headers'} =~ s/\n\s*\n/\n/g; # merge continuation lines # from Perl Cookbook, 1st German ed. 1999, pg. 91 $Headers{'headers'} =~ s/\n\s+/ /g; @@ -143,9 +150,29 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) { my $OtherHeaders; for (split(/\n/,$Headers{'headers'})) { # split header lines in header name and header content - my ($key,$value) = split(/:/,$_,2); - $key =~ s/\s*//; - $value =~ s/^\s*(.+)\s*$/$1/; + my ($key,$value); + if ($_ =~ /:/) { + ($key,$value) = split(/:/,$_,2); + $key =~ s/\s*//; + $value =~ s/^\s*(.+)\s*$/$1/; + } else { + &Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s", + $Conf{'DBDatabase'}, $Conf{'DBTableRaw'}, + $Headers{'id'},$_)); + next; + } + # check for empty (mandatory) fields from DBTableRaw + # and set them from $Headers{'headers', if necessary + if (lc($key) =~ /^(message-id|path|newsgroups)$/) { + my $HeaderName = lc($key); + $HeaderName = 'mid' if ($HeaderName eq 'message-id'); + if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') { + $Headers{$HeaderName} = $value; + &Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.", + $HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'}, + $Headers{'id'})); + } + } # save each header, separate database fields in %Headers, # the rest in $OtherHeaders (but not Message-ID, Path, Peer # and Newsgroups as those do already exist) @@ -166,7 +193,10 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) { $HeaderName =~ s/_$//; # decode From: / Sender: / Reply-To: / Subject: if ($Headers{$_} =~ /\?(B|Q)\?/) { - $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_}); + # check for legal encoding and decode + (my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/; + $Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_}) + if (exists($LegalEncodings{$Encoding})); } # extract name(s) and mail(s) from From: / Sender: / Reply-To: # in parsed form, if available