From 6d72dad2c0b70499877bfa844d378fb0ecb58322 Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Wed, 4 Sep 2013 00:03:03 +0200 Subject: [PATCH] Create a database table with parsed raw data. Incoming data is written to DBTableRaw without much interpretation. To allow for more and better analysis that raw data should be parsed daily and copied to another database table with separate fields for most header lines. All other scripts could use that pre-parsed data. * Add database schema to install.pl * Add DBTableParse to newsstats.conf.sample and as mandatory to NewsStats.pm * Add parsedb.pl TODO: - Documentation is only rudimentary. - From:, Sender:, Reply-To: and Subject: are not yet parsed. - gatherstats.pl does not yet use DbTableParse. Signed-off-by: Thomas Hochstein --- bin/parsedb.pl | 323 ++++++++++++++++++++++++++++++++++++++ etc/newsstats.conf.sample | 5 +- install/install.pl | 52 +++++- lib/NewsStats.pm | 2 +- 4 files changed, 378 insertions(+), 4 deletions(-) create mode 100755 bin/parsedb.pl diff --git a/bin/parsedb.pl b/bin/parsedb.pl new file mode 100755 index 0000000..10a1a5d --- /dev/null +++ b/bin/parsedb.pl @@ -0,0 +1,323 @@ +#! /usr/bin/perl +# +# parsedb.pl +# +# This script will parse a database with raw header information +# from a INN feed to a structured database. +# +# It is part of the NewsStats package. +# +# Copyright (c) 2013 Thomas Hochstein +# +# It can be redistributed and/or modified under the same terms under +# which Perl itself is published. + +BEGIN { + our $VERSION = "0.01"; + use File::Basename; + # we're in .../bin, so our module is in ../lib + push(@INC, dirname($0).'/../lib'); +} +use strict; +use warnings; + +use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper); + +use DBI; +use Getopt::Long qw(GetOptions); +Getopt::Long::config ('bundling'); + +################################# Definitions ################################## + +# define header names with separate database fields +my %DBFields = ('date' => 'date', + 'references' => 'refs', + 'followup-to' => 'fupto', + 'from' => 'from_', + 'sender' => 'sender', + 'reply-to' => 'replyto', + 'subject' => 'subject', + 'organization' => 'organization', + 'lines' => 'linecount', + 'approved' => 'approved', + 'supersedes' => 'supersedes', + 'expires' => 'expires', + 'user-agent' => 'useragent', + 'x-newsreader' => 'xnewsreader', + 'x-mailer' => 'xmailer', + 'x-no-archive' => 'xnoarchive', + 'content-type' => 'contenttype', + 'content-transfer-encoding' => 'contentencoding', + 'cancel-lock' => 'cancellock', + 'injection-info' => 'injectioninfo', + 'x-trace' => 'xtrace', + 'nntp-posting-host' => 'postinghost'); + +# define field list for database +my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed + from_name from_address sender sender_parsed sender_name + sender_address replyto replyto_parsed replyto_name + replyto_address subject subject_parsed organization linecount + approved supersedes expires useragent xnewsreader xmailer + xnoarchive contenttype contentencoding cancellock injectioninfo + xtrace postinghost headers disregard/; + +################################# Main program ################################# + +### read commandline options +my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile); +GetOptions ('d|day=s' => \$OptDay, + 'debug!' => \$OptDebug, + 'parsedb=s' => \$OptParseDB, + 'rawdb=s' => \$OptRawDB, + 't|test!' => \$OptTest, + 'conffile=s' => \$OptConfFile, + 'h|help' => \&ShowPOD, + 'V|version' => \&ShowVersion) or exit 1; + +### read configuration +my %Conf = %{ReadConfig($OptConfFile)}; + +### override configuration via commandline options +my %ConfOverride; +$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB; +$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB; +&OverrideConfig(\%Conf,\%ConfOverride); + +### get time period +### and set $Period for output and expression for SQL 'WHERE' clause +my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day'); +# bail out if --month is invalid or "all" +&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ". + "'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time'); + +### init database +my $DBHandle = InitDB(\%Conf,1); + +### get & write data +&Bleat(1,'Test mode. Database is not updated.') if $OptTest; + +# create $SQLWhereClause +my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard'); + +# delete old data for current period +if (!$OptTest) { + print "----------- Deleting old data ... -----------\n" if $OptDebug; + my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s", + $Conf{'DBDatabase'},$Conf{'DBTableParse'}, + $SQLWhereClause)) + or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); +}; + +# read from DBTableRaw +print "-------------- Reading data ... -------------\n" if $OptDebug; +my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ". + "newsgroups, headers, disregard ". + "FROM %s.%s %s", $Conf{'DBDatabase'}, + $Conf{'DBTableRaw'}, $SQLWhereClause)); +$DBQuery->execute() + or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableRaw'})); + +# set output and database connection to UTF-8 +# as we're going to write decoded header contents containing UTF-8 chars +binmode(STDOUT, ":utf8"); +$DBHandle->do("SET NAMES 'utf8'"); + +# parse data in a loop and write it out +print "-------------- Parsing data ... -------------\n" if $OptDebug; +while (my $HeadersR = $DBQuery->fetchrow_hashref) { + my %Headers = %{$HeadersR}; + + # parse $Headers{'headers'} ('headers' from DBTableRaw) + # merge continuation lines + # from Perl Cookbook, 1st German ed. 1999, pg. 91 + $Headers{'headers'} =~ s/\n\s+/ /g; + # split headers in single lines + my $OtherHeaders; + for (split(/\n/,$Headers{'headers'})) { + # split header lines in header name and header content + my ($key,$value) = split(/:/,$_,2); + $key =~ s/\s*//; + $value =~ s/^\s*(.+)\s*$/$1/; + # save each header, separate database fields in %Headers, + # the rest in $OtherHeaders (but not Message-ID, Path, Peer + # and Newsgroups as those do already exist) + if (defined($DBFields{lc($key)})) { + $Headers{$DBFields{lc($key)}} = $value; + } else { + $OtherHeaders .= sprintf("%s: %s\n",$key,$value) + if lc($key) !~ /^(message-id|path|peer|newsgroups)$/; + } + } + # replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders + chomp($OtherHeaders); + $Headers{'headers'} = $OtherHeaders; + + # order output for database entry: fill @SQLBindVars + print "-------------- Next entry:\n" if $OptDebug; + my @SQLBindVars; + foreach (@DBFields) { + if (defined($Headers{$_}) and $Headers{$_} ne '') { + push (@SQLBindVars,$Headers{$_}); + printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug; + } else { + push (@SQLBindVars,undef); + } + } + + # write data to DBTableParse + if (!$OptTest) { + print "-------------- Writing data ... -------------\n" if $OptDebug; + my $DBWrite = + $DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)", + $Conf{'DBDatabase'}, + $Conf{'DBTableParse'}, + # get field names from @DBFields + join(', ',@DBFields), + # create a list of '?' for each DBField + join(', ', + split(/ /,'? ' x scalar(@DBFields))) + )); + $DBWrite->execute(@SQLBindVars) + or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ". + "$DBI::errstr\n",$Period, + $Conf{'DBDatabase'},$Conf{'DBTableParse'})); + $DBWrite->finish; + } +}; +$DBQuery->finish; + +### close handles +$DBHandle->disconnect; + +print "------------------- DONE! -------------------\n" if $OptDebug; +__END__ + +################################ Documentation ################################# + +=head1 NAME + +parsedb - parse raw data and save it to a database + +=head1 SYNOPSIS + +B [B<-Vht>] [B<--day> I | I] [B<--rawdb> I] [B<--parsedb> I] [B<--conffile> I] [B<--debug>] + +=head1 REQUIREMENTS + +See L. + +=head1 DESCRIPTION + +... + +=head2 Configuration + +... + +=head1 OPTIONS + +=over 3 + +=item B<-V>, B<--version> + +Print out version and copyright information and exit. + +=item B<-h>, B<--help> + +Print this man page and exit. + +=item B<--debug> + +Output (rather much) debugging information to STDOUT while processing. + +=item B<-t>, B<--test> + +Do not write results to database. You should use B<--debug> in +conjunction with B<--test> ... everything else seems a bit pointless. + +=item B<-d>, B<--day> I + +Set processing period to a single day in YYYY-MM-DD format or to a time +period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated +by a colon). + +Defaults to yesterday. + +=item B<--rawdb> I (raw data table) + +Override I from F. + +=item B<--parsedb> I
(parsed data table) + +Override I from F. + +=item B<--conffile> I + +Load configuration from I instead of F. + +=back + +=head1 INSTALLATION + +See L. + +=head1 EXAMPLES + +... + +=head1 FILES + +=over 4 + +=item F + +The script itself. + +=item F + +Library functions for the NewsStats package. + +=item F + +Runtime configuration file. + +=back + +=head1 BUGS + +Please report any bugs or feature requests to the author or use the +bug tracker at L! + +=head1 SEE ALSO + +=over 2 + +=item - + +L + +=item - + +L + +=back + +This script is part of the B package. + +=head1 AUTHOR + +Thomas Hochstein + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2013 Thomas Hochstein + +This program is free software; you may redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/etc/newsstats.conf.sample b/etc/newsstats.conf.sample index 3133ed2..19a9d67 100644 --- a/etc/newsstats.conf.sample +++ b/etc/newsstats.conf.sample @@ -4,13 +4,14 @@ # DBDriver = mysql DBHost = localhost -DBUser = -DBPw = +DBUser = +DBPw = DBDatabase = newsstats # # tables # DBTableRaw = raw_de +DBTableParse = parsed_de DBTableGrps = groups_de #DBTableClnts = #DBTableHosts = diff --git a/install/install.pl b/install/install.pl index 12cc8ec..2f53a25 100755 --- a/install/install.pl +++ b/install/install.pl @@ -53,7 +53,7 @@ my $DBCreate = < < < < < < 'equal'); my %Conf = %{$ConfR};