Create a database table with parsed raw data.

Incoming data is written to DBTableRaw without
much interpretation. To allow for more and
better analysis that raw data should be parsed
daily and copied to another database table
with separate fields for most header lines.
All other scripts could use that pre-parsed
data.

* Add database schema to install.pl
* Add DBTableParse to newsstats.conf.sample
  and as mandatory to NewsStats.pm
* Add parsedb.pl

TODO:
- Documentation is only rudimentary.
- From:, Sender:, Reply-To: and Subject:
  are not yet parsed.
- gatherstats.pl does not yet use DbTableParse.

Signed-off-by: Thomas Hochstein <thh@inter.net>
This commit is contained in:
Thomas Hochstein 2013-09-04 00:03:03 +02:00
parent 3634010808
commit 6d72dad2c0
4 changed files with 378 additions and 4 deletions

323
bin/parsedb.pl Executable file
View file

@ -0,0 +1,323 @@
#! /usr/bin/perl
#
# parsedb.pl
#
# This script will parse a database with raw header information
# from a INN feed to a structured database.
#
# It is part of the NewsStats package.
#
# Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
#
# It can be redistributed and/or modified under the same terms under
# which Perl itself is published.
BEGIN {
our $VERSION = "0.01";
use File::Basename;
# we're in .../bin, so our module is in ../lib
push(@INC, dirname($0).'/../lib');
}
use strict;
use warnings;
use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
use DBI;
use Getopt::Long qw(GetOptions);
Getopt::Long::config ('bundling');
################################# Definitions ##################################
# define header names with separate database fields
my %DBFields = ('date' => 'date',
'references' => 'refs',
'followup-to' => 'fupto',
'from' => 'from_',
'sender' => 'sender',
'reply-to' => 'replyto',
'subject' => 'subject',
'organization' => 'organization',
'lines' => 'linecount',
'approved' => 'approved',
'supersedes' => 'supersedes',
'expires' => 'expires',
'user-agent' => 'useragent',
'x-newsreader' => 'xnewsreader',
'x-mailer' => 'xmailer',
'x-no-archive' => 'xnoarchive',
'content-type' => 'contenttype',
'content-transfer-encoding' => 'contentencoding',
'cancel-lock' => 'cancellock',
'injection-info' => 'injectioninfo',
'x-trace' => 'xtrace',
'nntp-posting-host' => 'postinghost');
# define field list for database
my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
from_name from_address sender sender_parsed sender_name
sender_address replyto replyto_parsed replyto_name
replyto_address subject subject_parsed organization linecount
approved supersedes expires useragent xnewsreader xmailer
xnoarchive contenttype contentencoding cancellock injectioninfo
xtrace postinghost headers disregard/;
################################# Main program #################################
### read commandline options
my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
GetOptions ('d|day=s' => \$OptDay,
'debug!' => \$OptDebug,
'parsedb=s' => \$OptParseDB,
'rawdb=s' => \$OptRawDB,
't|test!' => \$OptTest,
'conffile=s' => \$OptConfFile,
'h|help' => \&ShowPOD,
'V|version' => \&ShowVersion) or exit 1;
### read configuration
my %Conf = %{ReadConfig($OptConfFile)};
### override configuration via commandline options
my %ConfOverride;
$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
&OverrideConfig(\%Conf,\%ConfOverride);
### get time period
### and set $Period for output and expression for SQL 'WHERE' clause
my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
# bail out if --month is invalid or "all"
&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
"'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
### init database
my $DBHandle = InitDB(\%Conf,1);
### get & write data
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
# create $SQLWhereClause
my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
# delete old data for current period
if (!$OptTest) {
print "----------- Deleting old data ... -----------\n" if $OptDebug;
my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
$Conf{'DBDatabase'},$Conf{'DBTableParse'},
$SQLWhereClause))
or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
};
# read from DBTableRaw
print "-------------- Reading data ... -------------\n" if $OptDebug;
my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
"newsgroups, headers, disregard ".
"FROM %s.%s %s", $Conf{'DBDatabase'},
$Conf{'DBTableRaw'}, $SQLWhereClause));
$DBQuery->execute()
or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
# set output and database connection to UTF-8
# as we're going to write decoded header contents containing UTF-8 chars
binmode(STDOUT, ":utf8");
$DBHandle->do("SET NAMES 'utf8'");
# parse data in a loop and write it out
print "-------------- Parsing data ... -------------\n" if $OptDebug;
while (my $HeadersR = $DBQuery->fetchrow_hashref) {
my %Headers = %{$HeadersR};
# parse $Headers{'headers'} ('headers' from DBTableRaw)
# merge continuation lines
# from Perl Cookbook, 1st German ed. 1999, pg. 91
$Headers{'headers'} =~ s/\n\s+/ /g;
# split headers in single lines
my $OtherHeaders;
for (split(/\n/,$Headers{'headers'})) {
# split header lines in header name and header content
my ($key,$value) = split(/:/,$_,2);
$key =~ s/\s*//;
$value =~ s/^\s*(.+)\s*$/$1/;
# save each header, separate database fields in %Headers,
# the rest in $OtherHeaders (but not Message-ID, Path, Peer
# and Newsgroups as those do already exist)
if (defined($DBFields{lc($key)})) {
$Headers{$DBFields{lc($key)}} = $value;
} else {
$OtherHeaders .= sprintf("%s: %s\n",$key,$value)
if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
}
}
# replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
chomp($OtherHeaders);
$Headers{'headers'} = $OtherHeaders;
# order output for database entry: fill @SQLBindVars
print "-------------- Next entry:\n" if $OptDebug;
my @SQLBindVars;
foreach (@DBFields) {
if (defined($Headers{$_}) and $Headers{$_} ne '') {
push (@SQLBindVars,$Headers{$_});
printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
} else {
push (@SQLBindVars,undef);
}
}
# write data to DBTableParse
if (!$OptTest) {
print "-------------- Writing data ... -------------\n" if $OptDebug;
my $DBWrite =
$DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
$Conf{'DBDatabase'},
$Conf{'DBTableParse'},
# get field names from @DBFields
join(', ',@DBFields),
# create a list of '?' for each DBField
join(', ',
split(/ /,'? ' x scalar(@DBFields)))
));
$DBWrite->execute(@SQLBindVars)
or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
"$DBI::errstr\n",$Period,
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
$DBWrite->finish;
}
};
$DBQuery->finish;
### close handles
$DBHandle->disconnect;
print "------------------- DONE! -------------------\n" if $OptDebug;
__END__
################################ Documentation #################################
=head1 NAME
parsedb - parse raw data and save it to a database
=head1 SYNOPSIS
B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
=head1 REQUIREMENTS
See L<doc/README>.
=head1 DESCRIPTION
...
=head2 Configuration
...
=head1 OPTIONS
=over 3
=item B<-V>, B<--version>
Print out version and copyright information and exit.
=item B<-h>, B<--help>
Print this man page and exit.
=item B<--debug>
Output (rather much) debugging information to STDOUT while processing.
=item B<-t>, B<--test>
Do not write results to database. You should use B<--debug> in
conjunction with B<--test> ... everything else seems a bit pointless.
=item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
Set processing period to a single day in YYYY-MM-DD format or to a time
period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
by a colon).
Defaults to yesterday.
=item B<--rawdb> I<table> (raw data table)
Override I<DBTableRaw> from F<newsstats.conf>.
=item B<--parsedb> I<table> (parsed data table)
Override I<DBTableParse> from F<newsstats.conf>.
=item B<--conffile> I<filename>
Load configuration from I<filename> instead of F<newsstats.conf>.
=back
=head1 INSTALLATION
See L<doc/INSTALL>.
=head1 EXAMPLES
...
=head1 FILES
=over 4
=item F<bin/parsedb.pl>
The script itself.
=item F<lib/NewsStats.pm>
Library functions for the NewsStats package.
=item F<etc/newsstats.conf>
Runtime configuration file.
=back
=head1 BUGS
Please report any bugs or feature requests to the author or use the
bug tracker at L<http://bugs.th-h.de/>!
=head1 SEE ALSO
=over 2
=item -
L<doc/README>
=item -
L<doc/INSTALL>
=back
This script is part of the B<NewsStats> package.
=head1 AUTHOR
Thomas Hochstein <thh@inter.net>
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
This program is free software; you may redistribute it and/or modify it
under the same terms as Perl itself.
=cut

View file

@ -11,6 +11,7 @@ DBDatabase = newsstats
# tables
#
DBTableRaw = raw_de
DBTableParse = parsed_de
DBTableGrps = groups_de
#DBTableClnts =
#DBTableHosts =

View file

@ -53,7 +53,7 @@ my $DBCreate = <<SQLDB;
CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8;
SQLDB
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableGrps' => <<GRPS);
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableParse' => <<PARSE, 'DBTableGrps' => <<GRPS);
--
-- Table structure for table DBTableRaw
--
@ -78,6 +78,56 @@ CREATE TABLE IF NOT EXISTS `$Conf{'DBTableRaw'}` (
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Raw data';
RAW
--
-- Table structure for table DBTableParse
--
CREATE TABLE IF NOT EXISTS `$Conf{'DBTableParse'}` (
`id` bigint(20) unsigned NOT NULL auto_increment,
`day` date NOT NULL,
`mid` varchar(250) character set ascii NOT NULL,
`refs` varchar(1000) character set ascii,
`date` varchar(100) NOT NULL,
`path` varchar(1000) NOT NULL,
`newsgroups` varchar(1000) NOT NULL,
`fupto` varchar(200),
`from_` varchar(500),
`from_parsed` varchar(200),
`from_name` varchar(200),
`from_address` varchar(200),
`sender` varchar(500),
`sender_parsed` varchar(200),
`sender_name` varchar(200),
`sender_address` varchar(200),
`replyto` varchar(500),
`replyto_parsed` varchar(200),
`replyto_name` varchar(200),
`replyto_address` varchar(200),
`subject` varchar(1000) NOT NULL,
`subject_parsed` varchar(1000),
`organization` varchar(1000),
`linecount` int(4) unsigned,
`approved` varchar(250),
`supersedes` varchar(250),
`expires` varchar(100),
`useragent` varchar(500),
`xnewsreader` varchar(500),
`xmailer` varchar(500),
`xnoarchive` varchar(100),
`contenttype` varchar(500),
`contentencoding` varchar(500),
`cancellock` varchar(500),
`injectioninfo` varchar(500),
`xtrace` varchar(500),
`postinghost` varchar(1000),
`headers` longtext,
`disregard` tinyint(1) default '0',
PRIMARY KEY (`id`),
KEY `day` (`day`),
KEY `mid` (`mid`),
KEY `newsgroups` (`newsgroups`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Parsed data';
PARSE
--
-- Table structure for table DBTableGrps
--

View file

@ -107,7 +107,7 @@ sub ReadConfig {
$ConfFile = $HomePath . '/etc/newsstats.conf' if !$ConfFile;
# mandatory configuration options
my @Mandatory = ('DBDriver','DBHost','DBUser','DBPw','DBDatabase',
'DBTableRaw','DBTableGrps');
'DBTableRaw','DBTableParse','DBTableGrps');
# read config via Config::Auto
my $ConfR = Config::Auto::parse($ConfFile, format => 'equal');
my %Conf = %{$ConfR};