Create a database table with parsed raw data.
Incoming data is written to DBTableRaw without much interpretation. To allow for more and better analysis that raw data should be parsed daily and copied to another database table with separate fields for most header lines. All other scripts could use that pre-parsed data. * Add database schema to install.pl * Add DBTableParse to newsstats.conf.sample and as mandatory to NewsStats.pm * Add parsedb.pl TODO: - Documentation is only rudimentary. - From:, Sender:, Reply-To: and Subject: are not yet parsed. - gatherstats.pl does not yet use DbTableParse. Signed-off-by: Thomas Hochstein <thh@inter.net>
This commit is contained in:
parent
3634010808
commit
6d72dad2c0
323
bin/parsedb.pl
Executable file
323
bin/parsedb.pl
Executable file
|
@ -0,0 +1,323 @@
|
||||||
|
#! /usr/bin/perl
|
||||||
|
#
|
||||||
|
# parsedb.pl
|
||||||
|
#
|
||||||
|
# This script will parse a database with raw header information
|
||||||
|
# from a INN feed to a structured database.
|
||||||
|
#
|
||||||
|
# It is part of the NewsStats package.
|
||||||
|
#
|
||||||
|
# Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
|
||||||
|
#
|
||||||
|
# It can be redistributed and/or modified under the same terms under
|
||||||
|
# which Perl itself is published.
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
our $VERSION = "0.01";
|
||||||
|
use File::Basename;
|
||||||
|
# we're in .../bin, so our module is in ../lib
|
||||||
|
push(@INC, dirname($0).'/../lib');
|
||||||
|
}
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
use NewsStats qw(:DEFAULT :TimePeriods :SQLHelper);
|
||||||
|
|
||||||
|
use DBI;
|
||||||
|
use Getopt::Long qw(GetOptions);
|
||||||
|
Getopt::Long::config ('bundling');
|
||||||
|
|
||||||
|
################################# Definitions ##################################
|
||||||
|
|
||||||
|
# define header names with separate database fields
|
||||||
|
my %DBFields = ('date' => 'date',
|
||||||
|
'references' => 'refs',
|
||||||
|
'followup-to' => 'fupto',
|
||||||
|
'from' => 'from_',
|
||||||
|
'sender' => 'sender',
|
||||||
|
'reply-to' => 'replyto',
|
||||||
|
'subject' => 'subject',
|
||||||
|
'organization' => 'organization',
|
||||||
|
'lines' => 'linecount',
|
||||||
|
'approved' => 'approved',
|
||||||
|
'supersedes' => 'supersedes',
|
||||||
|
'expires' => 'expires',
|
||||||
|
'user-agent' => 'useragent',
|
||||||
|
'x-newsreader' => 'xnewsreader',
|
||||||
|
'x-mailer' => 'xmailer',
|
||||||
|
'x-no-archive' => 'xnoarchive',
|
||||||
|
'content-type' => 'contenttype',
|
||||||
|
'content-transfer-encoding' => 'contentencoding',
|
||||||
|
'cancel-lock' => 'cancellock',
|
||||||
|
'injection-info' => 'injectioninfo',
|
||||||
|
'x-trace' => 'xtrace',
|
||||||
|
'nntp-posting-host' => 'postinghost');
|
||||||
|
|
||||||
|
# define field list for database
|
||||||
|
my @DBFields = qw/day mid refs date path newsgroups fupto from_ from_parsed
|
||||||
|
from_name from_address sender sender_parsed sender_name
|
||||||
|
sender_address replyto replyto_parsed replyto_name
|
||||||
|
replyto_address subject subject_parsed organization linecount
|
||||||
|
approved supersedes expires useragent xnewsreader xmailer
|
||||||
|
xnoarchive contenttype contentencoding cancellock injectioninfo
|
||||||
|
xtrace postinghost headers disregard/;
|
||||||
|
|
||||||
|
################################# Main program #################################
|
||||||
|
|
||||||
|
### read commandline options
|
||||||
|
my ($OptDay,$OptDebug,$OptParseDB,$OptRawDB,$OptTest,$OptConfFile);
|
||||||
|
GetOptions ('d|day=s' => \$OptDay,
|
||||||
|
'debug!' => \$OptDebug,
|
||||||
|
'parsedb=s' => \$OptParseDB,
|
||||||
|
'rawdb=s' => \$OptRawDB,
|
||||||
|
't|test!' => \$OptTest,
|
||||||
|
'conffile=s' => \$OptConfFile,
|
||||||
|
'h|help' => \&ShowPOD,
|
||||||
|
'V|version' => \&ShowVersion) or exit 1;
|
||||||
|
|
||||||
|
### read configuration
|
||||||
|
my %Conf = %{ReadConfig($OptConfFile)};
|
||||||
|
|
||||||
|
### override configuration via commandline options
|
||||||
|
my %ConfOverride;
|
||||||
|
$ConfOverride{'DBTableRaw'} = $OptRawDB if $OptRawDB;
|
||||||
|
$ConfOverride{'DBTableParse'} = $OptParseDB if $OptParseDB;
|
||||||
|
&OverrideConfig(\%Conf,\%ConfOverride);
|
||||||
|
|
||||||
|
### get time period
|
||||||
|
### and set $Period for output and expression for SQL 'WHERE' clause
|
||||||
|
my ($Period,$SQLWherePeriod) = &GetTimePeriod($OptDay,'day');
|
||||||
|
# bail out if --month is invalid or "all"
|
||||||
|
&Bleat(2,"--day option has an invalid format - please use 'YYYY-MM-DD' or ".
|
||||||
|
"'YYYY-MM-DD:YYYY-MM-DD'!") if (!$Period or $Period eq 'all time');
|
||||||
|
|
||||||
|
### init database
|
||||||
|
my $DBHandle = InitDB(\%Conf,1);
|
||||||
|
|
||||||
|
### get & write data
|
||||||
|
&Bleat(1,'Test mode. Database is not updated.') if $OptTest;
|
||||||
|
|
||||||
|
# create $SQLWhereClause
|
||||||
|
my $SQLWhereClause = SQLBuildClause('where',$SQLWherePeriod,'NOT disregard');
|
||||||
|
|
||||||
|
# delete old data for current period
|
||||||
|
if (!$OptTest) {
|
||||||
|
print "----------- Deleting old data ... -----------\n" if $OptDebug;
|
||||||
|
my $DBQuery = $DBHandle->do(sprintf("DELETE FROM %s.%s %s",
|
||||||
|
$Conf{'DBDatabase'},$Conf{'DBTableParse'},
|
||||||
|
$SQLWhereClause))
|
||||||
|
or &Bleat(2,sprintf("Can't delete old parsed data for %s from %s.%s: ".
|
||||||
|
"$DBI::errstr\n",$Period,
|
||||||
|
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
|
||||||
|
};
|
||||||
|
|
||||||
|
# read from DBTableRaw
|
||||||
|
print "-------------- Reading data ... -------------\n" if $OptDebug;
|
||||||
|
my $DBQuery = $DBHandle->prepare(sprintf("SELECT id, day, mid, peer, path, ".
|
||||||
|
"newsgroups, headers, disregard ".
|
||||||
|
"FROM %s.%s %s", $Conf{'DBDatabase'},
|
||||||
|
$Conf{'DBTableRaw'}, $SQLWhereClause));
|
||||||
|
$DBQuery->execute()
|
||||||
|
or &Bleat(2,sprintf("Can't get data for %s from %s.%s: ".
|
||||||
|
"$DBI::errstr\n",$Period,
|
||||||
|
$Conf{'DBDatabase'},$Conf{'DBTableRaw'}));
|
||||||
|
|
||||||
|
# set output and database connection to UTF-8
|
||||||
|
# as we're going to write decoded header contents containing UTF-8 chars
|
||||||
|
binmode(STDOUT, ":utf8");
|
||||||
|
$DBHandle->do("SET NAMES 'utf8'");
|
||||||
|
|
||||||
|
# parse data in a loop and write it out
|
||||||
|
print "-------------- Parsing data ... -------------\n" if $OptDebug;
|
||||||
|
while (my $HeadersR = $DBQuery->fetchrow_hashref) {
|
||||||
|
my %Headers = %{$HeadersR};
|
||||||
|
|
||||||
|
# parse $Headers{'headers'} ('headers' from DBTableRaw)
|
||||||
|
# merge continuation lines
|
||||||
|
# from Perl Cookbook, 1st German ed. 1999, pg. 91
|
||||||
|
$Headers{'headers'} =~ s/\n\s+/ /g;
|
||||||
|
# split headers in single lines
|
||||||
|
my $OtherHeaders;
|
||||||
|
for (split(/\n/,$Headers{'headers'})) {
|
||||||
|
# split header lines in header name and header content
|
||||||
|
my ($key,$value) = split(/:/,$_,2);
|
||||||
|
$key =~ s/\s*//;
|
||||||
|
$value =~ s/^\s*(.+)\s*$/$1/;
|
||||||
|
# save each header, separate database fields in %Headers,
|
||||||
|
# the rest in $OtherHeaders (but not Message-ID, Path, Peer
|
||||||
|
# and Newsgroups as those do already exist)
|
||||||
|
if (defined($DBFields{lc($key)})) {
|
||||||
|
$Headers{$DBFields{lc($key)}} = $value;
|
||||||
|
} else {
|
||||||
|
$OtherHeaders .= sprintf("%s: %s\n",$key,$value)
|
||||||
|
if lc($key) !~ /^(message-id|path|peer|newsgroups)$/;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# replace old (now parsed) $Headers{'headers'} with remanining $OtherHeaders
|
||||||
|
chomp($OtherHeaders);
|
||||||
|
$Headers{'headers'} = $OtherHeaders;
|
||||||
|
|
||||||
|
# order output for database entry: fill @SQLBindVars
|
||||||
|
print "-------------- Next entry:\n" if $OptDebug;
|
||||||
|
my @SQLBindVars;
|
||||||
|
foreach (@DBFields) {
|
||||||
|
if (defined($Headers{$_}) and $Headers{$_} ne '') {
|
||||||
|
push (@SQLBindVars,$Headers{$_});
|
||||||
|
printf ("FOUND: %s -> %s\n",$_,$Headers{$_}) if $OptDebug;
|
||||||
|
} else {
|
||||||
|
push (@SQLBindVars,undef);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# write data to DBTableParse
|
||||||
|
if (!$OptTest) {
|
||||||
|
print "-------------- Writing data ... -------------\n" if $OptDebug;
|
||||||
|
my $DBWrite =
|
||||||
|
$DBHandle->prepare(sprintf("INSERT INTO %s.%s (%s) VALUES (%s)",
|
||||||
|
$Conf{'DBDatabase'},
|
||||||
|
$Conf{'DBTableParse'},
|
||||||
|
# get field names from @DBFields
|
||||||
|
join(', ',@DBFields),
|
||||||
|
# create a list of '?' for each DBField
|
||||||
|
join(', ',
|
||||||
|
split(/ /,'? ' x scalar(@DBFields)))
|
||||||
|
));
|
||||||
|
$DBWrite->execute(@SQLBindVars)
|
||||||
|
or &Bleat(2,sprintf("Can't write parsed data for %s to %s.%s: ".
|
||||||
|
"$DBI::errstr\n",$Period,
|
||||||
|
$Conf{'DBDatabase'},$Conf{'DBTableParse'}));
|
||||||
|
$DBWrite->finish;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
$DBQuery->finish;
|
||||||
|
|
||||||
|
### close handles
|
||||||
|
$DBHandle->disconnect;
|
||||||
|
|
||||||
|
print "------------------- DONE! -------------------\n" if $OptDebug;
|
||||||
|
__END__
|
||||||
|
|
||||||
|
################################ Documentation #################################
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
parsedb - parse raw data and save it to a database
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
B<parsedb> [B<-Vht>] [B<--day> I<YYYY-MM-DD> | I<YYYY-MM-DD:YYYY-MM-DD>] [B<--rawdb> I<database table>] [B<--parsedb> I<database table>] [B<--conffile> I<filename>] [B<--debug>]
|
||||||
|
|
||||||
|
=head1 REQUIREMENTS
|
||||||
|
|
||||||
|
See L<doc/README>.
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
=head2 Configuration
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
=head1 OPTIONS
|
||||||
|
|
||||||
|
=over 3
|
||||||
|
|
||||||
|
=item B<-V>, B<--version>
|
||||||
|
|
||||||
|
Print out version and copyright information and exit.
|
||||||
|
|
||||||
|
=item B<-h>, B<--help>
|
||||||
|
|
||||||
|
Print this man page and exit.
|
||||||
|
|
||||||
|
=item B<--debug>
|
||||||
|
|
||||||
|
Output (rather much) debugging information to STDOUT while processing.
|
||||||
|
|
||||||
|
=item B<-t>, B<--test>
|
||||||
|
|
||||||
|
Do not write results to database. You should use B<--debug> in
|
||||||
|
conjunction with B<--test> ... everything else seems a bit pointless.
|
||||||
|
|
||||||
|
=item B<-d>, B<--day> I<YYYY-MM-DD[:YYYY-MM-DD]>
|
||||||
|
|
||||||
|
Set processing period to a single day in YYYY-MM-DD format or to a time
|
||||||
|
period between two days in YYYY-MM-DD:YYYY-MM-DD format (two days, separated
|
||||||
|
by a colon).
|
||||||
|
|
||||||
|
Defaults to yesterday.
|
||||||
|
|
||||||
|
=item B<--rawdb> I<table> (raw data table)
|
||||||
|
|
||||||
|
Override I<DBTableRaw> from F<newsstats.conf>.
|
||||||
|
|
||||||
|
=item B<--parsedb> I<table> (parsed data table)
|
||||||
|
|
||||||
|
Override I<DBTableParse> from F<newsstats.conf>.
|
||||||
|
|
||||||
|
=item B<--conffile> I<filename>
|
||||||
|
|
||||||
|
Load configuration from I<filename> instead of F<newsstats.conf>.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head1 INSTALLATION
|
||||||
|
|
||||||
|
See L<doc/INSTALL>.
|
||||||
|
|
||||||
|
=head1 EXAMPLES
|
||||||
|
|
||||||
|
...
|
||||||
|
|
||||||
|
=head1 FILES
|
||||||
|
|
||||||
|
=over 4
|
||||||
|
|
||||||
|
=item F<bin/parsedb.pl>
|
||||||
|
|
||||||
|
The script itself.
|
||||||
|
|
||||||
|
=item F<lib/NewsStats.pm>
|
||||||
|
|
||||||
|
Library functions for the NewsStats package.
|
||||||
|
|
||||||
|
=item F<etc/newsstats.conf>
|
||||||
|
|
||||||
|
Runtime configuration file.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head1 BUGS
|
||||||
|
|
||||||
|
Please report any bugs or feature requests to the author or use the
|
||||||
|
bug tracker at L<http://bugs.th-h.de/>!
|
||||||
|
|
||||||
|
=head1 SEE ALSO
|
||||||
|
|
||||||
|
=over 2
|
||||||
|
|
||||||
|
=item -
|
||||||
|
|
||||||
|
L<doc/README>
|
||||||
|
|
||||||
|
=item -
|
||||||
|
|
||||||
|
L<doc/INSTALL>
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
This script is part of the B<NewsStats> package.
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Thomas Hochstein <thh@inter.net>
|
||||||
|
|
||||||
|
=head1 COPYRIGHT AND LICENSE
|
||||||
|
|
||||||
|
Copyright (c) 2013 Thomas Hochstein <thh@inter.net>
|
||||||
|
|
||||||
|
This program is free software; you may redistribute it and/or modify it
|
||||||
|
under the same terms as Perl itself.
|
||||||
|
|
||||||
|
=cut
|
|
@ -11,6 +11,7 @@ DBDatabase = newsstats
|
||||||
# tables
|
# tables
|
||||||
#
|
#
|
||||||
DBTableRaw = raw_de
|
DBTableRaw = raw_de
|
||||||
|
DBTableParse = parsed_de
|
||||||
DBTableGrps = groups_de
|
DBTableGrps = groups_de
|
||||||
#DBTableClnts =
|
#DBTableClnts =
|
||||||
#DBTableHosts =
|
#DBTableHosts =
|
||||||
|
|
|
@ -53,7 +53,7 @@ my $DBCreate = <<SQLDB;
|
||||||
CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8;
|
CREATE DATABASE IF NOT EXISTS `$Conf{'DBDatabase'}` DEFAULT CHARSET=utf8;
|
||||||
SQLDB
|
SQLDB
|
||||||
|
|
||||||
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableGrps' => <<GRPS);
|
my %DBCreate = ('DBTableRaw' => <<RAW, 'DBTableParse' => <<PARSE, 'DBTableGrps' => <<GRPS);
|
||||||
--
|
--
|
||||||
-- Table structure for table DBTableRaw
|
-- Table structure for table DBTableRaw
|
||||||
--
|
--
|
||||||
|
@ -78,6 +78,56 @@ CREATE TABLE IF NOT EXISTS `$Conf{'DBTableRaw'}` (
|
||||||
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Raw data';
|
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Raw data';
|
||||||
RAW
|
RAW
|
||||||
--
|
--
|
||||||
|
-- Table structure for table DBTableParse
|
||||||
|
--
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS `$Conf{'DBTableParse'}` (
|
||||||
|
`id` bigint(20) unsigned NOT NULL auto_increment,
|
||||||
|
`day` date NOT NULL,
|
||||||
|
`mid` varchar(250) character set ascii NOT NULL,
|
||||||
|
`refs` varchar(1000) character set ascii,
|
||||||
|
`date` varchar(100) NOT NULL,
|
||||||
|
`path` varchar(1000) NOT NULL,
|
||||||
|
`newsgroups` varchar(1000) NOT NULL,
|
||||||
|
`fupto` varchar(200),
|
||||||
|
`from_` varchar(500),
|
||||||
|
`from_parsed` varchar(200),
|
||||||
|
`from_name` varchar(200),
|
||||||
|
`from_address` varchar(200),
|
||||||
|
`sender` varchar(500),
|
||||||
|
`sender_parsed` varchar(200),
|
||||||
|
`sender_name` varchar(200),
|
||||||
|
`sender_address` varchar(200),
|
||||||
|
`replyto` varchar(500),
|
||||||
|
`replyto_parsed` varchar(200),
|
||||||
|
`replyto_name` varchar(200),
|
||||||
|
`replyto_address` varchar(200),
|
||||||
|
`subject` varchar(1000) NOT NULL,
|
||||||
|
`subject_parsed` varchar(1000),
|
||||||
|
`organization` varchar(1000),
|
||||||
|
`linecount` int(4) unsigned,
|
||||||
|
`approved` varchar(250),
|
||||||
|
`supersedes` varchar(250),
|
||||||
|
`expires` varchar(100),
|
||||||
|
`useragent` varchar(500),
|
||||||
|
`xnewsreader` varchar(500),
|
||||||
|
`xmailer` varchar(500),
|
||||||
|
`xnoarchive` varchar(100),
|
||||||
|
`contenttype` varchar(500),
|
||||||
|
`contentencoding` varchar(500),
|
||||||
|
`cancellock` varchar(500),
|
||||||
|
`injectioninfo` varchar(500),
|
||||||
|
`xtrace` varchar(500),
|
||||||
|
`postinghost` varchar(1000),
|
||||||
|
`headers` longtext,
|
||||||
|
`disregard` tinyint(1) default '0',
|
||||||
|
PRIMARY KEY (`id`),
|
||||||
|
KEY `day` (`day`),
|
||||||
|
KEY `mid` (`mid`),
|
||||||
|
KEY `newsgroups` (`newsgroups`)
|
||||||
|
) ENGINE=MyISAM DEFAULT CHARSET=utf8 COMMENT='Parsed data';
|
||||||
|
PARSE
|
||||||
|
--
|
||||||
-- Table structure for table DBTableGrps
|
-- Table structure for table DBTableGrps
|
||||||
--
|
--
|
||||||
|
|
||||||
|
|
|
@ -107,7 +107,7 @@ sub ReadConfig {
|
||||||
$ConfFile = $HomePath . '/etc/newsstats.conf' if !$ConfFile;
|
$ConfFile = $HomePath . '/etc/newsstats.conf' if !$ConfFile;
|
||||||
# mandatory configuration options
|
# mandatory configuration options
|
||||||
my @Mandatory = ('DBDriver','DBHost','DBUser','DBPw','DBDatabase',
|
my @Mandatory = ('DBDriver','DBHost','DBUser','DBPw','DBDatabase',
|
||||||
'DBTableRaw','DBTableGrps');
|
'DBTableRaw','DBTableParse','DBTableGrps');
|
||||||
# read config via Config::Auto
|
# read config via Config::Auto
|
||||||
my $ConfR = Config::Auto::parse($ConfFile, format => 'equal');
|
my $ConfR = Config::Auto::parse($ConfFile, format => 'equal');
|
||||||
my %Conf = %{$ConfR};
|
my %Conf = %{$ConfR};
|
||||||
|
|
Loading…
Reference in a new issue