Add some input validation.

Our raw data doesn't have the qualitiy one should
expect. There are empty header lines only containing
whitespace (leading to wrong joining of apparent
continuation lines); header lines that contain garbage
without ':' so split is failing; empty 'newsgroups'
fields; unsupported encondings in MIME encoded words
... and so on.

Add fixes for the aforementioned problems.

Signed-off-by: Thomas Hochstein <thh@inter.net>
This commit is contained in:
Thomas Hochstein 2013-09-04 13:04:14 +02:00
parent 13e006104b
commit 48c8d4bb8e

View file

@ -130,12 +130,19 @@ $DBQuery->execute()
binmode(STDOUT, ":utf8"); binmode(STDOUT, ":utf8");
$DBHandle->do("SET NAMES 'utf8'"); $DBHandle->do("SET NAMES 'utf8'");
# create a list of supported encondings
my %LegalEncodings;
foreach (Encode->encodings()) {
$LegalEncodings{$_} = 1;
}
# parse data in a loop and write it out # parse data in a loop and write it out
print "-------------- Parsing data ... -------------\n" if $OptDebug; print "-------------- Parsing data ... -------------\n" if $OptDebug;
while (my $HeadersR = $DBQuery->fetchrow_hashref) { while (my $HeadersR = $DBQuery->fetchrow_hashref) {
my %Headers = %{$HeadersR}; my %Headers = %{$HeadersR};
# parse $Headers{'headers'} ('headers' from DBTableRaw) # parse $Headers{'headers'} ('headers' from DBTableRaw)
# remove empty lines (that should not even exist in a header!)
$Headers{'headers'} =~ s/\n\s*\n/\n/g;
# merge continuation lines # merge continuation lines
# from Perl Cookbook, 1st German ed. 1999, pg. 91 # from Perl Cookbook, 1st German ed. 1999, pg. 91
$Headers{'headers'} =~ s/\n\s+/ /g; $Headers{'headers'} =~ s/\n\s+/ /g;
@ -143,9 +150,29 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
my $OtherHeaders; my $OtherHeaders;
for (split(/\n/,$Headers{'headers'})) { for (split(/\n/,$Headers{'headers'})) {
# split header lines in header name and header content # split header lines in header name and header content
my ($key,$value) = split(/:/,$_,2); my ($key,$value);
$key =~ s/\s*//; if ($_ =~ /:/) {
$value =~ s/^\s*(.+)\s*$/$1/; ($key,$value) = split(/:/,$_,2);
$key =~ s/\s*//;
$value =~ s/^\s*(.+)\s*$/$1/;
} else {
&Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s",
$Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
$Headers{'id'},$_));
next;
}
# check for empty (mandatory) fields from DBTableRaw
# and set them from $Headers{'headers', if necessary
if (lc($key) =~ /^(message-id|path|newsgroups)$/) {
my $HeaderName = lc($key);
$HeaderName = 'mid' if ($HeaderName eq 'message-id');
if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') {
$Headers{$HeaderName} = $value;
&Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.",
$HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
$Headers{'id'}));
}
}
# save each header, separate database fields in %Headers, # save each header, separate database fields in %Headers,
# the rest in $OtherHeaders (but not Message-ID, Path, Peer # the rest in $OtherHeaders (but not Message-ID, Path, Peer
# and Newsgroups as those do already exist) # and Newsgroups as those do already exist)
@ -166,7 +193,10 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
$HeaderName =~ s/_$//; $HeaderName =~ s/_$//;
# decode From: / Sender: / Reply-To: / Subject: # decode From: / Sender: / Reply-To: / Subject:
if ($Headers{$_} =~ /\?(B|Q)\?/) { if ($Headers{$_} =~ /\?(B|Q)\?/) {
$Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_}); # check for legal encoding and decode
(my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/;
$Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_})
if (exists($LegalEncodings{$Encoding}));
} }
# extract name(s) and mail(s) from From: / Sender: / Reply-To: # extract name(s) and mail(s) from From: / Sender: / Reply-To:
# in parsed form, if available # in parsed form, if available