Add some input validation.
Our raw data doesn't have the qualitiy one should expect. There are empty header lines only containing whitespace (leading to wrong joining of apparent continuation lines); header lines that contain garbage without ':' so split is failing; empty 'newsgroups' fields; unsupported encondings in MIME encoded words ... and so on. Add fixes for the aforementioned problems. Signed-off-by: Thomas Hochstein <thh@inter.net>
This commit is contained in:
parent
13e006104b
commit
48c8d4bb8e
|
@ -130,12 +130,19 @@ $DBQuery->execute()
|
||||||
binmode(STDOUT, ":utf8");
|
binmode(STDOUT, ":utf8");
|
||||||
$DBHandle->do("SET NAMES 'utf8'");
|
$DBHandle->do("SET NAMES 'utf8'");
|
||||||
|
|
||||||
|
# create a list of supported encondings
|
||||||
|
my %LegalEncodings;
|
||||||
|
foreach (Encode->encodings()) {
|
||||||
|
$LegalEncodings{$_} = 1;
|
||||||
|
}
|
||||||
# parse data in a loop and write it out
|
# parse data in a loop and write it out
|
||||||
print "-------------- Parsing data ... -------------\n" if $OptDebug;
|
print "-------------- Parsing data ... -------------\n" if $OptDebug;
|
||||||
while (my $HeadersR = $DBQuery->fetchrow_hashref) {
|
while (my $HeadersR = $DBQuery->fetchrow_hashref) {
|
||||||
my %Headers = %{$HeadersR};
|
my %Headers = %{$HeadersR};
|
||||||
|
|
||||||
# parse $Headers{'headers'} ('headers' from DBTableRaw)
|
# parse $Headers{'headers'} ('headers' from DBTableRaw)
|
||||||
|
# remove empty lines (that should not even exist in a header!)
|
||||||
|
$Headers{'headers'} =~ s/\n\s*\n/\n/g;
|
||||||
# merge continuation lines
|
# merge continuation lines
|
||||||
# from Perl Cookbook, 1st German ed. 1999, pg. 91
|
# from Perl Cookbook, 1st German ed. 1999, pg. 91
|
||||||
$Headers{'headers'} =~ s/\n\s+/ /g;
|
$Headers{'headers'} =~ s/\n\s+/ /g;
|
||||||
|
@ -143,9 +150,29 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
|
||||||
my $OtherHeaders;
|
my $OtherHeaders;
|
||||||
for (split(/\n/,$Headers{'headers'})) {
|
for (split(/\n/,$Headers{'headers'})) {
|
||||||
# split header lines in header name and header content
|
# split header lines in header name and header content
|
||||||
my ($key,$value) = split(/:/,$_,2);
|
my ($key,$value);
|
||||||
|
if ($_ =~ /:/) {
|
||||||
|
($key,$value) = split(/:/,$_,2);
|
||||||
$key =~ s/\s*//;
|
$key =~ s/\s*//;
|
||||||
$value =~ s/^\s*(.+)\s*$/$1/;
|
$value =~ s/^\s*(.+)\s*$/$1/;
|
||||||
|
} else {
|
||||||
|
&Bleat(1,sprintf("Illegal header line in %s.%s id %s: %s",
|
||||||
|
$Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
|
||||||
|
$Headers{'id'},$_));
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
# check for empty (mandatory) fields from DBTableRaw
|
||||||
|
# and set them from $Headers{'headers', if necessary
|
||||||
|
if (lc($key) =~ /^(message-id|path|newsgroups)$/) {
|
||||||
|
my $HeaderName = lc($key);
|
||||||
|
$HeaderName = 'mid' if ($HeaderName eq 'message-id');
|
||||||
|
if (!defined($Headers{$HeaderName}) or $Headers{$HeaderName} eq '') {
|
||||||
|
$Headers{$HeaderName} = $value;
|
||||||
|
&Bleat(1,sprintf("Taking missing %s from 'headers' in %s.%s id %s.",
|
||||||
|
$HeaderName, $Conf{'DBDatabase'}, $Conf{'DBTableRaw'},
|
||||||
|
$Headers{'id'}));
|
||||||
|
}
|
||||||
|
}
|
||||||
# save each header, separate database fields in %Headers,
|
# save each header, separate database fields in %Headers,
|
||||||
# the rest in $OtherHeaders (but not Message-ID, Path, Peer
|
# the rest in $OtherHeaders (but not Message-ID, Path, Peer
|
||||||
# and Newsgroups as those do already exist)
|
# and Newsgroups as those do already exist)
|
||||||
|
@ -166,7 +193,10 @@ while (my $HeadersR = $DBQuery->fetchrow_hashref) {
|
||||||
$HeaderName =~ s/_$//;
|
$HeaderName =~ s/_$//;
|
||||||
# decode From: / Sender: / Reply-To: / Subject:
|
# decode From: / Sender: / Reply-To: / Subject:
|
||||||
if ($Headers{$_} =~ /\?(B|Q)\?/) {
|
if ($Headers{$_} =~ /\?(B|Q)\?/) {
|
||||||
$Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_});
|
# check for legal encoding and decode
|
||||||
|
(my $Encoding) = $Headers{$_} =~ /\?([^?]+)\?(B|Q)\?/;
|
||||||
|
$Headers{$HeaderName.'_parsed'} = decode('MIME-Header',$Headers{$_})
|
||||||
|
if (exists($LegalEncodings{$Encoding}));
|
||||||
}
|
}
|
||||||
# extract name(s) and mail(s) from From: / Sender: / Reply-To:
|
# extract name(s) and mail(s) from From: / Sender: / Reply-To:
|
||||||
# in parsed form, if available
|
# in parsed form, if available
|
||||||
|
|
Loading…
Reference in a new issue