Refactor and fix TLH check.

Signed-off-by: Thomas Hochstein <thh@thh.name>
This commit is contained in:
Thomas Hochstein 2025-05-29 18:03:10 +02:00
parent 3447cdabff
commit 995173456b
2 changed files with 37 additions and 13 deletions

View file

@ -137,7 +137,7 @@ foreach my $Month (&ListMonth($Period)) {
googlegroups.com heirich.name news.neostrada.pl netcologne.de newsdawg.com newscene.com googlegroups.com heirich.name news.neostrada.pl netcologne.de newsdawg.com newscene.com
news-service.com octanews.com readnews.com wieslauf.sub.de highway.telekom.at news-service.com octanews.com readnews.com wieslauf.sub.de highway.telekom.at
united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl); united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl);
&HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptMID,$OptTest,$OptDebug,@KnownHosts); &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@KnownHosts);
}; };
}; };
@ -153,10 +153,10 @@ sub GroupStats {
### $DBRaw : database table for raw data (to read from) ### $DBRaw : database table for raw data (to read from)
### $DBGrps : database table for groups data (to write to) ### $DBGrps : database table for groups data (to write to)
### $Month : current month to do ### $Month : current month to do
### $MID : specific Message-ID to fetch (testing purposes)
### $TLH : TLHs to collect ### $TLH : TLHs to collect
### $Checkgroupsfile : filename template for checkgroups file ### $Checkgroupsfile : filename template for checkgroups file
### (expanded to $Checkgroupsfile-$Month) ### (expanded to $Checkgroupsfile-$Month)
### $MID : specific Message-ID to fetch (testing purposes)
### $Test : test mode ### $Test : test mode
### $Debug : debug mode ### $Debug : debug mode
### OUT: (nothing) ### OUT: (nothing)
@ -251,12 +251,13 @@ sub HostStats {
### $DBRaw : database table for raw data (to read from) ### $DBRaw : database table for raw data (to read from)
### $DBHosts : database table for hosts data (to write to) ### $DBHosts : database table for hosts data (to write to)
### $Month : current month to do ### $Month : current month to do
### $TLH : TLHs to collect
### $MID : specific Message-ID to fetch (testing purposes) ### $MID : specific Message-ID to fetch (testing purposes)
### $Test : test mode ### $Test : test mode
### $Debug : debug mode ### $Debug : debug mode
### @KnownHosts : list of known hosts with subdomains ### @KnownHosts : list of known hosts with subdomains
### OUT: (nothing) ### OUT: (nothing)
my ($DBHandle,$DBRaw,$DBHosts,$Month,$MID,$Test,$Debug,@KnownHosts) = @_; my ($DBHandle,$DBRaw,$DBHosts,$Month,$TLH,$MID,$Test,$Debug,@KnownHosts) = @_;
my (%Postings,$DBQuery); my (%Postings,$DBQuery);
@ -281,16 +282,7 @@ sub HostStats {
### parse headers ### parse headers
while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) { while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) {
### skip postings with wrong TLH ### skip postings with wrong TLH
# remove whitespace from contents of Newsgroups: next if ($TLH && !CheckTLH($Newsgroups,$TLH));
chomp($Newsgroups);
$Newsgroups =~ s/\s//;
my $GroupCount;
for (split /,/, $Newsgroups) {
# don't count newsgroup/hierarchy in wrong TLH
next if($TLH and !/^$TLH/);
$GroupCount++;
};
next if !$GroupCount;
my $Host; my $Host;
my %Header = ParseHeaders(split(/\n/,$Headers)); my %Header = ParseHeaders(split(/\n/,$Headers));
@ -380,6 +372,37 @@ sub HostStats {
}; };
}; };
sub CheckTLH {
### ----------------------------------------------------------------------------
### count newsgroups from legal TLH(s)
### IN : $Newsgroups : comma separated list of newsgroups
### $TLH : (reference to an array of) legal TLH(s)
### OUT: number of newsgroups from legal TLH(s)
my ($Newsgroups,$TLH) = @_;
my (@TLH,$GroupCount);
# fill @TLH from $TLH, which can be an array reference or a scalar value
if (ref($TLH) eq 'ARRAY') {
@TLH = @{$TLH};
} else {
push @TLH, $TLH;
}
# remove whitespace from contents of Newsgroups:
chomp($Newsgroups);
$Newsgroups =~ s/\s//;
for (split /,/, $Newsgroups) {
my $Newsgroup = $_;
foreach (@TLH) {
# increment $GroupCount if $Newsgroup starts with $TLH
$GroupCount++ if $Newsgroup =~ /^$_/;
}
};
return $GroupCount;
}
__END__ __END__
################################ Documentation ################################# ################################ Documentation #################################

View file

@ -1,5 +1,6 @@
NewsStats 0.4.0 (unreleased) NewsStats 0.4.0 (unreleased)
* Reformat $Conf{TLH} for GroupStats only. * Reformat $Conf{TLH} for GroupStats only.
* Extract TLH check from HostStats to subroutine, fix no-op check.
NewsStats 0.3.0 (2025-05-18) NewsStats 0.3.0 (2025-05-18)
* Extract GroupStats (in gatherstats) to subroutine. * Extract GroupStats (in gatherstats) to subroutine.