From 995173456bcac27b4d6ca24b2562f90e1e6777ce Mon Sep 17 00:00:00 2001 From: Thomas Hochstein Date: Thu, 29 May 2025 18:03:10 +0200 Subject: [PATCH] Refactor and fix TLH check. Signed-off-by: Thomas Hochstein --- bin/gatherstats.pl | 49 ++++++++++++++++++++++++++++++++++------------ doc/ChangeLog | 1 + 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/bin/gatherstats.pl b/bin/gatherstats.pl index e900104..ca90cc7 100755 --- a/bin/gatherstats.pl +++ b/bin/gatherstats.pl @@ -137,7 +137,7 @@ foreach my $Month (&ListMonth($Period)) { googlegroups.com heirich.name news.neostrada.pl netcologne.de newsdawg.com newscene.com news-service.com octanews.com readnews.com wieslauf.sub.de highway.telekom.at united-newsserver.de xennanews.com xlned.com xsnews.nl news.xs4all.nl); - &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptMID,$OptTest,$OptDebug,@KnownHosts); + &HostStats($DBHandle,$DBRaw,$DBHosts,$Month,$OptTLH,$OptMID,$OptTest,$OptDebug,@KnownHosts); }; }; @@ -153,10 +153,10 @@ sub GroupStats { ### $DBRaw : database table for raw data (to read from) ### $DBGrps : database table for groups data (to write to) ### $Month : current month to do -### $MID : specific Message-ID to fetch (testing purposes) ### $TLH : TLHs to collect ### $Checkgroupsfile : filename template for checkgroups file ### (expanded to $Checkgroupsfile-$Month) +### $MID : specific Message-ID to fetch (testing purposes) ### $Test : test mode ### $Debug : debug mode ### OUT: (nothing) @@ -251,12 +251,13 @@ sub HostStats { ### $DBRaw : database table for raw data (to read from) ### $DBHosts : database table for hosts data (to write to) ### $Month : current month to do +### $TLH : TLHs to collect ### $MID : specific Message-ID to fetch (testing purposes) ### $Test : test mode ### $Debug : debug mode ### @KnownHosts : list of known hosts with subdomains ### OUT: (nothing) - my ($DBHandle,$DBRaw,$DBHosts,$Month,$MID,$Test,$Debug,@KnownHosts) = @_; + my ($DBHandle,$DBRaw,$DBHosts,$Month,$TLH,$MID,$Test,$Debug,@KnownHosts) = @_; my (%Postings,$DBQuery); @@ -281,16 +282,7 @@ sub HostStats { ### parse headers while (my ($Newsgroups,$Headers) = $DBQuery->fetchrow_array) { ### skip postings with wrong TLH - # remove whitespace from contents of Newsgroups: - chomp($Newsgroups); - $Newsgroups =~ s/\s//; - my $GroupCount; - for (split /,/, $Newsgroups) { - # don't count newsgroup/hierarchy in wrong TLH - next if($TLH and !/^$TLH/); - $GroupCount++; - }; - next if !$GroupCount; + next if ($TLH && !CheckTLH($Newsgroups,$TLH)); my $Host; my %Header = ParseHeaders(split(/\n/,$Headers)); @@ -380,6 +372,37 @@ sub HostStats { }; }; +sub CheckTLH { +### ---------------------------------------------------------------------------- +### count newsgroups from legal TLH(s) +### IN : $Newsgroups : comma separated list of newsgroups +### $TLH : (reference to an array of) legal TLH(s) +### OUT: number of newsgroups from legal TLH(s) + my ($Newsgroups,$TLH) = @_; + + my (@TLH,$GroupCount); + + # fill @TLH from $TLH, which can be an array reference or a scalar value + if (ref($TLH) eq 'ARRAY') { + @TLH = @{$TLH}; + } else { + push @TLH, $TLH; + } + + # remove whitespace from contents of Newsgroups: + chomp($Newsgroups); + $Newsgroups =~ s/\s//; + for (split /,/, $Newsgroups) { + my $Newsgroup = $_; + foreach (@TLH) { + # increment $GroupCount if $Newsgroup starts with $TLH + $GroupCount++ if $Newsgroup =~ /^$_/; + } + }; + + return $GroupCount; +} + __END__ ################################ Documentation ################################# diff --git a/doc/ChangeLog b/doc/ChangeLog index 9664638..9f248cc 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -1,5 +1,6 @@ NewsStats 0.4.0 (unreleased) * Reformat $Conf{TLH} for GroupStats only. + * Extract TLH check from HostStats to subroutine, fix no-op check. NewsStats 0.3.0 (2025-05-18) * Extract GroupStats (in gatherstats) to subroutine.