#!/local/bin/perl -w # # Produce spamhippo-like summaries of narf logs. # # Bugs: this has entirely too few comments for its size, especially given # the high eeg factor in some of the code (hi, references; hi, eval of forms). use 5.004; use strict; # the hashes that we track things in my (%pathcnt, %pathsbi, %hostcnt, %hostsbi); # we show this many entries. my $topn = 25; # do we sort on article count or sbi? my $sorttype = "cnt"; # ... do we do both cnt and sbi reports? my $doboth = 1; # list class when possible instead of potentially forged path origin? my $doclass = 0; # Okay, it's GETOPT TIME. sub usage { print STDERR "usage: $0 [-S|-C] [-c] [-t N] [file ....]\n"; exit 1; } use Getopt::Std; getopts('SCt:c') || &usage; if ($::opt_S && $::opt_C) { print STDERR "$0: only one of -S and -C can be specified.\n"; exit 1; } if (defined $::opt_t && $::opt_t <= 0) { print STDERR "$0: -t argument must be larger than zero.\n"; exit 1; } $sorttype = "cnt" if ($::opt_C); $sorttype = "sbi" if ($::opt_S); $topn = $::opt_t if ($::opt_t); $doclass = !$doclass if ($::opt_c || $::opt_c); # da big loop. # we swallow the narf log format, looking for the _ lines that have # what we want. while (<>) { my ($msgid, $cntrl, $path, $nntph, $sbi, $class); my ($mark); my (@cr); my (@p); my ($pp, $pt); # do we want it? next unless /^_ /; chomp; # get what we want. # this would suffice if it were not for the CLEVER people who put SPACES # in their 'Path:' header entries. ARGH. # we are actually going to fix this in narf, which is the right place. # but for now cope with the brokenness. # (undef, $msgid, $cntrl, $path, $nntph, $sbi) = split; (undef, $msgid, $cntrl, @cr) = split; # I die, all die, oh the embarassment. argh. $mark = $#cr; # oh, how I have gonged myself in the head here... # there are too many formats of _ lines that we try to deal with. undef $class; if ($cr[$mark] eq "_" || $cr[$mark] eq "NETZILLA") { # We have at least <_|TYPE> at the end. $mark -= 2; $class = $cr[$#cr] if ($cr[$#cr] ne "_"); # do we have an MD5 body hash just before them? # (remembering that the MD5 body hash might not have been # computed because the article was too long...) $mark -- if ($cr[$mark] =~ /^[a-f0-9]{32}_\d+$/ || $cr[$mark] eq '_'); } # $mark -= 2 if ($cr[$mark] eq "_" || $cr[$mark] eq "NETZILLA"); $sbi = $cr[$mark]; $nntph = $cr[$mark-1]; $path = "@cr[0 .. $mark-2]"; # ditch cancels next if $cntrl eq "cancel"; # explode path with a kaboom @p = split '!', $path; # and add up what we want. # Cope with jokers who have '!!' near the end of their paths. # (Bite me, Netzilla) # compuserve may also bite me. $pp = $#p-1; $pp-- while (($pp >= 0) && (!$p[$pp] || $p[$pp] =~ /^n[it][dha].....$/)); $pt = ($class && $doclass) ? $class : $p[$pp]; if ($pt) { $pathcnt{$pt}++; $pathsbi{$pt} += $sbi; } if (!($nntph eq "_")) { $hostcnt{$nntph}++; $hostsbi{$nntph} += $sbi; } } # Generate the ordering for path and host. # in fact, generate both orderings for both paths and hosts (by count # and by SBI). my (@poc, @hoc, @pos, @hos); my (%prc, %hrc, %prs, %hrs); # map path (or host) to its rank. @poc = sort {$pathcnt{$b} <=> $pathcnt{$a}} (keys %pathcnt); @hoc = sort {$hostcnt{$b} <=> $hostcnt{$a}} (keys %hostcnt); @pos = sort {$pathsbi{$b} <=> $pathsbi{$a}} (keys %pathsbi); @hos = sort {$hostsbi{$b} <=> $hostsbi{$a}} (keys %hostsbi); # generate the rank orderings too. # observe the references, so we don't have to write this code # over and over again. sub doranks { my $i; my ($rankhash, $ranklist) = @_; for ($i = 0; $i <= $#$ranklist; $i++) { $$rankhash{$$ranklist[$i]} = $i+1; # make it start from 1 } } # I'm sure you see what's coming by now. &doranks(\%prc, \@poc); &doranks(\%prs, \@pos); &doranks(\%hrc, \@hoc); &doranks(\%hrs, \@hos); # In order to make this print nice without tearing my head out over # formats, we force all SBI numbers into having two decimal digits # by sprintf()ing them. You may AUGH at will. # ... so I AUGHed at the time it took and went with eval'ing the # form to get it to look right. #for (keys %hostsbi) { # $hostsbi{$_} = sprintf "%.2f", $hostsbi{$_}; #} #for (keys %pathsbi) { # $pathsbi{$_} = sprintf "%.2f", $pathsbi{$_}; #} # Now, the fun bit: FORMATS. # Let's see how far we explode. my ($reptype, $primary, $sec); # these will be references. Pay attention. my ($prank, $pcount, $scount, $srank, $rlist); format STDOUT_TOP = Rank @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>>>>> @>>>>>>>>> (Rank) $reptype, $primary, $sec . # This is necessary to avoid ugly time-wasting goo, like the sprintf()'s up # above. # $field1 and $field2 are the primary and secondary fields. One has to be # a ## format while the other one can't be it (unless we want our counts # to all have '.00' on the end; no thanks). sub mkformat { my ($field1, $field2) = @_; eval <>> @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< @>>>>>>>>> @>>>>>>>>> @>>> @>>> @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< $field1 $field2 @>>> \$\$prank{\$_}, \$_, \$\$pcount{\$_}, \$\$scount{\$_}, \$\$srank{\$_}; . EOF } # Infinite lines per page. $= = 100000000; # pages -- ie a new X for the 'top N by X' -- are separated by a blank line. $^L = "\n"; # What sort? sub setorder { if ($sorttype eq "cnt") { $primary = "Count"; $sec = "SBI"; &mkformat('@>>>>>>>>>', '@######.##'); } else { $primary = "SBI"; $sec = "Count"; &mkformat('@######.##', '@>>>>>>>>>'); } } &setorder(); # Set the variables as necessary, based on what sort order we want. # this takes references and sets more references, but the second are # global and the first are parameters. Confused yet? You should be. # (I think I am.) # Having this code here saves me annoying cut & paste later. We could # even duplicate the actual loop. sub setrefs { my ($rankc, $ranks, $countc, $counts, $listc, $lists) = @_; if ($sorttype eq "cnt") { $prank = $rankc; $srank = $ranks; $pcount = $countc; $scount = $counts; $rlist = $listc; } else { $prank = $ranks; $srank = $rankc; $pcount = $counts; $scount = $countc; $rlist = $lists; } } # and ... go! (well, for one); $reptype = "Top $topn rejected by Path origin"; &setrefs(\%prc, \%prs, \%pathcnt, \%pathsbi, \@poc, \@pos); # urgh urk # This is: generate the list of the first $topn elements of the list # pointed to by $rlist (which is the right list) and iterate making # $_ be that element, and it happens that $_ is the name of the thing # we're interested in. for (@$rlist[0 .. ($topn-1)]) { write; } # Now do the by-hosts report. $reptype = "Top $topn rejected by NNTP-Posting-Host"; $- = 0; # force new 'page' to print the header. &setrefs(\%hrc, \%hrs, \%hostcnt, \%hostsbi, \@hoc, \@hos); # urgh urk for (@$rlist[0 .. ($topn-1)]) { write; } # This is getting REALLY ugly. exit 0 unless $doboth; if ($sorttype eq "cnt") { $sorttype="sbi"; } else { $sorttype="cnt"; } $^W = 0; # so how DO you undef a form...? &setorder(); $^W = 1; $- = 0; # force new page. $reptype = "Top $topn rejected by Path origin"; &setrefs(\%prc, \%prs, \%pathcnt, \%pathsbi, \@poc, \@pos); for (@$rlist[0 .. ($topn-1)]) { write; } $- = 0; $reptype = "Top $topn rejected by NNTP-Posting-Host"; &setrefs(\%hrc, \%hrs, \%hostcnt, \%hostsbi, \@hoc, \@hos); for (@$rlist[0 .. ($topn-1)]) { write; }