#!/local/bin/perl -w
#
# Produce spamhippo-like summaries of narf logs.
#
# Bugs: this has entirely too few comments for its size, especially given
# the high eeg factor in some of the code (hi, references; hi, eval of forms).
use 5.004;
use strict;

# the hashes that we track things in
my (%pathcnt, %pathsbi, %hostcnt, %hostsbi);

# we show this many entries.
my $topn = 25;
# do we sort on article count or sbi?
my $sorttype = "cnt";
# ... do we do both cnt and sbi reports?
my $doboth = 1;
# list class when possible instead of potentially forged path origin?
my $doclass = 0;

# Okay, it's GETOPT TIME.
sub usage {
	print STDERR "usage: $0 [-S|-C] [-c] [-t N] [file ....]\n";
	exit 1;
}
use Getopt::Std;
getopts('SCt:c') || &usage;
if ($::opt_S && $::opt_C) {
	print STDERR "$0: only one of -S and -C can be specified.\n";
	exit 1;
}
if (defined $::opt_t && $::opt_t <= 0) {
	print STDERR "$0: -t argument must be larger than zero.\n";
	exit 1;
}
$sorttype = "cnt"		if ($::opt_C);
$sorttype = "sbi"		if ($::opt_S);
$topn = $::opt_t		if ($::opt_t);
$doclass = !$doclass		if ($::opt_c || $::opt_c);

# da big loop.
# we swallow the narf log format, looking for the _ lines that have
# what we want.
while (<>) {
	my ($msgid, $cntrl, $path, $nntph, $sbi, $class);
	my ($mark);
	my (@cr);
	my (@p);
	my ($pp, $pt);
	
	# do we want it?
	next unless /^_ /;
	chomp;
	# get what we want.
# this would suffice if it were not for the CLEVER people who put SPACES
# in their 'Path:' header entries. ARGH.
# we are actually going to fix this in narf, which is the right place.
# but for now cope with the brokenness.
#	(undef, $msgid, $cntrl, $path, $nntph, $sbi) = split;
	(undef, $msgid, $cntrl, @cr) = split;
	# I die, all die, oh the embarassment. argh.
	$mark = $#cr;
	# oh, how I have gonged myself in the head here...
	# there are too many formats of _ lines that we try to deal with.
	undef $class;
	if ($cr[$mark] eq "_" || $cr[$mark] eq "NETZILLA") {
		# We have at least <NGS|_> <_|TYPE> at the end.
		$mark -= 2;
		$class = $cr[$#cr]	if ($cr[$#cr] ne "_");
		# do we have an MD5 body hash just before them?
		# (remembering that the MD5 body hash might not have been
		# computed because the article was too long...)
		$mark --	if ($cr[$mark] =~ /^[a-f0-9]{32}_\d+$/ ||
				    $cr[$mark] eq '_');
	}
#	$mark -= 2	if ($cr[$mark] eq "_" || $cr[$mark] eq "NETZILLA");
	$sbi = $cr[$mark];
	$nntph = $cr[$mark-1];
	$path = "@cr[0 .. $mark-2]";
	# ditch cancels
	next if $cntrl eq "cancel";
	# explode path with a kaboom
	@p = split '!', $path;
	# and add up what we want.
	# Cope with jokers who have '!!' near the end of their paths.
	# (Bite me, Netzilla)
	# compuserve may also bite me.
	$pp = $#p-1;
	$pp--		while (($pp >= 0) &&
			       (!$p[$pp] || $p[$pp] =~ /^n[it][dha].....$/));
	$pt = ($class && $doclass) ? $class : $p[$pp];
	if ($pt) {	
		$pathcnt{$pt}++;
		$pathsbi{$pt} += $sbi;
	}
	if (!($nntph eq "_")) {
		$hostcnt{$nntph}++;
		$hostsbi{$nntph} += $sbi;
	}
}

# Generate the ordering for path and host.
# in fact, generate both orderings for both paths and hosts (by count
# and by SBI).
my (@poc, @hoc, @pos, @hos);
my (%prc, %hrc, %prs, %hrs);		# map path (or host) to its rank.

@poc = sort {$pathcnt{$b} <=> $pathcnt{$a}} (keys %pathcnt);
@hoc = sort {$hostcnt{$b} <=> $hostcnt{$a}} (keys %hostcnt);
@pos = sort {$pathsbi{$b} <=> $pathsbi{$a}} (keys %pathsbi);
@hos = sort {$hostsbi{$b} <=> $hostsbi{$a}} (keys %hostsbi);

# generate the rank orderings too.
# observe the references, so we don't have to write this code
# over and over again.
sub doranks {
	my $i;
	my ($rankhash, $ranklist) = @_;
	for ($i = 0; $i <= $#$ranklist; $i++) {
		$$rankhash{$$ranklist[$i]} = $i+1;	# make it start from 1
	}
}
# I'm sure you see what's coming by now.
&doranks(\%prc, \@poc);
&doranks(\%prs, \@pos);
&doranks(\%hrc, \@hoc);
&doranks(\%hrs, \@hos);

# In order to make this print nice without tearing my head out over
# formats, we force all SBI numbers into having two decimal digits
# by sprintf()ing them. You may AUGH at will.
# ... so I AUGHed at the time it took and went with eval'ing the
# form to get it to look right.
#for (keys %hostsbi) {
#	$hostsbi{$_} = sprintf "%.2f", $hostsbi{$_};
#}
#for (keys %pathsbi) {
#	$pathsbi{$_} = sprintf "%.2f", $pathsbi{$_};
#}

# Now, the fun bit: FORMATS.
# Let's see how far we explode.
my ($reptype, $primary, $sec);
# these will be references. Pay attention.
my ($prank, $pcount, $scount, $srank, $rlist);
format STDOUT_TOP =
Rank  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  @>>>>>>>>>  @>>>>>>>>> (Rank)
       $reptype,			    $primary,  $sec
.
# This is necessary to avoid ugly time-wasting goo, like the sprintf()'s up
# above.
# $field1 and $field2 are the primary and secondary fields. One has to be
# a ## format while the other one can't be it (unless we want our counts
# to all have '.00' on the end; no thanks).
sub mkformat {
	my ($field1, $field2) = @_;
	eval <<EOF
format STDOUT =
# comment preserved for posterity so you can see it line up.
#@>>>  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  @>>>>>>>>>  @>>>>>>>>>  @>>>
@>>>  @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<  $field1  $field2  @>>>
\$\$prank{\$_}, \$_,		    \$\$pcount{\$_}, \$\$scount{\$_}, \$\$srank{\$_};
.
EOF
}

# Infinite lines per page.
$= = 100000000;
# pages -- ie a new X for the 'top N by X' -- are separated by a blank line.
$^L = "\n";

# What sort?
sub setorder {
	if ($sorttype eq "cnt") {
		$primary = "Count";
		$sec = "SBI";
		&mkformat('@>>>>>>>>>',
			  '@######.##');
	} else {
		$primary = "SBI";
		$sec = "Count";
		&mkformat('@######.##',
			  '@>>>>>>>>>');
	}
}
&setorder();
# Set the variables as necessary, based on what sort order we want.
# this takes references and sets more references, but the second are
# global and the first are parameters. Confused yet? You should be.
# (I think I am.)
# Having this code here saves me annoying cut & paste later. We could
# even duplicate the actual loop.
sub setrefs {
	my ($rankc, $ranks, $countc, $counts, $listc, $lists) = @_;
	if ($sorttype eq "cnt") {
		$prank = $rankc; $srank = $ranks;
		$pcount = $countc; $scount = $counts;
		$rlist = $listc;
	} else {
		$prank = $ranks; $srank = $rankc;
		$pcount = $counts; $scount = $countc;
		$rlist = $lists;
	}
}

# and ... go! (well, for one);
$reptype = "Top $topn rejected by Path origin";
&setrefs(\%prc, \%prs, \%pathcnt, \%pathsbi, \@poc, \@pos);
# urgh urk
# This is: generate the list of the first $topn elements of the list
# pointed to by $rlist (which is the right list) and iterate making
# $_ be that element, and it happens that $_ is the name of the thing
# we're interested in.
for (@$rlist[0 .. ($topn-1)]) {
	write;
}

# Now do the by-hosts report.
$reptype = "Top $topn rejected by NNTP-Posting-Host";
$- = 0;			# force new 'page' to print the header.
&setrefs(\%hrc, \%hrs, \%hostcnt, \%hostsbi, \@hoc, \@hos);
# urgh urk
for (@$rlist[0 .. ($topn-1)]) {
	write;
}

# This is getting REALLY ugly.
exit 0		unless $doboth;

if ($sorttype eq "cnt") {
	$sorttype="sbi";
} else {
	$sorttype="cnt";
}
$^W = 0;			# so how DO you undef a form...?
&setorder();
$^W = 1;
$- = 0;			# force new page.
$reptype = "Top $topn rejected by Path origin";
&setrefs(\%prc, \%prs, \%pathcnt, \%pathsbi, \@poc, \@pos);
for (@$rlist[0 .. ($topn-1)]) {
	write;
}
$- = 0;
$reptype = "Top $topn rejected by NNTP-Posting-Host";
&setrefs(\%hrc, \%hrs, \%hostcnt, \%hostsbi, \@hoc, \@hos);
for (@$rlist[0 .. ($topn-1)]) {
	write;
}