#!/usr/bin/perl
#
# re benchmarking
#
# (translated to Perl by Daniel Martin, http://snowplow.org/martin/.)

use strict;
use Time::HiRes qw(gettimeofday tv_interval);

my $basetxt = 
q[This is some base text in which the match does not occur, nor do match
precursors, but we do find all of the regular features that I expect to
find in routine text, like newlines. And sentences.

And even paragraphs. We'll soon have to throw in _some_ non-ascii
characters, although I'm not being crazy enough to go all the way
to utf-8 or something.

== No, really.

So this is the end. We'll composite this together repeatedly to get
the actual test strings.];

# The precursor is FOO, followed by either BAR BAZ or MORK MINK.
my @bits = (
	"BAR BAZ",
	"MORK MINK",
	"F",
	"FO",
	"FOO",
	"FOO BAR",
	"FOO MORK",
	"FOO BAR BAZ",
	"FOO MORK MINK",
	);

# Regular expressions.
my @regexps = (
	['prefix alternate', qr'FOO (BAR BAZ|MORK MINK)'],
	['plain alternate', qr'(FOO BAR BAZ|FOO MORK MINK)'],
	['two regexps',
	 [qr'FOO BAR BAZ', qr'FOO MORK MINK']],
	['FOO+ alternate', qr'FOO+ (BAR BAZ|MORK MINK)'],
	['F+OO alternate', qr'F+OO (BAR BAZ|MORK MINK)'],
	['plain FOO BAR BAZ', qr'FOO BAR BAZ' ],
	['(?=FOO) start', qr'(?=FOO)(FOO BAR BAZ|FOO MORK MINK)'],

	# Comment these in or out for test runs.
	# FIXME: should be some sort of command line argument.
	#['three alternates', qr'(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK)'],
	#['four alternates', qr'(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK|FOO ABR ACO)'],
	#['two no prefix', qr'(?:BAR BAZ|MORK MINK)'],
	#['three no prefix', qr'(BAR BAZ|MORK MINK|HIK HAEK)'],
	#['four no prefix', qr'(BAR BAZ|MORK MINK|HIK HAEK|ABR ACO)'],

	# This is really not an interesting variant, so I take it
	# out for now.
	#('plain FOO MORK MINK', re.compile('FOO MORK MINK')),
	);


# The text size is calculated to work out to be about 22K, which
# is big enough for various effects to kick in. (We hope.)

# a separate class is too heavyweight here;
sub make_tblock {
	my ($txt, $pos, $bit) = @_;
	my $self = {};
	$self->{bit} = $bit;
	$self->{posn} = $pos;  # pos is a reserved word in perl
	my $fmt = "%60s\n";
	$bit = sprintf($fmt ,"some text $bit around the bit.");
	$self->{text} =
	{
		"none", sub { $_[1] x 50 . sprintf($fmt, "abc def ghi");},
		"early", sub { $_[1] x 15 . $_[0] . $_[1] x 35 },
		"middle", sub { $_[1] x 25 . $_[0] . $_[1] x 25 },
		"late", sub { $_[1] x 35 . $_[0] . $_[1] x 15 },
		"end", sub { $_[1] x 50 . $_[0]  }
	}->{$pos}->($bit,$txt);
	$self;
}

sub expl_tblock {
	my ($txtblk) = shift;
	if ($txtblk->{posn} eq "none") {"plain unmatching"}
	else {"$txtblk->{posn} $txtblk->{bit}"}
}

sub gentext {
	my $x;
	(
		make_tblock($basetxt, 'none', ''),
		map {$x=$_;map {make_tblock($basetxt,$x,$_)} @bits}
		qw(early middle late end)
	);
}

# loops should maybe vary, but ennh. 10000, 5000, depends on the
# size of the text.
use constant LOOPS => 2000;
sub timer {
	my ($re, $txt) = @_;
	my $t0 = [gettimeofday];
	if (ref($re) eq 'ARRAY') {
		my ($r1,$r2) = @$re;
		my $i;
		for $i (1..LOOPS) {
			$txt =~ /$r1/ or $txt =~ /$r2/
		}
	} else {
		my $r1 = $re;
		my $i;
		for $i (1..LOOPS) {
			$txt =~ /$r1/
		}
	}
	tv_interval($t0)/LOOPS;
}

sub testit {
	my ($res, $texts) = @_;
	my $fmt = join('',"%-20s ", map {" %18s"} @$res) . "\n";
	printf($fmt, "Text type:", map {$_->[0]} @$res);
	for my $i (@$texts)
	{
		my $txt = $i->{text};
		my @times = map{timer($_->[1], $txt)} @$res;
		# was '%.3g', but that flips to scientific notation
		# sometime, which I hate. %.2f always prints digits.
		my @ttxt = map {sprintf("%.4g usecs",$_ * 1e6)} @times;
		printf($fmt,expl_tblock($i),@ttxt);
		
		#for expl, rex in res:
		#	print "\t%.3g usecs\t%s" % (timer(rex, txt) * 1e6,
		#				    expl)
	}
}

#def main():
#	testit(regexps, gentext())
#
#if __name__ == "__main__":
#	main()

# the __name__ == "__main__" idiom doesn't work in perl

testit([@regexps], [gentext]);