#!/usr/bin/perl # # re benchmarking # # (translated to Perl by Daniel Martin, http://snowplow.org/martin/.) use strict; use Time::HiRes qw(gettimeofday tv_interval); my $basetxt = q[This is some base text in which the match does not occur, nor do match precursors, but we do find all of the regular features that I expect to find in routine text, like newlines. And sentences. And even paragraphs. We'll soon have to throw in _some_ non-ascii characters, although I'm not being crazy enough to go all the way to utf-8 or something. == No, really. So this is the end. We'll composite this together repeatedly to get the actual test strings.]; # The precursor is FOO, followed by either BAR BAZ or MORK MINK. my @bits = ( "BAR BAZ", "MORK MINK", "F", "FO", "FOO", "FOO BAR", "FOO MORK", "FOO BAR BAZ", "FOO MORK MINK", ); # Regular expressions. my @regexps = ( ['prefix alternate', qr'FOO (BAR BAZ|MORK MINK)'], ['plain alternate', qr'(FOO BAR BAZ|FOO MORK MINK)'], ['two regexps', [qr'FOO BAR BAZ', qr'FOO MORK MINK']], ['FOO+ alternate', qr'FOO+ (BAR BAZ|MORK MINK)'], ['F+OO alternate', qr'F+OO (BAR BAZ|MORK MINK)'], ['plain FOO BAR BAZ', qr'FOO BAR BAZ' ], ['(?=FOO) start', qr'(?=FOO)(FOO BAR BAZ|FOO MORK MINK)'], # Comment these in or out for test runs. # FIXME: should be some sort of command line argument. #['three alternates', qr'(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK)'], #['four alternates', qr'(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK|FOO ABR ACO)'], #['two no prefix', qr'(?:BAR BAZ|MORK MINK)'], #['three no prefix', qr'(BAR BAZ|MORK MINK|HIK HAEK)'], #['four no prefix', qr'(BAR BAZ|MORK MINK|HIK HAEK|ABR ACO)'], # This is really not an interesting variant, so I take it # out for now. #('plain FOO MORK MINK', re.compile('FOO MORK MINK')), ); # The text size is calculated to work out to be about 22K, which # is big enough for various effects to kick in. (We hope.) # a separate class is too heavyweight here; sub make_tblock { my ($txt, $pos, $bit) = @_; my $self = {}; $self->{bit} = $bit; $self->{posn} = $pos; # pos is a reserved word in perl my $fmt = "%60s\n"; $bit = sprintf($fmt ,"some text $bit around the bit."); $self->{text} = { "none", sub { $_[1] x 50 . sprintf($fmt, "abc def ghi");}, "early", sub { $_[1] x 15 . $_[0] . $_[1] x 35 }, "middle", sub { $_[1] x 25 . $_[0] . $_[1] x 25 }, "late", sub { $_[1] x 35 . $_[0] . $_[1] x 15 }, "end", sub { $_[1] x 50 . $_[0] } }->{$pos}->($bit,$txt); $self; } sub expl_tblock { my ($txtblk) = shift; if ($txtblk->{posn} eq "none") {"plain unmatching"} else {"$txtblk->{posn} $txtblk->{bit}"} } sub gentext { my $x; ( make_tblock($basetxt, 'none', ''), map {$x=$_;map {make_tblock($basetxt,$x,$_)} @bits} qw(early middle late end) ); } # loops should maybe vary, but ennh. 10000, 5000, depends on the # size of the text. use constant LOOPS => 2000; sub timer { my ($re, $txt) = @_; my $t0 = [gettimeofday]; if (ref($re) eq 'ARRAY') { my ($r1,$r2) = @$re; my $i; for $i (1..LOOPS) { $txt =~ /$r1/ or $txt =~ /$r2/ } } else { my $r1 = $re; my $i; for $i (1..LOOPS) { $txt =~ /$r1/ } } tv_interval($t0)/LOOPS; } sub testit { my ($res, $texts) = @_; my $fmt = join('',"%-20s ", map {" %18s"} @$res) . "\n"; printf($fmt, "Text type:", map {$_->[0]} @$res); for my $i (@$texts) { my $txt = $i->{text}; my @times = map{timer($_->[1], $txt)} @$res; # was '%.3g', but that flips to scientific notation # sometime, which I hate. %.2f always prints digits. my @ttxt = map {sprintf("%.4g usecs",$_ * 1e6)} @times; printf($fmt,expl_tblock($i),@ttxt); #for expl, rex in res: # print "\t%.3g usecs\t%s" % (timer(rex, txt) * 1e6, # expl) } } #def main(): # testit(regexps, gentext()) # #if __name__ == "__main__": # main() # the __name__ == "__main__" idiom doesn't work in perl testit([@regexps], [gentext]);