#!/usr/bin/python
#
# re benchmarking

import re

basetxt = """
This is some base text in which the match does not occur, nor do match
precursors, but we do find all of the regular features that I expect to
find in routine text, like newlines. And sentences.

And even paragraphs. We'll soon have to throw in _some_ non-ascii
characters, although I'm not being crazy enough to go all the way
to utf-8 or something.

== No, really.

So this is the end. We'll composite this together repeatedly to get
the actual test strings.
"""

# The precursor is FOO, followed by either BAR BAZ or MORK MINK.
bits = (
	"BAR BAZ",
	"MORK MINK",
	"F",
	"FO",
	"FOO",
	"FOO BAR",
	"FOO MORK",
	"FOO BAR BAZ",
	"FOO MORK MINK",
	)

# Regular expressions.
regexps = (
	('prefix alternate', re.compile('FOO (BAR BAZ|MORK MINK)')),
	('plain alternate', re.compile('(FOO BAR BAZ|FOO MORK MINK)')),
	('two regexps',
	 (re.compile('FOO BAR BAZ'), re.compile('FOO MORK MINK'))),
	('FOO+ alternate', re.compile('FOO+ (BAR BAZ|MORK MINK)')),
	('F+OO alternate', re.compile('F+OO (BAR BAZ|MORK MINK)')),
	('plain FOO BAR BAZ', re.compile('FOO BAR BAZ')),
	('(?=FOO) start', re.compile('(?=FOO)(FOO BAR BAZ|FOO MORK MINK)')),

	# Comment these in or out for test runs.
	# FIXME: should be some sort of command line argument.
	#('three alternates', re.compile('(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK)')),
	#('four alternates', re.compile('(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK|FOO ABR ACO)')),
	#('two no prefix', re.compile('(?:BAR BAZ|MORK MINK)')),
	#('three no prefix', re.compile('(BAR BAZ|MORK MINK|HIK HAEK)')),
	#('four no prefix', re.compile('(BAR BAZ|MORK MINK|HIK HAEK|ABR ACO)')),

	# This is really not an interesting variant, so I take it
	# out for now.
	#('plain FOO MORK MINK', re.compile('FOO MORK MINK')),
	)


# The text size is calculated to work out to be about 22K, which
# is big enough for various effects to kick in. (We hope.)
class TextBlock:
	def __init__(self, txt, pos, bit):
		self.bit = bit
		self.pos = pos
		fmt = "%60s\n"
		bit = fmt % ("some text "+bit+" around the bit.",)
		if pos == "none":
			self.text = txt * 50 + (fmt % "abc def ghi")
		elif pos == 'early':
			self.text = txt * 15 + bit + txt * 35
		elif pos == 'middle':
			self.text = txt * 25 + bit + txt * 25
		elif pos == 'late':
			self.text = txt * 35 + bit + txt * 15
		elif pos == 'end':
			self.text = txt * 50 + bit

	def expl(self):
		if self.pos == "none":
			return "plain unmatching"
		else:
			return "%s %s" % (self.pos, self.bit)

def gentext():
	def mk(pos, bit):
		return TextBlock(basetxt, pos, bit)
	l = [mk('none', '')] + \
	    [mk(x, y) for x in ('early', 'middle', 'late', 'end')
	     for y in bits]
	return l

import time, itertools
# loops should maybe vary, but ennh. 10000, 5000, depends on the
# size of the text.
LOOPS = 2000
def timer(rex, txt):
	it = itertools.repeat(None, LOOPS)
	t0 = time.time()
	if isinstance(rex, tuple):
		_r1 = rex[0]
		_r2 = rex[1]
		for i in it:
			_r1.search(txt) or _r2.search(txt)
	else:
		_r1 = rex
		for i in it:
			_r1.search(txt)
	t1 = time.time()
	return (t1 - t0) / LOOPS

def testit(res, texts):
	fmt = "%-20s " + " %18s" * len(res)
	print fmt % (("Text type:",) + tuple([x[0] for x in res]))
	for i in texts:
		txt = i.text
		times = [timer(x[1], txt) for x in res]
		# was '%.3g', but that flips to scientific notation
		# sometime, which I hate. %.2f always prints digits.
		ttxt = ["%.4g usecs" % (x * 1e6) for x in times]
		print fmt % ((i.expl(),) + tuple(ttxt))
		
		#for expl, rex in res:
		#	print "\t%.3g usecs\t%s" % (timer(rex, txt) * 1e6,
		#				    expl)

def main():
	testit(regexps, gentext())

if __name__ == "__main__":
	main()