#!/usr/bin/python # # re benchmarking import re basetxt = """ This is some base text in which the match does not occur, nor do match precursors, but we do find all of the regular features that I expect to find in routine text, like newlines. And sentences. And even paragraphs. We'll soon have to throw in _some_ non-ascii characters, although I'm not being crazy enough to go all the way to utf-8 or something. == No, really. So this is the end. We'll composite this together repeatedly to get the actual test strings. """ # The precursor is FOO, followed by either BAR BAZ or MORK MINK. bits = ( "BAR BAZ", "MORK MINK", "F", "FO", "FOO", "FOO BAR", "FOO MORK", "FOO BAR BAZ", "FOO MORK MINK", ) # Regular expressions. regexps = ( ('prefix alternate', re.compile('FOO (BAR BAZ|MORK MINK)')), ('plain alternate', re.compile('(FOO BAR BAZ|FOO MORK MINK)')), ('two regexps', (re.compile('FOO BAR BAZ'), re.compile('FOO MORK MINK'))), ('FOO+ alternate', re.compile('FOO+ (BAR BAZ|MORK MINK)')), ('F+OO alternate', re.compile('F+OO (BAR BAZ|MORK MINK)')), ('plain FOO BAR BAZ', re.compile('FOO BAR BAZ')), ('(?=FOO) start', re.compile('(?=FOO)(FOO BAR BAZ|FOO MORK MINK)')), # Comment these in or out for test runs. # FIXME: should be some sort of command line argument. #('three alternates', re.compile('(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK)')), #('four alternates', re.compile('(FOO BAR BAZ|FOO MORK MINK|FOO HIK HAEK|FOO ABR ACO)')), #('two no prefix', re.compile('(?:BAR BAZ|MORK MINK)')), #('three no prefix', re.compile('(BAR BAZ|MORK MINK|HIK HAEK)')), #('four no prefix', re.compile('(BAR BAZ|MORK MINK|HIK HAEK|ABR ACO)')), # This is really not an interesting variant, so I take it # out for now. #('plain FOO MORK MINK', re.compile('FOO MORK MINK')), ) # The text size is calculated to work out to be about 22K, which # is big enough for various effects to kick in. (We hope.) class TextBlock: def __init__(self, txt, pos, bit): self.bit = bit self.pos = pos fmt = "%60s\n" bit = fmt % ("some text "+bit+" around the bit.",) if pos == "none": self.text = txt * 50 + (fmt % "abc def ghi") elif pos == 'early': self.text = txt * 15 + bit + txt * 35 elif pos == 'middle': self.text = txt * 25 + bit + txt * 25 elif pos == 'late': self.text = txt * 35 + bit + txt * 15 elif pos == 'end': self.text = txt * 50 + bit def expl(self): if self.pos == "none": return "plain unmatching" else: return "%s %s" % (self.pos, self.bit) def gentext(): def mk(pos, bit): return TextBlock(basetxt, pos, bit) l = [mk('none', '')] + \ [mk(x, y) for x in ('early', 'middle', 'late', 'end') for y in bits] return l import time, itertools # loops should maybe vary, but ennh. 10000, 5000, depends on the # size of the text. LOOPS = 2000 def timer(rex, txt): it = itertools.repeat(None, LOOPS) t0 = time.time() if isinstance(rex, tuple): _r1 = rex[0] _r2 = rex[1] for i in it: _r1.search(txt) or _r2.search(txt) else: _r1 = rex for i in it: _r1.search(txt) t1 = time.time() return (t1 - t0) / LOOPS def testit(res, texts): fmt = "%-20s " + " %18s" * len(res) print fmt % (("Text type:",) + tuple([x[0] for x in res])) for i in texts: txt = i.text times = [timer(x[1], txt) for x in res] # was '%.3g', but that flips to scientific notation # sometime, which I hate. %.2f always prints digits. ttxt = ["%.4g usecs" % (x * 1e6) for x in times] print fmt % ((i.expl(),) + tuple(ttxt)) #for expl, rex in res: # print "\t%.3g usecs\t%s" % (timer(rex, txt) * 1e6, # expl) def main(): testit(regexps, gentext()) if __name__ == "__main__": main()