testing/xdelta3-regtest.py - platform/external/xdelta3 - Git at Google

 #!/usr/bin/python2.6
 # xdelta 3 - delta compression tools and library
 # Copyright (C) 2003, 2006, 2007, 2008.  Joshua P. MacDonald
 #
 #  This program is free software; you can redistribute it and/or modify
 #  it under the terms of the GNU General Public License as published by
 #  the Free Software Foundation; either version 2 of the License, or
 #  (at your option) any later version.
 #
 #  This program is distributed in the hope that it will be useful,
 #  but WITHOUT ANY WARRANTY; without even the implied warranty of
 #  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #  GNU General Public License for more details.
 #
 #  You should have received a copy of the GNU General Public License
 #  along with this program; if not, write to the Free Software
 #  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 # TODO: test 1.5 vs. greedy

 import os, sys, math, re, time, types, array, random
 import xdelta3

 #RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS'
 #RCSDIR = '/tmp/PRCS_read_copy'
 #SAMPLEDIR = "/tmp/WESNOTH_tmp/diff"

 #RCSDIR = 'G:/jmacd/PRCS_copy'
 #SAMPLEDIR = "C:/sample_data/Wesnoth/tar"

 RCSDIR = '/Users/jmacd/src/ftp.kernel.org'
 SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux'

 #
 MIN_SIZE       = 0

 TIME_TOO_SHORT = 0.050

 SKIP_TRIALS    = 2
 MIN_TRIALS     = 3
 MAX_TRIALS     = 15

 # 10 = fast 1.5 = slow
 MIN_STDDEV_PCT = 1.5

 # How many results per round
 MAX_RESULTS = 500
 TEST_ROUNDS = 10
 KEEP_P = (0.5)

 # For RCS testing, what percent to select
 FILE_P = (0.50)

 # For run-speed tests
 MIN_RUN = 1000 * 1000 * 1
 MAX_RUN = 1000 * 1000 * 10

 # Testwide defaults
 ALL_ARGS = [
     '-q'  # '-vv'
     ]

 # The first 7 args go to -C
 SOFT_CONFIG_CNT = 7

 CONFIG_ORDER = [ 'large_look',
                  'large_step',
                  'small_look',
                  'small_chain',
                  'small_lchain',
                  'max_lazy',
                  'long_enough',

                  # > SOFT_CONFIG_CNT
                  'nocompress',
                  'winsize',
                  'srcwinsize',
                  'sprevsz',
                  'iopt',
                  'djw',
                  'altcode',
                  ]

 CONFIG_ARGMAP = {
     'winsize'    : '-W',
     'srcwinsize' : '-B',
     'sprevsz'    : '-P',
     'iopt'       : '-I',
     'nocompress' : '-N',
     'djw'        : '-Sdjw',
     'altcode'    : '-T',
     }

 def INPUT_SPEC(rand):
     return {

     # Time/space costs:

     # -C 1,2,3,4,5,6,7
     'large_look' : lambda d: rand.choice([9, 10, 11, 12]),
     'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]),
     'small_look'   : lambda d: rand.choice([4]),
     'small_chain'  : lambda d: rand.choice([1]),
     'small_lchain' : lambda d: rand.choice([1]),
     'max_lazy'     : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]),

     # Note: long_enough only refers to small matching and has no effect if
     # small_chain == 1.
     'long_enough'  : lambda d: rand.choice([4]),

     # -N
     'nocompress'   : lambda d: rand.choice(['false']),

     # -T
     'altcode'      : lambda d: rand.choice(['false']),

     # -S djw
     'djw'          : lambda d: rand.choice(['false']),

     # Memory costs:

     # -W
     'winsize'      : lambda d: 8 * (1<<20),

     # -B
     'srcwinsize'   : lambda d: 64 * (1<<20),

     # -I 0 is unlimited
     'iopt'         : lambda d: 0,

     # -P only powers of two
     'sprevsz'      : lambda d: rand.choice([x * (1<<16) for x in [4]]),
   }
 #end

 #
 TMPDIR = '/tmp/xd3regtest.%d' % os.getpid()

 RUNFILE = os.path.join(TMPDIR, 'run')
 DFILE   = os.path.join(TMPDIR, 'output')
 RFILE   = os.path.join(TMPDIR, 'recon')
 CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1')
 CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2')

 HEAD_STATE = 0
 BAR_STATE  = 1
 REV_STATE  = 2
 DATE_STATE = 3

 #
 IGNORE_FILENAME  = re.compile('.*\\.(gif|jpg).*')

 # rcs output
 RE_TOTREV  = re.compile('total revisions: (\\d+)')
 RE_BAR     = re.compile('----------------------------')
 RE_REV     = re.compile('revision (.+)')
 RE_DATE    = re.compile('date: ([^;]+);.*')
 # xdelta output
 RE_HDRSZ   = re.compile('VCDIFF header size: +(\\d+)')
 RE_EXTCOMP = re.compile('XDELTA ext comp.*')

 def c2str(c):
     return ' '.join(['%s' % x for x in c])
 #end

 def SumList(l):
     return reduce(lambda x,y: x+y, l)
 #end

 # returns (total, mean, stddev, q2 (median),
 #          (q3-q1)/2 ("semi-interquartile range"), max-min (spread))
 class StatList:
     def __init__(self,l,desc):
         cnt = len(l)
         assert(cnt > 1)
         l.sort()
         self.cnt    = cnt
         self.l      = l
         self.total  = SumList(l)
         self.mean   = self.total / float(self.cnt)
         self.s      = math.sqrt(SumList([(x-self.mean) *
                                          (x - self.mean) for x in l]) /
                                 float(self.cnt-1))
         self.q0     = l[0]
         self.q1     = l[int(self.cnt/4.0+0.5)]
         self.q2     = l[int(self.cnt/2.0+0.5)]
         self.q3     = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))]
         self.q4     = l[self.cnt-1]
         self.siqr   = (self.q3-self.q1)/2.0;
         self.spread = (self.q4-self.q0)
         if len(l) == 1:
             self.str = '%s %s' % (desc, l[0])
         else:
             self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \
                 (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4)
     #end
 #end

 def RunCommand(args, ok = [0]):
     #print 'run command %s' % (' '.join(args))
     p = os.spawnvp(os.P_WAIT, args[0], args)
     if p not in ok:
         raise CommandError(args, 'exited %d' % p)
     #end
 #end

 def RunCommandIO(args,infn,outfn):
     p = os.fork()
     if p == 0:
         os.dup2(os.open(infn,os.O_RDONLY),0)
         os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1)
         os.execvp(args[0], args)
     else:
         s = os.waitpid(p,0)
         o = os.WEXITSTATUS(s[1])
         if not os.WIFEXITED(s[1]) or o != 0:
             raise CommandError(args, 'exited %d' % o)
         #end
     #end
 #end

 class TimedTest:
     def __init__(self, target, source, runnable,
                  skip_trials = SKIP_TRIALS,
                  min_trials = MIN_TRIALS,
                  max_trials = MAX_TRIALS,
                  min_stddev_pct = MIN_STDDEV_PCT):
         self.target = target
         self.source = source
         self.runnable = runnable

         self.skip_trials = skip_trials
         self.min_trials = min(min_trials, max_trials)
         self.max_trials = max_trials
         self.min_stddev_pct = min_stddev_pct

         self.encode_time = self.DoTest(DFILE,
                                        lambda x: x.Encode(self.target,
                                                           self.source, DFILE))
         self.encode_size = runnable.EncodeSize(DFILE)

         self.decode_time = self.DoTest(RFILE,
                                        lambda x: x.Decode(DFILE,
                                                           self.source, RFILE),
                                        )
         runnable.Verify(self.target, RFILE)
     #end

     def DoTest(self, fname, func):
         trials   = 0
         measured = []

         while 1:
             try:
                 os.remove(fname)
             except OSError:
                 pass

             start_time  = time.time()
             start_clock = time.clock()

             func(self.runnable)

             total_clock = (time.clock() - start_clock)
             total_time  = (time.time() - start_time)

             elap_time  = max(total_time,  0.0000001)
             elap_clock = max(total_clock, 0.0000001)

             trials = trials + 1

             # skip some of the first trials
             if trials > self.skip_trials:
                 measured.append((elap_clock, elap_time))
                 #print 'measurement total: %.1f ms' % (total_time * 1000.0)

             # at least so many
             if trials < (self.skip_trials + self.min_trials):
                 #print 'continue: need more trials: %d' % trials
                 continue

             # compute %variance
             done = 0
             if self.skip_trials + self.min_trials <= 2:
                 measured = measured + measured;
                 done = 1
             #end

             time_stat = StatList([x[1] for x in measured], 'elap time')
             sp = float(time_stat.s) / float(time_stat.mean)

             # what if MAX_TRIALS is exceeded?
             too_many = (trials - self.skip_trials) >= self.max_trials
             good = (100.0 * sp) < self.min_stddev_pct
             if done or too_many or good:
                 trials = trials - self.skip_trials
                 if not done and not good:
                     #print 'too many trials: %d' % trials
                     pass
                 #clock = StatList([x[0] for x in measured], 'elap clock')
                 return time_stat
             #end
         #end
     #end
 #end

 def Decimals(start, end):
     l = []
     step = start
     while 1:
         r = range(step, step * 10, step)
         l = l + r
         if step * 10 >= end:
             l.append(step * 10)
             break
         step = step * 10
     return l
 #end

 # This tests the raw speed of 0-byte inputs
 def RunSpeedTest():
     for L in Decimals(MIN_RUN, MAX_RUN):
         SetFileSize(RUNFILE, L)

         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)]))
         ReportSpeed(L, trx, '1MB ')

         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)]))
         ReportSpeed(L, trx, '512k')

         trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)]))
         ReportSpeed(L, trx, '256k')

         trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE))
         ReportSpeed(L, trm, 'swig')

         trg = TimedTest(RUNFILE, None, GzipRun1())
         ReportSpeed(L,trg,'gzip')
     #end
 #end

 def SetFileSize(F,L):
     fd = os.open(F, os.O_CREAT | os.O_WRONLY)
     os.ftruncate(fd,L)
     assert os.fstat(fd).st_size == L
     os.close(fd)
 #end

 def ReportSpeed(L,tr,desc):
     print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \
           (desc, L,
            tr.encode_size,
            tr.encode_time.mean * 1000.0,
            tr.decode_time.mean * 1000.0)
 #end

 class Xdelta3RunClass:
     def __init__(self, extra):
         self.extra = extra
     #end

     def __str__(self):
         return ' '.join(self.extra)
     #end

     def New(self):
         return Xdelta3Runner(self.extra)
     #end
 #end

 class Xdelta3Runner:
     # Use "forkexec" to get special command-line only features like
     # external compression support.
     def __init__(self, extra, forkexec=False):
         self.forkexec = forkexec
         self.extra = extra
     #end

     def Encode(self, target, source, output):
         args = (ALL_ARGS +
                 self.extra +
                 ['-e'])
         if source:
             args.append('-s')
             args.append(source)
         #end
         args = args + [target, output]
         self.Main(args)
     #end

     def Decode(self, input, source, output):
         args = (ALL_ARGS +
                 ['-d'])
         if source:
             args.append('-s')
             args.append(source)
         #end
         args = args + [input, output]
         self.Main(args)
     #end

     def Verify(self, target, recon):
         if target[-3:] == ".gz":
             RunCommandIO(('gzip', '-dc'), target, CMPTMP1)
             RunCommandIO(('gzip', '-dc'), recon, CMPTMP2)
             RunCommand(('cmp', CMPTMP1, CMPTMP2))
         else:
             RunCommand(('cmp', target, recon))
     #end

     def EncodeSize(self, output):
         return os.stat(output).st_size
     #end

     def Main(self, args):
         try:
             if self.forkexec:
                 RunCommand(['../xdelta3'] + args)
             else:
                 xdelta3.xd3_main_cmdline(args)
         except Exception, e:
             raise CommandError(args, "xdelta3.main exception: %s" % e)
         #end
     #end
 #end

 class Xdelta3Mod1:
     def __init__(self, file):
         self.target_data = open(file, 'r').read()
     #end

     def Encode(self, ignore1, ignore2, ignore3):
         r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10)
         if r1 != 0:
             raise CommandError('memory', 'encode failed: %s' % r1)
         #end
         self.encoded = encoded
     #end

     def Decode(self, ignore1, ignore2, ignore3):
         r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data))
         if r2 != 0:
             raise CommandError('memory', 'decode failed: %s' % r1)
         #end
         self.decoded = data1
     #end

     def Verify(self, ignore1, ignore2):
         if self.target_data != self.decoded:
             raise CommandError('memory', 'bad decode')
         #end
     #end

     def EncodeSize(self, ignore1):
         return len(self.encoded)
     #end
 #end

 class GzipRun1:
     def Encode(self, target, source, output):
         assert source == None
         RunCommandIO(['gzip', '-cf'], target, output)
     #end

     def Decode(self, input, source, output):
         assert source == None
         RunCommandIO(['gzip', '-dcf'], input, output)
     #end

     def Verify(self, target, recon):
         RunCommand(('cmp', target, recon))
     #end

     def EncodeSize(self, output):
         return os.stat(output).st_size
     #end
 #end

 class Xdelta1RunClass:
     def __str__(self):
         return 'xdelta1'
     #end

     def New(self):
         return Xdelta1Runner()
     #end
 #end

 class Xdelta1Runner:
     def Encode(self, target, source, output):
         assert source != None
         args = ['xdelta1', 'delta', '-q', source, target, output]
         RunCommand(args, [0, 1])
     #end

     def Decode(self, input, source, output):
         assert source != None
         args = ['xdelta1', 'patch', '-q', input, source, output]
         # Note: for dumb historical reasons, xdelta1 returns 1 or 0
         RunCommand(args)
     #end

     def Verify(self, target, recon):
         RunCommand(('cmp', target, recon))
     #end

     def EncodeSize(self, output):
         return os.stat(output).st_size
     #end
 #end

 # exceptions
 class SkipRcsException:
     def __init__(self,reason):
         self.reason = reason
     #end
 #end

 class NotEnoughVersions:
     def __init__(self):
         pass
     #end
 #end

 class CommandError:
     def __init__(self,cmd,str):
         if type(cmd) is types.TupleType or \
            type(cmd) is types.ListType:
             cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd)
         #end
         print 'command was: ',cmd
         print 'command failed: ',str
         print 'have fun debugging'
     #end
 #end

 class RcsVersion:
     def __init__(self,vstr):
         self.vstr = vstr
     #end
     def __cmp__(self,other):
         return cmp(self.date, other.date)
     #end
     def __str__(self):
         return str(self.vstr)
     #end
 #end

 class RcsFile:

     def __init__(self, fname):
         self.fname    = fname
         self.versions = []
         self.state    = HEAD_STATE
     #end

     def SetTotRev(self,s):
         self.totrev = int(s)
     #end

     def Rev(self,s):
         self.rev = RcsVersion(s)
         if len(self.versions) >= self.totrev:
             raise SkipRcsException('too many versions (in log messages)')
         #end
         self.versions.append(self.rev)
     #end

     def Date(self,s):
         self.rev.date = s
     #end

     def Match(self, line, state, rx, gp, newstate, f):
         if state == self.state:
             m = rx.match(line)
             if m:
                 if f:
                     f(m.group(gp))
                 #end
                 self.state = newstate
                 return 1
             #end
         #end
         return None
     #end

     def Sum1Rlog(self):
         f = os.popen('rlog '+self.fname, "r")
         l = f.readline()
         while l:
             if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev):
                 pass
             elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None):
                 pass
             elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev):
                 pass
             elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date):
                 pass
             #end
             l = f.readline()
         #end
         c = f.close()
         if c != None:
             raise c
         #end
     #end

     def Sum1(self):
         st = os.stat(self.fname)
         self.rcssize = st.st_size
         self.Sum1Rlog()
         if self.totrev != len(self.versions):
             raise SkipRcsException('wrong version count')
         #end
         self.versions.sort()
     #end

     def Checkout(self,n):
         v      = self.versions[n]
         out    = open(self.Verf(n), "w")
         cmd    = 'co -ko -p%s %s' % (v.vstr, self.fname)
         total  = 0
         (inf,
          stream,
          err)  = os.popen3(cmd, "r")
         inf.close()
         buf    = stream.read()
         while buf:
             total = total + len(buf)
             out.write(buf)
             buf = stream.read()
         #end
         v.vsize = total
         estr = ''
         buf = err.read()
         while buf:
             estr = estr + buf
             buf = err.read()
         #end
         if stream.close():
             raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr))
         #end
         out.close()
         err.close()
     #end

     def Vdate(self,n):
         return self.versions[n].date
     #end

     def Vstr(self,n):
         return self.versions[n].vstr
     #end

     def Verf(self,n):
         return os.path.join(TMPDIR, 'input.%d' % n)
     #end

     def FilePairsByDate(self, runclass):
         if self.totrev < 2:
             raise NotEnoughVersions()
         #end
         self.Checkout(0)
         ntrials = []
         if self.totrev < 2:
             return vtrials
         #end
         for v in range(0,self.totrev-1):
             if v > 1:
                 os.remove(self.Verf(v-1))
             #end
             self.Checkout(v+1)
             if os.stat(self.Verf(v)).st_size < MIN_SIZE or \
                os.stat(self.Verf(v+1)).st_size < MIN_SIZE:
                 continue
             #end

             result = TimedTest(self.Verf(v+1),
                                self.Verf(v),
                                runclass.New())

             target_size = os.stat(self.Verf(v+1)).st_size

             ntrials.append(result)
         #end

         os.remove(self.Verf(self.totrev-1))
         os.remove(self.Verf(self.totrev-2))
         return ntrials
     #end

     def AppendVersion(self, f, n):
         self.Checkout(n)
         rf = open(self.Verf(n), "r")
         data = rf.read()
         f.write(data)
         rf.close()
         return len(data)
     #end

 class RcsFinder:
     def __init__(self):
         self.subdirs  = []
         self.rcsfiles = []
         self.others   = []
         self.skipped  = []
         self.biground = 0
     #end

     def Scan1(self,dir):
         dents = os.listdir(dir)
         subdirs  = []
         rcsfiles = []
         others   = []
         for dent in dents:
             full = os.path.join(dir, dent)
             if os.path.isdir(full):
                 subdirs.append(full)
             elif dent[len(dent)-2:] == ",v":
                 rcsfiles.append(RcsFile(full))
             else:
                 others.append(full)
             #end
         #end
         self.subdirs  = self.subdirs  + subdirs
         self.rcsfiles = self.rcsfiles + rcsfiles
         self.others   = self.others   + others
         return subdirs
     #end

     def Crawl(self, dir):
         subdirs = [dir]
         while subdirs:
             s1 = self.Scan1(subdirs[0])
             subdirs = subdirs[1:] + s1
         #end
     #end

     def Summarize(self):
         good = []
         for rf in self.rcsfiles:
             try:
                 rf.Sum1()
                 if rf.totrev < 2:
                     raise SkipRcsException('too few versions (< 2)')
                 #end
             except SkipRcsException, e:
                 #print 'skipping file %s: %s' % (rf.fname, e.reason)
                 self.skipped.append(rf)
             else:
                 good.append(rf)
             #end
         self.rcsfiles = good
     #end

     def AllPairsByDate(self, runclass):
         results = []
         good = []
         for rf in self.rcsfiles:
             try:
                 results = results + rf.FilePairsByDate(runclass)
             except SkipRcsException:
                 print 'file %s has compressed versions: skipping' % (rf.fname)
             except NotEnoughVersions:
                 print 'testing %s on %s: not enough versions' % (runclass, rf.fname)
             else:
                 good.append(rf)
             #end
         self.rcsfiles = good
         self.ReportPairs(runclass, results)
         return results
     #end

     def ReportPairs(self, name, results):
         encode_time = 0
         decode_time = 0
         encode_size = 0
         for r in results:
             encode_time += r.encode_time.mean
             decode_time += r.decode_time.mean
             encode_size += r.encode_size
         #end
         print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \
               (name, encode_time, decode_time, encode_size)
     #end

     def MakeBigFiles(self, rand):
         f1 = open(TMPDIR + "/big.1", "w")
         f2 = open(TMPDIR + "/big.2", "w")
         population = []
         for file in self.rcsfiles:
             if len(file.versions) < 2:
                 continue
             population.append(file)
         #end
         f1sz = 0
         f2sz = 0
         fcount = int(len(population) * FILE_P)
         assert fcount > 0
         for file in rand.sample(population, fcount):
             m = IGNORE_FILENAME.match(file.fname)
             if m != None:
                 continue
             #end
             r1, r2 = rand.sample(xrange(0, len(file.versions)), 2)
             f1sz += file.AppendVersion(f1, r1)
             f2sz += file.AppendVersion(f2, r2)
             #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):],
             #file.Vstr(r1), file.Vstr(r2)))
         #end
         testkey = 'rcs%d' % self.biground
         self.biground = self.biground + 1

         print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz)
         f1.close()
         f2.close()
         return (TMPDIR + "/big.1",
                 TMPDIR + "/big.2",
                 testkey)
     #end

     def Generator(self):
         return lambda rand: self.MakeBigFiles(rand)
     #end
 #end

 # find a set of RCS files for testing
 def GetTestRcsFiles():
     rcsf = RcsFinder()
     rcsf.Crawl(RCSDIR)
     if len(rcsf.rcsfiles) == 0:
         raise CommandError('', 'no RCS files')
     #end
     rcsf.Summarize()
     print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (
         len(rcsf.rcsfiles),
         len(rcsf.subdirs),
         len(rcsf.others),
         len(rcsf.skipped))
     print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str
     print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str
     return rcsf
 #end

 class SampleDataTest:
     def __init__(self, dirs):
         dirs_in = dirs
         self.pairs = []
         while dirs:
             d = dirs[0]
             dirs = dirs[1:]
             l = os.listdir(d)
             files = []
             for e in l:
                 p = os.path.join(d, e)
                 if os.path.isdir(p):
                     dirs.append(p)
                 else:
                     files.append(p)
                 #end
             #end
             if len(files) > 1:
                 files.sort()
                 for x in xrange(len(files)):
                     for y in xrange(len(files)):
                         self.pairs.append((files[x], files[y],
                                            '%s-%s' % (files[x], files[y])))
                     #end
                 #end
             #end
         #end
         print "Sample data test using %d file pairs in %s" % (
             len(self.pairs), dirs_in)
     #end

     def Generator(self):
         return lambda rand: rand.choice(self.pairs)
     #end
 #end

 # configs are represented as a list of values,
 # program takes a list of strings:
 def ConfigToArgs(config):
     args = [ '-C',
              ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])]
     for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)):
         key = CONFIG_ARGMAP[CONFIG_ORDER[i]]
         val = config[i]
         if val == 'true' or val == 'false':
             if val == 'true':
                 args.append('%s' % key)
             #end
         else:
             args.append('%s=%s' % (key, val))
         #end
     #end
     return args
 #end

 #
 class RandomTest:
     def __init__(self, tnum, tinput, config, syntuple = None):
         self.mytinput = tinput[2]
         self.myconfig = config
         self.tnum = tnum

         if syntuple != None:
             self.runtime = syntuple[0]
             self.compsize = syntuple[1]
             self.decodetime = None
         else:
             args = ConfigToArgs(config)
             result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args))

             self.runtime = result.encode_time.mean
             self.compsize = result.encode_size
             self.decodetime = result.decode_time.mean
         #end

         self.score = None
         self.time_pos = None
         self.size_pos = None
         self.score_pos = None
     #end

     def __str__(self):
         decodestr = ' %s' % self.decodetime
         return 'time %.6f%s size %d%s << %s >>%s' % (
             self.time(), ((self.time_pos != None) and
                           (" (%s)" % self.time_pos) or ""),
             self.size(), ((self.size_pos != None) and
                           (" (%s)" % self.size_pos) or ""),
             c2str(self.config()),
             decodestr)
     #end

     def time(self):
         return self.runtime
     #end

     def size(self):
         return self.compsize
     #end

     def config(self):
         return self.myconfig
     #end

     def score(self):
         return self.score
     #end

     def tinput(self):
         return self.mytinput
     #end
 #end

 def PosInAlist(l, e):
     for i in range(0, len(l)):
         if l[i][1] == e:
             return i;
         #end
     #end
     return -1
 #end

 # Generates a set of num_results test configurations, given the list of
 # retest-configs.
 def RandomTestConfigs(rand, input_configs, num_results):

     outputs = input_configs[:]
     have_set = dict([(c,c) for c in input_configs])

     # Compute a random configuration
     def RandomConfig():
         config = []
         cmap = {}
         for key in CONFIG_ORDER:
             val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap)
             config.append(val)
         #end
         return tuple(config)
     #end

     while len(outputs) < num_results:
         newc = None
         for i in xrange(100):
             c = RandomConfig()
             if have_set.has_key(c):
                 continue
             #end
             have_set[c] = c
             newc = c
             break
         if newc is None:
             print 'stopped looking for configs at %d' % len(outputs)
             break
         #end
         outputs.append(c)
     #end
     outputs.sort()
     return outputs
 #end

 def RunOptimizationLoop(rand, generator, rounds):
     configs = []
     for rnum in xrange(rounds):
         configs = RandomTestConfigs(rand, configs, MAX_RESULTS)
         tinput = generator(rand)
         tests = []
         for x in xrange(len(configs)):
             t = RandomTest(x, tinput, configs[x])
             print 'Round %d test %d: %s' % (rnum, x, t)
             tests.append(t)
         #end
         results = ScoreTests(tests)

         for r in results:
             c = r.config()
             if not test_all_config_results.has_key(c):
                 test_all_config_results[c] = [r]
             else:
                 test_all_config_results[c].append(r)
             #end
         #end

         #GraphResults('expt%d' % rnum, results)
         #GraphSummary('sum%d' % rnum, results)

         # re-test some fraction
         configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]]
     #end
 #end

 # TODO: cleanup
 test_all_config_results = {}

 def ScoreTests(results):
     scored = []
     timed = []
     sized = []

     t_min = float(min([test.time() for test in results]))
     #t_max = float(max([test.time() for test in results]))
     s_min = float(min([test.size() for test in results]))
     #s_max = float(max([test.size() for test in results]))

     for test in results:

         # Hyperbolic function. Smaller scores still better
         red = 0.999  # minimum factors for each dimension are 1/1000
         test.score = ((test.size() - s_min * red) *
                       (test.time() - t_min * red))

         scored.append((test.score, test))
         timed.append((test.time(), test))
         sized.append((test.size(), test))
     #end

     scored.sort()
     timed.sort()
     sized.sort()

     best_by_size = []
     best_by_time = []

     pos = 0
     for (score, test) in scored:
         pos += 1
         test.score_pos = pos
     #end

     scored = [x[1] for x in scored]

     for test in scored:
         test.size_pos = PosInAlist(sized, test)
         test.time_pos = PosInAlist(timed, test)
     #end

     for test in scored:
         c = test.config()
         s = 0.0
         print 'H-Score: %0.9f %s' % (test.score, test)
     #end

     return scored
 #end

 def GraphResults(desc, results):
     f = open("data-%s.csv" % desc, "w")
     for r in results:
         f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r))
     #end
     f.close()
     os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc))
 #end

 def GraphSummary(desc, results_ignore):
     test_population = 0
     config_ordered = []

     # drops duplicate test/config pairs (TODO: don't retest them)
     for config, cresults in test_all_config_results.items():
         input_config_map = {}
         uniq = []
         for test in cresults:
             assert test.config() == config
             test_population += 1
             key = test.tinput()
             if not input_config_map.has_key(key):
                 input_config_map[key] = {}
             #end
             if input_config_map[key].has_key(config):
                 print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test)
                 continue
             #end
             input_config_map[key][config] = test
             uniq.append(test)
         #end
         config_ordered.append(uniq)
     #end

     # sort configs descending by number of tests
     config_ordered.sort(lambda x, y: len(y) - len(x))

     print 'population %d: %d configs %d results' % \
           (test_population,
            len(config_ordered),
            len(config_ordered[0]))

     if config_ordered[0] == 1:
         return
     #end

     # a map from test-key to test-list w/ various configs
     input_set = {}
     osize = len(config_ordered)

     for i in xrange(len(config_ordered)):
         config = config_ordered[i][0].config()
         config_tests = config_ordered[i]

         #print '%s has %d tested inputs' % (config, len(config_tests))

         if len(input_set) == 0:
             input_set = dict([(t.tinput(), [t]) for t in config_tests])
             continue
         #end

         # a map from test-key to test-list w/ various configs
         update_set = {}
         for r in config_tests:
             t = r.tinput()
             if input_set.has_key(t):
                 update_set[t] = input_set[t] + [r]
             else:
                 #print 'config %s does not have test %s' % (config, t)
                 pass
             #end
         #end

         if len(update_set) <= 1:
             break
         #end

         input_set = update_set

         # continue if there are more w/ the same number of inputs
         if i < (len(config_ordered) - 1) and \
            len(config_ordered[i + 1]) == len(config_tests):
             continue
         #end

         # synthesize results for multi-test inputs
         config_num = None

         # map of config to sum(various test-keys)
         smap = {}
         for (key, tests) in input_set.items():
             if config_num == None:
                 # config_num should be the same in all elements
                 config_num = len(tests)
                 smap = dict([(r.config(),
                               (r.time(),
                                r.size()))
                              for r in tests])
             else:
                 # compuate the per-config sum of time/size
                 assert config_num == len(tests)
                 smap = dict([(r.config(),
                               (smap[r.config()][0] + r.time(),
                                smap[r.config()][1] + r.size()))
                              for r in tests])
             #end
         #end

         if config_num == 1:
             continue
         #end

         if len(input_set) == osize:
             break
         #end

         summary = '%s-%d' % (desc, len(input_set))
         osize = len(input_set)

         print 'generate %s w/ %d configs' % (summary, config_num)
         syn = [RandomTest(0, (None, None, summary), config,
                           syntuple = (smap[config][0], smap[config][1]))
                for config in smap.keys()]
         syn = ScoreTests(syn)
         #print 'smap is %s' % (smap,)
         #print 'syn is %s' % (' and '.join([str(x) for x in syn]))
         #GraphResults(summary, syn)
     #end
 #end

 def RunRegressionTest(pairs, rounds):
     for args in [
         [],
         ['-S=djw'],
         ['-B=412907520'],
         ['-B 412907520', ],

                  ]:
         print "Args %s" % (args)
         for (file1, file2, testkey) in pairs:
             ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True),
                               skip_trials = 0,
                               min_trials = 1,
                               max_trials = 1)
             print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % (
                 file1, file2,
                 ttest.encode_time.str,
                 ttest.decode_time.str,
                 ttest.encode_size)
     #end
 #end

 if __name__ == "__main__":
     try:
         RunCommand(['rm', '-rf', TMPDIR])
         os.mkdir(TMPDIR)

         #rcsf = GetTestRcsFiles()
         #generator = rcsf.Generator()

         sample = SampleDataTest([SAMPLEDIR])
         generator = sample.Generator()

         rand = random.Random(135135135135135)

         RunRegressionTest(sample.pairs, TEST_ROUNDS)

         #RunSpeedTest()

         # the idea below is to add the default configurations and
         # xdelta1 to the optimization loop:
         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6']))
         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9']))
         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw']))
         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw']))
         #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T']))
         #x1r = rcsf.AllPairsByDate(Xdelta1RunClass())

     except CommandError:
         pass
     else:
         RunCommand(['rm', '-rf', TMPDIR])
         pass
     #end
 #end