#!/usr/bin/python2.6 # xdelta 3 - delta compression tools and library # Copyright (C) 2003, 2006, 2007, 2008. Joshua P. MacDonald # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # TODO: test 1.5 vs. greedy import os, sys, math, re, time, types, array, random import xdelta3main import xdelta3 #RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS' #RCSDIR = '/tmp/PRCS_read_copy' #SAMPLEDIR = "/tmp/WESNOTH_tmp/diff" #RCSDIR = 'G:/jmacd/PRCS_copy' #SAMPLEDIR = "C:/sample_data/Wesnoth/tar" RCSDIR = '/Users/jmacd/src/ftp.kernel.org' SAMPLEDIR = '/Users/jmacd/src/xdelta3/linux' # MIN_SIZE = 0 TIME_TOO_SHORT = 0.050 SKIP_TRIALS = 2 MIN_TRIALS = 3 MAX_TRIALS = 15 # 10 = fast 1.5 = slow MIN_STDDEV_PCT = 1.5 # How many results per round MAX_RESULTS = 500 TEST_ROUNDS = 10 KEEP_P = (0.5) # For RCS testing, what percent to select FILE_P = (0.50) # For run-speed tests MIN_RUN = 1000 * 1000 * 1 MAX_RUN = 1000 * 1000 * 10 # Testwide defaults ALL_ARGS = [ '-q' # '-vv' ] # The first 7 args go to -C SOFT_CONFIG_CNT = 7 CONFIG_ORDER = [ 'large_look', 'large_step', 'small_look', 'small_chain', 'small_lchain', 'max_lazy', 'long_enough', # > SOFT_CONFIG_CNT 'nocompress', 'winsize', 'srcwinsize', 'sprevsz', 'iopt', 'djw', 'altcode', ] CONFIG_ARGMAP = { 'winsize' : '-W', 'srcwinsize' : '-B', 'sprevsz' : '-P', 'iopt' : '-I', 'nocompress' : '-N', 'djw' : '-Sdjw', 'altcode' : '-T', } def INPUT_SPEC(rand): return { # Time/space costs: # -C 1,2,3,4,5,6,7 'large_look' : lambda d: rand.choice([9, 10, 11, 12]), 'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]), 'small_look' : lambda d: rand.choice([4]), 'small_chain' : lambda d: rand.choice([1]), 'small_lchain' : lambda d: rand.choice([1]), 'max_lazy' : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]), # Note: long_enough only refers to small matching and has no effect if # small_chain == 1. 'long_enough' : lambda d: rand.choice([4]), # -N 'nocompress' : lambda d: rand.choice(['false']), # -T 'altcode' : lambda d: rand.choice(['false']), # -S djw 'djw' : lambda d: rand.choice(['false']), # Memory costs: # -W 'winsize' : lambda d: 8 * (1<<20), # -B 'srcwinsize' : lambda d: 64 * (1<<20), # -I 0 is unlimited 'iopt' : lambda d: 0, # -P only powers of two 'sprevsz' : lambda d: rand.choice([x * (1<<16) for x in [4]]), } #end # TMPDIR = '/tmp/xd3regtest.%d' % os.getpid() RUNFILE = os.path.join(TMPDIR, 'run') DFILE = os.path.join(TMPDIR, 'output') RFILE = os.path.join(TMPDIR, 'recon') CMPTMP1 = os.path.join(TMPDIR, 'cmptmp1') CMPTMP2 = os.path.join(TMPDIR, 'cmptmp2') HEAD_STATE = 0 BAR_STATE = 1 REV_STATE = 2 DATE_STATE = 3 # IGNORE_FILENAME = re.compile('.*\\.(gif|jpg).*') # rcs output RE_TOTREV = re.compile('total revisions: (\\d+)') RE_BAR = re.compile('----------------------------') RE_REV = re.compile('revision (.+)') RE_DATE = re.compile('date: ([^;]+);.*') # xdelta output RE_HDRSZ = re.compile('VCDIFF header size: +(\\d+)') RE_EXTCOMP = re.compile('XDELTA ext comp.*') def c2str(c): return ' '.join(['%s' % x for x in c]) #end def SumList(l): return reduce(lambda x,y: x+y, l) #end # returns (total, mean, stddev, q2 (median), # (q3-q1)/2 ("semi-interquartile range"), max-min (spread)) class StatList: def __init__(self,l,desc): cnt = len(l) assert(cnt > 1) l.sort() self.cnt = cnt self.l = l self.total = SumList(l) self.mean = self.total / float(self.cnt) self.s = math.sqrt(SumList([(x-self.mean) * (x - self.mean) for x in l]) / float(self.cnt-1)) self.q0 = l[0] self.q1 = l[int(self.cnt/4.0+0.5)] self.q2 = l[int(self.cnt/2.0+0.5)] self.q3 = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))] self.q4 = l[self.cnt-1] self.siqr = (self.q3-self.q1)/2.0; self.spread = (self.q4-self.q0) if len(l) == 1: self.str = '%s %s' % (desc, l[0]) else: self.str = '%s mean %.1f: 25%-ile %d %d %d %d %d' % \ (desc, self.mean, self.q0, self.q1, self.q2, self.q3, self.q4) #end #end def RunCommand(args, ok = [0]): #print 'run command %s' % (' '.join(args)) p = os.spawnvp(os.P_WAIT, args[0], args) if p not in ok: raise CommandError(args, 'exited %d' % p) #end #end def RunCommandIO(args,infn,outfn): p = os.fork() if p == 0: os.dup2(os.open(infn,os.O_RDONLY),0) os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1) os.execvp(args[0], args) else: s = os.waitpid(p,0) o = os.WEXITSTATUS(s[1]) if not os.WIFEXITED(s[1]) or o != 0: raise CommandError(args, 'exited %d' % o) #end #end #end class TimedTest: def __init__(self, target, source, runnable, skip_trials = SKIP_TRIALS, min_trials = MIN_TRIALS, max_trials = MAX_TRIALS, min_stddev_pct = MIN_STDDEV_PCT): self.target = target self.source = source self.runnable = runnable self.skip_trials = skip_trials self.min_trials = min(min_trials, max_trials) self.max_trials = max_trials self.min_stddev_pct = min_stddev_pct self.encode_time = self.DoTest(DFILE, lambda x: x.Encode(self.target, self.source, DFILE)) self.encode_size = runnable.EncodeSize(DFILE) self.decode_time = self.DoTest(RFILE, lambda x: x.Decode(DFILE, self.source, RFILE), ) runnable.Verify(self.target, RFILE) #end def DoTest(self, fname, func): trials = 0 measured = [] while 1: try: os.remove(fname) except OSError: pass start_time = time.time() start_clock = time.clock() func(self.runnable) total_clock = (time.clock() - start_clock) total_time = (time.time() - start_time) elap_time = max(total_time, 0.0000001) elap_clock = max(total_clock, 0.0000001) trials = trials + 1 # skip some of the first trials if trials > self.skip_trials: measured.append((elap_clock, elap_time)) #print 'measurement total: %.1f ms' % (total_time * 1000.0) # at least so many if trials < (self.skip_trials + self.min_trials): #print 'continue: need more trials: %d' % trials continue # compute %variance done = 0 if self.skip_trials + self.min_trials <= 2: measured = measured + measured; done = 1 #end time_stat = StatList([x[1] for x in measured], 'elap time') sp = float(time_stat.s) / float(time_stat.mean) # what if MAX_TRIALS is exceeded? too_many = (trials - self.skip_trials) >= self.max_trials good = (100.0 * sp) < self.min_stddev_pct if done or too_many or good: trials = trials - self.skip_trials if not done and not good: #print 'too many trials: %d' % trials pass #clock = StatList([x[0] for x in measured], 'elap clock') return time_stat #end #end #end #end def Decimals(start, end): l = [] step = start while 1: r = range(step, step * 10, step) l = l + r if step * 10 >= end: l.append(step * 10) break step = step * 10 return l #end # This tests the raw speed of 0-byte inputs def RunSpeedTest(): for L in Decimals(MIN_RUN, MAX_RUN): SetFileSize(RUNFILE, L) trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)])) ReportSpeed(L, trx, '1MB ') trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)])) ReportSpeed(L, trx, '512k') trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)])) ReportSpeed(L, trx, '256k') trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE)) ReportSpeed(L, trm, 'swig') trg = TimedTest(RUNFILE, None, GzipRun1()) ReportSpeed(L,trg,'gzip') #end #end def SetFileSize(F,L): fd = os.open(F, os.O_CREAT | os.O_WRONLY) os.ftruncate(fd,L) assert os.fstat(fd).st_size == L os.close(fd) #end def ReportSpeed(L,tr,desc): print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \ (desc, L, tr.encode_size, tr.encode_time.mean * 1000.0, tr.decode_time.mean * 1000.0) #end class Xdelta3RunClass: def __init__(self, extra): self.extra = extra #end def __str__(self): return ' '.join(self.extra) #end def New(self): return Xdelta3Runner(self.extra) #end #end class Xdelta3Runner: # Use "forkexec" to get special command-line only features like # external compression support. def __init__(self, extra, forkexec=False): self.forkexec = forkexec self.extra = extra #end def Encode(self, target, source, output): args = (ALL_ARGS + self.extra + ['-e']) if source: args.append('-s') args.append(source) #end args = args + [target, output] self.Main(args) #end def Decode(self, input, source, output): args = (ALL_ARGS + ['-d']) if source: args.append('-s') args.append(source) #end args = args + [input, output] self.Main(args) #end def Verify(self, target, recon): if target[-3:] == ".gz": RunCommandIO(('gzip', '-dc'), target, CMPTMP1) RunCommandIO(('gzip', '-dc'), recon, CMPTMP2) RunCommand(('cmp', CMPTMP1, CMPTMP2)) else: RunCommand(('cmp', target, recon)) #end def EncodeSize(self, output): return os.stat(output).st_size #end def Main(self, args): try: if self.forkexec: RunCommand(['../xdelta3'] + args) else: xdelta3.xd3_main_cmdline(args) except Exception, e: raise CommandError(args, "xdelta3.main exception: %s" % e) #end #end #end class Xdelta3Mod1: def __init__(self, file): self.target_data = open(file, 'r').read() #end def Encode(self, ignore1, ignore2, ignore3): r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10) if r1 != 0: raise CommandError('memory', 'encode failed: %s' % r1) #end self.encoded = encoded #end def Decode(self, ignore1, ignore2, ignore3): r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data)) if r2 != 0: raise CommandError('memory', 'decode failed: %s' % r1) #end self.decoded = data1 #end def Verify(self, ignore1, ignore2): if self.target_data != self.decoded: raise CommandError('memory', 'bad decode') #end #end def EncodeSize(self, ignore1): return len(self.encoded) #end #end class GzipRun1: def Encode(self, target, source, output): assert source == None RunCommandIO(['gzip', '-cf'], target, output) #end def Decode(self, input, source, output): assert source == None RunCommandIO(['gzip', '-dcf'], input, output) #end def Verify(self, target, recon): RunCommand(('cmp', target, recon)) #end def EncodeSize(self, output): return os.stat(output).st_size #end #end class Xdelta1RunClass: def __str__(self): return 'xdelta1' #end def New(self): return Xdelta1Runner() #end #end class Xdelta1Runner: def Encode(self, target, source, output): assert source != None args = ['xdelta1', 'delta', '-q', source, target, output] RunCommand(args, [0, 1]) #end def Decode(self, input, source, output): assert source != None args = ['xdelta1', 'patch', '-q', input, source, output] # Note: for dumb historical reasons, xdelta1 returns 1 or 0 RunCommand(args) #end def Verify(self, target, recon): RunCommand(('cmp', target, recon)) #end def EncodeSize(self, output): return os.stat(output).st_size #end #end # exceptions class SkipRcsException: def __init__(self,reason): self.reason = reason #end #end class NotEnoughVersions: def __init__(self): pass #end #end class CommandError: def __init__(self,cmd,str): if type(cmd) is types.TupleType or \ type(cmd) is types.ListType: cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd) #end print 'command was: ',cmd print 'command failed: ',str print 'have fun debugging' #end #end class RcsVersion: def __init__(self,vstr): self.vstr = vstr #end def __cmp__(self,other): return cmp(self.date, other.date) #end def __str__(self): return str(self.vstr) #end #end class RcsFile: def __init__(self, fname): self.fname = fname self.versions = [] self.state = HEAD_STATE #end def SetTotRev(self,s): self.totrev = int(s) #end def Rev(self,s): self.rev = RcsVersion(s) if len(self.versions) >= self.totrev: raise SkipRcsException('too many versions (in log messages)') #end self.versions.append(self.rev) #end def Date(self,s): self.rev.date = s #end def Match(self, line, state, rx, gp, newstate, f): if state == self.state: m = rx.match(line) if m: if f: f(m.group(gp)) #end self.state = newstate return 1 #end #end return None #end def Sum1Rlog(self): f = os.popen('rlog '+self.fname, "r") l = f.readline() while l: if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev): pass elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None): pass elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev): pass elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date): pass #end l = f.readline() #end c = f.close() if c != None: raise c #end #end def Sum1(self): st = os.stat(self.fname) self.rcssize = st.st_size self.Sum1Rlog() if self.totrev != len(self.versions): raise SkipRcsException('wrong version count') #end self.versions.sort() #end def Checkout(self,n): v = self.versions[n] out = open(self.Verf(n), "w") cmd = 'co -ko -p%s %s' % (v.vstr, self.fname) total = 0 (inf, stream, err) = os.popen3(cmd, "r") inf.close() buf = stream.read() while buf: total = total + len(buf) out.write(buf) buf = stream.read() #end v.vsize = total estr = '' buf = err.read() while buf: estr = estr + buf buf = err.read() #end if stream.close(): raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr)) #end out.close() err.close() #end def Vdate(self,n): return self.versions[n].date #end def Vstr(self,n): return self.versions[n].vstr #end def Verf(self,n): return os.path.join(TMPDIR, 'input.%d' % n) #end def FilePairsByDate(self, runclass): if self.totrev < 2: raise NotEnoughVersions() #end self.Checkout(0) ntrials = [] if self.totrev < 2: return vtrials #end for v in range(0,self.totrev-1): if v > 1: os.remove(self.Verf(v-1)) #end self.Checkout(v+1) if os.stat(self.Verf(v)).st_size < MIN_SIZE or \ os.stat(self.Verf(v+1)).st_size < MIN_SIZE: continue #end result = TimedTest(self.Verf(v+1), self.Verf(v), runclass.New()) target_size = os.stat(self.Verf(v+1)).st_size ntrials.append(result) #end os.remove(self.Verf(self.totrev-1)) os.remove(self.Verf(self.totrev-2)) return ntrials #end def AppendVersion(self, f, n): self.Checkout(n) rf = open(self.Verf(n), "r") data = rf.read() f.write(data) rf.close() return len(data) #end class RcsFinder: def __init__(self): self.subdirs = [] self.rcsfiles = [] self.others = [] self.skipped = [] self.biground = 0 #end def Scan1(self,dir): dents = os.listdir(dir) subdirs = [] rcsfiles = [] others = [] for dent in dents: full = os.path.join(dir, dent) if os.path.isdir(full): subdirs.append(full) elif dent[len(dent)-2:] == ",v": rcsfiles.append(RcsFile(full)) else: others.append(full) #end #end self.subdirs = self.subdirs + subdirs self.rcsfiles = self.rcsfiles + rcsfiles self.others = self.others + others return subdirs #end def Crawl(self, dir): subdirs = [dir] while subdirs: s1 = self.Scan1(subdirs[0]) subdirs = subdirs[1:] + s1 #end #end def Summarize(self): good = [] for rf in self.rcsfiles: try: rf.Sum1() if rf.totrev < 2: raise SkipRcsException('too few versions (< 2)') #end except SkipRcsException, e: #print 'skipping file %s: %s' % (rf.fname, e.reason) self.skipped.append(rf) else: good.append(rf) #end self.rcsfiles = good #end def AllPairsByDate(self, runclass): results = [] good = [] for rf in self.rcsfiles: try: results = results + rf.FilePairsByDate(runclass) except SkipRcsException: print 'file %s has compressed versions: skipping' % (rf.fname) except NotEnoughVersions: print 'testing %s on %s: not enough versions' % (runclass, rf.fname) else: good.append(rf) #end self.rcsfiles = good self.ReportPairs(runclass, results) return results #end def ReportPairs(self, name, results): encode_time = 0 decode_time = 0 encode_size = 0 for r in results: encode_time += r.encode_time.mean decode_time += r.decode_time.mean encode_size += r.encode_size #end print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \ (name, encode_time, decode_time, encode_size) #end def MakeBigFiles(self, rand): f1 = open(TMPDIR + "/big.1", "w") f2 = open(TMPDIR + "/big.2", "w") population = [] for file in self.rcsfiles: if len(file.versions) < 2: continue population.append(file) #end f1sz = 0 f2sz = 0 fcount = int(len(population) * FILE_P) assert fcount > 0 for file in rand.sample(population, fcount): m = IGNORE_FILENAME.match(file.fname) if m != None: continue #end r1, r2 = rand.sample(xrange(0, len(file.versions)), 2) f1sz += file.AppendVersion(f1, r1) f2sz += file.AppendVersion(f2, r2) #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):], file.Vstr(r1), file.Vstr(r2))) #end testkey = 'rcs%d' % self.biground self.biground = self.biground + 1 print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz) f1.close() f2.close() return (TMPDIR + "/big.1", TMPDIR + "/big.2", testkey) #end def Generator(self): return lambda rand: self.MakeBigFiles(rand) #end #end # find a set of RCS files for testing def GetTestRcsFiles(): rcsf = RcsFinder() rcsf.Crawl(RCSDIR) if len(rcsf.rcsfiles) == 0: raise CommandError('', 'no RCS files') #end rcsf.Summarize() print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % ( len(rcsf.rcsfiles), len(rcsf.subdirs), len(rcsf.others), len(rcsf.skipped)) print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str return rcsf #end class SampleDataTest: def __init__(self, dirs): dirs_in = dirs self.pairs = [] while dirs: d = dirs[0] dirs = dirs[1:] l = os.listdir(d) files = [] for e in l: p = os.path.join(d, e) if os.path.isdir(p): dirs.append(p) else: files.append(p) #end #end if len(files) > 1: files.sort() for x in xrange(len(files)): for y in xrange(len(files)): self.pairs.append((files[x], files[y], '%s-%s' % (files[x], files[y]))) #end #end #end #end print "Sample data test using %d file pairs in %s" % ( len(self.pairs), dirs_in) #end def Generator(self): return lambda rand: rand.choice(self.pairs) #end #end # configs are represented as a list of values, # program takes a list of strings: def ConfigToArgs(config): args = [ '-C', ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])] for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)): key = CONFIG_ARGMAP[CONFIG_ORDER[i]] val = config[i] if val == 'true' or val == 'false': if val == 'true': args.append('%s' % key) #end else: args.append('%s=%s' % (key, val)) #end #end return args #end # class RandomTest: def __init__(self, tnum, tinput, config, syntuple = None): self.mytinput = tinput[2] self.myconfig = config self.tnum = tnum if syntuple != None: self.runtime = syntuple[0] self.compsize = syntuple[1] self.decodetime = None else: args = ConfigToArgs(config) result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args)) self.runtime = result.encode_time.mean self.compsize = result.encode_size self.decodetime = result.decode_time.mean #end self.score = None self.time_pos = None self.size_pos = None self.score_pos = None #end def __str__(self): decodestr = ' %s' % self.decodetime return 'time %.6f%s size %d%s << %s >>%s' % ( self.time(), ((self.time_pos != None) and (" (%s)" % self.time_pos) or ""), self.size(), ((self.size_pos != None) and (" (%s)" % self.size_pos) or ""), c2str(self.config()), decodestr) #end def time(self): return self.runtime #end def size(self): return self.compsize #end def config(self): return self.myconfig #end def score(self): return self.score #end def tinput(self): return self.mytinput #end #end def PosInAlist(l, e): for i in range(0, len(l)): if l[i][1] == e: return i; #end #end return -1 #end # Generates a set of num_results test configurations, given the list of # retest-configs. def RandomTestConfigs(rand, input_configs, num_results): outputs = input_configs[:] have_set = dict([(c,c) for c in input_configs]) # Compute a random configuration def RandomConfig(): config = [] cmap = {} for key in CONFIG_ORDER: val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap) config.append(val) #end return tuple(config) #end while len(outputs) < num_results: newc = None for i in xrange(100): c = RandomConfig() if have_set.has_key(c): continue #end have_set[c] = c newc = c break if newc is None: print 'stopped looking for configs at %d' % len(outputs) break #end outputs.append(c) #end outputs.sort() return outputs #end def RunOptimizationLoop(rand, generator, rounds): configs = [] for rnum in xrange(rounds): configs = RandomTestConfigs(rand, configs, MAX_RESULTS) tinput = generator(rand) tests = [] for x in xrange(len(configs)): t = RandomTest(x, tinput, configs[x]) print 'Round %d test %d: %s' % (rnum, x, t) tests.append(t) #end results = ScoreTests(tests) for r in results: c = r.config() if not test_all_config_results.has_key(c): test_all_config_results[c] = [r] else: test_all_config_results[c].append(r) #end #end #GraphResults('expt%d' % rnum, results) #GraphSummary('sum%d' % rnum, results) # re-test some fraction configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]] #end #end # TODO: cleanup test_all_config_results = {} def ScoreTests(results): scored = [] timed = [] sized = [] t_min = float(min([test.time() for test in results])) #t_max = float(max([test.time() for test in results])) s_min = float(min([test.size() for test in results])) #s_max = float(max([test.size() for test in results])) for test in results: # Hyperbolic function. Smaller scores still better red = 0.999 # minimum factors for each dimension are 1/1000 test.score = ((test.size() - s_min * red) * (test.time() - t_min * red)) scored.append((test.score, test)) timed.append((test.time(), test)) sized.append((test.size(), test)) #end scored.sort() timed.sort() sized.sort() best_by_size = [] best_by_time = [] pos = 0 for (score, test) in scored: pos += 1 test.score_pos = pos #end scored = [x[1] for x in scored] for test in scored: test.size_pos = PosInAlist(sized, test) test.time_pos = PosInAlist(timed, test) #end for test in scored: c = test.config() s = 0.0 print 'H-Score: %0.9f %s' % (test.score, test) #end return scored #end def GraphResults(desc, results): f = open("data-%s.csv" % desc, "w") for r in results: f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r)) #end f.close() os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc)) #end def GraphSummary(desc, results_ignore): test_population = 0 config_ordered = [] # drops duplicate test/config pairs (TODO: don't retest them) for config, cresults in test_all_config_results.items(): input_config_map = {} uniq = [] for test in cresults: assert test.config() == config test_population += 1 key = test.tinput() if not input_config_map.has_key(key): input_config_map[key] = {} #end if input_config_map[key].has_key(config): print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test) continue #end input_config_map[key][config] = test uniq.append(test) #end config_ordered.append(uniq) #end # sort configs descending by number of tests config_ordered.sort(lambda x, y: len(y) - len(x)) print 'population %d: %d configs %d results' % \ (test_population, len(config_ordered), len(config_ordered[0])) if config_ordered[0] == 1: return #end # a map from test-key to test-list w/ various configs input_set = {} osize = len(config_ordered) for i in xrange(len(config_ordered)): config = config_ordered[i][0].config() config_tests = config_ordered[i] #print '%s has %d tested inputs' % (config, len(config_tests)) if len(input_set) == 0: input_set = dict([(t.tinput(), [t]) for t in config_tests]) continue #end # a map from test-key to test-list w/ various configs update_set = {} for r in config_tests: t = r.tinput() if input_set.has_key(t): update_set[t] = input_set[t] + [r] else: #print 'config %s does not have test %s' % (config, t) pass #end #end if len(update_set) <= 1: break #end input_set = update_set # continue if there are more w/ the same number of inputs if i < (len(config_ordered) - 1) and \ len(config_ordered[i + 1]) == len(config_tests): continue #end # synthesize results for multi-test inputs config_num = None # map of config to sum(various test-keys) smap = {} for (key, tests) in input_set.items(): if config_num == None: # config_num should be the same in all elements config_num = len(tests) smap = dict([(r.config(), (r.time(), r.size())) for r in tests]) else: # compuate the per-config sum of time/size assert config_num == len(tests) smap = dict([(r.config(), (smap[r.config()][0] + r.time(), smap[r.config()][1] + r.size())) for r in tests]) #end #end if config_num == 1: continue #end if len(input_set) == osize: break #end summary = '%s-%d' % (desc, len(input_set)) osize = len(input_set) print 'generate %s w/ %d configs' % (summary, config_num) syn = [RandomTest(0, (None, None, summary), config, syntuple = (smap[config][0], smap[config][1])) for config in smap.keys()] syn = ScoreTests(syn) #print 'smap is %s' % (smap,) #print 'syn is %s' % (' and '.join([str(x) for x in syn])) #GraphResults(summary, syn) #end #end def RunRegressionTest(pairs, rounds): for args in [ [], ['-S djw'], ]: print "Args %s" % (args) for (file1, file2, testkey) in pairs: ttest = TimedTest(file1, file2, Xdelta3Runner(args, forkexec=True), skip_trials = 0, min_trials = 1, max_trials = 1) print "Source %s\nTarget %s\nEncode %s\nDecode %s\nSize %s\n\n" % ( file1, file2, ttest.encode_time.str, ttest.decode_time.str, ttest.encode_size) #end #end if __name__ == "__main__": try: RunCommand(['rm', '-rf', TMPDIR]) os.mkdir(TMPDIR) #rcsf = GetTestRcsFiles() #generator = rcsf.Generator() sample = SampleDataTest([SAMPLEDIR]) generator = sample.Generator() rand = random.Random(135135135135135) RunRegressionTest(sample.pairs, TEST_ROUNDS) #RunSpeedTest() # the idea below is to add the default configurations and # xdelta1 to the optimization loop: #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6'])) #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9'])) #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw'])) #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw'])) #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T'])) #x1r = rcsf.AllPairsByDate(Xdelta1RunClass()) except CommandError: pass else: RunCommand(['rm', '-rf', TMPDIR]) pass #end #end