From 8681251a19d6a845a725fba60eb0a4c9fa3c375c Mon Sep 17 00:00:00 2001 From: "josh.macdonald" Date: Sat, 24 Oct 2009 01:13:46 +0000 Subject: Move and re-invigorate the regtest, fix 1 harmless compiler warning --- xdelta3/Makefile | 6 +- xdelta3/setup.py | 2 +- xdelta3/testing/xdelta3-regtest.py | 1225 ++++++++++++++++++++++++++++++++++++ xdelta3/xdelta3-main.h | 3 +- xdelta3/xdelta3-regtest.py | 1222 ----------------------------------- xdelta3/xdelta3.c | 2 +- xdelta3/xdelta3.h | 15 +- 7 files changed, 1239 insertions(+), 1236 deletions(-) create mode 100755 xdelta3/testing/xdelta3-regtest.py delete mode 100755 xdelta3/xdelta3-regtest.py (limited to 'xdelta3') diff --git a/xdelta3/Makefile b/xdelta3/Makefile index 3a5c6b0..47fea85 100644 --- a/xdelta3/Makefile +++ b/xdelta3/Makefile @@ -8,7 +8,7 @@ PYVER = 2.5 ifeq ("$(CYGWIN)", "") SWIGTGT = xdelta3module.so -PYTGT = build/lib.linux-i686-$(PYVER)/xdelta3main.so +PYTGT = build/lib.macosx-10.5-i386-2.5/xdelta3main.so else SWIGTGT = xdelta3module.dll PYTGT = build/lib.cygwin-1.5.24-i686-$(PYVER)/xdelta3main.dll @@ -57,13 +57,13 @@ CFLAGS= -Wall -Wshadow -fno-builtin WFLAGS= -Wextra -Wsign-compare -Wconversion -Wextra -Wno-unused-parameter # $Format: "REL=$Xdelta3Version$" $ -REL=3.0v +REL=3.0wRC11 RELDIR = xdelta$(REL) EXTRA = Makefile COPYING linkxd3lib.c badcopy.c xdelta3.swig \ draft-korn-vcdiff.txt xdelta3.vcproj badcopy.vcproj \ - xdelta3-regtest.py xdelta3-test.py setup.py \ + testing/xdelta3-regtest.py xdelta3-test.py setup.py \ examples/Makefile examples/small_page_test.c \ examples/README examples/encode_decode_test.c \ examples/compare_test.c examples/speed_test.c \ diff --git a/xdelta3/setup.py b/xdelta3/setup.py index 7fa4b0c..32b20cf 100644 --- a/xdelta3/setup.py +++ b/xdelta3/setup.py @@ -50,7 +50,7 @@ xdelta3_ext = Extension('xdelta3main', ]) # $Format: "REL='$Xdelta3Version$'" $ -REL='3.0v' +REL='3.0wRC1' # This provides xdelta3.main(), which calls the xdelta3 command-line main() # from python. diff --git a/xdelta3/testing/xdelta3-regtest.py b/xdelta3/testing/xdelta3-regtest.py new file mode 100755 index 0000000..3c5bfd6 --- /dev/null +++ b/xdelta3/testing/xdelta3-regtest.py @@ -0,0 +1,1225 @@ +#!/usr/bin/python2.5 +# xdelta 3 - delta compression tools and library +# Copyright (C) 2003, 2006, 2007, 2008. Joshua P. MacDonald +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +# TODO: test 1.5 vs. greedy + +import os, sys, math, re, time, types, array, random +import xdelta3 +import xdelta3main + +#RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS' +#RCSDIR = '/tmp/PRCS_read_copy' +#SAMPLEDIR = "/tmp/WESNOTH_tmp/diff" + +#RCSDIR = 'G:/jmacd/PRCS_copy' +#SAMPLEDIR = "C:/sample_data/Wesnoth/tar" + +#RCSDIR = '/Users/jmacd/src/ftp.kernel.org/pub/scm/linux/kernel/bkcvs/linux-2.4/net/x25' +RCSDIR = '/Users/jmacd/src/ftp.kernel.org/pub/scm/linux/kernel/bkcvs/linux-2.4/fs' +RCSDIR = '/Users/jmacd/src/ftp.kernel.org' + +# +MIN_SIZE = 0 + +TIME_TOO_SHORT = 0.050 + +SKIP_TRIALS = 2 +MIN_TRIALS = 3 +MAX_TRIALS = 15 + +# 10 = fast 1.5 = slow +MIN_STDDEV_PCT = 1.5 + +# How many results per round +MAX_RESULTS = 500 +TEST_ROUNDS = 500 +KEEP_P = (0.5) + +# For RCS testing, what percent to select +FILE_P = (0.50) + +# For run-speed tests +MIN_RUN = 1000 * 1000 * 1 +MAX_RUN = 1000 * 1000 * 10 + +# Testwide defaults +ALL_ARGS = [ + '-vv' + ] + +# The first 7 args go to -C +SOFT_CONFIG_CNT = 7 + +CONFIG_ORDER = [ 'large_look', + 'large_step', + 'small_look', + 'small_chain', + 'small_lchain', + 'max_lazy', + 'long_enough', + + # > SOFT_CONFIG_CNT + 'nocompress', + 'winsize', + 'srcwinsize', + 'sprevsz', + 'iopt', + 'djw', + 'altcode', + ] + +CONFIG_ARGMAP = { + 'winsize' : '-W', + 'srcwinsize' : '-B', + 'sprevsz' : '-P', + 'iopt' : '-I', + 'nocompress' : '-N', + 'djw' : '-Sdjw', + 'altcode' : '-T', + } + +def INPUT_SPEC(rand): + return { + + # Time/space costs: + + # -C 1,2,3,4,5,6,7 + 'large_look' : lambda d: rand.choice([9, 10, 11, 12]), + 'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]), + 'small_look' : lambda d: rand.choice([4]), + 'small_chain' : lambda d: rand.choice([1]), + 'small_lchain' : lambda d: rand.choice([1]), + 'max_lazy' : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]), + + # Note: long_enough only refers to small matching and has no effect if + # small_chain == 1. + 'long_enough' : lambda d: rand.choice([4]), + + # -N + 'nocompress' : lambda d: rand.choice(['false']), + + # -T + 'altcode' : lambda d: rand.choice(['false']), + + # -S djw + 'djw' : lambda d: rand.choice(['false']), + + # Memory costs: + + # -W + 'winsize' : lambda d: 8 * (1<<20), + + # -B + 'srcwinsize' : lambda d: 64 * (1<<20), + + # -I 0 is unlimited + 'iopt' : lambda d: 0, + + # -P only powers of two + 'sprevsz' : lambda d: rand.choice([x * (1<<16) for x in [4]]), + } +#end + +# +TMPDIR = '/tmp/xd3regtest.%d' % os.getpid() + +RUNFILE = os.path.join(TMPDIR, 'run') +DFILE = os.path.join(TMPDIR, 'output') +RFILE = os.path.join(TMPDIR, 'recon') + +HEAD_STATE = 0 +BAR_STATE = 1 +REV_STATE = 2 +DATE_STATE = 3 + +# +IGNORE_FILENAME = re.compile('.*\\.(gif|jpg).*') + +# rcs output +RE_TOTREV = re.compile('total revisions: (\\d+)') +RE_BAR = re.compile('----------------------------') +RE_REV = re.compile('revision (.+)') +RE_DATE = re.compile('date: ([^;]+);.*') +# xdelta output +RE_HDRSZ = re.compile('VCDIFF header size: +(\\d+)') +RE_EXTCOMP = re.compile('XDELTA ext comp.*') + +def c2str(c): + return ' '.join(['%s' % x for x in c]) +#end + +def SumList(l): + return reduce(lambda x,y: x+y, l) +#end + +# returns (total, mean, stddev, q2 (median), +# (q3-q1)/2 ("semi-interquartile range"), max-min (spread)) +class StatList: + def __init__(self,l,desc): + cnt = len(l) + assert(cnt > 1) + l.sort() + self.cnt = cnt + self.l = l + self.total = SumList(l) + self.mean = self.total / float(self.cnt) + self.s = math.sqrt(SumList([(x-self.mean) * (x - self.mean) for x in l]) / float(self.cnt-1)) + self.q0 = l[0] + self.q1 = l[int(self.cnt/4.0+0.5)] + self.q2 = l[int(self.cnt/2.0+0.5)] + self.q3 = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))] + self.q4 = l[self.cnt-1]+1 + self.siqr = (self.q3-self.q1)/2.0; + self.spread = (self.q4-self.q0) + self.str = '%s %d; mean %d; sdev %d; q2 %d; .5(q3-q1) %.1f; spread %d' % \ + (desc, self.total, self.mean, self.s, self.q2, self.siqr, self.spread) + #end +#end + +def RunCommand(args, ok = [0]): + #print 'run command %s' % (' '.join(args)) + p = os.spawnvp(os.P_WAIT, args[0], args) + if p not in ok: + raise CommandError(args, 'exited %d' % p) + #end +#end + +def RunCommandIO(args,infn,outfn): + p = os.fork() + if p == 0: + os.dup2(os.open(infn,os.O_RDONLY),0) + os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1) + os.execvp(args[0], args) + else: + s = os.waitpid(p,0) + o = os.WEXITSTATUS(s[1]) + if not os.WIFEXITED(s[1]) or o != 0: + raise CommandError(args, 'exited %d' % o) + #end + #end +#end + +class TimedTest: + def __init__(self, target, source, runnable, + skip_trials = SKIP_TRIALS, + min_trials = MIN_TRIALS, + max_trials = MAX_TRIALS, + min_stddev_pct = MIN_STDDEV_PCT): + self.target = target + self.source = source + self.runnable = runnable + + self.skip_trials = skip_trials + self.min_trials = min(min_trials, max_trials) + self.max_trials = max_trials + self.min_stddev_pct = min_stddev_pct + + self.encode_time = self.DoTest(DFILE, + lambda x: x.Encode(self.target, self.source, DFILE)) + self.encode_size = runnable.EncodeSize(DFILE) + + self.decode_time = self.DoTest(RFILE, + lambda x: x.Decode(DFILE, self.source, RFILE), + ) + + # verify + runnable.Verify(self.target, RFILE) + #end + + def DoTest(self, fname, func): + trials = 0 + measured = [] + + while 1: + try: + os.remove(fname) + except OSError: + pass + + start_time = time.time() + start_clock = time.clock() + + func(self.runnable) + + total_clock = (time.clock() - start_clock) + total_time = (time.time() - start_time) + + elap_time = max(total_time, 0.0000001) + elap_clock = max(total_clock, 0.0000001) + + trials = trials + 1 + + # skip some of the first trials + if trials > self.skip_trials: + measured.append((elap_clock, elap_time)) + #print 'measurement total: %.1f ms' % (total_time * 1000.0) + + # at least so many + if trials < (self.skip_trials + self.min_trials): + #print 'continue: need more trials: %d' % trials + continue + + # compute %variance + done = 0 + if self.skip_trials + self.min_trials <= 2: + measured = measured + measured; + done = 1 + #end + + time_stat = StatList([x[1] for x in measured], 'elap time') + sp = float(time_stat.s) / float(time_stat.mean) + + # what if MAX_TRIALS is exceeded? + too_many = (trials - self.skip_trials) >= self.max_trials + good = (100.0 * sp) < self.min_stddev_pct + if done or too_many or good: + trials = trials - self.skip_trials + if not done and not good: + #print 'too many trials: %d' % trials + pass + #clock = StatList([x[0] for x in measured], 'elap clock') + return time_stat + #end + #end + #end +#end + +def Decimals(start, end): + l = [] + step = start + while 1: + r = range(step, step * 10, step) + l = l + r + if step * 10 >= end: + l.append(step * 10) + break + step = step * 10 + return l +#end + +# This tests the raw speed of 0-byte inputs +def RunSpeedTest(): + for L in Decimals(MIN_RUN, MAX_RUN): + SetFileSize(RUNFILE, L) + + trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)])) + ReportSpeed(L, trx, '1MB ') + + trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)])) + ReportSpeed(L, trx, '512k') + + trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)])) + ReportSpeed(L, trx, '256k') + + trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE)) + ReportSpeed(L, trm, 'swig') + + trg = TimedTest(RUNFILE, None, GzipRun1()) + ReportSpeed(L,trg,'gzip') + #end +#end + +def SetFileSize(F,L): + fd = os.open(F, os.O_CREAT | os.O_WRONLY) + os.ftruncate(fd,L) + assert os.fstat(fd).st_size == L + os.close(fd) +#end + +def ReportSpeed(L,tr,desc): + print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \ + (desc, L, + tr.encode_size, + tr.encode_time.mean * 1000.0, + tr.decode_time.mean * 1000.0) +#end + +class Xdelta3RunClass: + def __init__(self, extra): + self.extra = extra + #end + + def __str__(self): + return ' '.join(self.extra) + #end + + def New(self): + return Xdelta3Runner(self.extra) + #end +#end + +class Xdelta3Runner: + def __init__(self, extra): + self.extra = extra + #end + + def Encode(self, target, source, output): + args = (ALL_ARGS + + self.extra + + ['-e']) + if source: + args.append('-s') + args.append(source) + #end + args = args + [target, output] + self.Main(args) + #end + + def Decode(self, input, source, output): + args = (ALL_ARGS + + ['-d']) + if source: + args.append('-s') + args.append(source) + #end + args = args + [input, output] + self.Main(args) + #end + + def Verify(self, target, recon): + RunCommand(('cmp', target, recon)) + #end + + def EncodeSize(self, output): + return os.stat(output).st_size + #end + + def Main(self, args): + try: + #print 'Run %s' % (' '.join(args)) + xdelta3.xd3_main_cmdline(args) + except Exception, e: + raise CommandError(args, "xdelta3.main exception: %s" % e) + #end + #end +#end + +class Xdelta3Mod1: + def __init__(self, file): + self.target_data = open(file, 'r').read() + #end + + def Encode(self, ignore1, ignore2, ignore3): + r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10) + if r1 != 0: + raise CommandError('memory', 'encode failed: %s' % r1) + #end + self.encoded = encoded + #end + + def Decode(self, ignore1, ignore2, ignore3): + r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data)) + if r2 != 0: + raise CommandError('memory', 'decode failed: %s' % r1) + #end + self.decoded = data1 + #end + + def Verify(self, ignore1, ignore2): + if self.target_data != self.decoded: + raise CommandError('memory', 'bad decode') + #end + #end + + def EncodeSize(self, ignore1): + return len(self.encoded) + #end +#end + +class GzipRun1: + def Encode(self, target, source, output): + assert source == None + RunCommandIO(['gzip', '-cf'], target, output) + #end + + def Decode(self, input, source, output): + assert source == None + RunCommandIO(['gzip', '-dcf'], input, output) + #end + + def Verify(self, target, recon): + RunCommand(('cmp', target, recon)) + #end + + def EncodeSize(self, output): + return os.stat(output).st_size + #end +#end + +class Xdelta1RunClass: + def __str__(self): + return 'xdelta1' + #end + + def New(self): + return Xdelta1Runner() + #end +#end + +class Xdelta1Runner: + def Encode(self, target, source, output): + assert source != None + args = ['xdelta1', 'delta', '-q', source, target, output] + RunCommand(args, [0, 1]) + #end + + def Decode(self, input, source, output): + assert source != None + args = ['xdelta1', 'patch', '-q', input, source, output] + # Note: for dumb historical reasons, xdelta1 returns 1 or 0 + RunCommand(args) + #end + + def Verify(self, target, recon): + RunCommand(('cmp', target, recon)) + #end + + def EncodeSize(self, output): + return os.stat(output).st_size + #end +#end + +# exceptions +class SkipRcsException: + def __init__(self,reason): + self.reason = reason + #end +#end + +class NotEnoughVersions: + def __init__(self): + pass + #end +#end + +class CommandError: + def __init__(self,cmd,str): + if type(cmd) is types.TupleType or \ + type(cmd) is types.ListType: + cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd) + #end + print 'command was: ',cmd + print 'command failed: ',str + print 'have fun debugging' + #end +#end + +class RcsVersion: + def __init__(self,vstr): + self.vstr = vstr + #end + def __cmp__(self,other): + return cmp(self.date, other.date) + #end + def __str__(self): + return str(self.vstr) + #end +#end + +class RcsFile: + + def __init__(self, fname): + self.fname = fname + self.versions = [] + self.state = HEAD_STATE + #end + + def SetTotRev(self,s): + self.totrev = int(s) + #end + + def Rev(self,s): + self.rev = RcsVersion(s) + if len(self.versions) >= self.totrev: + raise SkipRcsException('too many versions (in log messages)') + #end + self.versions.append(self.rev) + #end + + def Date(self,s): + self.rev.date = s + #end + + def Match(self, line, state, rx, gp, newstate, f): + if state == self.state: + m = rx.match(line) + if m: + if f: + f(m.group(gp)) + #end + self.state = newstate + return 1 + #end + #end + return None + #end + + def Sum1Rlog(self): + f = os.popen('rlog '+self.fname, "r") + l = f.readline() + while l: + if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev): + pass + elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None): + pass + elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev): + pass + elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date): + pass + #end + l = f.readline() + #end + c = f.close() + if c != None: + raise c + #end + #end + + def Sum1(self): + st = os.stat(self.fname) + self.rcssize = st.st_size + self.Sum1Rlog() + if self.totrev != len(self.versions): + raise SkipRcsException('wrong version count') + #end + self.versions.sort() + #end + + def Checkout(self,n): + v = self.versions[n] + out = open(self.Verf(n), "w") + cmd = 'co -ko -p%s %s' % (v.vstr, self.fname) + total = 0 + (inf, + stream, + err) = os.popen3(cmd, "r") + inf.close() + buf = stream.read() + while buf: + total = total + len(buf) + out.write(buf) + buf = stream.read() + #end + v.vsize = total + estr = '' + buf = err.read() + while buf: + estr = estr + buf + buf = err.read() + #end + if stream.close(): + raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr)) + #end + out.close() + err.close() + #end + + def Vdate(self,n): + return self.versions[n].date + #end + + def Vstr(self,n): + return self.versions[n].vstr + #end + + def Verf(self,n): + return os.path.join(TMPDIR, 'input.%d' % n) + #end + + def FilePairsByDate(self, runclass): + if self.totrev < 2: + raise NotEnoughVersions() + #end + self.Checkout(0) + ntrials = [] + if self.totrev < 2: + return vtrials + #end + for v in range(0,self.totrev-1): + if v > 1: + os.remove(self.Verf(v-1)) + #end + self.Checkout(v+1) + if os.stat(self.Verf(v)).st_size < MIN_SIZE or \ + os.stat(self.Verf(v+1)).st_size < MIN_SIZE: + continue + #end + + result = TimedTest(self.Verf(v+1), + self.Verf(v), + runclass.New()) + + target_size = os.stat(self.Verf(v+1)).st_size + + ntrials.append(result) + #end + + os.remove(self.Verf(self.totrev-1)) + os.remove(self.Verf(self.totrev-2)) + return ntrials + #end + + def AppendVersion(self, f, n): + self.Checkout(n) + rf = open(self.Verf(n), "r") + data = rf.read() + f.write(data) + rf.close() + return len(data) + #end + +class RcsFinder: + def __init__(self): + self.subdirs = [] + self.rcsfiles = [] + self.others = [] + self.skipped = [] + self.biground = 0 + #end + + def Scan1(self,dir): + dents = os.listdir(dir) + subdirs = [] + rcsfiles = [] + others = [] + for dent in dents: + full = os.path.join(dir, dent) + if os.path.isdir(full): + subdirs.append(full) + elif dent[len(dent)-2:] == ",v": + rcsfiles.append(RcsFile(full)) + else: + others.append(full) + #end + #end + self.subdirs = self.subdirs + subdirs + self.rcsfiles = self.rcsfiles + rcsfiles + self.others = self.others + others + return subdirs + #end + + def Crawl(self, dir): + subdirs = [dir] + while subdirs: + s1 = self.Scan1(subdirs[0]) + subdirs = subdirs[1:] + s1 + #end + #end + + def Summarize(self): + good = [] + for rf in self.rcsfiles: + try: + rf.Sum1() + if rf.totrev < 2: + raise SkipRcsException('too few versions (< 2)') + #end + except SkipRcsException, e: + #print 'skipping file %s: %s' % (rf.fname, e.reason) + self.skipped.append(rf) + else: + good.append(rf) + #end + self.rcsfiles = good + #end + + def AllPairsByDate(self, runclass): + results = [] + good = [] + for rf in self.rcsfiles: + try: + results = results + rf.FilePairsByDate(runclass) + except SkipRcsException: + print 'file %s has compressed versions: skipping' % (rf.fname) + except NotEnoughVersions: + print 'testing %s on %s: not enough versions' % (runclass, rf.fname) + else: + good.append(rf) + #end + self.rcsfiles = good + self.ReportPairs(runclass, results) + return results + #end + + def ReportPairs(self, name, results): + encode_time = 0 + decode_time = 0 + encode_size = 0 + for r in results: + encode_time += r.encode_time.mean + decode_time += r.decode_time.mean + encode_size += r.encode_size + #end + print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \ + (name, encode_time, decode_time, encode_size) + #end + + def MakeBigFiles(self, rand): + f1 = open(TMPDIR + "/big.1", "w") + f2 = open(TMPDIR + "/big.2", "w") + population = [] + for file in self.rcsfiles: + if len(file.versions) < 2: + continue + population.append(file) + #end + f1sz = 0 + f2sz = 0 + fcount = int(len(population) * FILE_P) + assert fcount > 0 + for file in rand.sample(population, fcount): + m = IGNORE_FILENAME.match(file.fname) + if m != None: + continue + #end + r1, r2 = rand.sample(xrange(0, len(file.versions)), 2) + f1sz += file.AppendVersion(f1, r1) + f2sz += file.AppendVersion(f2, r2) + #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):], file.Vstr(r1), file.Vstr(r2))) + #end + testkey = 'rcs%d' % self.biground + self.biground = self.biground + 1 + + print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz) + f1.close() + f2.close() + return (TMPDIR + "/big.1", + TMPDIR + "/big.2", + testkey) + #end + + def Generator(self): + return lambda rand: self.MakeBigFiles(rand) + #end +#end + +# find a set of RCS files for testing +def GetTestRcsFiles(): + rcsf = RcsFinder() + rcsf.Crawl(RCSDIR) + if len(rcsf.rcsfiles) == 0: + raise CommandError('', 'no RCS files') + #end + rcsf.Summarize() + print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (len(rcsf.rcsfiles), + len(rcsf.subdirs), + len(rcsf.others), + len(rcsf.skipped)) + print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str + print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str + return rcsf +#end + +class SampleDataTest: + def __init__(self, dirs): + self.pairs = [] + while dirs: + d = dirs[0] + dirs = dirs[1:] + l = os.listdir(d) + files = [] + for e in l: + p = os.path.join(d, e) + if os.path.isdir(p): + dirs.append(p) + else: + files.append(p) + #end + #end + if len(files) > 1: + files.sort() + for x in xrange(len(files) - 1): + self.pairs.append((files[x], files[x+1], + '%s-%s' % (files[x], files[x+1]))) + #end + #end + #end + #end + + def Generator(self): + return lambda rand: rand.choice(self.pairs) + #end +#end + +# configs are represented as a list of values, +# program takes a list of strings: +def ConfigToArgs(config): + args = [ '-C', + ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])] + for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)): + key = CONFIG_ARGMAP[CONFIG_ORDER[i]] + val = config[i] + if val == 'true' or val == 'false': + if val == 'true': + args.append('%s' % key) + #end + else: + args.append('%s=%s' % (key, val)) + #end + #end + return args +#end + +# +class RandomTest: + def __init__(self, tnum, tinput, config, syntuple = None): + self.mytinput = tinput[2] + self.myconfig = config + self.tnum = tnum + + if syntuple != None: + self.runtime = syntuple[0] + self.compsize = syntuple[1] + self.decodetime = None + else: + args = ConfigToArgs(config) + result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args)) + + self.runtime = result.encode_time.mean + self.compsize = result.encode_size + self.decodetime = result.decode_time.mean + #end + + self.score = None + self.time_pos = None + self.size_pos = None + self.score_pos = None + #end + + def __str__(self): + decodestr = ' %s' % self.decodetime + return 'time %.6f%s size %d%s << %s >>%s' % ( + self.time(), ((self.time_pos != None) and (" (%s)" % self.time_pos) or ""), + self.size(), ((self.size_pos != None) and (" (%s)" % self.size_pos) or ""), + c2str(self.config()), + decodestr) + #end + + def time(self): + return self.runtime + #end + + def size(self): + return self.compsize + #end + + def config(self): + return self.myconfig + #end + + def score(self): + return self.score + #end + + def tinput(self): + return self.mytinput + #end +#end + +def PosInAlist(l, e): + for i in range(0, len(l)): + if l[i][1] == e: + return i; + #end + #end + return -1 +#end + +# Generates a set of num_results test configurations, given the list of +# retest-configs. +def RandomTestConfigs(rand, input_configs, num_results): + + outputs = input_configs[:] + have_set = dict([(c,c) for c in input_configs]) + + # Compute a random configuration + def RandomConfig(): + config = [] + cmap = {} + for key in CONFIG_ORDER: + val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap) + config.append(val) + #end + return tuple(config) + #end + + while len(outputs) < num_results: + newc = None + for i in xrange(100): + c = RandomConfig() + if have_set.has_key(c): + continue + #end + have_set[c] = c + newc = c + break + if newc is None: + print 'stopped looking for configs at %d' % len(outputs) + break + #end + outputs.append(c) + #end + outputs.sort() + return outputs +#end + +def RunTestLoop(rand, generator, rounds): + configs = [] + for rnum in xrange(rounds): + configs = RandomTestConfigs(rand, configs, MAX_RESULTS) + tinput = generator(rand) + tests = [] + for x in xrange(len(configs)): + t = RandomTest(x, tinput, configs[x]) + print 'Round %d test %d: %s' % (rnum, x, t) + tests.append(t) + #end + results = ScoreTests(tests) + + for r in results: + c = r.config() + if not test_all_config_results.has_key(c): + test_all_config_results[c] = [r] + else: + test_all_config_results[c].append(r) + #end + #end + + #GraphResults('expt%d' % rnum, results) + #GraphSummary('sum%d' % rnum, results) + + # re-test some fraction + configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]] + #end +#end + +# TODO: cleanup +test_all_config_results = {} + +def ScoreTests(results): + scored = [] + timed = [] + sized = [] + + t_min = float(min([test.time() for test in results])) + #t_max = float(max([test.time() for test in results])) + s_min = float(min([test.size() for test in results])) + #s_max = float(max([test.size() for test in results])) + + for test in results: + + # Hyperbolic function. Smaller scores still better + red = 0.999 # minimum factors for each dimension are 1/1000 + test.score = ((test.size() - s_min * red) * + (test.time() - t_min * red)) + + scored.append((test.score, test)) + timed.append((test.time(), test)) + sized.append((test.size(), test)) + #end + + scored.sort() + timed.sort() + sized.sort() + + best_by_size = [] + best_by_time = [] + + pos = 0 + for (score, test) in scored: + pos += 1 + test.score_pos = pos + #end + + scored = [x[1] for x in scored] + + for test in scored: + test.size_pos = PosInAlist(sized, test) + test.time_pos = PosInAlist(timed, test) + #end + + for test in scored: + c = test.config() + s = 0.0 + print 'H-Score: %0.9f %s' % (test.score, test) + #end + + return scored +#end + +def GraphResults(desc, results): + f = open("data-%s.csv" % desc, "w") + for r in results: + f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r)) + #end + f.close() + os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc)) +#end + +def GraphSummary(desc, results_ignore): + test_population = 0 + config_ordered = [] + + # drops duplicate test/config pairs (TODO: don't retest them) + for config, cresults in test_all_config_results.items(): + input_config_map = {} + uniq = [] + for test in cresults: + assert test.config() == config + test_population += 1 + key = test.tinput() + if not input_config_map.has_key(key): + input_config_map[key] = {} + #end + if input_config_map[key].has_key(config): + print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test) + continue + #end + input_config_map[key][config] = test + uniq.append(test) + #end + config_ordered.append(uniq) + #end + + # sort configs descending by number of tests + config_ordered.sort(lambda x, y: len(y) - len(x)) + + print 'population %d: %d configs %d results' % \ + (test_population, + len(config_ordered), + len(config_ordered[0])) + + if config_ordered[0] == 1: + return + #end + + # a map from test-key to test-list w/ various configs + input_set = {} + osize = len(config_ordered) + + for i in xrange(len(config_ordered)): + config = config_ordered[i][0].config() + config_tests = config_ordered[i] + + #print '%s has %d tested inputs' % (config, len(config_tests)) + + if len(input_set) == 0: + input_set = dict([(t.tinput(), [t]) for t in config_tests]) + continue + #end + + # a map from test-key to test-list w/ various configs + update_set = {} + for r in config_tests: + t = r.tinput() + if input_set.has_key(t): + update_set[t] = input_set[t] + [r] + else: + #print 'config %s does not have test %s' % (config, t) + pass + #end + #end + + if len(update_set) <= 1: + break + #end + + input_set = update_set + + # continue if there are more w/ the same number of inputs + if i < (len(config_ordered) - 1) and \ + len(config_ordered[i + 1]) == len(config_tests): + continue + #end + + # synthesize results for multi-test inputs + config_num = None + + # map of config to sum(various test-keys) + smap = {} + for (key, tests) in input_set.items(): + if config_num == None: + # config_num should be the same in all elements + config_num = len(tests) + smap = dict([(r.config(), + (r.time(), + r.size())) + for r in tests]) + else: + # compuate the per-config sum of time/size + assert config_num == len(tests) + smap = dict([(r.config(), + (smap[r.config()][0] + r.time(), + smap[r.config()][1] + r.size())) + for r in tests]) + #end + #end + + if config_num == 1: + continue + #end + + if len(input_set) == osize: + break + #end + + summary = '%s-%d' % (desc, len(input_set)) + osize = len(input_set) + + print 'generate %s w/ %d configs' % (summary, config_num) + syn = [RandomTest(0, (None, None, summary), config, + syntuple = (smap[config][0], smap[config][1])) + for config in smap.keys()] + syn = ScoreTests(syn) + #print 'smap is %s' % (smap,) + #print 'syn is %s' % (' and '.join([str(x) for x in syn])) + #GraphResults(summary, syn) + #end +#end + +if __name__ == "__main__": + try: + RunCommand(['rm', '-rf', TMPDIR]) + os.mkdir(TMPDIR) + + rcsf = GetTestRcsFiles() + generator = rcsf.Generator() + + #sample = SampleDataTest([SAMPLEDIR]) + #generator = sample.Generator() + + rand = random.Random(135135135135135) + RunTestLoop(rand, generator, TEST_ROUNDS) + + #RunSpeedTest() + + #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-3', '-6'])) + #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9'])) + #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw'])) + #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw'])) + #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T'])) + + #x1r = rcsf.AllPairsByDate(Xdelta1RunClass()) + + except CommandError: + pass + else: + RunCommand(['rm', '-rf', TMPDIR]) + pass + #end +#end diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index 60d645d..ea9353d 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h @@ -1,5 +1,6 @@ /* xdelta 3 - delta compression tools and library * Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, + * 2008, 2009 * Joshua P. MacDonald * * This program is free software; you can redistribute it and/or modify @@ -374,7 +375,7 @@ static int main_version (void) { /* $Format: " DP(RINT \"Xdelta version $Xdelta3Version$, Copyright (C) 2007, 2008, Joshua MacDonald\n\");" $ */ - DP(RINT "Xdelta version 3.0u, Copyright (C) 2007, 2008, Joshua MacDonald\n"); + DP(RINT "Xdelta version 3.0wRC1, Copyright (C) 2007, 2008, Joshua MacDonald\n"); DP(RINT "Xdelta comes with ABSOLUTELY NO WARRANTY.\n"); DP(RINT "This is free software, and you are welcome to redistribute it\n"); DP(RINT "under certain conditions; see \"COPYING\" for details.\n"); diff --git a/xdelta3/xdelta3-regtest.py b/xdelta3/xdelta3-regtest.py deleted file mode 100755 index f9a11bd..0000000 --- a/xdelta3/xdelta3-regtest.py +++ /dev/null @@ -1,1222 +0,0 @@ -#!/usr/bin/python2.5 -# xdelta 3 - delta compression tools and library -# Copyright (C) 2003, 2006, 2007, 2008. Joshua P. MacDonald -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -# TODO: test 1.5 vs. greedy - -import os, sys, math, re, time, types, array, random -import xdelta3 - -#RCSDIR = '/mnt/polaroid/Polaroid/orbit_linux/home/jmacd/PRCS' -#RCSDIR = '/tmp/PRCS_read_copy' -#SAMPLEDIR = "/tmp/WESNOTH_tmp/diff" - -#RCSDIR = 'G:/jmacd/PRCS_copy' -#SAMPLEDIR = "C:/sample_data/Wesnoth/tar" - -#RCSDIR = '/Users/jmacd/src/ftp.kernel.org/pub/scm/linux/kernel/bkcvs/linux-2.4/net/x25' -RCSDIR = '/Users/jmacd/src/ftp.kernel.org' - -# -MIN_SIZE = 0 - -TIME_TOO_SHORT = 0.050 - -SKIP_TRIALS = 2 -MIN_TRIALS = 3 -MAX_TRIALS = 15 - -# 10 = fast 1.5 = slow -MIN_STDDEV_PCT = 1.5 - -# How many results per round -MAX_RESULTS = 500 -TEST_ROUNDS = 500 -KEEP_P = (0.5) - -# For RCS testing, what percent to select -FILE_P = (0.50) - -# For run-speed tests -MIN_RUN = 1000 * 1000 * 1 -MAX_RUN = 1000 * 1000 * 10 - -# Testwide defaults -ALL_ARGS = [ - '-vv' - ] - -# The first 7 args go to -C -SOFT_CONFIG_CNT = 7 - -CONFIG_ORDER = [ 'large_look', - 'large_step', - 'small_look', - 'small_chain', - 'small_lchain', - 'max_lazy', - 'long_enough', - - # > SOFT_CONFIG_CNT - 'nocompress', - 'winsize', - 'srcwinsize', - 'sprevsz', - 'iopt', - 'djw', - 'altcode', - ] - -CONFIG_ARGMAP = { - 'winsize' : '-W', - 'srcwinsize' : '-B', - 'sprevsz' : '-P', - 'iopt' : '-I', - 'nocompress' : '-N', - 'djw' : '-Sdjw', - 'altcode' : '-T', - } - -def INPUT_SPEC(rand): - return { - - # Time/space costs: - - # -C 1,2,3,4,5,6,7 - 'large_look' : lambda d: rand.choice([9, 10, 11, 12]), - 'large_step' : lambda d: rand.choice([25, 26, 27, 28, 29, 30]), - 'small_look' : lambda d: rand.choice([4]), - 'small_chain' : lambda d: rand.choice([1]), - 'small_lchain' : lambda d: rand.choice([1]), - 'max_lazy' : lambda d: rand.choice([4, 5, 6, 7, 8, 9, 10 ]), - - # Note: long_enough only refers to small matching and has no effect if - # small_chain == 1. - 'long_enough' : lambda d: rand.choice([4]), - - # -N - 'nocompress' : lambda d: rand.choice(['false']), - - # -T - 'altcode' : lambda d: rand.choice(['false']), - - # -S djw - 'djw' : lambda d: rand.choice(['false']), - - # Memory costs: - - # -W - 'winsize' : lambda d: 8 * (1<<20), - - # -B - 'srcwinsize' : lambda d: 64 * (1<<20), - - # -I 0 is unlimited - 'iopt' : lambda d: 0, - - # -P only powers of two - 'sprevsz' : lambda d: rand.choice([x * (1<<16) for x in [4]]), - } -#end - -# -TMPDIR = '/tmp/xd3regtest.%d' % os.getpid() - -RUNFILE = os.path.join(TMPDIR, 'run') -DFILE = os.path.join(TMPDIR, 'output') -RFILE = os.path.join(TMPDIR, 'recon') - -HEAD_STATE = 0 -BAR_STATE = 1 -REV_STATE = 2 -DATE_STATE = 3 - -# -IGNORE_FILENAME = re.compile('.*\\.(gif|jpg).*') - -# rcs output -RE_TOTREV = re.compile('total revisions: (\\d+)') -RE_BAR = re.compile('----------------------------') -RE_REV = re.compile('revision (.+)') -RE_DATE = re.compile('date: ([^;]+);.*') -# xdelta output -RE_HDRSZ = re.compile('VCDIFF header size: +(\\d+)') -RE_EXTCOMP = re.compile('XDELTA ext comp.*') - -def c2str(c): - return ' '.join(['%s' % x for x in c]) -#end - -def SumList(l): - return reduce(lambda x,y: x+y, l) -#end - -# returns (total, mean, stddev, q2 (median), -# (q3-q1)/2 ("semi-interquartile range"), max-min (spread)) -class StatList: - def __init__(self,l,desc): - cnt = len(l) - assert(cnt > 1) - l.sort() - self.cnt = cnt - self.l = l - self.total = SumList(l) - self.mean = self.total / float(self.cnt) - self.s = math.sqrt(SumList([(x-self.mean) * (x - self.mean) for x in l]) / float(self.cnt-1)) - self.q0 = l[0] - self.q1 = l[int(self.cnt/4.0+0.5)] - self.q2 = l[int(self.cnt/2.0+0.5)] - self.q3 = l[min(self.cnt-1,int((3.0*self.cnt)/4.0+0.5))] - self.q4 = l[self.cnt-1]+1 - self.siqr = (self.q3-self.q1)/2.0; - self.spread = (self.q4-self.q0) - self.str = '%s %d; mean %d; sdev %d; q2 %d; .5(q3-q1) %.1f; spread %d' % \ - (desc, self.total, self.mean, self.s, self.q2, self.siqr, self.spread) - #end -#end - -def RunCommand(args, ok = [0]): - #print 'run command %s' % (' '.join(args)) - p = os.spawnvp(os.P_WAIT, args[0], args) - if p not in ok: - raise CommandError(args, 'exited %d' % p) - #end -#end - -def RunCommandIO(args,infn,outfn): - p = os.fork() - if p == 0: - os.dup2(os.open(infn,os.O_RDONLY),0) - os.dup2(os.open(outfn,os.O_CREAT|os.O_TRUNC|os.O_WRONLY),1) - os.execvp(args[0], args) - else: - s = os.waitpid(p,0) - o = os.WEXITSTATUS(s[1]) - if not os.WIFEXITED(s[1]) or o != 0: - raise CommandError(args, 'exited %d' % o) - #end - #end -#end - -class TimedTest: - def __init__(self, target, source, runnable, - skip_trials = SKIP_TRIALS, - min_trials = MIN_TRIALS, - max_trials = MAX_TRIALS, - min_stddev_pct = MIN_STDDEV_PCT): - self.target = target - self.source = source - self.runnable = runnable - - self.skip_trials = skip_trials - self.min_trials = min(min_trials, max_trials) - self.max_trials = max_trials - self.min_stddev_pct = min_stddev_pct - - self.encode_time = self.DoTest(DFILE, - lambda x: x.Encode(self.target, self.source, DFILE)) - self.encode_size = runnable.EncodeSize(DFILE) - - self.decode_time = self.DoTest(RFILE, - lambda x: x.Decode(DFILE, self.source, RFILE), - ) - - # verify - runnable.Verify(self.target, RFILE) - #end - - def DoTest(self, fname, func): - trials = 0 - measured = [] - - while 1: - try: - os.remove(fname) - except OSError: - pass - - start_time = time.time() - start_clock = time.clock() - - func(self.runnable) - - total_clock = (time.clock() - start_clock) - total_time = (time.time() - start_time) - - elap_time = max(total_time, 0.0000001) - elap_clock = max(total_clock, 0.0000001) - - trials = trials + 1 - - # skip some of the first trials - if trials > self.skip_trials: - measured.append((elap_clock, elap_time)) - #print 'measurement total: %.1f ms' % (total_time * 1000.0) - - # at least so many - if trials < (self.skip_trials + self.min_trials): - #print 'continue: need more trials: %d' % trials - continue - - # compute %variance - done = 0 - if self.skip_trials + self.min_trials <= 2: - measured = measured + measured; - done = 1 - #end - - time_stat = StatList([x[1] for x in measured], 'elap time') - sp = float(time_stat.s) / float(time_stat.mean) - - # what if MAX_TRIALS is exceeded? - too_many = (trials - self.skip_trials) >= self.max_trials - good = (100.0 * sp) < self.min_stddev_pct - if done or too_many or good: - trials = trials - self.skip_trials - if not done and not good: - #print 'too many trials: %d' % trials - pass - #clock = StatList([x[0] for x in measured], 'elap clock') - return time_stat - #end - #end - #end -#end - -def Decimals(start, end): - l = [] - step = start - while 1: - r = range(step, step * 10, step) - l = l + r - if step * 10 >= end: - l.append(step * 10) - break - step = step * 10 - return l -#end - -# This tests the raw speed of 0-byte inputs -def RunSpeedTest(): - for L in Decimals(MIN_RUN, MAX_RUN): - SetFileSize(RUNFILE, L) - - trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<20)])) - ReportSpeed(L, trx, '1MB ') - - trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<19)])) - ReportSpeed(L, trx, '512k') - - trx = TimedTest(RUNFILE, None, Xdelta3Runner(['-W', str(1<<18)])) - ReportSpeed(L, trx, '256k') - - trm = TimedTest(RUNFILE, None, Xdelta3Mod1(RUNFILE)) - ReportSpeed(L, trm, 'swig') - - trg = TimedTest(RUNFILE, None, GzipRun1()) - ReportSpeed(L,trg,'gzip') - #end -#end - -def SetFileSize(F,L): - fd = os.open(F, os.O_CREAT | os.O_WRONLY) - os.ftruncate(fd,L) - assert os.fstat(fd).st_size == L - os.close(fd) -#end - -def ReportSpeed(L,tr,desc): - print '%s run length %u: size %u: time %.3f ms: decode %.3f ms' % \ - (desc, L, - tr.encode_size, - tr.encode_time.mean * 1000.0, - tr.decode_time.mean * 1000.0) -#end - -class Xdelta3RunClass: - def __init__(self, extra): - self.extra = extra - #end - - def __str__(self): - return ' '.join(self.extra) - #end - - def New(self): - return Xdelta3Runner(self.extra) - #end -#end - -class Xdelta3Runner: - def __init__(self, extra): - self.extra = extra - #end - - def Encode(self, target, source, output): - args = (ALL_ARGS + - self.extra + - ['-e']) - if source: - args.append('-s') - args.append(source) - #end - args = args + [target, output] - self.Main(args) - #end - - def Decode(self, input, source, output): - args = (ALL_ARGS + - ['-d']) - if source: - args.append('-s') - args.append(source) - #end - args = args + [input, output] - self.Main(args) - #end - - def Verify(self, target, recon): - RunCommand(('cmp', target, recon)) - #end - - def EncodeSize(self, output): - return os.stat(output).st_size - #end - - def Main(self, args): - try: - #print 'Run %s' % (' '.join(args)) - xdelta3.xd3_main_cmdline(args) - except Exception, e: - raise CommandError(args, "xdelta3.main exception: %s" % e) - #end - #end -#end - -class Xdelta3Mod1: - def __init__(self, file): - self.target_data = open(file, 'r').read() - #end - - def Encode(self, ignore1, ignore2, ignore3): - r1, encoded = xdelta3.xd3_encode_memory(self.target_data, None, 1000000, 1<<10) - if r1 != 0: - raise CommandError('memory', 'encode failed: %s' % r1) - #end - self.encoded = encoded - #end - - def Decode(self, ignore1, ignore2, ignore3): - r2, data1 = xdelta3.xd3_decode_memory(self.encoded, None, len(self.target_data)) - if r2 != 0: - raise CommandError('memory', 'decode failed: %s' % r1) - #end - self.decoded = data1 - #end - - def Verify(self, ignore1, ignore2): - if self.target_data != self.decoded: - raise CommandError('memory', 'bad decode') - #end - #end - - def EncodeSize(self, ignore1): - return len(self.encoded) - #end -#end - -class GzipRun1: - def Encode(self, target, source, output): - assert source == None - RunCommandIO(['gzip', '-cf'], target, output) - #end - - def Decode(self, input, source, output): - assert source == None - RunCommandIO(['gzip', '-dcf'], input, output) - #end - - def Verify(self, target, recon): - RunCommand(('cmp', target, recon)) - #end - - def EncodeSize(self, output): - return os.stat(output).st_size - #end -#end - -class Xdelta1RunClass: - def __str__(self): - return 'xdelta1' - #end - - def New(self): - return Xdelta1Runner() - #end -#end - -class Xdelta1Runner: - def Encode(self, target, source, output): - assert source != None - args = ['xdelta1', 'delta', '-q', source, target, output] - RunCommand(args, [0, 1]) - #end - - def Decode(self, input, source, output): - assert source != None - args = ['xdelta1', 'patch', '-q', input, source, output] - # Note: for dumb historical reasons, xdelta1 returns 1 or 0 - RunCommand(args) - #end - - def Verify(self, target, recon): - RunCommand(('cmp', target, recon)) - #end - - def EncodeSize(self, output): - return os.stat(output).st_size - #end -#end - -# exceptions -class SkipRcsException: - def __init__(self,reason): - self.reason = reason - #end -#end - -class NotEnoughVersions: - def __init__(self): - pass - #end -#end - -class CommandError: - def __init__(self,cmd,str): - if type(cmd) is types.TupleType or \ - type(cmd) is types.ListType: - cmd = reduce(lambda x,y: '%s %s' % (x,y),cmd) - #end - print 'command was: ',cmd - print 'command failed: ',str - print 'have fun debugging' - #end -#end - -class RcsVersion: - def __init__(self,vstr): - self.vstr = vstr - #end - def __cmp__(self,other): - return cmp(self.date, other.date) - #end - def __str__(self): - return str(self.vstr) - #end -#end - -class RcsFile: - - def __init__(self, fname): - self.fname = fname - self.versions = [] - self.state = HEAD_STATE - #end - - def SetTotRev(self,s): - self.totrev = int(s) - #end - - def Rev(self,s): - self.rev = RcsVersion(s) - if len(self.versions) >= self.totrev: - raise SkipRcsException('too many versions (in log messages)') - #end - self.versions.append(self.rev) - #end - - def Date(self,s): - self.rev.date = s - #end - - def Match(self, line, state, rx, gp, newstate, f): - if state == self.state: - m = rx.match(line) - if m: - if f: - f(m.group(gp)) - #end - self.state = newstate - return 1 - #end - #end - return None - #end - - def Sum1Rlog(self): - f = os.popen('rlog '+self.fname, "r") - l = f.readline() - while l: - if self.Match(l, HEAD_STATE, RE_TOTREV, 1, BAR_STATE, self.SetTotRev): - pass - elif self.Match(l, BAR_STATE, RE_BAR, 1, REV_STATE, None): - pass - elif self.Match(l, REV_STATE, RE_REV, 1, DATE_STATE, self.Rev): - pass - elif self.Match(l, DATE_STATE, RE_DATE, 1, BAR_STATE, self.Date): - pass - #end - l = f.readline() - #end - c = f.close() - if c != None: - raise c - #end - #end - - def Sum1(self): - st = os.stat(self.fname) - self.rcssize = st.st_size - self.Sum1Rlog() - if self.totrev != len(self.versions): - raise SkipRcsException('wrong version count') - #end - self.versions.sort() - #end - - def Checkout(self,n): - v = self.versions[n] - out = open(self.Verf(n), "w") - cmd = 'co -ko -p%s %s' % (v.vstr, self.fname) - total = 0 - (inf, - stream, - err) = os.popen3(cmd, "r") - inf.close() - buf = stream.read() - while buf: - total = total + len(buf) - out.write(buf) - buf = stream.read() - #end - v.vsize = total - estr = '' - buf = err.read() - while buf: - estr = estr + buf - buf = err.read() - #end - if stream.close(): - raise CommandError(cmd, 'checkout failed: %s\n%s\n%s' % (v.vstr, self.fname, estr)) - #end - out.close() - err.close() - #end - - def Vdate(self,n): - return self.versions[n].date - #end - - def Vstr(self,n): - return self.versions[n].vstr - #end - - def Verf(self,n): - return os.path.join(TMPDIR, 'input.%d' % n) - #end - - def FilePairsByDate(self, runclass): - if self.totrev < 2: - raise NotEnoughVersions() - #end - self.Checkout(0) - ntrials = [] - if self.totrev < 2: - return vtrials - #end - for v in range(0,self.totrev-1): - if v > 1: - os.remove(self.Verf(v-1)) - #end - self.Checkout(v+1) - if os.stat(self.Verf(v)).st_size < MIN_SIZE or \ - os.stat(self.Verf(v+1)).st_size < MIN_SIZE: - continue - #end - - result = TimedTest(self.Verf(v+1), - self.Verf(v), - runclass.New()) - - target_size = os.stat(self.Verf(v+1)).st_size - - ntrials.append(result) - #end - - os.remove(self.Verf(self.totrev-1)) - os.remove(self.Verf(self.totrev-2)) - return ntrials - #end - - def AppendVersion(self, f, n): - self.Checkout(n) - rf = open(self.Verf(n), "r") - data = rf.read() - f.write(data) - rf.close() - return len(data) - #end - -class RcsFinder: - def __init__(self): - self.subdirs = [] - self.rcsfiles = [] - self.others = [] - self.skipped = [] - self.biground = 0 - #end - - def Scan1(self,dir): - dents = os.listdir(dir) - subdirs = [] - rcsfiles = [] - others = [] - for dent in dents: - full = os.path.join(dir, dent) - if os.path.isdir(full): - subdirs.append(full) - elif dent[len(dent)-2:] == ",v": - rcsfiles.append(RcsFile(full)) - else: - others.append(full) - #end - #end - self.subdirs = self.subdirs + subdirs - self.rcsfiles = self.rcsfiles + rcsfiles - self.others = self.others + others - return subdirs - #end - - def Crawl(self, dir): - subdirs = [dir] - while subdirs: - s1 = self.Scan1(subdirs[0]) - subdirs = subdirs[1:] + s1 - #end - #end - - def Summarize(self): - good = [] - for rf in self.rcsfiles: - try: - rf.Sum1() - if rf.totrev < 2: - raise SkipRcsException('too few versions (< 2)') - #end - except SkipRcsException, e: - #print 'skipping file %s: %s' % (rf.fname, e.reason) - self.skipped.append(rf) - else: - good.append(rf) - #end - self.rcsfiles = good - #end - - def AllPairsByDate(self, runclass): - results = [] - good = [] - for rf in self.rcsfiles: - try: - results = results + rf.FilePairsByDate(runclass) - except SkipRcsException: - print 'file %s has compressed versions: skipping' % (rf.fname) - except NotEnoughVersions: - print 'testing %s on %s: not enough versions' % (runclass, rf.fname) - else: - good.append(rf) - #end - self.rcsfiles = good - self.ReportPairs(runclass, results) - return results - #end - - def ReportPairs(self, name, results): - encode_time = 0 - decode_time = 0 - encode_size = 0 - for r in results: - encode_time += r.encode_time.mean - decode_time += r.decode_time.mean - encode_size += r.encode_size - #end - print '%s rcs: encode %.2f s: decode %.2f s: size %d' % \ - (name, encode_time, decode_time, encode_size) - #end - - def MakeBigFiles(self, rand): - f1 = open(TMPDIR + "/big.1", "w") - f2 = open(TMPDIR + "/big.2", "w") - population = [] - for file in self.rcsfiles: - if len(file.versions) < 2: - continue - population.append(file) - #end - f1sz = 0 - f2sz = 0 - fcount = int(len(population) * FILE_P) - assert fcount > 0 - for file in rand.sample(population, fcount): - m = IGNORE_FILENAME.match(file.fname) - if m != None: - continue - #end - r1, r2 = rand.sample(xrange(0, len(file.versions)), 2) - f1sz += file.AppendVersion(f1, r1) - f2sz += file.AppendVersion(f2, r2) - #m.update('%s,%s,%s ' % (file.fname[len(RCSDIR):], file.Vstr(r1), file.Vstr(r2))) - #end - testkey = 'rcs%d' % self.biground - self.biground = self.biground + 1 - - print '%s; source %u bytes; target %u bytes' % (testkey, f1sz, f2sz) - f1.close() - f2.close() - return (TMPDIR + "/big.1", - TMPDIR + "/big.2", - testkey) - #end - - def Generator(self): - return lambda rand: self.MakeBigFiles(rand) - #end -#end - -# find a set of RCS files for testing -def GetTestRcsFiles(): - rcsf = RcsFinder() - rcsf.Crawl(RCSDIR) - if len(rcsf.rcsfiles) == 0: - raise CommandError('', 'no RCS files') - #end - rcsf.Summarize() - print "rcsfiles: rcsfiles %d; subdirs %d; others %d; skipped %d" % (len(rcsf.rcsfiles), - len(rcsf.subdirs), - len(rcsf.others), - len(rcsf.skipped)) - print StatList([x.rcssize for x in rcsf.rcsfiles], "rcssize").str - print StatList([x.totrev for x in rcsf.rcsfiles], "totrev").str - return rcsf -#end - -class SampleDataTest: - def __init__(self, dirs): - self.pairs = [] - while dirs: - d = dirs[0] - dirs = dirs[1:] - l = os.listdir(d) - files = [] - for e in l: - p = os.path.join(d, e) - if os.path.isdir(p): - dirs.append(p) - else: - files.append(p) - #end - #end - if len(files) > 1: - files.sort() - for x in xrange(len(files) - 1): - self.pairs.append((files[x], files[x+1], - '%s-%s' % (files[x], files[x+1]))) - #end - #end - #end - #end - - def Generator(self): - return lambda rand: rand.choice(self.pairs) - #end -#end - -# configs are represented as a list of values, -# program takes a list of strings: -def ConfigToArgs(config): - args = [ '-C', - ','.join([str(x) for x in config[0:SOFT_CONFIG_CNT]])] - for i in range(SOFT_CONFIG_CNT, len(CONFIG_ORDER)): - key = CONFIG_ARGMAP[CONFIG_ORDER[i]] - val = config[i] - if val == 'true' or val == 'false': - if val == 'true': - args.append('%s' % key) - #end - else: - args.append('%s=%s' % (key, val)) - #end - #end - return args -#end - -# -class RandomTest: - def __init__(self, tnum, tinput, config, syntuple = None): - self.mytinput = tinput[2] - self.myconfig = config - self.tnum = tnum - - if syntuple != None: - self.runtime = syntuple[0] - self.compsize = syntuple[1] - self.decodetime = None - else: - args = ConfigToArgs(config) - result = TimedTest(tinput[1], tinput[0], Xdelta3Runner(args)) - - self.runtime = result.encode_time.mean - self.compsize = result.encode_size - self.decodetime = result.decode_time.mean - #end - - self.score = None - self.time_pos = None - self.size_pos = None - self.score_pos = None - #end - - def __str__(self): - decodestr = ' %.6f' % self.decodetime - return 'time %.6f%s size %d%s << %s >>%s' % ( - self.time(), ((self.time_pos != None) and (" (%s)" % self.time_pos) or ""), - self.size(), ((self.size_pos != None) and (" (%s)" % self.size_pos) or ""), - c2str(self.config()), - decodestr) - #end - - def time(self): - return self.runtime - #end - - def size(self): - return self.compsize - #end - - def config(self): - return self.myconfig - #end - - def score(self): - return self.score - #end - - def tinput(self): - return self.mytinput - #end -#end - -def PosInAlist(l, e): - for i in range(0, len(l)): - if l[i][1] == e: - return i; - #end - #end - return -1 -#end - -# Generates a set of num_results test configurations, given the list of -# retest-configs. -def RandomTestConfigs(rand, input_configs, num_results): - - outputs = input_configs[:] - have_set = dict([(c,c) for c in input_configs]) - - # Compute a random configuration - def RandomConfig(): - config = [] - cmap = {} - for key in CONFIG_ORDER: - val = cmap[key] = (INPUT_SPEC(rand)[key])(cmap) - config.append(val) - #end - return tuple(config) - #end - - while len(outputs) < num_results: - newc = None - for i in xrange(100): - c = RandomConfig() - if have_set.has_key(c): - continue - #end - have_set[c] = c - newc = c - break - if newc is None: - print 'stopped looking for configs at %d' % len(outputs) - break - #end - outputs.append(c) - #end - outputs.sort() - return outputs -#end - -def RunTestLoop(rand, generator, rounds): - configs = [] - for rnum in xrange(rounds): - configs = RandomTestConfigs(rand, configs, MAX_RESULTS) - tinput = generator(rand) - tests = [] - for x in xrange(len(configs)): - t = RandomTest(x, tinput, configs[x]) - print 'Round %d test %d: %s' % (rnum, x, t) - tests.append(t) - #end - results = ScoreTests(tests) - - for r in results: - c = r.config() - if not test_all_config_results.has_key(c): - test_all_config_results[c] = [r] - else: - test_all_config_results[c].append(r) - #end - #end - - GraphResults('expt%d' % rnum, results) - GraphSummary('sum%d' % rnum, results) - - # re-test some fraction - configs = [r.config() for r in results[0:int(MAX_RESULTS * KEEP_P)]] - #end -#end - -# TODO: cleanup -test_all_config_results = {} - -def ScoreTests(results): - scored = [] - timed = [] - sized = [] - - t_min = float(min([test.time() for test in results])) - #t_max = float(max([test.time() for test in results])) - s_min = float(min([test.size() for test in results])) - #s_max = float(max([test.size() for test in results])) - - for test in results: - - # Hyperbolic function. Smaller scores still better - red = 0.999 # minimum factors for each dimension are 1/1000 - test.score = ((test.size() - s_min * red) * - (test.time() - t_min * red)) - - scored.append((test.score, test)) - timed.append((test.time(), test)) - sized.append((test.size(), test)) - #end - - scored.sort() - timed.sort() - sized.sort() - - best_by_size = [] - best_by_time = [] - - pos = 0 - for (score, test) in scored: - pos += 1 - test.score_pos = pos - #end - - scored = [x[1] for x in scored] - - for test in scored: - test.size_pos = PosInAlist(sized, test) - test.time_pos = PosInAlist(timed, test) - #end - - for test in scored: - c = test.config() - s = 0.0 - print 'H-Score: %0.9f %s' % (test.score, test) - #end - - return scored -#end - -def GraphResults(desc, results): - f = open("data-%s.csv" % desc, "w") - for r in results: - f.write("%0.9f\t%d\t# %s\n" % (r.time(), r.size(), r)) - #end - f.close() - os.system("./plot.sh data-%s.csv plot-%s.jpg" % (desc, desc)) -#end - -def GraphSummary(desc, results_ignore): - test_population = 0 - config_ordered = [] - - # drops duplicate test/config pairs (TODO: don't retest them) - for config, cresults in test_all_config_results.items(): - input_config_map = {} - uniq = [] - for test in cresults: - assert test.config() == config - test_population += 1 - key = test.tinput() - if not input_config_map.has_key(key): - input_config_map[key] = {} - #end - if input_config_map[key].has_key(config): - print 'skipping repeat test %s vs. %s' % (input_config_map[key][config], test) - continue - #end - input_config_map[key][config] = test - uniq.append(test) - #end - config_ordered.append(uniq) - #end - - # sort configs descending by number of tests - config_ordered.sort(lambda x, y: len(y) - len(x)) - - print 'population %d: %d configs %d results' % \ - (test_population, - len(config_ordered), - len(config_ordered[0])) - - if config_ordered[0] == 1: - return - #end - - # a map from test-key to test-list w/ various configs - input_set = {} - osize = len(config_ordered) - - for i in xrange(len(config_ordered)): - config = config_ordered[i][0].config() - config_tests = config_ordered[i] - - #print '%s has %d tested inputs' % (config, len(config_tests)) - - if len(input_set) == 0: - input_set = dict([(t.tinput(), [t]) for t in config_tests]) - continue - #end - - # a map from test-key to test-list w/ various configs - update_set = {} - for r in config_tests: - t = r.tinput() - if input_set.has_key(t): - update_set[t] = input_set[t] + [r] - else: - #print 'config %s does not have test %s' % (config, t) - pass - #end - #end - - if len(update_set) <= 1: - break - #end - - input_set = update_set - - # continue if there are more w/ the same number of inputs - if i < (len(config_ordered) - 1) and \ - len(config_ordered[i + 1]) == len(config_tests): - continue - #end - - # synthesize results for multi-test inputs - config_num = None - - # map of config to sum(various test-keys) - smap = {} - for (key, tests) in input_set.items(): - if config_num == None: - # config_num should be the same in all elements - config_num = len(tests) - smap = dict([(r.config(), - (r.time(), - r.size())) - for r in tests]) - else: - # compuate the per-config sum of time/size - assert config_num == len(tests) - smap = dict([(r.config(), - (smap[r.config()][0] + r.time(), - smap[r.config()][1] + r.size())) - for r in tests]) - #end - #end - - if config_num == 1: - continue - #end - - if len(input_set) == osize: - break - #end - - summary = '%s-%d' % (desc, len(input_set)) - osize = len(input_set) - - print 'generate %s w/ %d configs' % (summary, config_num) - syn = [RandomTest(0, (None, None, summary), config, - syntuple = (smap[config][0], smap[config][1])) - for config in smap.keys()] - syn = ScoreTests(syn) - #print 'smap is %s' % (smap,) - #print 'syn is %s' % (' and '.join([str(x) for x in syn])) - GraphResults(summary, syn) - #end -#end - -if __name__ == "__main__": - try: - RunCommand(['rm', '-rf', TMPDIR]) - os.mkdir(TMPDIR) - - rcsf = GetTestRcsFiles() - #generator = rcsf.Generator() - - #sample = SampleDataTest([SAMPLEDIR]) - #generator = sample.Generator() - - #rand = random.Random(135135135135135) - #RunTestLoop(rand, generator, TEST_ROUNDS) - - #RunSpeedTest() - - #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9'])) - x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-S', 'djw'])) - x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-1', '-S', 'djw'])) - #x3r = rcsf.AllPairsByDate(Xdelta3RunClass(['-9', '-T'])) - - #x1r = rcsf.AllPairsByDate(Xdelta1RunClass()) - - except CommandError: - pass - else: - RunCommand(['rm', '-rf', TMPDIR]) - pass - #end -#end diff --git a/xdelta3/xdelta3.c b/xdelta3/xdelta3.c index 3dbb923..d0cf2b1 100644 --- a/xdelta3/xdelta3.c +++ b/xdelta3/xdelta3.c @@ -5085,7 +5085,7 @@ XD3_TEMPLATE(xd3_string_match_) (xd3_stream *stream) const uint8_t *inp; uint32_t scksum = 0; - uint32_t scksum_state; + uint32_t scksum_state = 0; uint32_t lcksum = 0; usize_t sinx; usize_t linx; diff --git a/xdelta3/xdelta3.h b/xdelta3/xdelta3.h index c6226c1..9ee90d6 100644 --- a/xdelta3/xdelta3.h +++ b/xdelta3/xdelta3.h @@ -1024,14 +1024,13 @@ int xd3_decode_memory (const uint8_t *input, usize_t avail_output, int flags); -/* This function encodes an in-memory input. Everything else about - * the xd3_stream is configurable. The output array must be large - * enough to hold the output or else ENOSPC is returned. The source - * (if any) should be set using xd3_set_source() with a single-block - * xd3_source. This calls the underlying non-blocking interface, - * xd3_encode_input(), handling the necessary input/output states. - * This method be considered a reference for any application using - * xd3_encode_input() directly. +/* This function encodes an in-memory input. The output array must be + * large enough to hold the output or else ENOSPC is returned. The + * source (if any) should be set using xd3_set_source() with a + * single-block xd3_source. This calls the underlying non-blocking + * interface, xd3_encode_input(), handling the necessary input/output + * states. This method be considered a reference for any application + * using xd3_encode_input() directly. * * xd3_stream stream; * xd3_config config; -- cgit v1.2.3