summaryrefslogtreecommitdiff
path: root/xdelta3/testing/checksum_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'xdelta3/testing/checksum_test.cc')
-rw-r--r--xdelta3/testing/checksum_test.cc770
1 files changed, 770 insertions, 0 deletions
diff --git a/xdelta3/testing/checksum_test.cc b/xdelta3/testing/checksum_test.cc
new file mode 100644
index 0000000..9cd3740
--- /dev/null
+++ b/xdelta3/testing/checksum_test.cc
@@ -0,0 +1,770 @@
1/* xdelta3 - delta compression tools and library -*- Mode: C++ -*-
2 Copyright 2016 Joshua MacDonald
3
4 Licensed under the Apache License, Version 2.0 (the "License");
5 you may not use this file except in compliance with the License.
6 You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10 Unless required by applicable law or agreed to in writing, software
11 distributed under the License is distributed on an "AS IS" BASIS,
12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 See the License for the specific language governing permissions and
14 limitations under the License.
15*/
16
17#include "test.h"
18#include <assert.h>
19#include <list>
20#include <vector>
21#include <algorithm>
22
23#include "../cpp-btree/btree_map.h"
24
25extern "C" {
26uint32_t xd3_large32_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
27uint32_t xd3_large32_cksum_update_old (xd3_hash_cfg *cfg, uint32_t cksum,
28 const uint8_t *base, const usize_t look);
29
30uint64_t xd3_large64_cksum_old (xd3_hash_cfg *cfg, const uint8_t *base, const usize_t look);
31uint64_t xd3_large64_cksum_update_old (xd3_hash_cfg *cfg, uint64_t cksum,
32 const uint8_t *base, const usize_t look);
33}
34
35using btree::btree_map;
36using std::list;
37using std::vector;
38
39// MLCG parameters
40// a, a*
41uint32_t good_32bit_values[] = {
42 1597334677U, // ...
43 741103597U, 887987685U,
44};
45
46// a, a*
47uint64_t good_64bit_values[] = {
48 1181783497276652981ULL, 4292484099903637661ULL,
49 7664345821815920749ULL, // ...
50};
51
52void print_header() {
53 static int hdr_cnt = 0;
54 if (hdr_cnt++ % 20 == 0) {
55 printf("%-32sConf\t\tCount\tUniq\tFull\tCover\tColls"
56 "\tMB/s\tIters\t#Colls\n", "Name");
57 }
58}
59
60struct true_type { };
61struct false_type { };
62
63template <typename Word>
64usize_t bitsof();
65
66template<>
67usize_t bitsof<unsigned int>() {
68 return sizeof(unsigned int) * 8;
69}
70
71template<>
72usize_t bitsof<unsigned long>() {
73 return sizeof(unsigned long) * 8;
74}
75
76template<>
77usize_t bitsof<unsigned long long>() {
78 return sizeof(unsigned long long) * 8;
79}
80
81template <typename Word>
82struct hhash { // shift "s" bits leaving the high bits as a hash value for
83 // this checksum, which are the most "distant" in terms of the
84 // spectral test for the rabin_karp MLCG. For short windows,
85 // the high bits aren't enough, XOR "mask" worth of these in.
86 Word operator()(const Word t, const Word s, const Word mask) {
87 return (t >> s) ^ (t & mask);
88 }
89};
90
91template <typename Word>
92Word good_word();
93
94template<>
95uint32_t good_word<uint32_t>() {
96 return good_32bit_values[0];
97}
98
99template<>
100uint64_t good_word<uint64_t>() {
101 return good_64bit_values[0];
102}
103
104// CLASSES
105
106#define SELF Word, CksumSize, CksumSkip, Hash, Compaction
107#define MEMBER template <typename Word, \
108 int CksumSize, \
109 int CksumSkip, \
110 typename Hash, \
111 int Compaction>
112
113MEMBER
114struct cksum_params {
115 typedef Word word_type;
116 typedef Hash hash_type;
117
118 static const int cksum_size = CksumSize;
119 static const int cksum_skip = CksumSkip;
120 static const int compaction = Compaction;
121};
122
123MEMBER
124struct rabin_karp : public cksum_params<SELF> {
125 // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
126 rabin_karp()
127 : powers(make_powers()),
128 product(powers[0] * good_word<Word>()),
129 incr_state(0) { }
130
131 static Word* make_powers() {
132 Word *p = new Word[CksumSize];
133 p[CksumSize - 1] = 1;
134 for (int i = CksumSize - 2; i >= 0; i--) {
135 p[i] = p[i + 1] * good_word<Word>();
136 }
137 return p;
138 }
139
140 ~rabin_karp() {
141 delete [] powers;
142 }
143
144 Word step(const uint8_t *ptr) {
145 Word h = 0;
146 for (int i = 0; i < CksumSize; i++) {
147 h += (ptr[i]) * powers[i];
148 }
149 return h;
150 }
151
152 Word state0(const uint8_t *ptr) {
153 incr_state = step(ptr);
154 return incr_state;
155 }
156
157 Word incr(const uint8_t *ptr) {
158 incr_state = good_word<Word>() * incr_state -
159 product * (ptr[-1]) + (ptr[CksumSize - 1]);
160 return incr_state;
161 }
162
163 const Word *const powers;
164 const Word product;
165 Word incr_state;
166};
167
168MEMBER
169struct with_stream : public cksum_params<SELF> {
170 xd3_stream stream;
171
172 with_stream()
173 {
174 xd3_config cfg;
175 memset (&stream, 0, sizeof (stream));
176 xd3_init_config (&cfg, 0);
177 cfg.smatch_cfg = XD3_SMATCH_SOFT;
178 cfg.smatcher_soft.large_look = CksumSize;
179 cfg.smatcher_soft.large_step = CksumSkip;
180 cfg.smatcher_soft.small_look = 4;
181 cfg.smatcher_soft.small_chain = 4;
182 cfg.smatcher_soft.small_lchain = 4;
183 cfg.smatcher_soft.max_lazy = 4;
184 cfg.smatcher_soft.long_enough = 4;
185 CHECK_EQ(0, xd3_config_stream (&stream, &cfg));
186
187 CHECK_EQ(0, xd3_size_hashtable (&stream,
188 1<<10 /* ignored */,
189 stream.smatcher.large_look,
190 & stream.large_hash));
191 }
192 ~with_stream()
193 {
194 xd3_free_stream (&stream);
195 }
196};
197
198MEMBER
199struct large_cksum : public with_stream<SELF> {
200 Word step(const uint8_t *ptr) {
201 return xd3_large_cksum (&this->stream.large_hash, ptr, CksumSize);
202 }
203
204 Word state0(const uint8_t *ptr) {
205 incr_state = step(ptr);
206 return incr_state;
207 }
208
209 Word incr(const uint8_t *ptr) {
210 incr_state = xd3_large_cksum_update (&this->stream.large_hash,
211 incr_state, ptr - 1, CksumSize);
212 return incr_state;
213 }
214
215 Word incr_state;
216};
217
218#if SIZEOF_USIZE_T == 4
219#define xd3_large_cksum_old xd3_large32_cksum_old
220#define xd3_large_cksum_update_old xd3_large32_cksum_update_old
221#elif SIZEOF_USIZE_T == 8
222#define xd3_large_cksum_old xd3_large64_cksum_old
223#define xd3_large_cksum_update_old xd3_large64_cksum_update_old
224#endif
225
226MEMBER
227struct large_cksum_old : public with_stream<SELF> {
228 Word step(const uint8_t *ptr) {
229 return xd3_large_cksum_old (&this->stream.large_hash, ptr, CksumSize);
230 }
231
232 Word state0(const uint8_t *ptr) {
233 incr_state = step(ptr);
234 return incr_state;
235 }
236
237 Word incr(const uint8_t *ptr) {
238 incr_state = xd3_large_cksum_update_old (&this->stream.large_hash,
239 incr_state, ptr - 1, CksumSize);
240 return incr_state;
241 }
242
243 Word incr_state;
244};
245
246// TESTS
247
248template <typename Word>
249struct file_stats {
250 typedef const uint8_t* ptr_type;
251 typedef Word word_type;
252 typedef btree::btree_multimap<word_type, ptr_type> table_type;
253 typedef typename table_type::iterator table_iterator;
254
255 usize_t cksum_size;
256 usize_t cksum_skip;
257 usize_t unique;
258 usize_t unique_values;
259 usize_t count;
260 table_type table;
261
262 file_stats(usize_t size, usize_t skip)
263 : cksum_size(size),
264 cksum_skip(skip),
265 unique(0),
266 unique_values(0),
267 count(0) {
268 }
269
270 void reset() {
271 unique = 0;
272 unique_values = 0;
273 count = 0;
274 table.clear();
275 }
276
277 void update(word_type word, ptr_type ptr) {
278 table_iterator t_i = table.find(word);
279
280 count++;
281 if (t_i != table.end()) {
282 int collisions = 0;
283 for (table_iterator p_i = t_i;
284 p_i != table.end() && p_i->first == word;
285 ++p_i) {
286 if (memcmp(p_i->second, ptr, cksum_size) == 0) {
287 return;
288 }
289 collisions++;
290 }
291 if (collisions >= 1000) {
292 fprintf(stderr, "Something is not right, lots of collisions=%d\n",
293 collisions);
294 abort();
295 }
296 } else {
297 unique_values++;
298 }
299 unique++;
300 table.insert(std::make_pair(word, ptr));
301 return;
302 }
303
304 void freeze() {
305 table.clear();
306 }
307};
308
309struct test_result_base;
310
311static vector<test_result_base*> all_tests;
312
313struct test_result_base {
314 virtual ~test_result_base() {
315 }
316 virtual void reset() = 0;
317 virtual void print() = 0;
318 virtual void get(const uint8_t* buf, const size_t buf_size,
319 usize_t iters) = 0;
320 virtual void stat() = 0;
321 virtual usize_t count() = 0;
322 virtual usize_t dups() = 0;
323 virtual double uniqueness() = 0;
324 virtual double fullness() = 0;
325 virtual double collisions() = 0;
326 virtual double coverage() = 0;
327 virtual double compression() = 0;
328 virtual double time() = 0;
329 virtual double total_time() = 0;
330 virtual usize_t total_count() = 0;
331 virtual usize_t total_dups() = 0;
332};
333
334template <typename Checksum>
335struct test_result : public test_result_base {
336 Checksum cksum;
337 const char *test_name;
338 file_stats<typename Checksum::word_type> fstats;
339 usize_t test_size;
340 usize_t n_steps;
341 usize_t n_incrs;
342 typename Checksum::word_type s_bits;
343 typename Checksum::word_type s_mask;
344 usize_t t_entries;
345 usize_t h_bits;
346 usize_t h_buckets_full;
347 char *hash_table;
348 long accum_millis;
349 usize_t accum_iters;
350
351 // These are not reset
352 double accum_time;
353 usize_t accum_count;
354 usize_t accum_dups;
355 usize_t accum_colls;
356 size_t accum_size;
357
358 test_result(const char *name)
359 : test_name(name),
360 fstats(Checksum::cksum_size, Checksum::cksum_skip),
361 hash_table(NULL),
362 accum_millis(0),
363 accum_iters(0),
364 accum_time(0.0),
365 accum_count(0),
366 accum_dups(0),
367 accum_colls(0),
368 accum_size(0) {
369 all_tests.push_back(this);
370 }
371
372 ~test_result() {
373 reset();
374 }
375
376 void reset() {
377 // size of file
378 test_size = 0;
379
380 // count
381 n_steps = 0;
382 n_incrs = 0;
383
384 // four values used by new_table()/summarize_table()
385 s_bits = 0;
386 s_mask = 0;
387 t_entries = 0;
388 h_bits = 0;
389 h_buckets_full = 0;
390
391 accum_millis = 0;
392 accum_iters = 0;
393
394 fstats.reset();
395
396 // temporary
397 if (hash_table) {
398 delete(hash_table);
399 hash_table = NULL;
400 }
401 }
402
403 usize_t count() {
404 if (Checksum::cksum_skip == 1) {
405 return n_incrs;
406 } else {
407 return n_steps;
408 }
409 }
410
411 usize_t dups() {
412 return fstats.count - fstats.unique;
413 }
414
415 /* Fraction of distinct strings of length cksum_size which are not
416 * represented in the hash table. */
417 double collisions() {
418 return (fstats.unique - fstats.unique_values) / (double) fstats.unique;
419 }
420 usize_t colls() {
421 return (fstats.unique - fstats.unique_values);
422 }
423
424 double uniqueness() {
425 return 1.0 - (double) dups() / count();
426 }
427
428 double fullness() {
429 return (double) h_buckets_full / (1 << h_bits);
430 }
431
432 double coverage() {
433 return (double) h_buckets_full / uniqueness() / count();
434 }
435
436 double compression() {
437 return 1.0 - coverage();
438 }
439
440 double time() {
441 return (double) accum_millis / accum_iters;
442 }
443
444 double total_time() {
445 return accum_time;
446 }
447
448 usize_t total_count() {
449 return accum_count;
450 }
451
452 usize_t total_dups() {
453 return accum_dups;
454 }
455
456 usize_t total_colls() {
457 return accum_dups;
458 }
459
460 void stat() {
461 accum_time += time();
462 accum_count += count();
463 accum_dups += dups();
464 accum_colls += colls();
465 accum_size += test_size;
466 }
467
468 void print() {
469 if (fstats.count != count()) {
470 fprintf(stderr, "internal error: %" W "d != %" W "d\n", fstats.count, count());
471 abort();
472 }
473 print_header();
474 printf("%-32s%d/%d 2^%" W "u\t%" W "u\t%0.4f\t%.4f\t%.4f\t%.1e\t%.2f\t"
475 "%" W "u\t%" W "u\n",
476 test_name,
477 Checksum::cksum_size,
478 Checksum::cksum_skip,
479 h_bits,
480 count(),
481 uniqueness(),
482 fullness(),
483 coverage(),
484 collisions(),
485 0.001 * accum_iters * test_size / accum_millis,
486 accum_iters,
487 colls());
488 }
489
490 usize_t size_log2 (usize_t slots) {
491 usize_t bits = bitsof<typename Checksum::word_type>() - 1;
492 usize_t i;
493
494 for (i = 3; i <= bits; i += 1) {
495 if (slots <= (1U << i)) {
496 return i - Checksum::compaction;
497 }
498 }
499
500 return bits;
501 }
502
503 void new_table(usize_t entries) {
504 t_entries = entries;
505 h_bits = size_log2(entries);
506
507 usize_t n = 1 << h_bits;
508
509 s_bits = bitsof<typename Checksum::word_type>() - h_bits;
510 s_mask = n - 1U;
511
512 hash_table = new char[n / 8];
513 memset(hash_table, 0, n / 8);
514 }
515
516 int get_table_bit(usize_t i) {
517 return hash_table[i/8] & (1 << i%8);
518 }
519
520 int set_table_bit(usize_t i) {
521 return hash_table[i/8] |= (1 << i%8);
522 }
523
524 void summarize_table() {
525 usize_t n = 1 << h_bits;
526 usize_t f = 0;
527 for (usize_t i = 0; i < n; i++) {
528 if (get_table_bit(i)) {
529 f++;
530 }
531 }
532 h_buckets_full = f;
533 }
534
535 void get(const uint8_t* buf, const size_t buf_size, usize_t test_iters) {
536 typename Checksum::hash_type hash;
537 const uint8_t *ptr;
538 const uint8_t *end;
539 usize_t periods;
540 int64_t last_offset;
541 int64_t stop;
542
543 test_size = buf_size;
544 last_offset = buf_size - Checksum::cksum_size;
545
546 if (last_offset < 0) {
547 periods = 0;
548 n_steps = 0;
549 n_incrs = 0;
550 stop = -Checksum::cksum_size;
551 } else {
552 periods = last_offset / Checksum::cksum_skip;
553 n_steps = periods + 1;
554 n_incrs = last_offset + 1;
555 stop = last_offset - (periods + 1) * Checksum::cksum_skip;
556 }
557
558 // Compute file stats once.
559 if (fstats.unique_values == 0) {
560 if (Checksum::cksum_skip == 1) {
561 for (size_t i = 0; i <= buf_size - Checksum::cksum_size; i++) {
562 fstats.update(hash(cksum.step(buf + i), s_bits, s_mask), buf + i);
563 }
564 } else {
565 ptr = buf + last_offset;
566 end = buf + stop;
567
568 for (; ptr != end; ptr -= Checksum::cksum_skip) {
569 fstats.update(hash(cksum.step(ptr), s_bits, s_mask), ptr);
570 }
571 }
572 fstats.freeze();
573 }
574
575 long start_test = get_millisecs_now();
576
577 if (Checksum::cksum_skip != 1) {
578 new_table(n_steps);
579
580 for (usize_t i = 0; i < test_iters; i++) {
581 ptr = buf + last_offset;
582 end = buf + stop;
583
584 for (; ptr != end; ptr -= Checksum::cksum_skip) {
585 set_table_bit(hash(cksum.step(ptr), s_bits, s_mask));
586 }
587 }
588
589 summarize_table();
590 }
591
592 stop = buf_size - Checksum::cksum_size + 1;
593 if (stop < 0) {
594 stop = 0;
595 }
596
597 if (Checksum::cksum_skip == 1) {
598 new_table(n_incrs);
599
600 for (usize_t i = 0; i < test_iters; i++) {
601 ptr = buf;
602 end = buf + stop;
603
604 if (ptr != end) {
605 set_table_bit(hash(cksum.state0(ptr++), s_bits, s_mask));
606 }
607
608 for (; ptr != end; ptr++) {
609 typename Checksum::word_type w = cksum.incr(ptr);
610 CHECK_EQ(w, cksum.step(ptr));
611 set_table_bit(hash(w, s_bits, s_mask));
612 }
613 }
614
615 summarize_table();
616 }
617
618 accum_iters += test_iters;
619 accum_millis += get_millisecs_now() - start_test;
620 }
621};
622
623static int read_whole_file(const char *name,
624 uint8_t **buf_ptr,
625 size_t *buf_len) {
626 main_file file;
627 int ret;
628 xoff_t len;
629 size_t nread;
630 main_file_init(&file);
631 file.filename = name;
632 ret = main_file_open(&file, name, XO_READ);
633 if (ret != 0) {
634 fprintf(stderr, "open failed\n");
635 goto exit;
636 }
637 ret = main_file_stat(&file, &len);
638 if (ret != 0) {
639 fprintf(stderr, "stat failed\n");
640 goto exit;
641 }
642
643 (*buf_len) = (size_t)len;
644 (*buf_ptr) = (uint8_t*) main_malloc(*buf_len);
645 ret = main_file_read(&file, *buf_ptr, *buf_len, &nread,
646 "read failed");
647 if (ret == 0 && *buf_len == nread) {
648 ret = 0;
649 } else {
650 fprintf(stderr, "invalid read\n");
651 ret = XD3_INTERNAL;
652 }
653 exit:
654 main_file_cleanup(&file);
655 return ret;
656}
657
658int main(int argc, char** argv) {
659 int i;
660 uint8_t *buf = NULL;
661 size_t buf_len = 0;
662 int ret;
663
664 if (argc <= 1) {
665 fprintf(stderr, "usage: %s file ...\n", argv[0]);
666 return 1;
667 }
668
669// TODO: The xdelta3-hash.h code is identical now; add sameness test.
670// using rabin_karp<> template.
671#define TEST(T,Z,S,C) \
672 test_result<large_cksum<T,Z,S,hhash<T>,C>> \
673 _xck_ ## T ## _ ## Z ## _ ## S ## _ ## C \
674 ("xck_" #T "_" #Z "_" #S "_" #C); \
675 test_result<large_cksum_old<T,Z,S,hhash<T>,C>> \
676 _old_ ## T ## _ ## Z ## _ ## S ## _ ## C \
677 ("old_" #T "_" #Z "_" #S "_" #C)
678
679#define TESTS(SIZE, SKIP) \
680 TEST(usize_t, SIZE, SKIP, 1); \
681 TEST(usize_t, SIZE, SKIP, 2)
682
683 TESTS(5, 1);
684 TESTS(6, 1);
685 TESTS(7, 1);
686 TESTS(8, 1);
687 TESTS(9, 1);
688 TESTS(10, 1);
689 TESTS(11, 1);
690 TESTS(12, 1);
691 TESTS(13, 1);
692 TESTS(14, 1);
693 TESTS(15, 1);
694 TESTS(16, 1);
695 TESTS(17, 1);
696 TESTS(18, 1);
697 TESTS(19, 1);
698 TESTS(20, 1);
699 TESTS(21, 1);
700 TESTS(22, 1);
701 TESTS(23, 1);
702 TESTS(24, 1);
703 TESTS(25, 1);
704 TESTS(26, 1);
705 TESTS(27, 1);
706 TESTS(28, 1);
707 TESTS(29, 1);
708 TESTS(30, 1);
709 TESTS(31, 1);
710 TESTS(32, 1);
711 TESTS(33, 1);
712 TESTS(34, 1);
713 TESTS(35, 1);
714 TESTS(36, 1);
715 TESTS(37, 1);
716 TESTS(38, 1);
717 TESTS(39, 1);
718
719
720 for (i = 1; i < argc; i++) {
721 if ((ret = read_whole_file(argv[i],
722 & buf,
723 & buf_len))) {
724 return 1;
725 }
726
727 fprintf(stderr, "file %s is %zu bytes\n",
728 argv[i], buf_len);
729
730 double min_time = -1.0;
731 double min_compression = 0.0;
732
733 for (vector<test_result_base*>::iterator iter = all_tests.begin();
734 iter != all_tests.end(); ++iter) {
735 test_result_base *test = *iter;
736 test->reset();
737
738 usize_t iters = 1;
739 long start_test = get_millisecs_now();
740
741 do {
742 test->get(buf, buf_len, iters);
743 iters *= 3;
744 iters /= 2;
745 } while (get_millisecs_now() - start_test < 2000);
746
747 test->stat();
748
749 if (min_time < 0.0) {
750 min_compression = test->compression();
751 min_time = test->time();
752 }
753
754 if (min_time > test->time()) {
755 min_time = test->time();
756 }
757
758 if (min_compression > test->compression()) {
759 min_compression = test->compression();
760 }
761
762 test->print();
763 }
764
765 main_free(buf);
766 buf = NULL;
767 }
768
769 return 0;
770}