summaryrefslogtreecommitdiff
path: root/xdelta3
diff options
context:
space:
mode:
Diffstat (limited to 'xdelta3')
-rw-r--r--xdelta3/xdelta3-djw.h63
-rw-r--r--xdelta3/xdelta3-fgk.h67
-rw-r--r--xdelta3/xdelta3-main.h9
-rw-r--r--xdelta3/xdelta3-test.h6
-rw-r--r--xdelta3/xdelta3.c5
5 files changed, 83 insertions, 67 deletions
diff --git a/xdelta3/xdelta3-djw.h b/xdelta3/xdelta3-djw.h
index 396f73c..a9f6e6f 100644
--- a/xdelta3/xdelta3-djw.h
+++ b/xdelta3/xdelta3-djw.h
@@ -16,15 +16,15 @@
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */ 17 */
18 18
19/* TODO: This code needs a thorough round of commenting. There is some slop in the 19/* TODO: This code needs a thorough round of commenting. There is
20 * declaration of arrays, which are maybe one element larger than they need to be and 20 * some slop in the declaration of arrays, which are maybe one element
21 * comments would help clear it up. */ 21 * larger than they need to be and comments would help clear it up. */
22 22
23#ifndef _XDELTA3_DJW_H_ 23#ifndef _XDELTA3_DJW_H_
24#define _XDELTA3_DJW_H_ 24#define _XDELTA3_DJW_H_
25 25
26/* The following people deserve much credit for the algorithms and techniques contained in 26/* The following people deserve much credit for the algorithms and
27 * this file: 27 * techniques contained in this file:
28 28
29 Julian Seward 29 Julian Seward
30 Bzip2 sources, implementation of the multi-table Huffman technique. 30 Bzip2 sources, implementation of the multi-table Huffman technique.
@@ -43,13 +43,14 @@
43 43
44*/ 44*/
45 45
46/* OPT: during the multi-table iteration, pick the worst-overall performing table and 46/* OPT: during the multi-table iteration, pick the worst-overall
47 * replace it with exactly the frequencies of the worst-overall performing sector or 47 * performing table and replace it with exactly the frequencies of the
48 * N-worst performing sectors. */ 48 * worst-overall performing sector or N-worst performing sectors. */
49 49
50/* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with the Bzip prefix coding 50/* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with
51 * strategy. xdfs-0.256 contains the last of the other-format tests, including RFC1950 51 * the Bzip prefix coding strategy. xdfs-0.256 contains the last of
52 * and the RFC1950+MTF tests. */ 52 * the other-format tests, including RFC1950 and the RFC1950+MTF
53 * tests. */
53 54
54#define DJW_MAX_CODELEN 20 /* Maximum length of an alphabet code. */ 55#define DJW_MAX_CODELEN 20 /* Maximum length of an alphabet code. */
55 56
@@ -111,27 +112,31 @@ struct _djw_stream
111 int unused; 112 int unused;
112}; 113};
113 114
114/* Each Huffman table consists of 256 "code length" (CLEN) codes, which are themselves 115/* Each Huffman table consists of 256 "code length" (CLEN) codes,
115 * Huffman coded after eliminating repeats and move-to-front coding. The prefix consists 116 * which are themselves Huffman coded after eliminating repeats and
116 * of all the CLEN codes in djw_encode_basic plus a 4-bit value stating how many of the 117 * move-to-front coding. The prefix consists of all the CLEN codes in
117 * djw_encode_extra codes are actually coded (the rest are presumed zero, or unused CLEN 118 * djw_encode_basic plus a 4-bit value stating how many of the
118 * codes). 119 * djw_encode_extra codes are actually coded (the rest are presumed
120 * zero, or unused CLEN codes).
119 * 121 *
120 * These values of these two arrays were arrived at by studying the distribution of min 122 * These values of these two arrays were arrived at by studying the
121 * and max clen over a collection of DATA, INST, and ADDR inputs. The goal is to specify 123 * distribution of min and max clen over a collection of DATA, INST,
122 * the order of djw_extra_codes that is most likely to minimize the number of extra codes 124 * and ADDR inputs. The goal is to specify the order of
123 * that must be encoded. 125 * djw_extra_codes that is most likely to minimize the number of extra
126 * codes that must be encoded.
124 * 127 *
125 * Results: 158896 sections were counted by compressing files (window size 512K) listed 128 * Results: 158896 sections were counted by compressing files (window
126 * with: `find / -type f ( -user jmacd -o -perm +444 )` 129 * size 512K) listed with: `find / -type f ( -user jmacd -o -perm +444
130 * )`
127 * 131 *
128 * The distribution of CLEN codes for each efficient invocation of the secondary 132 * The distribution of CLEN codes for each efficient invocation of the
129 * compressor (taking the best number of groups/sector size) was recorded. Then we look at 133 * secondary compressor (taking the best number of groups/sector size)
130 * the distribution of min and max clen values, counting the number of times the value 134 * was recorded. Then we look at the distribution of min and max clen
131 * C_low is less than the min and C_high is greater than the max. Values >= C_high and <= 135 * values, counting the number of times the value C_low is less than
132 * C_low will not have their lengths coded. The results are sorted and the least likely 136 * the min and C_high is greater than the max. Values >= C_high and
133 * 15 are placed into the djw_encode_extra[] array in order. These values are used as 137 * <= C_low will not have their lengths coded. The results are sorted
134 * the initial MTF ordering. 138 * and the least likely 15 are placed into the djw_encode_extra[]
139 * array in order. These values are used as the initial MTF ordering.
135 140
136 clow[1] = 155119 141 clow[1] = 155119
137 clow[2] = 140325 142 clow[2] = 140325
diff --git a/xdelta3/xdelta3-fgk.h b/xdelta3/xdelta3-fgk.h
index f5d8fc6..b2b8f09 100644
--- a/xdelta3/xdelta3-fgk.h
+++ b/xdelta3/xdelta3-fgk.h
@@ -22,11 +22,12 @@
22#ifndef _XDELTA3_FGK_h_ 22#ifndef _XDELTA3_FGK_h_
23#define _XDELTA3_FGK_h_ 23#define _XDELTA3_FGK_h_
24 24
25/* An implementation of the FGK algorithm described by D.E. Knuth in "Dynamic Huffman 25/* An implementation of the FGK algorithm described by D.E. Knuth in
26 * Coding" in Journal of Algorithms 6. */ 26 * "Dynamic Huffman Coding" in Journal of Algorithms 6. */
27 27
28/* A 32bit counter (fgk_weight) is used as the frequency counter for nodes in the huffman 28/* A 32bit counter (fgk_weight) is used as the frequency counter for
29 * tree. TODO: Need to test for overflow and/or reset stats. */ 29 * nodes in the huffman tree. TODO: Need oto test for overflow and/or
30 * reset stats. */
30 31
31typedef struct _fgk_stream fgk_stream; 32typedef struct _fgk_stream fgk_stream;
32typedef struct _fgk_node fgk_node; 33typedef struct _fgk_node fgk_node;
@@ -47,12 +48,14 @@ struct _fgk_block {
47/* The code can also support fixed huffman encoding/decoding. */ 48/* The code can also support fixed huffman encoding/decoding. */
48#define IS_ADAPTIVE 1 49#define IS_ADAPTIVE 1
49 50
50/* weight is a count of the number of times this element has been seen in the current 51/* weight is a count of the number of times this element has been seen
51 * encoding/decoding. parent, right_child, and left_child are pointers defining the tree 52 * in the current encoding/decoding. parent, right_child, and
52 * structure. right and left point to neighbors in an ordered sequence of 53 * left_child are pointers defining the tree structure. right and
53 * weights. The left child of a node is always guaranteed to have weight not greater than 54 * left point to neighbors in an ordered sequence of weights. The
54 * its sibling. fgk_blockLeader points to the element with the same weight as itself which is 55 * left child of a node is always guaranteed to have weight not
55 * closest to the next increasing weight block. */ 56 * greater than its sibling. fgk_blockLeader points to the element
57 * with the same weight as itself which is closest to the next
58 * increasing weight block. */
56struct _fgk_node 59struct _fgk_node
57{ 60{
58 fgk_weight weight; 61 fgk_weight weight;
@@ -64,14 +67,17 @@ struct _fgk_node
64 fgk_block *my_block; 67 fgk_block *my_block;
65}; 68};
66 69
67/* alphabet_size is the a count of the number of possible leaves in the huffman tree. The 70/* alphabet_size is the a count of the number of possible leaves in
68 * number of total nodes counting internal nodes is ((2 * alphabet_size) - 1). 71 * the huffman tree. The number of total nodes counting internal
69 * zero_freq_count is the number of elements remaining which have zero frequency. 72 * nodes is ((2 * alphabet_size) - 1). zero_freq_count is the number
70 * zero_freq_exp and zero_freq_rem satisfy the equation zero_freq_count = 2^zero_freq_exp + 73 * of elements remaining which have zero frequency. zero_freq_exp and
71 * zero_freq_rem. root_node is the root of the tree, which is initialized to a node with 74 * zero_freq_rem satisfy the equation zero_freq_count =
72 * zero frequency and contains the 0th such element. free_node contains a pointer to the 75 * 2^zero_freq_exp + zero_freq_rem. root_node is the root of the
73 * next available fgk_node space. alphabet contains all the elements and is indexed by N. 76 * tree, which is initialized to a node with zero frequency and
74 * remaining_zeros points to the head of the list of zeros. */ 77 * contains the 0th such element. free_node contains a pointer to the
78 * next available fgk_node space. alphabet contains all the elements
79 * and is indexed by N. remaining_zeros points to the head of the
80 * list of zeros. */
75struct _fgk_stream 81struct _fgk_stream
76{ 82{
77 int alphabet_size; 83 int alphabet_size;
@@ -533,9 +539,10 @@ static fgk_node* fgk_increase_zero_weight (fgk_stream *h, int n)
533 return this_zero; 539 return this_zero;
534} 540}
535 541
536/* When a zero frequency element is encoded, it is followed by the binary representation 542/* When a zero frequency element is encoded, it is followed by the
537 * of the index into the remaining elements. Sets a cache to the element before it so 543 * binary representation of the index into the remaining elements.
538 * that it can be removed without calling this procedure again. */ 544 * Sets a cache to the element before it so that it can be removed
545 * without calling this procedure again. */
539static unsigned int fgk_find_nth_zero (fgk_stream* h, int n) 546static unsigned int fgk_find_nth_zero (fgk_stream* h, int n)
540{ 547{
541 fgk_node *target_ptr = h->alphabet + n; 548 fgk_node *target_ptr = h->alphabet + n;
@@ -604,9 +611,10 @@ static void fgk_init_node (fgk_node *node, int i, int size)
604 node->my_block = NULL; 611 node->my_block = NULL;
605} 612}
606 613
607/* The data structure used is an array of blocks, which are unions of free pointers and 614/* The data structure used is an array of blocks, which are unions of
608 * huffnode pointers. free blocks are a linked list of free blocks, the front of which is 615 * free pointers and huffnode pointers. free blocks are a linked list
609 * h->free_block. The used blocks are pointers to the head of each block. */ 616 * of free blocks, the front of which is h->free_block. The used
617 * blocks are pointers to the head of each block. */
610static fgk_block* fgk_make_block (fgk_stream *h, fgk_node* lead) 618static fgk_block* fgk_make_block (fgk_stream *h, fgk_node* lead)
611{ 619{
612 fgk_block *ret = h->free_block; 620 fgk_block *ret = h->free_block;
@@ -627,8 +635,8 @@ static void fgk_free_block (fgk_stream *h, fgk_block *b)
627 h->free_block = b; 635 h->free_block = b;
628} 636}
629 637
630/* sets zero_freq_count, zero_freq_rem, and zero_freq_exp to satsity the equation given 638/* sets zero_freq_count, zero_freq_rem, and zero_freq_exp to satsity
631 * above. */ 639 * the equation given above. */
632static void fgk_factor_remaining (fgk_stream *h) 640static void fgk_factor_remaining (fgk_stream *h)
633{ 641{
634 unsigned int i; 642 unsigned int i;
@@ -705,9 +713,10 @@ static int fgk_nth_zero (fgk_stream* h, int n)
705{ 713{
706 fgk_node *ret = h->remaining_zeros; 714 fgk_node *ret = h->remaining_zeros;
707 715
708 /* ERROR: if during this loop (ret->right_child == NULL) then the encoder's zero count 716 /* ERROR: if during this loop (ret->right_child == NULL) then the
709 * is too high. Could return an error code now, but is probably unnecessary overhead, 717 * encoder's zero count is too high. Could return an error code
710 * since the caller should check integrity anyway. */ 718 * now, but is probably unnecessary overhead, since the caller
719 * should check integrity anyway. */
711 for (; n != 0 && ret->right_child != NULL; n -= 1) 720 for (; n != 0 && ret->right_child != NULL; n -= 1)
712 { 721 {
713 ret = ret->right_child; 722 ret = ret->right_child;
diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h
index 0c5e08d..4d642df 100644
--- a/xdelta3/xdelta3-main.h
+++ b/xdelta3/xdelta3-main.h
@@ -2955,10 +2955,11 @@ done:
2955 main_file_close (ifile); 2955 main_file_close (ifile);
2956 main_file_close (sfile); 2956 main_file_close (sfile);
2957 2957
2958 /* If output file is not open yet because of delayed-open, it means we never encountered 2958 /* If output file is not open yet because of delayed-open, it means
2959 * a window in the delta, but it could have had a VCDIFF header? TODO: solve this 2959 * we never encountered a window in the delta, but it could have had
2960 * elsewhere. For now, it prints "nothing to output" below, but the check doesn't 2960 * a VCDIFF header? TODO: solve this elsewhere. For now, it prints
2961 * happen in case of option_no_output. */ 2961 * "nothing to output" below, but the check doesn't happen in case
2962 * of option_no_output. */
2962 if (! option_no_output) 2963 if (! option_no_output)
2963 { 2964 {
2964 if (!stdout_only && ! main_file_isopen (ofile)) 2965 if (!stdout_only && ! main_file_isopen (ofile))
diff --git a/xdelta3/xdelta3-test.h b/xdelta3/xdelta3-test.h
index f4fb76f..0739e2d 100644
--- a/xdelta3/xdelta3-test.h
+++ b/xdelta3/xdelta3-test.h
@@ -2339,10 +2339,10 @@ xd3_selftest (void)
2339 DO_TEST (decompress_single_bit_error, XD3_ADLER32, 3); 2339 DO_TEST (decompress_single_bit_error, XD3_ADLER32, 3);
2340 2340
2341 IF_FGK (DO_TEST (decompress_single_bit_error, XD3_SEC_FGK, 3)); 2341 IF_FGK (DO_TEST (decompress_single_bit_error, XD3_SEC_FGK, 3));
2342 IF_DJW (DO_TEST (decompress_single_bit_error, XD3_SEC_DJW, 17)); 2342 IF_DJW (DO_TEST (decompress_single_bit_error, XD3_SEC_DJW, 18));
2343 2343
2344 /* There are many expected non-failures for ALT_CODE_TABLE because not all of the 2344 /* There are many expected non-failures for ALT_CODE_TABLE because
2345 * instruction codes are used. */ 2345 * not all of the instruction codes are used. */
2346 IF_GENCODETBL (DO_TEST (decompress_single_bit_error, XD3_ALT_CODE_TABLE, 224)); 2346 IF_GENCODETBL (DO_TEST (decompress_single_bit_error, XD3_ALT_CODE_TABLE, 224));
2347 2347
2348#ifndef WIN32 2348#ifndef WIN32
diff --git a/xdelta3/xdelta3.c b/xdelta3/xdelta3.c
index 5a79bf4..0381206 100644
--- a/xdelta3/xdelta3.c
+++ b/xdelta3/xdelta3.c
@@ -4778,6 +4778,7 @@ static const xd3_smatcher XD3_TEMPLATE(__smatcher_) =
4778 * 4778 *
4779 * TODO: really would like a good test for this logic. how? 4779 * TODO: really would like a good test for this logic. how?
4780 * TODO: optimize the inner loop 4780 * TODO: optimize the inner loop
4781 * TODO: do-templatize this
4781 */ 4782 */
4782static int 4783static int
4783XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_point) 4784XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_point)
@@ -4911,8 +4912,8 @@ XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_poi
4911static int 4912static int
4912XD3_TEMPLATE(xd3_string_match_) (xd3_stream *stream) 4913XD3_TEMPLATE(xd3_string_match_) (xd3_stream *stream)
4913{ 4914{
4914 /* TODO config: These next three variables should be statically compliled in various 4915 /* TODO config: These next three variables should be statically
4915 * scan_cfg configurations? */ 4916 * compliled in various scan_cfg configurations? */
4916 const int DO_SMALL = ! (stream->flags & XD3_NOCOMPRESS); 4917 const int DO_SMALL = ! (stream->flags & XD3_NOCOMPRESS);
4917 const int DO_LARGE = (stream->src != NULL); 4918 const int DO_LARGE = (stream->src != NULL);
4918 const int DO_RUN = (1); 4919 const int DO_RUN = (1);