diff options
-rw-r--r-- | xdelta3/xdelta3-djw.h | 63 | ||||
-rw-r--r-- | xdelta3/xdelta3-fgk.h | 67 | ||||
-rw-r--r-- | xdelta3/xdelta3-main.h | 9 | ||||
-rw-r--r-- | xdelta3/xdelta3-test.h | 6 | ||||
-rw-r--r-- | xdelta3/xdelta3.c | 5 |
5 files changed, 83 insertions, 67 deletions
diff --git a/xdelta3/xdelta3-djw.h b/xdelta3/xdelta3-djw.h index 396f73c..a9f6e6f 100644 --- a/xdelta3/xdelta3-djw.h +++ b/xdelta3/xdelta3-djw.h | |||
@@ -16,15 +16,15 @@ | |||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
17 | */ | 17 | */ |
18 | 18 | ||
19 | /* TODO: This code needs a thorough round of commenting. There is some slop in the | 19 | /* TODO: This code needs a thorough round of commenting. There is |
20 | * declaration of arrays, which are maybe one element larger than they need to be and | 20 | * some slop in the declaration of arrays, which are maybe one element |
21 | * comments would help clear it up. */ | 21 | * larger than they need to be and comments would help clear it up. */ |
22 | 22 | ||
23 | #ifndef _XDELTA3_DJW_H_ | 23 | #ifndef _XDELTA3_DJW_H_ |
24 | #define _XDELTA3_DJW_H_ | 24 | #define _XDELTA3_DJW_H_ |
25 | 25 | ||
26 | /* The following people deserve much credit for the algorithms and techniques contained in | 26 | /* The following people deserve much credit for the algorithms and |
27 | * this file: | 27 | * techniques contained in this file: |
28 | 28 | ||
29 | Julian Seward | 29 | Julian Seward |
30 | Bzip2 sources, implementation of the multi-table Huffman technique. | 30 | Bzip2 sources, implementation of the multi-table Huffman technique. |
@@ -43,13 +43,14 @@ | |||
43 | 43 | ||
44 | */ | 44 | */ |
45 | 45 | ||
46 | /* OPT: during the multi-table iteration, pick the worst-overall performing table and | 46 | /* OPT: during the multi-table iteration, pick the worst-overall |
47 | * replace it with exactly the frequencies of the worst-overall performing sector or | 47 | * performing table and replace it with exactly the frequencies of the |
48 | * N-worst performing sectors. */ | 48 | * worst-overall performing sector or N-worst performing sectors. */ |
49 | 49 | ||
50 | /* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with the Bzip prefix coding | 50 | /* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with |
51 | * strategy. xdfs-0.256 contains the last of the other-format tests, including RFC1950 | 51 | * the Bzip prefix coding strategy. xdfs-0.256 contains the last of |
52 | * and the RFC1950+MTF tests. */ | 52 | * the other-format tests, including RFC1950 and the RFC1950+MTF |
53 | * tests. */ | ||
53 | 54 | ||
54 | #define DJW_MAX_CODELEN 20 /* Maximum length of an alphabet code. */ | 55 | #define DJW_MAX_CODELEN 20 /* Maximum length of an alphabet code. */ |
55 | 56 | ||
@@ -111,27 +112,31 @@ struct _djw_stream | |||
111 | int unused; | 112 | int unused; |
112 | }; | 113 | }; |
113 | 114 | ||
114 | /* Each Huffman table consists of 256 "code length" (CLEN) codes, which are themselves | 115 | /* Each Huffman table consists of 256 "code length" (CLEN) codes, |
115 | * Huffman coded after eliminating repeats and move-to-front coding. The prefix consists | 116 | * which are themselves Huffman coded after eliminating repeats and |
116 | * of all the CLEN codes in djw_encode_basic plus a 4-bit value stating how many of the | 117 | * move-to-front coding. The prefix consists of all the CLEN codes in |
117 | * djw_encode_extra codes are actually coded (the rest are presumed zero, or unused CLEN | 118 | * djw_encode_basic plus a 4-bit value stating how many of the |
118 | * codes). | 119 | * djw_encode_extra codes are actually coded (the rest are presumed |
120 | * zero, or unused CLEN codes). | ||
119 | * | 121 | * |
120 | * These values of these two arrays were arrived at by studying the distribution of min | 122 | * These values of these two arrays were arrived at by studying the |
121 | * and max clen over a collection of DATA, INST, and ADDR inputs. The goal is to specify | 123 | * distribution of min and max clen over a collection of DATA, INST, |
122 | * the order of djw_extra_codes that is most likely to minimize the number of extra codes | 124 | * and ADDR inputs. The goal is to specify the order of |
123 | * that must be encoded. | 125 | * djw_extra_codes that is most likely to minimize the number of extra |
126 | * codes that must be encoded. | ||
124 | * | 127 | * |
125 | * Results: 158896 sections were counted by compressing files (window size 512K) listed | 128 | * Results: 158896 sections were counted by compressing files (window |
126 | * with: `find / -type f ( -user jmacd -o -perm +444 )` | 129 | * size 512K) listed with: `find / -type f ( -user jmacd -o -perm +444 |
130 | * )` | ||
127 | * | 131 | * |
128 | * The distribution of CLEN codes for each efficient invocation of the secondary | 132 | * The distribution of CLEN codes for each efficient invocation of the |
129 | * compressor (taking the best number of groups/sector size) was recorded. Then we look at | 133 | * secondary compressor (taking the best number of groups/sector size) |
130 | * the distribution of min and max clen values, counting the number of times the value | 134 | * was recorded. Then we look at the distribution of min and max clen |
131 | * C_low is less than the min and C_high is greater than the max. Values >= C_high and <= | 135 | * values, counting the number of times the value C_low is less than |
132 | * C_low will not have their lengths coded. The results are sorted and the least likely | 136 | * the min and C_high is greater than the max. Values >= C_high and |
133 | * 15 are placed into the djw_encode_extra[] array in order. These values are used as | 137 | * <= C_low will not have their lengths coded. The results are sorted |
134 | * the initial MTF ordering. | 138 | * and the least likely 15 are placed into the djw_encode_extra[] |
139 | * array in order. These values are used as the initial MTF ordering. | ||
135 | 140 | ||
136 | clow[1] = 155119 | 141 | clow[1] = 155119 |
137 | clow[2] = 140325 | 142 | clow[2] = 140325 |
diff --git a/xdelta3/xdelta3-fgk.h b/xdelta3/xdelta3-fgk.h index f5d8fc6..b2b8f09 100644 --- a/xdelta3/xdelta3-fgk.h +++ b/xdelta3/xdelta3-fgk.h | |||
@@ -22,11 +22,12 @@ | |||
22 | #ifndef _XDELTA3_FGK_h_ | 22 | #ifndef _XDELTA3_FGK_h_ |
23 | #define _XDELTA3_FGK_h_ | 23 | #define _XDELTA3_FGK_h_ |
24 | 24 | ||
25 | /* An implementation of the FGK algorithm described by D.E. Knuth in "Dynamic Huffman | 25 | /* An implementation of the FGK algorithm described by D.E. Knuth in |
26 | * Coding" in Journal of Algorithms 6. */ | 26 | * "Dynamic Huffman Coding" in Journal of Algorithms 6. */ |
27 | 27 | ||
28 | /* A 32bit counter (fgk_weight) is used as the frequency counter for nodes in the huffman | 28 | /* A 32bit counter (fgk_weight) is used as the frequency counter for |
29 | * tree. TODO: Need to test for overflow and/or reset stats. */ | 29 | * nodes in the huffman tree. TODO: Need oto test for overflow and/or |
30 | * reset stats. */ | ||
30 | 31 | ||
31 | typedef struct _fgk_stream fgk_stream; | 32 | typedef struct _fgk_stream fgk_stream; |
32 | typedef struct _fgk_node fgk_node; | 33 | typedef struct _fgk_node fgk_node; |
@@ -47,12 +48,14 @@ struct _fgk_block { | |||
47 | /* The code can also support fixed huffman encoding/decoding. */ | 48 | /* The code can also support fixed huffman encoding/decoding. */ |
48 | #define IS_ADAPTIVE 1 | 49 | #define IS_ADAPTIVE 1 |
49 | 50 | ||
50 | /* weight is a count of the number of times this element has been seen in the current | 51 | /* weight is a count of the number of times this element has been seen |
51 | * encoding/decoding. parent, right_child, and left_child are pointers defining the tree | 52 | * in the current encoding/decoding. parent, right_child, and |
52 | * structure. right and left point to neighbors in an ordered sequence of | 53 | * left_child are pointers defining the tree structure. right and |
53 | * weights. The left child of a node is always guaranteed to have weight not greater than | 54 | * left point to neighbors in an ordered sequence of weights. The |
54 | * its sibling. fgk_blockLeader points to the element with the same weight as itself which is | 55 | * left child of a node is always guaranteed to have weight not |
55 | * closest to the next increasing weight block. */ | 56 | * greater than its sibling. fgk_blockLeader points to the element |
57 | * with the same weight as itself which is closest to the next | ||
58 | * increasing weight block. */ | ||
56 | struct _fgk_node | 59 | struct _fgk_node |
57 | { | 60 | { |
58 | fgk_weight weight; | 61 | fgk_weight weight; |
@@ -64,14 +67,17 @@ struct _fgk_node | |||
64 | fgk_block *my_block; | 67 | fgk_block *my_block; |
65 | }; | 68 | }; |
66 | 69 | ||
67 | /* alphabet_size is the a count of the number of possible leaves in the huffman tree. The | 70 | /* alphabet_size is the a count of the number of possible leaves in |
68 | * number of total nodes counting internal nodes is ((2 * alphabet_size) - 1). | 71 | * the huffman tree. The number of total nodes counting internal |
69 | * zero_freq_count is the number of elements remaining which have zero frequency. | 72 | * nodes is ((2 * alphabet_size) - 1). zero_freq_count is the number |
70 | * zero_freq_exp and zero_freq_rem satisfy the equation zero_freq_count = 2^zero_freq_exp + | 73 | * of elements remaining which have zero frequency. zero_freq_exp and |
71 | * zero_freq_rem. root_node is the root of the tree, which is initialized to a node with | 74 | * zero_freq_rem satisfy the equation zero_freq_count = |
72 | * zero frequency and contains the 0th such element. free_node contains a pointer to the | 75 | * 2^zero_freq_exp + zero_freq_rem. root_node is the root of the |
73 | * next available fgk_node space. alphabet contains all the elements and is indexed by N. | 76 | * tree, which is initialized to a node with zero frequency and |
74 | * remaining_zeros points to the head of the list of zeros. */ | 77 | * contains the 0th such element. free_node contains a pointer to the |
78 | * next available fgk_node space. alphabet contains all the elements | ||
79 | * and is indexed by N. remaining_zeros points to the head of the | ||
80 | * list of zeros. */ | ||
75 | struct _fgk_stream | 81 | struct _fgk_stream |
76 | { | 82 | { |
77 | int alphabet_size; | 83 | int alphabet_size; |
@@ -533,9 +539,10 @@ static fgk_node* fgk_increase_zero_weight (fgk_stream *h, int n) | |||
533 | return this_zero; | 539 | return this_zero; |
534 | } | 540 | } |
535 | 541 | ||
536 | /* When a zero frequency element is encoded, it is followed by the binary representation | 542 | /* When a zero frequency element is encoded, it is followed by the |
537 | * of the index into the remaining elements. Sets a cache to the element before it so | 543 | * binary representation of the index into the remaining elements. |
538 | * that it can be removed without calling this procedure again. */ | 544 | * Sets a cache to the element before it so that it can be removed |
545 | * without calling this procedure again. */ | ||
539 | static unsigned int fgk_find_nth_zero (fgk_stream* h, int n) | 546 | static unsigned int fgk_find_nth_zero (fgk_stream* h, int n) |
540 | { | 547 | { |
541 | fgk_node *target_ptr = h->alphabet + n; | 548 | fgk_node *target_ptr = h->alphabet + n; |
@@ -604,9 +611,10 @@ static void fgk_init_node (fgk_node *node, int i, int size) | |||
604 | node->my_block = NULL; | 611 | node->my_block = NULL; |
605 | } | 612 | } |
606 | 613 | ||
607 | /* The data structure used is an array of blocks, which are unions of free pointers and | 614 | /* The data structure used is an array of blocks, which are unions of |
608 | * huffnode pointers. free blocks are a linked list of free blocks, the front of which is | 615 | * free pointers and huffnode pointers. free blocks are a linked list |
609 | * h->free_block. The used blocks are pointers to the head of each block. */ | 616 | * of free blocks, the front of which is h->free_block. The used |
617 | * blocks are pointers to the head of each block. */ | ||
610 | static fgk_block* fgk_make_block (fgk_stream *h, fgk_node* lead) | 618 | static fgk_block* fgk_make_block (fgk_stream *h, fgk_node* lead) |
611 | { | 619 | { |
612 | fgk_block *ret = h->free_block; | 620 | fgk_block *ret = h->free_block; |
@@ -627,8 +635,8 @@ static void fgk_free_block (fgk_stream *h, fgk_block *b) | |||
627 | h->free_block = b; | 635 | h->free_block = b; |
628 | } | 636 | } |
629 | 637 | ||
630 | /* sets zero_freq_count, zero_freq_rem, and zero_freq_exp to satsity the equation given | 638 | /* sets zero_freq_count, zero_freq_rem, and zero_freq_exp to satsity |
631 | * above. */ | 639 | * the equation given above. */ |
632 | static void fgk_factor_remaining (fgk_stream *h) | 640 | static void fgk_factor_remaining (fgk_stream *h) |
633 | { | 641 | { |
634 | unsigned int i; | 642 | unsigned int i; |
@@ -705,9 +713,10 @@ static int fgk_nth_zero (fgk_stream* h, int n) | |||
705 | { | 713 | { |
706 | fgk_node *ret = h->remaining_zeros; | 714 | fgk_node *ret = h->remaining_zeros; |
707 | 715 | ||
708 | /* ERROR: if during this loop (ret->right_child == NULL) then the encoder's zero count | 716 | /* ERROR: if during this loop (ret->right_child == NULL) then the |
709 | * is too high. Could return an error code now, but is probably unnecessary overhead, | 717 | * encoder's zero count is too high. Could return an error code |
710 | * since the caller should check integrity anyway. */ | 718 | * now, but is probably unnecessary overhead, since the caller |
719 | * should check integrity anyway. */ | ||
711 | for (; n != 0 && ret->right_child != NULL; n -= 1) | 720 | for (; n != 0 && ret->right_child != NULL; n -= 1) |
712 | { | 721 | { |
713 | ret = ret->right_child; | 722 | ret = ret->right_child; |
diff --git a/xdelta3/xdelta3-main.h b/xdelta3/xdelta3-main.h index 0c5e08d..4d642df 100644 --- a/xdelta3/xdelta3-main.h +++ b/xdelta3/xdelta3-main.h | |||
@@ -2955,10 +2955,11 @@ done: | |||
2955 | main_file_close (ifile); | 2955 | main_file_close (ifile); |
2956 | main_file_close (sfile); | 2956 | main_file_close (sfile); |
2957 | 2957 | ||
2958 | /* If output file is not open yet because of delayed-open, it means we never encountered | 2958 | /* If output file is not open yet because of delayed-open, it means |
2959 | * a window in the delta, but it could have had a VCDIFF header? TODO: solve this | 2959 | * we never encountered a window in the delta, but it could have had |
2960 | * elsewhere. For now, it prints "nothing to output" below, but the check doesn't | 2960 | * a VCDIFF header? TODO: solve this elsewhere. For now, it prints |
2961 | * happen in case of option_no_output. */ | 2961 | * "nothing to output" below, but the check doesn't happen in case |
2962 | * of option_no_output. */ | ||
2962 | if (! option_no_output) | 2963 | if (! option_no_output) |
2963 | { | 2964 | { |
2964 | if (!stdout_only && ! main_file_isopen (ofile)) | 2965 | if (!stdout_only && ! main_file_isopen (ofile)) |
diff --git a/xdelta3/xdelta3-test.h b/xdelta3/xdelta3-test.h index f4fb76f..0739e2d 100644 --- a/xdelta3/xdelta3-test.h +++ b/xdelta3/xdelta3-test.h | |||
@@ -2339,10 +2339,10 @@ xd3_selftest (void) | |||
2339 | DO_TEST (decompress_single_bit_error, XD3_ADLER32, 3); | 2339 | DO_TEST (decompress_single_bit_error, XD3_ADLER32, 3); |
2340 | 2340 | ||
2341 | IF_FGK (DO_TEST (decompress_single_bit_error, XD3_SEC_FGK, 3)); | 2341 | IF_FGK (DO_TEST (decompress_single_bit_error, XD3_SEC_FGK, 3)); |
2342 | IF_DJW (DO_TEST (decompress_single_bit_error, XD3_SEC_DJW, 17)); | 2342 | IF_DJW (DO_TEST (decompress_single_bit_error, XD3_SEC_DJW, 18)); |
2343 | 2343 | ||
2344 | /* There are many expected non-failures for ALT_CODE_TABLE because not all of the | 2344 | /* There are many expected non-failures for ALT_CODE_TABLE because |
2345 | * instruction codes are used. */ | 2345 | * not all of the instruction codes are used. */ |
2346 | IF_GENCODETBL (DO_TEST (decompress_single_bit_error, XD3_ALT_CODE_TABLE, 224)); | 2346 | IF_GENCODETBL (DO_TEST (decompress_single_bit_error, XD3_ALT_CODE_TABLE, 224)); |
2347 | 2347 | ||
2348 | #ifndef WIN32 | 2348 | #ifndef WIN32 |
diff --git a/xdelta3/xdelta3.c b/xdelta3/xdelta3.c index 5a79bf4..0381206 100644 --- a/xdelta3/xdelta3.c +++ b/xdelta3/xdelta3.c | |||
@@ -4778,6 +4778,7 @@ static const xd3_smatcher XD3_TEMPLATE(__smatcher_) = | |||
4778 | * | 4778 | * |
4779 | * TODO: really would like a good test for this logic. how? | 4779 | * TODO: really would like a good test for this logic. how? |
4780 | * TODO: optimize the inner loop | 4780 | * TODO: optimize the inner loop |
4781 | * TODO: do-templatize this | ||
4781 | */ | 4782 | */ |
4782 | static int | 4783 | static int |
4783 | XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_point) | 4784 | XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_point) |
@@ -4911,8 +4912,8 @@ XD3_TEMPLATE(xd3_srcwin_move_point_) (xd3_stream *stream, usize_t *next_move_poi | |||
4911 | static int | 4912 | static int |
4912 | XD3_TEMPLATE(xd3_string_match_) (xd3_stream *stream) | 4913 | XD3_TEMPLATE(xd3_string_match_) (xd3_stream *stream) |
4913 | { | 4914 | { |
4914 | /* TODO config: These next three variables should be statically compliled in various | 4915 | /* TODO config: These next three variables should be statically |
4915 | * scan_cfg configurations? */ | 4916 | * compliled in various scan_cfg configurations? */ |
4916 | const int DO_SMALL = ! (stream->flags & XD3_NOCOMPRESS); | 4917 | const int DO_SMALL = ! (stream->flags & XD3_NOCOMPRESS); |
4917 | const int DO_LARGE = (stream->src != NULL); | 4918 | const int DO_LARGE = (stream->src != NULL); |
4918 | const int DO_RUN = (1); | 4919 | const int DO_RUN = (1); |