/* xdelta 3 - delta compression tools and library
 * Copyright (C) 2002, 2006, 2007.  Joshua P. MacDonald
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/* TODO: This code needs a thorough round of commenting.  There is
 * some slop in the declaration of arrays, which are maybe one element
 * larger than they need to be and comments would help clear it up. */

#ifndef _XDELTA3_DJW_H_
#define _XDELTA3_DJW_H_

/* The following people deserve much credit for the algorithms and
 * techniques contained in this file:

 Julian Seward
 Bzip2 sources, implementation of the multi-table Huffman technique.

 Jean-loup Gailly and Mark Adler and L. Peter Deutsch
 Zlib source code, RFC 1951

 Daniel S. Hirschberg and Debra A. LeLewer
 "Efficient Decoding of Prefix Codes"
 Communications of the ACM, April 1990 33(4).

 David J. Wheeler
 Program bred3.c, bexp3 and accompanying documents bred3.ps, huff.ps.
 This contains the idea behind the multi-table Huffman and 1-2 coding
 techniques.
 ftp://ftp.cl.cam.ac.uk/users/djw3/

*/

/* OPT: during the multi-table iteration, pick the worst-overall
 * performing table and replace it with exactly the frequencies of the
 * worst-overall performing sector or N-worst performing sectors. */

/* REF: See xdfs-0.222 and xdfs-0.226 for some old experiments with
 * the Bzip prefix coding strategy.  xdfs-0.256 contains the last of
 * the other-format tests, including RFC1950 and the RFC1950+MTF
 * tests. */

#define DJW_MAX_CODELEN      20U /* Maximum length of an alphabet code. */

/* Code lengths are themselves code-length encoded, so the total number of
 * codes is: [RUN_0, RUN_1, 1-DJW_MAX_CODELEN] */
#define DJW_TOTAL_CODES      (DJW_MAX_CODELEN+2)

#define RUN_0                0U /* Symbols used in MTF+1/2 coding. */
#define RUN_1                1U

/* Number of code lengths always encoded (djw_encode_basic array) */
#define DJW_BASIC_CODES      5U  
#define DJW_RUN_CODES        2U  /* Number of run codes */

/* Offset of extra codes */
#define DJW_EXTRA_12OFFSET   (DJW_BASIC_CODES + DJW_RUN_CODES)

/* Number of optionally encoded code lengths (djw_encode_extra array) */
#define DJW_EXTRA_CODES      15U

/* Number of bits to code [0-DJW_EXTRA_CODES] */
#define DJW_EXTRA_CODE_BITS  4U  

#define DJW_MAX_GROUPS       8U  /* Max number of group coding tables */
#define DJW_GROUP_BITS       3U  /* Number of bits to code [1-DJW_MAX_GROUPS] */

#define DJW_SECTORSZ_MULT     5U  /* Multiplier for encoded sectorsz */
#define DJW_SECTORSZ_BITS     5U  /* Number of bits to code group size */
#define DJW_SECTORSZ_MAX      ((1U << DJW_SECTORSZ_BITS) * DJW_SECTORSZ_MULT)

/* Maximum number of iterations to find group tables. */
#define DJW_MAX_ITER         6U
/* Minimum number of bits an iteration must reduce coding by. */
#define DJW_MIN_IMPROVEMENT  20U 

/* Maximum code length of a prefix code length */
#define DJW_MAX_CLCLEN       15U

/* Number of bits to code [0-DJW_MAX_CLCLEN] */
#define DJW_CLCLEN_BITS      4U  

#define DJW_MAX_GBCLEN       7U  /* Maximum code length of a group selector */

/* Number of bits to code [0-DJW_MAX_GBCLEN]
 * TODO: Actually, should never have zero code lengths here, or else a group
 * went unused.  Write a test for this: if a group goes unused, eliminate
 * it? */
#define DJW_GBCLEN_BITS      3U

/* It has to save at least this many bits... */
#define EFFICIENCY_BITS      16U

typedef struct _djw_stream   djw_stream;
typedef struct _djw_heapen   djw_heapen;
typedef struct _djw_prefix   djw_prefix;
typedef uint32_t             djw_weight;

struct _djw_heapen
{
  uint32_t depth;
  uint32_t freq;
  uint32_t parent;
};

struct _djw_prefix
{
  usize_t   scount;
  uint8_t *symbol;
  usize_t   mcount;
  uint8_t *mtfsym;
  uint8_t *repcnt;
};

struct _djw_stream
{
  int unused;
};

/* Each Huffman table consists of 256 "code length" (CLEN) codes,
 * which are themselves Huffman coded after eliminating repeats and
 * move-to-front coding.  The prefix consists of all the CLEN codes in
 * djw_encode_basic plus a 4-bit value stating how many of the
 * djw_encode_extra codes are actually coded (the rest are presumed
 * zero, or unused CLEN codes).
 *
 * These values of these two arrays were arrived at by studying the
 * distribution of min and max clen over a collection of DATA, INST,
 * and ADDR inputs.  The goal is to specify the order of
 * djw_extra_codes that is most likely to minimize the number of extra
 * codes that must be encoded.
 *
 * Results: 158896 sections were counted by compressing files (window
 * size 512K) listed with: `find / -type f ( -user jmacd -o -perm +444
 * )`
 *
 * The distribution of CLEN codes for each efficient invocation of the
 * secondary compressor (taking the best number of groups/sector size)
 * was recorded.  Then we look at the distribution of min and max clen
 * values, counting the number of times the value C_low is less than
 * the min and C_high is greater than the max.  Values >= C_high and
 * <= C_low will not have their lengths coded.  The results are sorted
 * and the least likely 15 are placed into the djw_encode_extra[]
 * array in order.  These values are used as the initial MTF ordering.

 clow[1] = 155119
 clow[2] = 140325
 clow[3] = 84072
 ---
 clow[4] = 7225
 clow[5] = 1093
 clow[6] = 215
 ---
 chigh[4] = 1
 chigh[5] = 30
 chigh[6] = 218
 chigh[7] = 2060
 chigh[8] = 13271
 ---
 chigh[9] = 39463
 chigh[10] = 77360
 chigh[11] = 118298
 chigh[12] = 141360
 chigh[13] = 154086
 chigh[14] = 157967
 chigh[15] = 158603
 chigh[16] = 158864
 chigh[17] = 158893
 chigh[18] = 158895
 chigh[19] = 158896
 chigh[20] = 158896

*/

static const uint8_t djw_encode_12extra[DJW_EXTRA_CODES] =
  {
    9, 10, 3, 11, 2, 12, 13, 1, 14, 15, 16, 17, 18, 19, 20,
  };

static const uint8_t djw_encode_12basic[DJW_BASIC_CODES] =
  {
    4, 5, 6, 7, 8,
  };

/*********************************************************************/
/*                              DECLS                                */
/*********************************************************************/

static djw_stream*     djw_alloc           (xd3_stream *stream);
static void            djw_init            (djw_stream *h);
static void            djw_destroy         (xd3_stream *stream,
					    djw_stream *h);

#if XD3_ENCODER
static int             xd3_encode_huff     (xd3_stream   *stream,
					    djw_stream  *sec_stream,
					    xd3_output   *input,
					    xd3_output   *output,
					    xd3_sec_cfg  *cfg);
#endif

static int             xd3_decode_huff     (xd3_stream     *stream,
					    djw_stream    *sec_stream,
					    const uint8_t **input,
					    const uint8_t  *const input_end,
					    uint8_t       **output,
					    const uint8_t  *const output_end);

/*********************************************************************/
/*                             HUFFMAN                               */
/*********************************************************************/

static djw_stream*
djw_alloc (xd3_stream *stream)
{
  return (djw_stream*) xd3_alloc (stream, sizeof (djw_stream), 1);
}

static void
djw_init (djw_stream *h)
{
  /* Fields are initialized prior to use. */
}

static void
djw_destroy (xd3_stream *stream,
	     djw_stream *h)
{
  xd3_free (stream, h);
}


/*********************************************************************/
/*                               HEAP                                */
/*********************************************************************/

static inline int
heap_less (const djw_heapen *a, const djw_heapen *b)
{
  return a->freq   < b->freq ||
    (a->freq  == b->freq &&
     a->depth  < b->depth);
}

static inline void
heap_insert (usize_t *heap, const djw_heapen *ents, usize_t p, const usize_t e)
{
  /* Insert ents[e] into next slot heap[p] */
  usize_t pp = p/2; /* P's parent */

  while (heap_less (& ents[e], & ents[heap[pp]]))
    {
      heap[p] = heap[pp];
      p  = pp;
      pp = p/2;
    }

  heap[p] = e;
}

static inline djw_heapen*
heap_extract (usize_t *heap, const djw_heapen *ents, usize_t heap_last)
{
  usize_t smallest = heap[1];
  usize_t p, pc, t;

  /* Caller decrements heap_last, so heap_last+1 is the replacement elt. */
  heap[1] = heap[heap_last+1];

  /* Re-heapify */
  for (p = 1; ; p = pc)
    {
      pc = p*2;

      /* Reached bottom of heap */
      if (pc > heap_last) { break; }

      /* See if second child is smaller. */
      if (pc < heap_last && heap_less (& ents[heap[pc+1]], & ents[heap[pc]]))
	{
	  pc += 1;
	}

      /* If pc is not smaller than p, heap property re-established. */
      if (! heap_less (& ents[heap[pc]], & ents[heap[p]])) { break; }

      t = heap[pc];
      heap[pc] = heap[p];
      heap[p] = t;
    }

  return (djw_heapen*) & ents[smallest];
}

#if XD3_DEBUG
static void
heap_check (usize_t *heap, djw_heapen *ents, usize_t heap_last)
{
  usize_t i;
  for (i = 1; i <= heap_last; i += 1)
    {
      /* Heap property: child not less than parent */
      XD3_ASSERT (! heap_less (& ents[heap[i]], & ents[heap[i/2]]));

      IF_DEBUG2 (DP(RINT "heap[%d] = %u\n", i, ents[heap[i]].freq));
    }
}
#endif

/*********************************************************************/
/*                             MTF, 1/2                              */
/*********************************************************************/

static inline usize_t
djw_update_mtf (uint8_t *mtf, usize_t mtf_i)
{
  int k;
  usize_t sym = mtf[mtf_i];

  for (k = mtf_i; k != 0; k -= 1) { mtf[k] = mtf[k-1]; }

  mtf[0] = sym;
  return sym;
}

static inline void
djw_update_1_2 (int *mtf_run, usize_t *mtf_i,
		uint8_t *mtfsym, djw_weight *freq)
{
  int code;
  
  do
    {
      /* Offset by 1, since any number of RUN_ symbols implies run>0... */
      *mtf_run -= 1;

      code = (*mtf_run & 1) ? RUN_1 : RUN_0;

      mtfsym[(*mtf_i)++] = code;
      freq[code] += 1;
      *mtf_run >>= 1;
    }
  while (*mtf_run >= 1);

  *mtf_run = 0;
}

static void
djw_init_clen_mtf_1_2 (uint8_t *clmtf)
{
  usize_t i, cl_i = 0;

  clmtf[cl_i++] = 0;
  for (i = 0; i < DJW_BASIC_CODES; i += 1)
    {
      clmtf[cl_i++] = djw_encode_12basic[i];
    }
  for (i = 0; i < DJW_EXTRA_CODES; i += 1)
    {
      clmtf[cl_i++] = djw_encode_12extra[i];
    }
}

/*********************************************************************/
/*                           PREFIX CODES                            */
/*********************************************************************/
#if XD3_ENCODER
static usize_t
djw_build_prefix (const djw_weight *freq, uint8_t *clen, usize_t asize, usize_t maxlen)
{
  /* Heap with 0th entry unused, prefix tree with up to ALPHABET_SIZE-1
   * internal nodes, never more than ALPHABET_SIZE entries actually in the
   * heap (minimum weight subtrees during prefix construction).  First
   * ALPHABET_SIZE entries are the actual symbols, next ALPHABET_SIZE-1 are
   * internal nodes. */
  djw_heapen ents[ALPHABET_SIZE * 2];
  usize_t heap[ALPHABET_SIZE + 1];

  usize_t heap_last; /* Index of the last _valid_ heap entry. */
  usize_t ents_size; /* Number of entries, including 0th fake entry */
  usize_t  overflow;  /* Number of code lengths that overflow */
  uint32_t total_bits;
  usize_t i;

  IF_DEBUG (uint32_t first_bits = 0);

  /* Insert real symbol frequences. */
  for (i = 0; i < asize; i += 1)
    {
      ents[i+1].freq = freq[i];
      IF_DEBUG2 (DP(RINT "ents[%d] = freq[%d] = %d\n",
			i+1, i, freq[i]));
    }

 again:

  /* The loop is re-entered each time an overflow occurs.  Re-initialize... */
  heap_last = 0;
  ents_size = 1;
  overflow  = 0;
  total_bits = 0;

  /* 0th entry terminates the while loop in heap_insert (it's the parent of
   * the smallest element, always less-than) */
  heap[0] = 0;
  ents[0].depth = 0;
  ents[0].freq  = 0;

  /* Initial heap. */
  for (i = 0; i < asize; i += 1, ents_size += 1)
    {
      ents[ents_size].depth  = 0;
      ents[ents_size].parent = 0;

      if (ents[ents_size].freq != 0)
	{
	  heap_insert (heap, ents, ++heap_last, ents_size);
	}
    }

  IF_DEBUG (heap_check (heap, ents, heap_last));

  /* Must be at least one symbol, or else we can't get here. */
  XD3_ASSERT (heap_last != 0);

  /* If there is only one symbol, fake a second to prevent zero-length
   * codes. */
  if (heap_last == 1)
    {
      /* Pick either the first or last symbol. */
      int s = freq[0] ? asize-1 : 0;
      ents[s+1].freq = 1;
      goto again;
    }

  /* Build prefix tree. */
  while (heap_last > 1)
    {
      djw_heapen *h1 = heap_extract (heap, ents, --heap_last);
      djw_heapen *h2 = heap_extract (heap, ents, --heap_last);

      ents[ents_size].freq   = h1->freq + h2->freq;
      ents[ents_size].depth  = 1 + max (h1->depth, h2->depth);
      ents[ents_size].parent = 0;

      h1->parent = h2->parent = ents_size;

      heap_insert (heap, ents, ++heap_last, ents_size++);
    }

  IF_DEBUG (heap_check (heap, ents, heap_last));

  /* Now compute prefix code lengths, counting parents. */
  for (i = 1; i < asize+1; i += 1)
    {
      usize_t b = 0;

      if (ents[i].freq != 0)
	{
	  usize_t p = i;

	  while ((p = ents[p].parent) != 0) { b += 1; }

	  if (b > maxlen) { overflow = 1; }

	  total_bits += b * freq[i-1];
	}

      /* clen is 0-origin, unlike ents. */
      IF_DEBUG2 (DP(RINT "clen[%d] = %d\n", i-1, b));
      clen[i-1] = b;
    }

  IF_DEBUG (if (first_bits == 0) first_bits = total_bits);

  if (! overflow)
    {
      IF_DEBUG2 (if (first_bits != total_bits)
      {
	DP(RINT "code length overflow changed %u bits\n",
	   (usize_t)(total_bits - first_bits));
      });
      return total_bits;
    }

  /* OPT: There is a non-looping way to fix overflow shown in zlib, but this
   * is easier (for now), as done in bzip2. */
  for (i = 1; i < asize+1; i += 1)
    {
      ents[i].freq = ents[i].freq / 2 + 1;
    }

  goto again;
}

static void
djw_build_codes (usize_t *codes, const uint8_t *clen, usize_t asize, usize_t abs_max)
{
  usize_t i, l;
  usize_t min_clen = DJW_MAX_CODELEN;
  usize_t max_clen = 0;
  usize_t code = 0;

  /* Find the min and max code length */
  for (i = 0; i < asize; i += 1)
    {
      if (clen[i] > 0 && clen[i] < min_clen)
	{
	  min_clen = clen[i];
	}

      max_clen = max (max_clen, (usize_t) clen[i]);
    }

  XD3_ASSERT (max_clen <= abs_max);

  /* Generate a code for each symbol with the appropriate length. */
  for (l = min_clen; l <= max_clen; l += 1)
    {
      for (i = 0; i < asize; i += 1)
	{
	  if (clen[i] == l)
	    {
	      codes[i] = code++;
	    } 
	}

      code <<= 1;
    }

  IF_DEBUG2 ({
      for (i = 0; i < asize; i += 1)
	{
	  DP(RINT "code[%d] = %u\n", i, codes[i]);
	}
    });
}

/*********************************************************************/
/*			      MOVE-TO-FRONT                          */
/*********************************************************************/
static void
djw_compute_mtf_1_2 (djw_prefix  *prefix,
		     uint8_t     *mtf,
		     djw_weight  *freq_out,
		     usize_t      nsym)
{
  size_t i, j, k;
  usize_t sym;
  usize_t size = prefix->scount;
  usize_t mtf_i = 0;
  int mtf_run = 0;

  /* This +2 is for the RUN_0, RUN_1 codes */
  memset (freq_out, 0, sizeof (freq_out[0]) * (nsym+2));

  for (i = 0; i < size; )
    {
      /* OPT: Bzip optimizes this algorithm a little by effectively checking
       * j==0 before the MTF update. */
      sym = prefix->symbol[i++];

      for (j = 0; mtf[j] != sym; j += 1) { }

      XD3_ASSERT (j <= nsym);

      for (k = j; k >= 1; k -= 1) { mtf[k] = mtf[k-1]; }

      mtf[0] = sym;

      if (j == 0)
	{
	  mtf_run += 1;
	  continue;
	}

      if (mtf_run > 0)
	{
	  djw_update_1_2 (& mtf_run, & mtf_i, prefix->mtfsym, freq_out);
	}

      /* Non-zero symbols are offset by RUN_1 */
      prefix->mtfsym[mtf_i++] = j+RUN_1;
      freq_out[j+RUN_1] += 1;
    }

  if (mtf_run > 0)
    {
      djw_update_1_2 (& mtf_run, & mtf_i, prefix->mtfsym, freq_out);
    }

  prefix->mcount = mtf_i;
}

/* Counts character frequencies of the input buffer, returns the size. */
static usize_t
djw_count_freqs (djw_weight *freq, xd3_output *input)
{
  xd3_output *in;
  usize_t size = 0;

  memset (freq, 0, sizeof (freq[0]) * ALPHABET_SIZE);

  for (in = input; in; in = in->next_page)
    {
      const uint8_t *p     = in->base;
      const uint8_t *p_max = p + in->next;

      size += in->next;

      do
	{
	  ++freq[*p];
	}
      while (++p < p_max);
    }

  IF_DEBUG2 ({int i;
  DP(RINT "freqs: ");
  for (i = 0; i < ALPHABET_SIZE; i += 1)
    {
      DP(RINT "%u ", freq[i]);
    }
  DP(RINT "\n");});

  return size;
}

static void
djw_compute_multi_prefix (usize_t     groups,
			  uint8_t     clen[DJW_MAX_GROUPS][ALPHABET_SIZE],
			  djw_prefix *prefix)
{
  usize_t gp, i;
      
  prefix->scount = ALPHABET_SIZE;
  memcpy (prefix->symbol, clen[0], ALPHABET_SIZE);

  for (gp = 1; gp < groups; gp += 1)
    {
      for (i = 0; i < ALPHABET_SIZE; i += 1)
	{
	  if (clen[gp][i] == 0)
	    {
	      continue;
	    }

	  prefix->symbol[prefix->scount++] = clen[gp][i];
	}
    }
}

static void
djw_compute_prefix_1_2 (djw_prefix *prefix, djw_weight *freq)
{
  /* This +1 is for the 0 code-length. */
  uint8_t clmtf[DJW_MAX_CODELEN+1];

  djw_init_clen_mtf_1_2 (clmtf);

  djw_compute_mtf_1_2 (prefix, clmtf, freq, DJW_MAX_CODELEN);
}

static int
djw_encode_prefix (xd3_stream   *stream,
		   xd3_output  **output,
		   bit_state    *bstate,
		   djw_prefix   *prefix)
{
  int ret;
  size_t i;
  usize_t num_to_encode;
  djw_weight clfreq[DJW_TOTAL_CODES];
  uint8_t    clclen[DJW_TOTAL_CODES];
  usize_t    clcode[DJW_TOTAL_CODES];

  /* Move-to-front encode prefix symbols, count frequencies */
  djw_compute_prefix_1_2 (prefix, clfreq);

  /* Compute codes */
  djw_build_prefix (clfreq, clclen, DJW_TOTAL_CODES, DJW_MAX_CLCLEN);
  djw_build_codes  (clcode, clclen, DJW_TOTAL_CODES, DJW_MAX_CLCLEN);

  /* Compute number of extra codes beyond basic ones for this template. */
  num_to_encode = DJW_TOTAL_CODES;
  while (num_to_encode > DJW_EXTRA_12OFFSET && clclen[num_to_encode-1] == 0)
    {
      num_to_encode -= 1;
    }
  XD3_ASSERT (num_to_encode - DJW_EXTRA_12OFFSET < (1 << DJW_EXTRA_CODE_BITS));

  /* Encode: # of extra codes */
  if ((ret = xd3_encode_bits (stream, output, bstate, DJW_EXTRA_CODE_BITS,
			      num_to_encode - DJW_EXTRA_12OFFSET)))
    {
      return ret;
    }

  /* Encode: MTF code lengths */
  for (i = 0; i < num_to_encode; i += 1)
    {
      if ((ret = xd3_encode_bits (stream, output, bstate,
				  DJW_CLCLEN_BITS, clclen[i])))
	{
	  return ret;
	}
    }

  /* Encode: CLEN code lengths */
  for (i = 0; i < prefix->mcount; i += 1)
    {
      usize_t mtf_sym = prefix->mtfsym[i];
      usize_t bits    = clclen[mtf_sym];
      usize_t code    = clcode[mtf_sym];

      if ((ret = xd3_encode_bits (stream, output, bstate, bits, code)))
	{
	  return ret;
	}
    }

  return 0;
}

static void
djw_compute_selector_1_2 (djw_prefix *prefix,
			  usize_t     groups,
			  djw_weight *gbest_freq)
{
  uint8_t grmtf[DJW_MAX_GROUPS];
  usize_t i;

  for (i = 0; i < groups; i += 1) { grmtf[i] = i; }

  djw_compute_mtf_1_2 (prefix, grmtf, gbest_freq, groups);
}

static int
xd3_encode_howmany_groups (xd3_stream *stream,
			   xd3_sec_cfg *cfg,
			   usize_t input_size,
			   usize_t *ret_groups,
			   usize_t *ret_sector_size)
{
  usize_t cfg_groups = 0;
  usize_t cfg_sector_size = 0;
  usize_t sugg_groups = 0;
  usize_t sugg_sector_size = 0;

  if (cfg->ngroups != 0)
    {
      if (cfg->ngroups > DJW_MAX_GROUPS)
	{
	  stream->msg = "invalid secondary encoder group number";
	  return XD3_INTERNAL;
	}

      cfg_groups = cfg->ngroups;
    }

  if (cfg->sector_size != 0)
    {
      if (cfg->sector_size < DJW_SECTORSZ_MULT ||
	  cfg->sector_size > DJW_SECTORSZ_MAX ||
	  (cfg->sector_size % DJW_SECTORSZ_MULT) != 0)
	{
	  stream->msg = "invalid secondary encoder sector size";
	  return XD3_INTERNAL;
	}

      cfg_sector_size = cfg->sector_size;
    }

  if (cfg_groups == 0 || cfg_sector_size == 0)
    {
      /* These values were found empirically using xdelta3-tune around version
       * xdfs-0.256. */
      switch (cfg->data_type)
	{
	case DATA_SECTION:
	  if      (input_size < 1000)   { sugg_groups = 1; sugg_sector_size = 0; }
	  else if (input_size < 4000)   { sugg_groups = 2; sugg_sector_size = 10; }
	  else if (input_size < 7000)   { sugg_groups = 3; sugg_sector_size = 10; }
	  else if (input_size < 10000)  { sugg_groups = 4; sugg_sector_size = 10; }
	  else if (input_size < 25000)  { sugg_groups = 5; sugg_sector_size = 10; }
	  else if (input_size < 50000)  { sugg_groups = 7; sugg_sector_size = 20; }
	  else if (input_size < 100000) { sugg_groups = 8; sugg_sector_size = 30; }
	  else                          { sugg_groups = 8; sugg_sector_size = 70; }
	  break;
	case INST_SECTION:
	  if      (input_size < 7000)   { sugg_groups = 1; sugg_sector_size = 0; }
	  else if (input_size < 10000)  { sugg_groups = 2; sugg_sector_size = 50; }
	  else if (input_size < 25000)  { sugg_groups = 3; sugg_sector_size = 50; }
	  else if (input_size < 50000)  { sugg_groups = 6; sugg_sector_size = 40; }
	  else if (input_size < 100000) { sugg_groups = 8; sugg_sector_size = 40; }
	  else                          { sugg_groups = 8; sugg_sector_size = 40; }
	  break;
	case ADDR_SECTION:
	  if      (input_size < 9000)   { sugg_groups = 1; sugg_sector_size = 0; }
	  else if (input_size < 25000)  { sugg_groups = 2; sugg_sector_size = 130; }
	  else if (input_size < 50000)  { sugg_groups = 3; sugg_sector_size = 130; }
	  else if (input_size < 100000) { sugg_groups = 5; sugg_sector_size = 130; }
	  else                          { sugg_groups = 7; sugg_sector_size = 130; }
	  break;
	}

      if (cfg_groups == 0)
	{
	  cfg_groups = sugg_groups;
	}

      if (cfg_sector_size == 0)
	{
	  cfg_sector_size = sugg_sector_size;
	}
    }

  if (cfg_groups != 1 && cfg_sector_size == 0)
    {
      switch (cfg->data_type)
	{
	case DATA_SECTION:
	  cfg_sector_size = 20;
	  break;
	case INST_SECTION:
	  cfg_sector_size = 50;
	  break;
	case ADDR_SECTION:
	  cfg_sector_size = 130;
	  break;
	}
    }

  (*ret_groups)     = cfg_groups;
  (*ret_sector_size) = cfg_sector_size;

  XD3_ASSERT (cfg_groups > 0 && cfg_groups <= DJW_MAX_GROUPS);
  XD3_ASSERT (cfg_groups == 1 ||
	      (cfg_sector_size >= DJW_SECTORSZ_MULT &&
	       cfg_sector_size <= DJW_SECTORSZ_MAX));

  return 0;
}

static int
xd3_encode_huff (xd3_stream   *stream,
		 djw_stream   *h,
		 xd3_output   *input,
		 xd3_output   *output,
		 xd3_sec_cfg  *cfg)
{
  int         ret;
  usize_t     groups, sector_size;
  bit_state   bstate = BIT_STATE_ENCODE_INIT;
  xd3_output *in;
  int         output_bits;
  usize_t     input_bits;
  usize_t     input_bytes;
  usize_t     initial_offset = output->next;
  djw_weight  real_freq[ALPHABET_SIZE];
  uint8_t    *gbest = NULL;
  uint8_t    *gbest_mtf = NULL;

  input_bytes = djw_count_freqs (real_freq, input);
  input_bits  = input_bytes * 8;

  XD3_ASSERT (input_bytes > 0);

  if ((ret = xd3_encode_howmany_groups (stream, cfg, input_bytes,
					& groups, & sector_size)))
    {
      return ret;
    }

  if (0)
    {
    regroup:
      /* Sometimes we dynamically decide there are too many groups.  Arrive
       * here. */
      output->next = initial_offset;
      xd3_bit_state_encode_init (& bstate);
    }

  /* Encode: # of groups (3 bits) */
  if ((ret = xd3_encode_bits (stream, & output, & bstate,
			      DJW_GROUP_BITS, groups-1))) { goto failure; }

  if (groups == 1)
    {
      /* Single Huffman group. */
      usize_t    code[ALPHABET_SIZE]; /* Codes */
      uint8_t    clen[ALPHABET_SIZE];
      uint8_t    prefix_mtfsym[ALPHABET_SIZE];
      djw_prefix prefix;

      output_bits =
	djw_build_prefix (real_freq, clen, ALPHABET_SIZE, DJW_MAX_CODELEN);
      djw_build_codes (code, clen, ALPHABET_SIZE, DJW_MAX_CODELEN);

      if (output_bits + EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
	{
	  goto nosecond;
	}

      /* Encode: prefix */
      prefix.mtfsym = prefix_mtfsym;
      prefix.symbol = clen;
      prefix.scount = ALPHABET_SIZE;

      if ((ret = djw_encode_prefix (stream, & output, & bstate, & prefix)))
	{
	  goto failure;
	}

      if (output_bits + (8 * output->next) + EFFICIENCY_BITS >=
	  input_bits && ! cfg->inefficient)
	{
	  goto nosecond;
	}

      /* Encode: data */
      for (in = input; in; in = in->next_page)
	{
	  const uint8_t *p     = in->base;
	  const uint8_t *p_max = p + in->next;

	  do
	    {
	      usize_t sym  = *p++;
	      usize_t bits = clen[sym];

	      IF_DEBUG (output_bits -= bits);

	      if ((ret = xd3_encode_bits (stream, & output,
					  & bstate, bits, code[sym])))
		{
		  goto failure;
		}
	    }
	  while (p < p_max);
	}

      XD3_ASSERT (output_bits == 0);
    }
  else
    {
      /* DJW Huffman */
      djw_weight evolve_freq[DJW_MAX_GROUPS][ALPHABET_SIZE];
      uint8_t evolve_clen[DJW_MAX_GROUPS][ALPHABET_SIZE];
      djw_weight left = input_bytes;
      usize_t gp;
      usize_t niter = 0;
      usize_t select_bits;
      usize_t sym1 = 0, sym2 = 0, s;
      usize_t gcost[DJW_MAX_GROUPS];
      usize_t gbest_code[DJW_MAX_GROUPS+2];
      uint8_t gbest_clen[DJW_MAX_GROUPS+2];
      usize_t  gbest_max = 1 + (input_bytes - 1) / sector_size;
      usize_t best_bits = 0;
      usize_t  gbest_no;
      usize_t  gpcnt;
      const uint8_t *p;
      IF_DEBUG2 (usize_t gcount[DJW_MAX_GROUPS]);

      /* Encode: sector size (5 bits) */
      if ((ret = xd3_encode_bits (stream, & output, & bstate,
				  DJW_SECTORSZ_BITS,
				  (sector_size/DJW_SECTORSZ_MULT)-1)))
	{
	  goto failure;
	}

      /* Dynamic allocation. */
      if (gbest == NULL)
	{
	  if ((gbest = (uint8_t*) xd3_alloc (stream, gbest_max, 1)) == NULL)
	    {
	      ret = ENOMEM;
	      goto failure;
	    }
	}

      if (gbest_mtf == NULL)
	{
	  if ((gbest_mtf = (uint8_t*) xd3_alloc (stream, gbest_max, 1)) == NULL)
	    {
	      ret = ENOMEM;
	      goto failure;
	    }
	}

      /* OPT: Some of the inner loops can be optimized, as shown in bzip2 */

      /* Generate initial code length tables. */
      for (gp = 0; gp < groups; gp += 1)
	{
	  djw_weight sum  = 0;
	  djw_weight goal = left / (groups - gp);

	  IF_DEBUG2 (usize_t nz = 0);

	  /* Due to the single-code granularity of this distribution, it may
	   * be that we can't generate a distribution for each group.  In that
	   * case subtract one group and try again.  If (inefficient), we're
	   * testing group behavior, so don't mess things up. */
	  if (goal == 0 && !cfg->inefficient)
	    {
	      IF_DEBUG2 (DP(RINT "too many groups (%u), dropping one\n",
			    groups));
	      groups -= 1;
	      goto regroup;
	    }

	  /* Sum == goal is possible when (cfg->inefficient)... */
	  while (sum < goal)
	    {
	      XD3_ASSERT (sym2 < ALPHABET_SIZE);
	      IF_DEBUG2 (nz += real_freq[sym2] != 0);
	      sum += real_freq[sym2++];
	    }

	  IF_DEBUG2(DP(RINT "group %u has symbols %u..%u (%u non-zero) "
		       "(%u/%u = %.3f)\n",
		       gp, sym1, sym2, nz, sum,
		       input_bytes, sum / (double)input_bytes););

	  for (s = 0; s < ALPHABET_SIZE; s += 1)
	    {
	      evolve_clen[gp][s] = (s >= sym1 && s <= sym2) ? 1 : 16;
	    }

	  left -= sum;
	  sym1  = sym2+1;
	}

    repeat:

      niter += 1;
      gbest_no = 0;
      memset (evolve_freq, 0, sizeof (evolve_freq[0]) * groups);
      IF_DEBUG2 (memset (gcount, 0, sizeof (gcount[0]) * groups));

      /* For each input page (loop is irregular to allow non-pow2-size group
       * size. */
      in = input;
      p  = in->base;

      /* For each group-size sector. */
      do
	{
	  const uint8_t *p0  = p;
	  xd3_output    *in0 = in;
	  usize_t best   = 0;
	  usize_t winner = 0;

	  /* Select best group for each sector, update evolve_freq. */
	  memset (gcost, 0, sizeof (gcost[0]) * groups);

	  /* For each byte in sector. */
	  for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
	    {
	      /* For each group. */
	      for (gp = 0; gp < groups; gp += 1)
		{
		  gcost[gp] += evolve_clen[gp][*p];
		}

	      /* Check end-of-input-page. */
#             define GP_PAGE()                \
	      if ((usize_t)(++p - in->base) == in->next) \
		{                             \
		  in = in->next_page;         \
		  if (in == NULL) { break; }  \
		  p  = in->base;              \
		}

	      GP_PAGE ();
	    }

	  /* Find min cost group for this sector */
	  best = USIZE_T_MAX;
	  for (gp = 0; gp < groups; gp += 1)
	    {
	      if (gcost[gp] < best) 
		{ 
		  best = gcost[gp]; 
		  winner = gp; 
		}
	    }

	  XD3_ASSERT(gbest_no < gbest_max);
	  gbest[gbest_no++] = winner;
	  IF_DEBUG2 (gcount[winner] += 1);

	  p  = p0;
	  in = in0;

	  /* Update group frequencies. */
	  for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
	    {
	      evolve_freq[winner][*p] += 1;

	      GP_PAGE ();
	    }
	}
      while (in != NULL);

      XD3_ASSERT (gbest_no == gbest_max);

      /* Recompute code lengths. */
      output_bits = 0;
      for (gp = 0; gp < groups; gp += 1)
	{
	  int i;
	  uint8_t evolve_zero[ALPHABET_SIZE];
	  int any_zeros = 0;

	  memset (evolve_zero, 0, sizeof (evolve_zero));

	  /* Cannot allow a zero clen when the real frequency is non-zero.
	   * Note: this means we are going to encode a fairly long code for
	   * these unused entries.  An improvement would be to implement a
	   * NOTUSED code for when these are actually zero, but this requires
	   * another data structure (evolve_zero) since we don't know when
	   * evolve_freq[i] == 0...  Briefly tested, looked worse. */
	  for (i = 0; i < ALPHABET_SIZE; i += 1)
	    {
	      if (evolve_freq[gp][i] == 0 && real_freq[i] != 0)
		{
		  evolve_freq[gp][i] = 1;
		  evolve_zero[i] = 1;
		  any_zeros = 1;
		}
	    }

	  output_bits += djw_build_prefix (evolve_freq[gp], evolve_clen[gp],
					   ALPHABET_SIZE, DJW_MAX_CODELEN);

	  /* The above faking of frequencies does not matter for the last
	   * iteration, but we don't know when that is yet.  However, it also
	   * breaks the output_bits computation.  Necessary for accuracy, and
	   * for the (output_bits==0) assert after all bits are output. */
	  if (any_zeros)
	    {
	      IF_DEBUG2 (usize_t save_total = output_bits);

	      for (i = 0; i < ALPHABET_SIZE; i += 1)
		{
		  if (evolve_zero[i]) { output_bits -= evolve_clen[gp][i]; }
		}

	      IF_DEBUG2 (DP(RINT "evolve_zero reduced %u bits in group %u\n",
			    save_total - output_bits, gp));
	    }
	}

      IF_DEBUG2(
	DP(RINT "pass %u total bits: %u group uses: ", niter, output_bits);
	for (gp = 0; gp < groups; gp += 1) { DP(RINT "%u ", gcount[gp]); }
	DP(RINT "\n");
	);

      /* End iteration. */

      IF_DEBUG2 (if (niter > 1 && best_bits < output_bits) {
	DP(RINT "iteration lost %u bits\n", output_bits - best_bits); });

      if (niter == 1 || (niter < DJW_MAX_ITER &&
			 (best_bits - output_bits) >= DJW_MIN_IMPROVEMENT))
	{
	  best_bits = output_bits;
	  goto repeat;
	}

      /* Efficiency check. */
      if (output_bits + EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
	{
	  goto nosecond;
	}

      IF_DEBUG2 (DP(RINT "djw compression: %u -> %0.3f\n",
		    input_bytes, output_bits / 8.0));

      /* Encode: prefix */
      {
	uint8_t     prefix_symbol[DJW_MAX_GROUPS * ALPHABET_SIZE];
	uint8_t     prefix_mtfsym[DJW_MAX_GROUPS * ALPHABET_SIZE];
	uint8_t     prefix_repcnt[DJW_MAX_GROUPS * ALPHABET_SIZE];
	djw_prefix prefix;

	prefix.symbol = prefix_symbol;
	prefix.mtfsym = prefix_mtfsym;
	prefix.repcnt = prefix_repcnt;

	djw_compute_multi_prefix (groups, evolve_clen, & prefix);
	if ((ret = djw_encode_prefix (stream, & output, & bstate, & prefix)))
	  {
	    goto failure;
	  }
      }

      /* Encode: selector frequencies */
      {
	djw_weight gbest_freq[DJW_MAX_GROUPS+1];
	djw_prefix gbest_prefix;
	usize_t i;

	gbest_prefix.scount = gbest_no;
	gbest_prefix.symbol = gbest;
	gbest_prefix.mtfsym = gbest_mtf;

	djw_compute_selector_1_2 (& gbest_prefix, groups, gbest_freq);

	select_bits =
	  djw_build_prefix (gbest_freq, gbest_clen, groups+1, DJW_MAX_GBCLEN);
	djw_build_codes  (gbest_code, gbest_clen, groups+1, DJW_MAX_GBCLEN);

	for (i = 0; i < groups+1; i += 1)
	  {
	    if ((ret = xd3_encode_bits (stream, & output, & bstate,
					DJW_GBCLEN_BITS, gbest_clen[i])))
	      {
		goto failure;
	      }
	  }

	for (i = 0; i < gbest_prefix.mcount; i += 1)
	  {
	    usize_t gp_mtf      = gbest_mtf[i];
	    usize_t gp_sel_bits = gbest_clen[gp_mtf];
	    usize_t gp_sel_code = gbest_code[gp_mtf];

	    XD3_ASSERT (gp_mtf < groups+1);

	    if ((ret = xd3_encode_bits (stream, & output, & bstate,
					gp_sel_bits, gp_sel_code)))
	      {
		goto failure;
	      }

	    IF_DEBUG (select_bits -= gp_sel_bits);
	  }

	XD3_ASSERT (select_bits == 0);
      }

      /* Efficiency check. */
      if (output_bits + select_bits + (8 * output->next) +
	  EFFICIENCY_BITS >= input_bits && ! cfg->inefficient)
	{
	  goto nosecond;
	}

      /* Encode: data */
      {
	usize_t evolve_code[DJW_MAX_GROUPS][ALPHABET_SIZE];
	usize_t sector = 0;

	/* Build code tables for each group. */
	for (gp = 0; gp < groups; gp += 1)
	  {
	    djw_build_codes (evolve_code[gp], evolve_clen[gp],
			     ALPHABET_SIZE, DJW_MAX_CODELEN);
	  }

	/* Now loop over the input. */
	in = input;
	p  = in->base;

	do
	  {
	    /* For each sector. */
	    usize_t   gp_best  = gbest[sector];
	    usize_t *gp_codes = evolve_code[gp_best];
	    uint8_t *gp_clens = evolve_clen[gp_best];

	    XD3_ASSERT (sector < gbest_no);

	    sector += 1;

	    /* Encode the sector data. */
	    for (gpcnt = 0; gpcnt < sector_size; gpcnt += 1)
	      {
		usize_t sym  = *p;
		usize_t bits = gp_clens[sym];
		usize_t code = gp_codes[sym];

		IF_DEBUG (output_bits -= bits);

		if ((ret = xd3_encode_bits (stream, & output, & bstate,
					    bits, code)))
		  {
		    goto failure;
		  }

		GP_PAGE ();
	      }
	  }
	while (in != NULL);

	XD3_ASSERT (select_bits == 0);
	XD3_ASSERT (output_bits == 0);
      }
    }

  ret = xd3_flush_bits (stream, & output, & bstate);

  if (0)
    {
    nosecond:
      stream->msg = "secondary compression was inefficient";
      ret = XD3_NOSECOND;
    }

 failure:

  xd3_free (stream, gbest);
  xd3_free (stream, gbest_mtf);
  return ret;
}
#endif /* XD3_ENCODER */

/*********************************************************************/
/*                              DECODE                               */
/*********************************************************************/

static void
djw_build_decoder (xd3_stream    *stream,
		   usize_t        asize,
		   usize_t        abs_max,
		   const uint8_t *clen,
		   uint8_t       *inorder,
		   usize_t       *base,
		   usize_t       *limit,
		   usize_t       *min_clenp,
		   usize_t       *max_clenp)
{
  usize_t i, l;
  const uint8_t *ci;
  usize_t nr_clen [DJW_TOTAL_CODES];
  usize_t tmp_base[DJW_TOTAL_CODES];
  usize_t min_clen;
  usize_t max_clen;

  /* Assumption: the two temporary arrays are large enough to hold abs_max. */
  XD3_ASSERT (abs_max <= DJW_MAX_CODELEN);

  /* This looks something like the start of zlib's inftrees.c */
  memset (nr_clen, 0, sizeof (nr_clen[0]) * (abs_max+1));

  /* Count number of each code length */
  i  = asize;
  ci = clen;
  do
    {
      /* Caller _must_ check that values are in-range.  Most of the time the
       * caller decodes a specific number of bits, which imply the max value,
       * and the other time the caller decodes a huffman value, which must be
       * in-range.  Therefore, its an assertion and this function cannot
       * otherwise fail. */
      XD3_ASSERT (*ci <= abs_max);

      nr_clen[*ci++]++;
    }
  while (--i != 0);

  /* Compute min, max. */
  for (i = 1; i <= abs_max; i += 1) { if (nr_clen[i]) { break; } }
  min_clen = i;
  for (i = abs_max; i != 0; i -= 1) { if (nr_clen[i]) { break; } }
  max_clen = i;

  /* Fill the BASE, LIMIT table. */
  tmp_base[min_clen] = 0;
  base[min_clen]     = 0;
  limit[min_clen]    = nr_clen[min_clen] - 1;
  for (i = min_clen + 1; i <= max_clen; i += 1)
    {
      usize_t last_limit = ((limit[i-1] + 1) << 1);
      tmp_base[i] = tmp_base[i-1] + nr_clen[i-1];
      limit[i]    = last_limit + nr_clen[i] - 1;
      base[i]     = last_limit - tmp_base[i];
    }

  /* Fill the inorder array, canonically ordered codes. */
  ci = clen;
  for (i = 0; i < asize; i += 1)
    {
      if ((l = *ci++) != 0)
	{
	  inorder[tmp_base[l]++] = i;
	}
    }

  *min_clenp = min_clen;
  *max_clenp = max_clen;
}

static inline int
djw_decode_symbol (xd3_stream     *stream,
		   bit_state      *bstate,
		   const uint8_t **input,
		   const uint8_t  *input_end,
		   const uint8_t  *inorder,
		   const usize_t  *base,
		   const usize_t  *limit,
		   usize_t         min_clen,
		   usize_t         max_clen,
		   usize_t         *sym,
		   usize_t          max_sym)
{
  usize_t code = 0;
  usize_t bits = 0;

  /* OPT: Supposedly a small lookup table improves speed here... */

  /* Code outline is similar to xd3_decode_bits... */
  if (bstate->cur_mask == 0x100) { goto next_byte; }

  for (;;)
    {
      do
	{
	  if (bits == max_clen) { goto corrupt; }

	  bits += 1;
	  code  = (code << 1);

	  if (bstate->cur_byte & bstate->cur_mask) { code |= 1; }

	  bstate->cur_mask <<= 1;

	  if (bits >= min_clen && code <= limit[bits]) { goto done; }
	}
      while (bstate->cur_mask != 0x100);

    next_byte:

      if (*input == input_end)
	{
	  stream->msg = "secondary decoder end of input";
	  return XD3_INTERNAL;
	}

      bstate->cur_byte = *(*input)++;
      bstate->cur_mask = 1;
    }

 done:

  if (base[bits] <= code)
    {
      usize_t offset = code - base[bits];

      if (offset <= max_sym)
	{
	  IF_DEBUG2 (DP(RINT "(j) %u ", code));
	  *sym = inorder[offset];
	  return 0;
	}
    }

 corrupt:
  stream->msg = "secondary decoder invalid code";
  return XD3_INTERNAL;
}

static int
djw_decode_clclen (xd3_stream     *stream,
		   bit_state      *bstate,
		   const uint8_t **input,
		   const uint8_t  *input_end,
		   uint8_t        *cl_inorder,
		   usize_t        *cl_base,
		   usize_t        *cl_limit,
		   usize_t        *cl_minlen,
		   usize_t        *cl_maxlen,
		   uint8_t        *cl_mtf)
{
  int ret;
  uint8_t cl_clen[DJW_TOTAL_CODES];
  usize_t num_codes, value;
  usize_t i;

  /* How many extra code lengths to encode. */
  if ((ret = xd3_decode_bits (stream, bstate, input,
			      input_end, DJW_EXTRA_CODE_BITS, & num_codes)))
    {
      return ret;
    }

  num_codes += DJW_EXTRA_12OFFSET;

  /* Read num_codes. */
  for (i = 0; i < num_codes; i += 1)
    {
      if ((ret = xd3_decode_bits (stream, bstate, input,
				  input_end, DJW_CLCLEN_BITS, & value)))
	{
	  return ret;
	}

      cl_clen[i] = value;
    }

  /* Set the rest to zero. */
  for (; i < DJW_TOTAL_CODES; i += 1) { cl_clen[i] = 0; }

  /* No need to check for in-range clen values, because: */
  XD3_ASSERT (1 << DJW_CLCLEN_BITS == DJW_MAX_CLCLEN + 1);

  /* Build the code-length decoder. */
  djw_build_decoder (stream, DJW_TOTAL_CODES, DJW_MAX_CLCLEN,
		     cl_clen, cl_inorder, cl_base,
		     cl_limit, cl_minlen, cl_maxlen);

  /* Initialize the MTF state. */
  djw_init_clen_mtf_1_2 (cl_mtf);

  return 0;
}

static inline int
djw_decode_1_2 (xd3_stream     *stream,
		bit_state      *bstate,
		const uint8_t **input,
		const uint8_t  *input_end,
		const uint8_t  *inorder,
		const usize_t  *base,
		const usize_t  *limit,
		const usize_t  *minlen,
		const usize_t  *maxlen,
		uint8_t        *mtfvals,
		usize_t         elts,
		usize_t         skip_offset,
		uint8_t        *values)
{
  usize_t n = 0, rep = 0, mtf = 0, s = 0;
  int ret;
  
  while (n < elts)
    {
      /* Special case inside generic code: CLEN only: If not the first group,
       * we already know the zero frequencies. */
      if (skip_offset != 0 && n >= skip_offset && values[n-skip_offset] == 0)
	{
	  values[n++] = 0;
	  continue;
	}

      /* Repeat last symbol. */
      if (rep != 0)
	{
	  values[n++] = mtfvals[0];
	  rep -= 1;
	  continue;
	}

      /* Symbol following last repeat code. */
      if (mtf != 0)
	{
	  usize_t sym = djw_update_mtf (mtfvals, mtf);
	  values[n++] = sym;
	  mtf = 0;
	  continue;
	}

      /* Decode next symbol/repeat code. */
      if ((ret = djw_decode_symbol (stream, bstate, input, input_end,
				    inorder, base, limit, *minlen, *maxlen,
				    & mtf, DJW_TOTAL_CODES))) { return ret; }

      if (mtf <= RUN_1)
	{
	  /* Repetition. */
	  rep = ((mtf + 1) << s);
	  mtf = 0;
	  s += 1;
	}
      else
	{
	  /* Remove the RUN_1 MTF offset. */
	  mtf -= 1;
	  s = 0;
	}
    }

  /* If (rep != 0) there were too many codes received. */
  if (rep != 0)
    {
      stream->msg = "secondary decoder invalid repeat code";
      return XD3_INTERNAL;
    }
  
  return 0;
}

static inline int
djw_decode_prefix (xd3_stream     *stream,
		   bit_state      *bstate,
		   const uint8_t **input,
		   const uint8_t  *input_end,
		   const uint8_t  *cl_inorder,
		   const usize_t  *cl_base,
		   const usize_t  *cl_limit,
		   const usize_t  *cl_minlen,
		   const usize_t  *cl_maxlen,
		   uint8_t        *cl_mtf,
		   usize_t         groups,
		   uint8_t        *clen)
{
  return djw_decode_1_2 (stream, bstate, input, input_end,
			 cl_inorder, cl_base, cl_limit,
			 cl_minlen, cl_maxlen, cl_mtf,
			 ALPHABET_SIZE * groups, ALPHABET_SIZE, clen);
}

static int
xd3_decode_huff (xd3_stream     *stream,
		 djw_stream    *h,
		 const uint8_t **input_pos,
		 const uint8_t  *const input_end,
		 uint8_t       **output_pos,
		 const uint8_t  *const output_end)
{
  const uint8_t *input = *input_pos;
  uint8_t  *output = *output_pos;
  bit_state bstate = BIT_STATE_DECODE_INIT;
  uint8_t  *sel_group = NULL;
  usize_t    groups, gp;
  usize_t    output_bytes = (output_end - output);
  usize_t    sector_size;
  usize_t    sectors;
  int ret;

  /* Invalid input. */
  if (output_bytes == 0)
    {
      stream->msg = "secondary decoder invalid input";
      return XD3_INTERNAL;
    }

  /* Decode: number of groups */
  if ((ret = xd3_decode_bits (stream, & bstate, & input,
			      input_end, DJW_GROUP_BITS, & groups)))
    {
      goto fail;
    }

  groups += 1;

  if (groups > 1)
    {
      /* Decode: group size */
      if ((ret = xd3_decode_bits (stream, & bstate, & input,
				  input_end, DJW_SECTORSZ_BITS,
				  & sector_size))) { goto fail; }
      
      sector_size = (sector_size + 1) * DJW_SECTORSZ_MULT;
    }
  else
    {
      /* Default for groups == 1 */
      sector_size = output_bytes;
    }

  sectors = 1 + (output_bytes - 1) / sector_size;

  /* TODO: In the case of groups==1, lots of extra stack space gets used here.
   * Could dynamically allocate this memory, which would help with excess
   * parameter passing, too.  Passing too many parameters in this file,
   * simplify it! */

  /* Outer scope: per-group symbol decoder tables. */
  {
    uint8_t inorder[DJW_MAX_GROUPS][ALPHABET_SIZE];
    usize_t base   [DJW_MAX_GROUPS][DJW_TOTAL_CODES];
    usize_t limit  [DJW_MAX_GROUPS][DJW_TOTAL_CODES];
    usize_t minlen [DJW_MAX_GROUPS];
    usize_t maxlen [DJW_MAX_GROUPS];

    /* Nested scope: code length decoder tables. */
    {
      uint8_t clen      [DJW_MAX_GROUPS][ALPHABET_SIZE];
      uint8_t cl_inorder[DJW_TOTAL_CODES];
      usize_t cl_base   [DJW_MAX_CLCLEN+2];
      usize_t cl_limit  [DJW_MAX_CLCLEN+2];
      uint8_t cl_mtf    [DJW_TOTAL_CODES];
      usize_t cl_minlen;
      usize_t cl_maxlen;

      /* Compute the code length decoder. */
      if ((ret = djw_decode_clclen (stream, & bstate, & input, input_end,
				    cl_inorder, cl_base, cl_limit, & cl_minlen,
				    & cl_maxlen, cl_mtf))) { goto fail; }

      /* Now decode each group decoder. */
      if ((ret = djw_decode_prefix (stream, & bstate, & input, input_end,
				    cl_inorder, cl_base, cl_limit,
				    & cl_minlen, & cl_maxlen, cl_mtf,
				    groups, clen[0]))) { goto fail; }

      /* Prepare the actual decoding tables. */
      for (gp = 0; gp < groups; gp += 1)
	{
	  djw_build_decoder (stream, ALPHABET_SIZE, DJW_MAX_CODELEN,
			     clen[gp], inorder[gp], base[gp], limit[gp],
			     & minlen[gp], & maxlen[gp]);
	}
    }

    /* Decode: selector clens. */
    {
      uint8_t sel_inorder[DJW_MAX_GROUPS+2];
      usize_t sel_base   [DJW_MAX_GBCLEN+2];
      usize_t sel_limit  [DJW_MAX_GBCLEN+2];
      uint8_t sel_mtf    [DJW_MAX_GROUPS+2];
      usize_t sel_minlen;
      usize_t sel_maxlen;

      /* Setup group selection. */
      if (groups > 1)
	{
	  uint8_t sel_clen[DJW_MAX_GROUPS+1];

	  for (gp = 0; gp < groups+1; gp += 1)
	    {
	      usize_t value;

	      if ((ret = xd3_decode_bits (stream, & bstate, & input,
					  input_end, DJW_GBCLEN_BITS,
					  & value))) { goto fail; }

	      sel_clen[gp] = value;
	      sel_mtf[gp]  = gp;
	    }

	  if ((sel_group = (uint8_t*) xd3_alloc (stream, sectors, 1)) == NULL)
	    {
	      ret = ENOMEM;
	      goto fail;
	    }

	  djw_build_decoder (stream, groups+1, DJW_MAX_GBCLEN, sel_clen,
			     sel_inorder, sel_base, sel_limit,
			     & sel_minlen, & sel_maxlen);

	  if ((ret = djw_decode_1_2 (stream, & bstate, & input, input_end,
				     sel_inorder, sel_base,
				     sel_limit, & sel_minlen,
				     & sel_maxlen, sel_mtf,
				     sectors, 0, sel_group))) { goto fail; }
	}

      /* Now decode each sector. */
      {
	/* Initialize for (groups==1) case. */
	uint8_t *gp_inorder = inorder[0]; 
	usize_t *gp_base    = base[0];
	usize_t *gp_limit   = limit[0];
	usize_t  gp_minlen  = minlen[0];
	usize_t  gp_maxlen  = maxlen[0];
	usize_t c;

	for (c = 0; c < sectors; c += 1)
	  {
	    usize_t n;

	    if (groups >= 2)
	      {
		gp = sel_group[c];

		XD3_ASSERT (gp < groups);

		gp_inorder = inorder[gp];
		gp_base    = base[gp];
		gp_limit   = limit[gp];
		gp_minlen  = minlen[gp];
		gp_maxlen  = maxlen[gp];
	      }

	    XD3_ASSERT (output_end - output > 0);
	    
	    /* Decode next sector. */
	    n = min (sector_size, (usize_t) (output_end - output));

	    do
	      {
		usize_t sym;

		if ((ret = djw_decode_symbol (stream, & bstate,
					      & input, input_end,
					      gp_inorder, gp_base,
					      gp_limit, gp_minlen, gp_maxlen,
					      & sym, ALPHABET_SIZE)))
		  {
		    goto fail;
		  }

		*output++ = sym;
	      }
	    while (--n);
	  }
      }
    }
  }

  IF_REGRESSION (if ((ret = xd3_test_clean_bits (stream, & bstate)))
		   { goto fail; });
  XD3_ASSERT (ret == 0);

 fail:
  xd3_free (stream, sel_group);

  (*input_pos) = input;
  (*output_pos) = output;
  return ret;
}

#endif