Huffman Codec implementation [Java]

Eventual plans:

  • Support for codes between 2-32 bits.
  • Better tests (they are bad).
  • Function to create code tree.

Huffman is a entropy based lossless compression algorithm.

package net.github.hadyn;

import com.google.common.base.Preconditions;

import java.io.EOFException;
import java.io.IOException;

/**
 * This implementation is limited to 8 bit codewords. Code literal 255 is reserved for a null code
 * which is used for control flow. This codec encodes and decodes values that use MSB 0 numbering
 * meaning the MSB is bit 0.
 *
 * @author Hadyn Fitzgerald
 */
public class HuffmanCodec {

  public static final int MAXIMUM_SYMBOLS = 255;

  /**
   * Used to denote a node in the code tree that doesn't have a code value.
   */
  private static final byte NULL_CODE = -1;

  /**
   * The binary code tree used to encode and decode values.
   */
  private byte[] codes;

  /**
   * The indexes for all of the symbols in the code tree.
   */
  private byte[] indexes;

  public HuffmanCodec(byte[] codes, int length) {
    Preconditions.checkArgument(length < MAXIMUM_SYMBOLS);
    this.codes = codes;
    indexes = new byte[length];
    for (int i = 0; i < codes.length; i++) {
      if (codes[i] == NULL_CODE) {
        continue;
      }
      indexes[codes[i] & 0xff] = (byte) i;
    }
  }

  public int encode(byte[] src, int srcOff, int srcLen, byte[] dest, int destOff) {
    int bitOffset = destOff << 3;

    for (int i = 0; i < srcLen; i++) {
      int value = src[srcOff + i] & 0xff;

      int index = indexes[value] & 0xff;
      if (index == 0) {
        throw new IllegalArgumentException("Symbol '" + value + "' is not supported by the codec.");
      }

      // floor(log2(index + 1))
      int length = 31 - Integer.numberOfLeadingZeros(index + 1);

      // Write out the next symbol, we must write the codes in reverse order.
      for (int j = 0; j < length; j++) {
        int codeBitOffset = bitOffset + (length - j - 1);
        int byteOffset = codeBitOffset >> 3;
        int k = 7 - (codeBitOffset & 7);

        // Odd indexes are left, even indexes are right. Normalizes this to the bit we
        // need to write to the destination buffer, 0 for left and 1 for right.
        int bit = (~(index & 1) & 1);

        // Mask the bit that we are writing to the destination buffer so that
        // just in case the buffer needs to be overwritten.
        dest[byteOffset] = (byte) ((dest[byteOffset] & ~(1 << k)) | (bit << k));
        index = (index - bit - 1) >> 1;
      }

      if (index != 0) {
        throw new IllegalStateException("Invalid code tree.");
      }

      bitOffset += length;
    }

    return bitOffset - (destOff << 3);
  }

  public void decode(byte[] src, int srcOff, int srcLen,
                     byte[] dest, int destOff, int destLength) throws IOException {
    int bitOffset = srcOff << 3;
    int bitLength = srcLen << 3;

    for (int i = 0; i < destLength; i++) {
      // Read in the next symbol from the source buffer. A symbol is completed if the current
      // node being traversed in the binary tree is non-null. Null symbols are defined as literal
      // value 255.
      int n = 0;
      int length = 0;
      while (length < bitLength) {
        int bitPos = bitOffset + length;
        if (codes[n] != NULL_CODE) {
          break;
        }
        n = (n << 1) + (src[bitPos >>> 3] >>> (7 - (bitPos & 7)) & 1) + 1;
        length++;
      }

      byte code = codes[n];
      if (code == NULL_CODE) {
        throw new EOFException();
      }
      dest[destOff + i] = code;
      bitOffset += length;
      bitLength -= length;
    }
  }
}
package net.github.hadyn;

import org.junit.Test;

import java.io.IOException;

import static com.google.common.truth.Truth.assertThat;

/**
 * @author Hadyn Fitzgerald
 */
public class HuffmanCodecTest {

  @Test
  public void testHuffmanCodec() throws IOException {
    String symbols = "abc";
    byte[] codes = new byte[] {
            -1,                     //      |
            -1, 0,                  //    |   a
             1, 2                   //  b   c
    };

    HuffmanCodec codec = new HuffmanCodec(codes, symbols.length());

    String value = "abcabcbcabca";
    byte[] srcBuffer = new byte[value.length()];
    byte[] destBuffer = new byte[100];

    for(int i = 0; i < value.length(); i++) {
      srcBuffer[i] = (byte) symbols.indexOf(value.charAt(i));
    }

    int bitLength = codec.encode(srcBuffer, 0, srcBuffer.length, destBuffer, 0);
    codec.decode(destBuffer, 0, (bitLength >> 3) + 1, srcBuffer, 0, srcBuffer.length);

    String result = "";
    for(int i = 0; i < srcBuffer.length; i++) {
      result += symbols.charAt(srcBuffer[i] & 0xff);
    }

    assertThat(result).isEqualTo(value);
  }
}