Eventual plans:
- Support for codes between 2-32 bits.
- Better tests (they are bad).
- Function to create code tree.
Huffman is a entropy based lossless compression algorithm.
package net.github.hadyn;
import com.google.common.base.Preconditions;
import java.io.EOFException;
import java.io.IOException;
/**
* This implementation is limited to 8 bit codewords. Code literal 255 is reserved for a null code
* which is used for control flow. This codec encodes and decodes values that use MSB 0 numbering
* meaning the MSB is bit 0.
*
* @author Hadyn Fitzgerald
*/
public class HuffmanCodec {
public static final int MAXIMUM_SYMBOLS = 255;
/**
* Used to denote a node in the code tree that doesn't have a code value.
*/
private static final byte NULL_CODE = -1;
/**
* The binary code tree used to encode and decode values.
*/
private byte[] codes;
/**
* The indexes for all of the symbols in the code tree.
*/
private byte[] indexes;
public HuffmanCodec(byte[] codes, int length) {
Preconditions.checkArgument(length < MAXIMUM_SYMBOLS);
this.codes = codes;
indexes = new byte[length];
for (int i = 0; i < codes.length; i++) {
if (codes[i] == NULL_CODE) {
continue;
}
indexes[codes[i] & 0xff] = (byte) i;
}
}
public int encode(byte[] src, int srcOff, int srcLen, byte[] dest, int destOff) {
int bitOffset = destOff << 3;
for (int i = 0; i < srcLen; i++) {
int value = src[srcOff + i] & 0xff;
int index = indexes[value] & 0xff;
if (index == 0) {
throw new IllegalArgumentException("Symbol '" + value + "' is not supported by the codec.");
}
// floor(log2(index + 1))
int length = 31 - Integer.numberOfLeadingZeros(index + 1);
// Write out the next symbol, we must write the codes in reverse order.
for (int j = 0; j < length; j++) {
int codeBitOffset = bitOffset + (length - j - 1);
int byteOffset = codeBitOffset >> 3;
int k = 7 - (codeBitOffset & 7);
// Odd indexes are left, even indexes are right. Normalizes this to the bit we
// need to write to the destination buffer, 0 for left and 1 for right.
int bit = (~(index & 1) & 1);
// Mask the bit that we are writing to the destination buffer so that
// just in case the buffer needs to be overwritten.
dest[byteOffset] = (byte) ((dest[byteOffset] & ~(1 << k)) | (bit << k));
index = (index - bit - 1) >> 1;
}
if (index != 0) {
throw new IllegalStateException("Invalid code tree.");
}
bitOffset += length;
}
return bitOffset - (destOff << 3);
}
public void decode(byte[] src, int srcOff, int srcLen,
byte[] dest, int destOff, int destLength) throws IOException {
int bitOffset = srcOff << 3;
int bitLength = srcLen << 3;
for (int i = 0; i < destLength; i++) {
// Read in the next symbol from the source buffer. A symbol is completed if the current
// node being traversed in the binary tree is non-null. Null symbols are defined as literal
// value 255.
int n = 0;
int length = 0;
while (length < bitLength) {
int bitPos = bitOffset + length;
if (codes[n] != NULL_CODE) {
break;
}
n = (n << 1) + (src[bitPos >>> 3] >>> (7 - (bitPos & 7)) & 1) + 1;
length++;
}
byte code = codes[n];
if (code == NULL_CODE) {
throw new EOFException();
}
dest[destOff + i] = code;
bitOffset += length;
bitLength -= length;
}
}
}
package net.github.hadyn;
import org.junit.Test;
import java.io.IOException;
import static com.google.common.truth.Truth.assertThat;
/**
* @author Hadyn Fitzgerald
*/
public class HuffmanCodecTest {
@Test
public void testHuffmanCodec() throws IOException {
String symbols = "abc";
byte[] codes = new byte[] {
-1, // |
-1, 0, // | a
1, 2 // b c
};
HuffmanCodec codec = new HuffmanCodec(codes, symbols.length());
String value = "abcabcbcabca";
byte[] srcBuffer = new byte[value.length()];
byte[] destBuffer = new byte[100];
for(int i = 0; i < value.length(); i++) {
srcBuffer[i] = (byte) symbols.indexOf(value.charAt(i));
}
int bitLength = codec.encode(srcBuffer, 0, srcBuffer.length, destBuffer, 0);
codec.decode(destBuffer, 0, (bitLength >> 3) + 1, srcBuffer, 0, srcBuffer.length);
String result = "";
for(int i = 0; i < srcBuffer.length; i++) {
result += symbols.charAt(srcBuffer[i] & 0xff);
}
assertThat(result).isEqualTo(value);
}
}