refs #35 generic implementation of Huffman compression for variable bit length source alphabet
This commit is contained in:
parent
20181c63a1
commit
12b209c724
@ -84,22 +84,17 @@ data class ObjString(val value: String) : Obj() {
|
||||
override suspend fun lynonType(): LynonType = LynonType.String
|
||||
|
||||
override suspend fun serialize(scope: Scope, encoder: LynonEncoder, lynonType: LynonType?) {
|
||||
// if( lynonType == null )
|
||||
// encoder.encodeCached(this) { encoder.encodeBinaryData(value.encodeToByteArray()) }
|
||||
// else
|
||||
encoder.encodeBinaryData(value.encodeToByteArray())
|
||||
val data = value.encodeToByteArray()
|
||||
encoder.encodeCached(data) { encoder.encodeBinaryData(data) }
|
||||
}
|
||||
|
||||
|
||||
companion object {
|
||||
val type = object : ObjClass("String") {
|
||||
override suspend fun deserialize(scope: Scope, decoder: LynonDecoder, lynonType: LynonType?): Obj =
|
||||
// if( lynonType == null )
|
||||
// decoder.decodeCached {
|
||||
// ObjString(decoder.unpackBinaryData().decodeToString())
|
||||
// }
|
||||
// else
|
||||
decoder.decodeCached {
|
||||
ObjString(decoder.unpackBinaryData().decodeToString())
|
||||
}
|
||||
}.apply {
|
||||
addFn("toInt") {
|
||||
ObjInt(thisAs<ObjString>().value.toLong())
|
||||
|
@ -4,21 +4,21 @@ import net.sergeych.bintools.ByteChunk
|
||||
import net.sergeych.lyng.Scope
|
||||
import net.sergeych.lyng.obj.*
|
||||
|
||||
enum class LynonType(val objClass: ObjClass) {
|
||||
Null(ObjNull.objClass),
|
||||
Int0(ObjInt.type),
|
||||
IntNegative(ObjInt.type),
|
||||
IntPositive(ObjInt.type),
|
||||
String(ObjString.type),
|
||||
enum class LynonType(val objClass: ObjClass,val defaultFrequency: Int = 1) {
|
||||
Null(ObjNull.objClass, 80),
|
||||
Int0(ObjInt.type, 70),
|
||||
IntNegative(ObjInt.type, 50),
|
||||
IntPositive(ObjInt.type, 100),
|
||||
String(ObjString.type, 100),
|
||||
Real(ObjReal.type),
|
||||
Bool(ObjBool.type),
|
||||
List(ObjList.type),
|
||||
Map(ObjMap.type),
|
||||
Bool(ObjBool.type, 80),
|
||||
List(ObjList.type, 70),
|
||||
Map(ObjMap.type,40),
|
||||
Set(ObjSet.type),
|
||||
Buffer(ObjBuffer.type),
|
||||
Instant(ObjInstant.type),
|
||||
Buffer(ObjBuffer.type, 50),
|
||||
Instant(ObjInstant.type, 30),
|
||||
Duration(ObjDuration.type),
|
||||
Other(Obj.rootObjectType);
|
||||
Other(Obj.rootObjectType,60);
|
||||
}
|
||||
|
||||
open class LynonEncoder(val bout: BitOutput, val settings: LynonSettings = LynonSettings.default) {
|
||||
|
@ -1,38 +1,68 @@
|
||||
package net.sergeych.lynon
|
||||
|
||||
import net.sergeych.collections.SortedList
|
||||
import net.sergeych.lynon.Huffman.Alphabet
|
||||
|
||||
|
||||
/**
|
||||
* Experimental, reference implementation of Huffman trees and encoding.
|
||||
*
|
||||
* This is a reference huffman encoding implementation not yet ready;
|
||||
* it was used to experiment with LZW, at the moment, LZW won the competition
|
||||
* for compressed module format for its speed and sufficiently small size/
|
||||
*
|
||||
* This is byte-based compressor which makes it not too interesting.
|
||||
*
|
||||
* TODO: convert to use various source dictionary
|
||||
*
|
||||
* reason: version thant compress bytes is not too interesting; particular alphabets
|
||||
* are often longer than byte bits and are often sparse, that requires another
|
||||
* codes serialization implementation
|
||||
* Generic huffman encoding implementation using bits input/output and abstract [Alphabet].
|
||||
*/
|
||||
object Huffman {
|
||||
|
||||
/**
|
||||
* Alphabet interface: source can be variable bit size codes, not just bytes,
|
||||
* so the Huffman encoding is not limited to bytes. It works with any alphabet
|
||||
* using its _ordinals_; encoding between source symbols and ordinals are
|
||||
* performed by the alphabet. See [byteAlphabet] for example.
|
||||
*/
|
||||
interface Alphabet<T> {
|
||||
val maxOrdinal: Int
|
||||
|
||||
/**
|
||||
* Write correct symbol for the [ordinal] to the [bout]. This is
|
||||
* the inverse of [ordinalOf] but as [T] could be variable bit size,
|
||||
* we provide output bit stream.
|
||||
*/
|
||||
fun decodeOrdinalTo(bout: BitOutput, ordinal: Int)
|
||||
|
||||
/**
|
||||
* Find the ordinal of the source symbol
|
||||
*/
|
||||
fun ordinalOf(value: T): Int
|
||||
|
||||
operator fun get(ordinal: Int): T
|
||||
}
|
||||
|
||||
/**
|
||||
* Alphabet for unsigned bytes, allows to encode bytes easily
|
||||
*/
|
||||
val byteAlphabet = object : Alphabet<UByte> {
|
||||
override val maxOrdinal: Int
|
||||
get() = 256
|
||||
|
||||
override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) {
|
||||
bout.putBits(ordinal, 8)
|
||||
}
|
||||
|
||||
override fun ordinalOf(value: UByte): Int = value.toInt()
|
||||
|
||||
override operator fun get(ordinal: Int): UByte = ordinal.toUByte()
|
||||
}
|
||||
|
||||
sealed class Node(val freq: Int) : Comparable<Node> {
|
||||
override fun compareTo(other: Node): Int {
|
||||
return freq.compareTo(other.freq)
|
||||
}
|
||||
|
||||
abstract fun decode(bin: BitInput): Int?
|
||||
abstract fun decodeOrdinal(bin: BitInput): Int?
|
||||
|
||||
class Leaf(val value: Int, freq: Int) : Node(freq) {
|
||||
class Leaf(val ordinal: Int, freq: Int) : Node(freq) {
|
||||
override fun toString(): String {
|
||||
return "[$value:$freq]"
|
||||
return "[$ordinal:$freq]"
|
||||
}
|
||||
|
||||
override fun decode(bin: BitInput): Int {
|
||||
return value//.also { println(": ${Char(value)}") }
|
||||
override fun decodeOrdinal(bin: BitInput): Int {
|
||||
return ordinal//.also { println(": ${Char(value)}") }
|
||||
}
|
||||
}
|
||||
|
||||
@ -41,33 +71,33 @@ object Huffman {
|
||||
return "[${left.freq}<- :<$freq>: ->${right.freq}]"
|
||||
}
|
||||
|
||||
override fun decode(bin: BitInput): Int? {
|
||||
override fun decodeOrdinal(bin: BitInput): Int? {
|
||||
return when (bin.getBitOrNull().also { print("$it") }) {
|
||||
1 -> left.decode(bin)
|
||||
0 -> right.decode(bin)
|
||||
1 -> left.decodeOrdinal(bin)
|
||||
0 -> right.decodeOrdinal(bin)
|
||||
else -> null
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data class Code(val symbol: Int, val bits: TinyBits) {
|
||||
data class Code(val ordinal: Int, val bits: TinyBits) {
|
||||
|
||||
val size by bits::size
|
||||
|
||||
override fun toString(): String {
|
||||
return "[${Char(symbol)}:$size:$bits]"
|
||||
return "[$ordinal:$size:$bits]"
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private fun generateCanonicCodes(tree: Node): List<Code?> {
|
||||
val codes = MutableList<Code?>(256) { null }
|
||||
private fun generateCanonicCodes(tree: Node, alphabet: Alphabet<*>): List<Code?> {
|
||||
val codes = MutableList<Code?>(alphabet.maxOrdinal) { null }
|
||||
|
||||
fun traverse(node: Node, code: TinyBits) {
|
||||
when (node) {
|
||||
is Node.Leaf ->
|
||||
codes[node.value] = (Code(node.value, code))
|
||||
codes[node.ordinal] = (Code(node.ordinal, code))
|
||||
|
||||
is Node.Internal -> {
|
||||
traverse(node.left, code.insertBit(1))
|
||||
@ -77,17 +107,17 @@ object Huffman {
|
||||
}
|
||||
traverse(tree, TinyBits())
|
||||
|
||||
return makeCanonical(codes)
|
||||
return makeCanonical(codes, alphabet)
|
||||
}
|
||||
|
||||
private fun makeCanonical(source: List<Code?>): List<Code?> {
|
||||
private fun makeCanonical(source: List<Code?>,alphabet: Alphabet<*>): List<Code?> {
|
||||
val sorted = source.filterNotNull().sortedWith(canonicComparator)
|
||||
|
||||
val canonical = MutableList<Code?>(256) { null }
|
||||
val canonical = MutableList<Code?>(alphabet.maxOrdinal) { null }
|
||||
|
||||
val first = sorted[0]
|
||||
val prevValue = first.copy(bits = TinyBits(0UL, first.bits.size))
|
||||
canonical[first.symbol] = prevValue
|
||||
canonical[first.ordinal] = prevValue
|
||||
var prev = prevValue.bits
|
||||
|
||||
for (i in 1..<sorted.size) {
|
||||
@ -96,7 +126,7 @@ object Huffman {
|
||||
while (code.bits.size > bits.size) {
|
||||
bits = bits.insertBit(0)
|
||||
}
|
||||
canonical[code.symbol] = code.copy(bits = bits)//.also { println("$it") }
|
||||
canonical[code.ordinal] = code.copy(bits = bits)//.also { println("$it") }
|
||||
prev = bits
|
||||
}
|
||||
return canonical
|
||||
@ -104,18 +134,21 @@ object Huffman {
|
||||
|
||||
private val canonicComparator = { a: Code, b: Code ->
|
||||
if (a.bits.size == b.bits.size) {
|
||||
a.symbol.compareTo(b.symbol)
|
||||
a.ordinal.compareTo(b.ordinal)
|
||||
} else {
|
||||
a.bits.size.compareTo(b.bits.size)
|
||||
}
|
||||
}
|
||||
|
||||
private fun buildTree(data: UByteArray): Node {
|
||||
// println(data.toDump())
|
||||
val frequencies = Array(256) { 0 }
|
||||
data.forEach { frequencies[it.toInt()]++ }
|
||||
private fun buildTree(data: Iterable<Int>,alphabet: Alphabet<*>): Node {
|
||||
val frequencies = buildFrequencies(alphabet, data)
|
||||
return buildTree(frequencies)
|
||||
}
|
||||
|
||||
val list = SortedList<Node>(*frequencies.mapIndexed { index, i -> Node.Leaf(index, i) }.filter { it.freq > 0 }
|
||||
private fun buildTree(frequencies: Array<Int>): Node {
|
||||
// println(data.toDump())
|
||||
|
||||
val list: SortedList<Node> = SortedList(*frequencies.mapIndexed { index, frequency -> Node.Leaf(index, frequency) }.filter { it.freq > 0 }
|
||||
.toTypedArray())
|
||||
|
||||
// build the tree
|
||||
@ -127,8 +160,18 @@ object Huffman {
|
||||
return list[0]
|
||||
}
|
||||
|
||||
fun decompressUsingCodes(bin: BitInput, codes: List<Code?>): UByteArray {
|
||||
val result = mutableListOf<UByte>()
|
||||
private fun buildFrequencies(
|
||||
alphabet: Alphabet<*>,
|
||||
data: Iterable<Int>
|
||||
): Array<Int> {
|
||||
val maxOrdinal = alphabet.maxOrdinal
|
||||
val frequencies = Array(maxOrdinal) { 0 }
|
||||
data.forEach { frequencies[it]++ }
|
||||
return frequencies
|
||||
}
|
||||
|
||||
fun decompressUsingCodes(bin: BitInput, codes: List<Code?>, alphabet: Alphabet<*>): BitArray {
|
||||
val result = MemoryBitOutput()
|
||||
val table = codes.filterNotNull().associateBy { it.bits }
|
||||
|
||||
outer@ while (true) {
|
||||
@ -139,12 +182,12 @@ object Huffman {
|
||||
val data = table[input]
|
||||
if (data != null) {
|
||||
// println("Code found: ${data.bits} -> [${data.symbol.toChar()}]")
|
||||
result.add(data.symbol.toUByte())
|
||||
alphabet.decodeOrdinalTo(result,data.ordinal)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return result.toUByteArray()
|
||||
return result.toBitArray()
|
||||
}
|
||||
|
||||
private fun serializeCanonicCodes(bout: BitOutput, codes: List<Code?>) {
|
||||
@ -167,11 +210,11 @@ object Huffman {
|
||||
}
|
||||
}
|
||||
|
||||
fun deserializeCanonicCodes(bin: BitInput): List<Code?> {
|
||||
fun deserializeCanonicCodes(bin: BitInput, alphabet: Alphabet<*>): List<Code?> {
|
||||
val minSize = bin.unpackUnsigned().toInt()
|
||||
val sizeInBits = bin.unpackUnsigned().toInt()
|
||||
val sorted = mutableListOf<Code>().also { codes ->
|
||||
for (i in 0..<256) {
|
||||
for (i in 0..<alphabet.maxOrdinal) {
|
||||
val s = bin.getBits(sizeInBits).toInt()
|
||||
if (s > 0) {
|
||||
codes.add(Code(i, TinyBits(0U, s - 1 + minSize)))
|
||||
@ -179,66 +222,53 @@ object Huffman {
|
||||
}
|
||||
}.sortedWith(canonicComparator)
|
||||
|
||||
val result = MutableList<Code?>(256) { null }
|
||||
val result = MutableList<Code?>(alphabet.maxOrdinal) { null }
|
||||
var prev = sorted[0].copy(bits = TinyBits(0U, sorted[0].bits.size))
|
||||
result[prev.symbol] = prev
|
||||
result[prev.ordinal] = prev
|
||||
|
||||
for (i in 1..<sorted.size) {
|
||||
val code = sorted[i]
|
||||
var bits = TinyBits(prev.bits.value + 1u, prev.bits.size)
|
||||
while (bits.size < code.bits.size) bits = bits.insertBit(0)
|
||||
result[code.symbol] = code.copy(bits = bits).also {
|
||||
result[code.ordinal] = code.copy(bits = bits).also {
|
||||
prev = it
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
fun compress(data: UByteArray): BitArray {
|
||||
// fun generateCanonicalCodes(frequencies: Iterable<Int>): List<Code?> {
|
||||
//
|
||||
// }
|
||||
|
||||
val root = buildTree(data)
|
||||
fun generateCanonicalCodes(frequencies: Array<Int>,alphabet: Alphabet<*>): List<Code?> =
|
||||
generateCanonicCodes(buildTree(frequencies), alphabet)
|
||||
|
||||
val codes = generateCanonicCodes(root)
|
||||
fun <T>compress(plain: Iterable<T>,alphabet: Alphabet<T>): BitArray {
|
||||
|
||||
val source = plain.map { alphabet.ordinalOf(it) }
|
||||
val root = buildTree(source,alphabet)
|
||||
|
||||
val codes = generateCanonicCodes(root, alphabet)
|
||||
|
||||
// serializa table
|
||||
|
||||
// test encode:
|
||||
val bout = MemoryBitOutput()
|
||||
serializeCanonicCodes(bout, codes)
|
||||
for (i in data) {
|
||||
val code = codes[i.toInt()]!!
|
||||
for (i in source) {
|
||||
val code = codes[i]!!
|
||||
// println(">> $code")
|
||||
bout.putBits(code.bits)
|
||||
}
|
||||
// println(bout.toBitArray().bytes.toDump())
|
||||
val compressed = bout.toBitArray()
|
||||
// println("Size: ${compressed.bytes.size / data.size.toDouble() }")
|
||||
// println("compression ratio: ${compressed.bytes.size / data.size.toDouble() }")
|
||||
|
||||
// test decompress
|
||||
// val bin = MemoryBitInput(compressed)
|
||||
// val codes2 = deserializeCanonicCodes(bin)
|
||||
// for ((a, b) in codes.zip(codes2)) {
|
||||
// if (a != b) {
|
||||
// println("Codes mismatch: $a != $b")
|
||||
// break
|
||||
// }
|
||||
// }
|
||||
// require(codes == codes2)
|
||||
// val result = decompressUsingCodes(bin, codes2)
|
||||
//
|
||||
//// println(result.toUByteArray().toDump())
|
||||
// check(data contentEquals result.toUByteArray())
|
||||
// if( !(data contentEquals result.toUByteArray()) )
|
||||
// throw RuntimeException("Data mismatch")
|
||||
// println(data.toDump())
|
||||
//
|
||||
return compressed
|
||||
}
|
||||
|
||||
fun decompress(bin: BitInput): UByteArray {
|
||||
val codes = deserializeCanonicCodes(bin)
|
||||
return decompressUsingCodes(bin, codes)
|
||||
fun <T>decompress(bin: BitInput,alphabet: Alphabet<T>): UByteArray {
|
||||
val codes = deserializeCanonicCodes(bin, alphabet)
|
||||
return decompressUsingCodes(bin, codes, alphabet).asUbyteArray()
|
||||
}
|
||||
|
||||
}
|
@ -382,16 +382,39 @@ class LynonTests {
|
||||
println("Original : ${x.size}")
|
||||
val lzw = LZW.compress(x).bytes
|
||||
println("LZW : ${lzw.size}")
|
||||
val ba = Huffman.compress(x)
|
||||
val ba = Huffman.compress(x, Huffman.byteAlphabet)
|
||||
val huff = ba.bytes
|
||||
println("Huffman : ${huff.size}")
|
||||
val lzwhuff = Huffman.compress(lzw).bytes
|
||||
val lzwhuff = Huffman.compress(lzw, Huffman.byteAlphabet).bytes
|
||||
println("LZW+HUFF : ${lzwhuff.size}")
|
||||
val compressed = Huffman.compress(x)
|
||||
val decompressed = Huffman.decompress(compressed.toBitInput())
|
||||
val compressed = Huffman.compress(x,Huffman.byteAlphabet)
|
||||
val decompressed = Huffman.decompress(compressed.toBitInput(),Huffman.byteAlphabet)
|
||||
assertContentEquals(x, decompressed)
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testGenerateCanonicalHuffmanCodes() {
|
||||
val frequencies = LynonType.entries.map { it.defaultFrequency }.toTypedArray()
|
||||
val alphabet = object : Huffman.Alphabet<LynonType> {
|
||||
override val maxOrdinal = LynonType.entries.size
|
||||
|
||||
// val bitSize = sizeInBits(maxOrdinal)
|
||||
|
||||
override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) {
|
||||
TODO("Not yet implemented")
|
||||
}
|
||||
|
||||
override fun get(ordinal: Int): LynonType {
|
||||
TODO("Not yet implemented")
|
||||
}
|
||||
|
||||
override fun ordinalOf(value: LynonType): Int = value.ordinal
|
||||
}
|
||||
for(code in Huffman.generateCanonicalCodes(frequencies, alphabet)) {
|
||||
println("${code?.bits}: ${code?.ordinal?.let { LynonType.entries[it] }}")
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
fun testBitListSmall() {
|
||||
var t = TinyBits()
|
||||
|
Loading…
x
Reference in New Issue
Block a user