refs #35 generic implementation of Huffman compression for variable bit length source alphabet

This commit is contained in:
Sergey Chernov 2025-07-23 20:49:40 +03:00
parent 20181c63a1
commit 12b209c724
4 changed files with 149 additions and 101 deletions

View File

@ -84,22 +84,17 @@ data class ObjString(val value: String) : Obj() {
override suspend fun lynonType(): LynonType = LynonType.String
override suspend fun serialize(scope: Scope, encoder: LynonEncoder, lynonType: LynonType?) {
// if( lynonType == null )
// encoder.encodeCached(this) { encoder.encodeBinaryData(value.encodeToByteArray()) }
// else
encoder.encodeBinaryData(value.encodeToByteArray())
val data = value.encodeToByteArray()
encoder.encodeCached(data) { encoder.encodeBinaryData(data) }
}
companion object {
val type = object : ObjClass("String") {
override suspend fun deserialize(scope: Scope, decoder: LynonDecoder, lynonType: LynonType?): Obj =
// if( lynonType == null )
// decoder.decodeCached {
// ObjString(decoder.unpackBinaryData().decodeToString())
// }
// else
ObjString(decoder.unpackBinaryData().decodeToString())
decoder.decodeCached {
ObjString(decoder.unpackBinaryData().decodeToString())
}
}.apply {
addFn("toInt") {
ObjInt(thisAs<ObjString>().value.toLong())

View File

@ -4,21 +4,21 @@ import net.sergeych.bintools.ByteChunk
import net.sergeych.lyng.Scope
import net.sergeych.lyng.obj.*
enum class LynonType(val objClass: ObjClass) {
Null(ObjNull.objClass),
Int0(ObjInt.type),
IntNegative(ObjInt.type),
IntPositive(ObjInt.type),
String(ObjString.type),
enum class LynonType(val objClass: ObjClass,val defaultFrequency: Int = 1) {
Null(ObjNull.objClass, 80),
Int0(ObjInt.type, 70),
IntNegative(ObjInt.type, 50),
IntPositive(ObjInt.type, 100),
String(ObjString.type, 100),
Real(ObjReal.type),
Bool(ObjBool.type),
List(ObjList.type),
Map(ObjMap.type),
Bool(ObjBool.type, 80),
List(ObjList.type, 70),
Map(ObjMap.type,40),
Set(ObjSet.type),
Buffer(ObjBuffer.type),
Instant(ObjInstant.type),
Buffer(ObjBuffer.type, 50),
Instant(ObjInstant.type, 30),
Duration(ObjDuration.type),
Other(Obj.rootObjectType);
Other(Obj.rootObjectType,60);
}
open class LynonEncoder(val bout: BitOutput, val settings: LynonSettings = LynonSettings.default) {

View File

@ -1,38 +1,68 @@
package net.sergeych.lynon
import net.sergeych.collections.SortedList
import net.sergeych.lynon.Huffman.Alphabet
/**
* Experimental, reference implementation of Huffman trees and encoding.
*
* This is a reference huffman encoding implementation not yet ready;
* it was used to experiment with LZW, at the moment, LZW won the competition
* for compressed module format for its speed and sufficiently small size/
*
* This is byte-based compressor which makes it not too interesting.
*
* TODO: convert to use various source dictionary
*
* reason: version thant compress bytes is not too interesting; particular alphabets
* are often longer than byte bits and are often sparse, that requires another
* codes serialization implementation
* Generic huffman encoding implementation using bits input/output and abstract [Alphabet].
*/
object Huffman {
/**
* Alphabet interface: source can be variable bit size codes, not just bytes,
* so the Huffman encoding is not limited to bytes. It works with any alphabet
* using its _ordinals_; encoding between source symbols and ordinals are
* performed by the alphabet. See [byteAlphabet] for example.
*/
interface Alphabet<T> {
val maxOrdinal: Int
/**
* Write correct symbol for the [ordinal] to the [bout]. This is
* the inverse of [ordinalOf] but as [T] could be variable bit size,
* we provide output bit stream.
*/
fun decodeOrdinalTo(bout: BitOutput, ordinal: Int)
/**
* Find the ordinal of the source symbol
*/
fun ordinalOf(value: T): Int
operator fun get(ordinal: Int): T
}
/**
* Alphabet for unsigned bytes, allows to encode bytes easily
*/
val byteAlphabet = object : Alphabet<UByte> {
override val maxOrdinal: Int
get() = 256
override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) {
bout.putBits(ordinal, 8)
}
override fun ordinalOf(value: UByte): Int = value.toInt()
override operator fun get(ordinal: Int): UByte = ordinal.toUByte()
}
sealed class Node(val freq: Int) : Comparable<Node> {
override fun compareTo(other: Node): Int {
return freq.compareTo(other.freq)
}
abstract fun decode(bin: BitInput): Int?
abstract fun decodeOrdinal(bin: BitInput): Int?
class Leaf(val value: Int, freq: Int) : Node(freq) {
class Leaf(val ordinal: Int, freq: Int) : Node(freq) {
override fun toString(): String {
return "[$value:$freq]"
return "[$ordinal:$freq]"
}
override fun decode(bin: BitInput): Int {
return value//.also { println(": ${Char(value)}") }
override fun decodeOrdinal(bin: BitInput): Int {
return ordinal//.also { println(": ${Char(value)}") }
}
}
@ -41,33 +71,33 @@ object Huffman {
return "[${left.freq}<- :<$freq>: ->${right.freq}]"
}
override fun decode(bin: BitInput): Int? {
override fun decodeOrdinal(bin: BitInput): Int? {
return when (bin.getBitOrNull().also { print("$it") }) {
1 -> left.decode(bin)
0 -> right.decode(bin)
1 -> left.decodeOrdinal(bin)
0 -> right.decodeOrdinal(bin)
else -> null
}
}
}
}
data class Code(val symbol: Int, val bits: TinyBits) {
data class Code(val ordinal: Int, val bits: TinyBits) {
val size by bits::size
override fun toString(): String {
return "[${Char(symbol)}:$size:$bits]"
return "[$ordinal:$size:$bits]"
}
}
private fun generateCanonicCodes(tree: Node): List<Code?> {
val codes = MutableList<Code?>(256) { null }
private fun generateCanonicCodes(tree: Node, alphabet: Alphabet<*>): List<Code?> {
val codes = MutableList<Code?>(alphabet.maxOrdinal) { null }
fun traverse(node: Node, code: TinyBits) {
when (node) {
is Node.Leaf ->
codes[node.value] = (Code(node.value, code))
codes[node.ordinal] = (Code(node.ordinal, code))
is Node.Internal -> {
traverse(node.left, code.insertBit(1))
@ -77,17 +107,17 @@ object Huffman {
}
traverse(tree, TinyBits())
return makeCanonical(codes)
return makeCanonical(codes, alphabet)
}
private fun makeCanonical(source: List<Code?>): List<Code?> {
private fun makeCanonical(source: List<Code?>,alphabet: Alphabet<*>): List<Code?> {
val sorted = source.filterNotNull().sortedWith(canonicComparator)
val canonical = MutableList<Code?>(256) { null }
val canonical = MutableList<Code?>(alphabet.maxOrdinal) { null }
val first = sorted[0]
val prevValue = first.copy(bits = TinyBits(0UL, first.bits.size))
canonical[first.symbol] = prevValue
canonical[first.ordinal] = prevValue
var prev = prevValue.bits
for (i in 1..<sorted.size) {
@ -96,7 +126,7 @@ object Huffman {
while (code.bits.size > bits.size) {
bits = bits.insertBit(0)
}
canonical[code.symbol] = code.copy(bits = bits)//.also { println("$it") }
canonical[code.ordinal] = code.copy(bits = bits)//.also { println("$it") }
prev = bits
}
return canonical
@ -104,18 +134,21 @@ object Huffman {
private val canonicComparator = { a: Code, b: Code ->
if (a.bits.size == b.bits.size) {
a.symbol.compareTo(b.symbol)
a.ordinal.compareTo(b.ordinal)
} else {
a.bits.size.compareTo(b.bits.size)
}
}
private fun buildTree(data: UByteArray): Node {
// println(data.toDump())
val frequencies = Array(256) { 0 }
data.forEach { frequencies[it.toInt()]++ }
private fun buildTree(data: Iterable<Int>,alphabet: Alphabet<*>): Node {
val frequencies = buildFrequencies(alphabet, data)
return buildTree(frequencies)
}
val list = SortedList<Node>(*frequencies.mapIndexed { index, i -> Node.Leaf(index, i) }.filter { it.freq > 0 }
private fun buildTree(frequencies: Array<Int>): Node {
// println(data.toDump())
val list: SortedList<Node> = SortedList(*frequencies.mapIndexed { index, frequency -> Node.Leaf(index, frequency) }.filter { it.freq > 0 }
.toTypedArray())
// build the tree
@ -127,8 +160,18 @@ object Huffman {
return list[0]
}
fun decompressUsingCodes(bin: BitInput, codes: List<Code?>): UByteArray {
val result = mutableListOf<UByte>()
private fun buildFrequencies(
alphabet: Alphabet<*>,
data: Iterable<Int>
): Array<Int> {
val maxOrdinal = alphabet.maxOrdinal
val frequencies = Array(maxOrdinal) { 0 }
data.forEach { frequencies[it]++ }
return frequencies
}
fun decompressUsingCodes(bin: BitInput, codes: List<Code?>, alphabet: Alphabet<*>): BitArray {
val result = MemoryBitOutput()
val table = codes.filterNotNull().associateBy { it.bits }
outer@ while (true) {
@ -139,12 +182,12 @@ object Huffman {
val data = table[input]
if (data != null) {
// println("Code found: ${data.bits} -> [${data.symbol.toChar()}]")
result.add(data.symbol.toUByte())
alphabet.decodeOrdinalTo(result,data.ordinal)
break
}
}
}
return result.toUByteArray()
return result.toBitArray()
}
private fun serializeCanonicCodes(bout: BitOutput, codes: List<Code?>) {
@ -167,11 +210,11 @@ object Huffman {
}
}
fun deserializeCanonicCodes(bin: BitInput): List<Code?> {
fun deserializeCanonicCodes(bin: BitInput, alphabet: Alphabet<*>): List<Code?> {
val minSize = bin.unpackUnsigned().toInt()
val sizeInBits = bin.unpackUnsigned().toInt()
val sorted = mutableListOf<Code>().also { codes ->
for (i in 0..<256) {
for (i in 0..<alphabet.maxOrdinal) {
val s = bin.getBits(sizeInBits).toInt()
if (s > 0) {
codes.add(Code(i, TinyBits(0U, s - 1 + minSize)))
@ -179,66 +222,53 @@ object Huffman {
}
}.sortedWith(canonicComparator)
val result = MutableList<Code?>(256) { null }
val result = MutableList<Code?>(alphabet.maxOrdinal) { null }
var prev = sorted[0].copy(bits = TinyBits(0U, sorted[0].bits.size))
result[prev.symbol] = prev
result[prev.ordinal] = prev
for (i in 1..<sorted.size) {
val code = sorted[i]
var bits = TinyBits(prev.bits.value + 1u, prev.bits.size)
while (bits.size < code.bits.size) bits = bits.insertBit(0)
result[code.symbol] = code.copy(bits = bits).also {
result[code.ordinal] = code.copy(bits = bits).also {
prev = it
}
}
return result
}
fun compress(data: UByteArray): BitArray {
// fun generateCanonicalCodes(frequencies: Iterable<Int>): List<Code?> {
//
// }
val root = buildTree(data)
fun generateCanonicalCodes(frequencies: Array<Int>,alphabet: Alphabet<*>): List<Code?> =
generateCanonicCodes(buildTree(frequencies), alphabet)
val codes = generateCanonicCodes(root)
fun <T>compress(plain: Iterable<T>,alphabet: Alphabet<T>): BitArray {
val source = plain.map { alphabet.ordinalOf(it) }
val root = buildTree(source,alphabet)
val codes = generateCanonicCodes(root, alphabet)
// serializa table
// test encode:
val bout = MemoryBitOutput()
serializeCanonicCodes(bout, codes)
for (i in data) {
val code = codes[i.toInt()]!!
for (i in source) {
val code = codes[i]!!
// println(">> $code")
bout.putBits(code.bits)
}
// println(bout.toBitArray().bytes.toDump())
val compressed = bout.toBitArray()
// println("Size: ${compressed.bytes.size / data.size.toDouble() }")
// println("compression ratio: ${compressed.bytes.size / data.size.toDouble() }")
// test decompress
// val bin = MemoryBitInput(compressed)
// val codes2 = deserializeCanonicCodes(bin)
// for ((a, b) in codes.zip(codes2)) {
// if (a != b) {
// println("Codes mismatch: $a != $b")
// break
// }
// }
// require(codes == codes2)
// val result = decompressUsingCodes(bin, codes2)
//
//// println(result.toUByteArray().toDump())
// check(data contentEquals result.toUByteArray())
// if( !(data contentEquals result.toUByteArray()) )
// throw RuntimeException("Data mismatch")
// println(data.toDump())
//
return compressed
}
fun decompress(bin: BitInput): UByteArray {
val codes = deserializeCanonicCodes(bin)
return decompressUsingCodes(bin, codes)
fun <T>decompress(bin: BitInput,alphabet: Alphabet<T>): UByteArray {
val codes = deserializeCanonicCodes(bin, alphabet)
return decompressUsingCodes(bin, codes, alphabet).asUbyteArray()
}
}

View File

@ -382,16 +382,39 @@ class LynonTests {
println("Original : ${x.size}")
val lzw = LZW.compress(x).bytes
println("LZW : ${lzw.size}")
val ba = Huffman.compress(x)
val ba = Huffman.compress(x, Huffman.byteAlphabet)
val huff = ba.bytes
println("Huffman : ${huff.size}")
val lzwhuff = Huffman.compress(lzw).bytes
val lzwhuff = Huffman.compress(lzw, Huffman.byteAlphabet).bytes
println("LZW+HUFF : ${lzwhuff.size}")
val compressed = Huffman.compress(x)
val decompressed = Huffman.decompress(compressed.toBitInput())
val compressed = Huffman.compress(x,Huffman.byteAlphabet)
val decompressed = Huffman.decompress(compressed.toBitInput(),Huffman.byteAlphabet)
assertContentEquals(x, decompressed)
}
@Test
fun testGenerateCanonicalHuffmanCodes() {
val frequencies = LynonType.entries.map { it.defaultFrequency }.toTypedArray()
val alphabet = object : Huffman.Alphabet<LynonType> {
override val maxOrdinal = LynonType.entries.size
// val bitSize = sizeInBits(maxOrdinal)
override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) {
TODO("Not yet implemented")
}
override fun get(ordinal: Int): LynonType {
TODO("Not yet implemented")
}
override fun ordinalOf(value: LynonType): Int = value.ordinal
}
for(code in Huffman.generateCanonicalCodes(frequencies, alphabet)) {
println("${code?.bits}: ${code?.ordinal?.let { LynonType.entries[it] }}")
}
}
@Test
fun testBitListSmall() {
var t = TinyBits()