diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt index 4348c01..8e8f884 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt @@ -84,22 +84,17 @@ data class ObjString(val value: String) : Obj() { override suspend fun lynonType(): LynonType = LynonType.String override suspend fun serialize(scope: Scope, encoder: LynonEncoder, lynonType: LynonType?) { -// if( lynonType == null ) -// encoder.encodeCached(this) { encoder.encodeBinaryData(value.encodeToByteArray()) } -// else - encoder.encodeBinaryData(value.encodeToByteArray()) + val data = value.encodeToByteArray() + encoder.encodeCached(data) { encoder.encodeBinaryData(data) } } companion object { val type = object : ObjClass("String") { override suspend fun deserialize(scope: Scope, decoder: LynonDecoder, lynonType: LynonType?): Obj = -// if( lynonType == null ) -// decoder.decodeCached { -// ObjString(decoder.unpackBinaryData().decodeToString()) -// } -// else - ObjString(decoder.unpackBinaryData().decodeToString()) + decoder.decodeCached { + ObjString(decoder.unpackBinaryData().decodeToString()) + } }.apply { addFn("toInt") { ObjInt(thisAs().value.toLong()) diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt index 5dc9cf5..c136b1f 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt @@ -4,21 +4,21 @@ import net.sergeych.bintools.ByteChunk import net.sergeych.lyng.Scope import net.sergeych.lyng.obj.* -enum class LynonType(val objClass: ObjClass) { - Null(ObjNull.objClass), - Int0(ObjInt.type), - IntNegative(ObjInt.type), - IntPositive(ObjInt.type), - String(ObjString.type), +enum class LynonType(val objClass: ObjClass,val defaultFrequency: Int = 1) { + Null(ObjNull.objClass, 80), + Int0(ObjInt.type, 70), + IntNegative(ObjInt.type, 50), + IntPositive(ObjInt.type, 100), + String(ObjString.type, 100), Real(ObjReal.type), - Bool(ObjBool.type), - List(ObjList.type), - Map(ObjMap.type), + Bool(ObjBool.type, 80), + List(ObjList.type, 70), + Map(ObjMap.type,40), Set(ObjSet.type), - Buffer(ObjBuffer.type), - Instant(ObjInstant.type), + Buffer(ObjBuffer.type, 50), + Instant(ObjInstant.type, 30), Duration(ObjDuration.type), - Other(Obj.rootObjectType); + Other(Obj.rootObjectType,60); } open class LynonEncoder(val bout: BitOutput, val settings: LynonSettings = LynonSettings.default) { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt index a8c3aa8..56e1d12 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt @@ -1,38 +1,68 @@ package net.sergeych.lynon import net.sergeych.collections.SortedList +import net.sergeych.lynon.Huffman.Alphabet + /** - * Experimental, reference implementation of Huffman trees and encoding. - * - * This is a reference huffman encoding implementation not yet ready; - * it was used to experiment with LZW, at the moment, LZW won the competition - * for compressed module format for its speed and sufficiently small size/ - * - * This is byte-based compressor which makes it not too interesting. - * - * TODO: convert to use various source dictionary - * - * reason: version thant compress bytes is not too interesting; particular alphabets - * are often longer than byte bits and are often sparse, that requires another - * codes serialization implementation + * Generic huffman encoding implementation using bits input/output and abstract [Alphabet]. */ object Huffman { + /** + * Alphabet interface: source can be variable bit size codes, not just bytes, + * so the Huffman encoding is not limited to bytes. It works with any alphabet + * using its _ordinals_; encoding between source symbols and ordinals are + * performed by the alphabet. See [byteAlphabet] for example. + */ + interface Alphabet { + val maxOrdinal: Int + + /** + * Write correct symbol for the [ordinal] to the [bout]. This is + * the inverse of [ordinalOf] but as [T] could be variable bit size, + * we provide output bit stream. + */ + fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) + + /** + * Find the ordinal of the source symbol + */ + fun ordinalOf(value: T): Int + + operator fun get(ordinal: Int): T + } + + /** + * Alphabet for unsigned bytes, allows to encode bytes easily + */ + val byteAlphabet = object : Alphabet { + override val maxOrdinal: Int + get() = 256 + + override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) { + bout.putBits(ordinal, 8) + } + + override fun ordinalOf(value: UByte): Int = value.toInt() + + override operator fun get(ordinal: Int): UByte = ordinal.toUByte() + } + sealed class Node(val freq: Int) : Comparable { override fun compareTo(other: Node): Int { return freq.compareTo(other.freq) } - abstract fun decode(bin: BitInput): Int? + abstract fun decodeOrdinal(bin: BitInput): Int? - class Leaf(val value: Int, freq: Int) : Node(freq) { + class Leaf(val ordinal: Int, freq: Int) : Node(freq) { override fun toString(): String { - return "[$value:$freq]" + return "[$ordinal:$freq]" } - override fun decode(bin: BitInput): Int { - return value//.also { println(": ${Char(value)}") } + override fun decodeOrdinal(bin: BitInput): Int { + return ordinal//.also { println(": ${Char(value)}") } } } @@ -41,33 +71,33 @@ object Huffman { return "[${left.freq}<- :<$freq>: ->${right.freq}]" } - override fun decode(bin: BitInput): Int? { + override fun decodeOrdinal(bin: BitInput): Int? { return when (bin.getBitOrNull().also { print("$it") }) { - 1 -> left.decode(bin) - 0 -> right.decode(bin) + 1 -> left.decodeOrdinal(bin) + 0 -> right.decodeOrdinal(bin) else -> null } } } } - data class Code(val symbol: Int, val bits: TinyBits) { + data class Code(val ordinal: Int, val bits: TinyBits) { val size by bits::size override fun toString(): String { - return "[${Char(symbol)}:$size:$bits]" + return "[$ordinal:$size:$bits]" } } - private fun generateCanonicCodes(tree: Node): List { - val codes = MutableList(256) { null } + private fun generateCanonicCodes(tree: Node, alphabet: Alphabet<*>): List { + val codes = MutableList(alphabet.maxOrdinal) { null } fun traverse(node: Node, code: TinyBits) { when (node) { is Node.Leaf -> - codes[node.value] = (Code(node.value, code)) + codes[node.ordinal] = (Code(node.ordinal, code)) is Node.Internal -> { traverse(node.left, code.insertBit(1)) @@ -77,17 +107,17 @@ object Huffman { } traverse(tree, TinyBits()) - return makeCanonical(codes) + return makeCanonical(codes, alphabet) } - private fun makeCanonical(source: List): List { + private fun makeCanonical(source: List,alphabet: Alphabet<*>): List { val sorted = source.filterNotNull().sortedWith(canonicComparator) - val canonical = MutableList(256) { null } + val canonical = MutableList(alphabet.maxOrdinal) { null } val first = sorted[0] val prevValue = first.copy(bits = TinyBits(0UL, first.bits.size)) - canonical[first.symbol] = prevValue + canonical[first.ordinal] = prevValue var prev = prevValue.bits for (i in 1.. bits.size) { bits = bits.insertBit(0) } - canonical[code.symbol] = code.copy(bits = bits)//.also { println("$it") } + canonical[code.ordinal] = code.copy(bits = bits)//.also { println("$it") } prev = bits } return canonical @@ -104,18 +134,21 @@ object Huffman { private val canonicComparator = { a: Code, b: Code -> if (a.bits.size == b.bits.size) { - a.symbol.compareTo(b.symbol) + a.ordinal.compareTo(b.ordinal) } else { a.bits.size.compareTo(b.bits.size) } } - private fun buildTree(data: UByteArray): Node { -// println(data.toDump()) - val frequencies = Array(256) { 0 } - data.forEach { frequencies[it.toInt()]++ } + private fun buildTree(data: Iterable,alphabet: Alphabet<*>): Node { + val frequencies = buildFrequencies(alphabet, data) + return buildTree(frequencies) + } - val list = SortedList(*frequencies.mapIndexed { index, i -> Node.Leaf(index, i) }.filter { it.freq > 0 } + private fun buildTree(frequencies: Array): Node { +// println(data.toDump()) + + val list: SortedList = SortedList(*frequencies.mapIndexed { index, frequency -> Node.Leaf(index, frequency) }.filter { it.freq > 0 } .toTypedArray()) // build the tree @@ -127,8 +160,18 @@ object Huffman { return list[0] } - fun decompressUsingCodes(bin: BitInput, codes: List): UByteArray { - val result = mutableListOf() + private fun buildFrequencies( + alphabet: Alphabet<*>, + data: Iterable + ): Array { + val maxOrdinal = alphabet.maxOrdinal + val frequencies = Array(maxOrdinal) { 0 } + data.forEach { frequencies[it]++ } + return frequencies + } + + fun decompressUsingCodes(bin: BitInput, codes: List, alphabet: Alphabet<*>): BitArray { + val result = MemoryBitOutput() val table = codes.filterNotNull().associateBy { it.bits } outer@ while (true) { @@ -139,12 +182,12 @@ object Huffman { val data = table[input] if (data != null) { // println("Code found: ${data.bits} -> [${data.symbol.toChar()}]") - result.add(data.symbol.toUByte()) + alphabet.decodeOrdinalTo(result,data.ordinal) break } } } - return result.toUByteArray() + return result.toBitArray() } private fun serializeCanonicCodes(bout: BitOutput, codes: List) { @@ -167,11 +210,11 @@ object Huffman { } } - fun deserializeCanonicCodes(bin: BitInput): List { + fun deserializeCanonicCodes(bin: BitInput, alphabet: Alphabet<*>): List { val minSize = bin.unpackUnsigned().toInt() val sizeInBits = bin.unpackUnsigned().toInt() val sorted = mutableListOf().also { codes -> - for (i in 0..<256) { + for (i in 0.. 0) { codes.add(Code(i, TinyBits(0U, s - 1 + minSize))) @@ -179,66 +222,53 @@ object Huffman { } }.sortedWith(canonicComparator) - val result = MutableList(256) { null } + val result = MutableList(alphabet.maxOrdinal) { null } var prev = sorted[0].copy(bits = TinyBits(0U, sorted[0].bits.size)) - result[prev.symbol] = prev + result[prev.ordinal] = prev for (i in 1..): List { +// +// } - val root = buildTree(data) + fun generateCanonicalCodes(frequencies: Array,alphabet: Alphabet<*>): List = + generateCanonicCodes(buildTree(frequencies), alphabet) - val codes = generateCanonicCodes(root) + fun compress(plain: Iterable,alphabet: Alphabet): BitArray { + + val source = plain.map { alphabet.ordinalOf(it) } + val root = buildTree(source,alphabet) + + val codes = generateCanonicCodes(root, alphabet) // serializa table // test encode: val bout = MemoryBitOutput() serializeCanonicCodes(bout, codes) - for (i in data) { - val code = codes[i.toInt()]!! + for (i in source) { + val code = codes[i]!! // println(">> $code") bout.putBits(code.bits) } // println(bout.toBitArray().bytes.toDump()) val compressed = bout.toBitArray() -// println("Size: ${compressed.bytes.size / data.size.toDouble() }") -// println("compression ratio: ${compressed.bytes.size / data.size.toDouble() }") - - // test decompress -// val bin = MemoryBitInput(compressed) -// val codes2 = deserializeCanonicCodes(bin) -// for ((a, b) in codes.zip(codes2)) { -// if (a != b) { -// println("Codes mismatch: $a != $b") -// break -// } -// } -// require(codes == codes2) -// val result = decompressUsingCodes(bin, codes2) -// -//// println(result.toUByteArray().toDump()) -// check(data contentEquals result.toUByteArray()) -// if( !(data contentEquals result.toUByteArray()) ) -// throw RuntimeException("Data mismatch") -// println(data.toDump()) -// return compressed } - fun decompress(bin: BitInput): UByteArray { - val codes = deserializeCanonicCodes(bin) - return decompressUsingCodes(bin, codes) + fun decompress(bin: BitInput,alphabet: Alphabet): UByteArray { + val codes = deserializeCanonicCodes(bin, alphabet) + return decompressUsingCodes(bin, codes, alphabet).asUbyteArray() } } \ No newline at end of file diff --git a/lynglib/src/jvmTest/kotlin/LynonTests.kt b/lynglib/src/jvmTest/kotlin/LynonTests.kt index d0ba497..da762e2 100644 --- a/lynglib/src/jvmTest/kotlin/LynonTests.kt +++ b/lynglib/src/jvmTest/kotlin/LynonTests.kt @@ -382,16 +382,39 @@ class LynonTests { println("Original : ${x.size}") val lzw = LZW.compress(x).bytes println("LZW : ${lzw.size}") - val ba = Huffman.compress(x) + val ba = Huffman.compress(x, Huffman.byteAlphabet) val huff = ba.bytes println("Huffman : ${huff.size}") - val lzwhuff = Huffman.compress(lzw).bytes + val lzwhuff = Huffman.compress(lzw, Huffman.byteAlphabet).bytes println("LZW+HUFF : ${lzwhuff.size}") - val compressed = Huffman.compress(x) - val decompressed = Huffman.decompress(compressed.toBitInput()) + val compressed = Huffman.compress(x,Huffman.byteAlphabet) + val decompressed = Huffman.decompress(compressed.toBitInput(),Huffman.byteAlphabet) assertContentEquals(x, decompressed) } + @Test + fun testGenerateCanonicalHuffmanCodes() { + val frequencies = LynonType.entries.map { it.defaultFrequency }.toTypedArray() + val alphabet = object : Huffman.Alphabet { + override val maxOrdinal = LynonType.entries.size + +// val bitSize = sizeInBits(maxOrdinal) + + override fun decodeOrdinalTo(bout: BitOutput, ordinal: Int) { + TODO("Not yet implemented") + } + + override fun get(ordinal: Int): LynonType { + TODO("Not yet implemented") + } + + override fun ordinalOf(value: LynonType): Int = value.ordinal + } + for(code in Huffman.generateCanonicalCodes(frequencies, alphabet)) { + println("${code?.bits}: ${code?.ordinal?.let { LynonType.entries[it] }}") + } + } + @Test fun testBitListSmall() { var t = TinyBits()