diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitInput.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitInput.kt index dd58412..f1658e2 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitInput.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitInput.kt @@ -1,36 +1,9 @@ package net.sergeych.lynon -abstract class BitInput { +interface BitInput { - data class DataByte(val data: Int,val bits: Int) - /** - * Return next byte, int in 0..255 range, or -1 if end of stream reached - */ - abstract fun getByte(): DataByte - - private var accumulator = 0 - - var isEndOfStream: Boolean = false - private set - - private var mask = 0 - - fun getBitOrNull(): Int? { - if (isEndOfStream) return null - if (mask == 0) { - val ab = getByte() - accumulator = ab.data - if (accumulator == -1) { - isEndOfStream = true - return null - } - mask = 1 shl (ab.bits - 1) - } - val result = if (0 == accumulator and mask) 0 else 1 - mask = mask shr 1 - return result - } + fun getBitOrNull(): Int? fun getBitsOrNull(count: Int): ULong? { var result = 0UL @@ -54,8 +27,11 @@ abstract class BitInput { return getBitOrNull() ?: throw IllegalStateException("Unexpected end of stream") } - fun unpackUnsigned(): ULong { - val tetrades = getBits(4).toInt() + fun unpackUnsigned(): ULong = + unpackUnsignedOrNull() ?: throw IllegalStateException("Unexpected end of stream") + + fun unpackUnsignedOrNull(): ULong? { + val tetrades = getBitsOrNull(4)?.toInt() ?: return null var result = 0UL var shift = 0 for (i in 0.. tetrades) { @@ -84,5 +60,27 @@ abstract class BitInput { } return result } + + + fun decompress(): ByteArray = decompressOrNull() ?: throw DecompressionException("Unexpected end of stream") + + fun decompressOrNull(): ByteArray? { + val originalSize = unpackUnsignedOrNull()?.toInt() ?: return null + return if( getBit() == 1) { + // data is compressed +// val expectedCRC = getBits(32).toUInt() + val method = getBits(2).toInt() + if( method != 0) throw DecompressionException("Unknown compression method") + LZW.decompress(this, originalSize).asByteArray() + } + else { + getBytes(originalSize) ?: throw DecompressionException("Unexpected end of stream in uncompressed data") + } + } + + @Suppress("unused") + fun decompressStringOrNull(): String? = decompressOrNull()?.decodeToString() + + fun decompressString(): String = decompress().decodeToString() } diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitList.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitList.kt new file mode 100644 index 0000000..9b89e36 --- /dev/null +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitList.kt @@ -0,0 +1,34 @@ +package net.sergeych.lynon + +@Suppress("unused") +interface BitList { + operator fun get(bitIndex: Long): Int + operator fun set(bitIndex: Long,value: Int) + val size: Long + val indices: LongRange + + fun toInput(): BitInput = object : BitInput { + private var index = 0L + + override fun getBitOrNull(): Int? = + if( index < size) this@BitList[index++] + else null + } +} + +fun bitListOf(vararg bits: Int): BitList { + return if( bits.size > 64) { + BitArray.ofBits(*bits) + } + else + TinyBits.of(*bits) +} + +@Suppress("unused") +fun bitListOfSize(sizeInBits: Long): BitList { + return if( sizeInBits > 64) { + BitArray.withBitSize(sizeInBits) + } + else + TinyBits() +} \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitOutput.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitOutput.kt index 5227768..7c1529a 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitOutput.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/BitOutput.kt @@ -1,27 +1,6 @@ package net.sergeych.lynon -abstract class BitOutput { - - abstract fun outputByte(byte: UByte) - - private var accumulator = 0 - - /** - * Number of bits in accumulator. After output is closed by [close] this value is - * not changed and represents the number of bits in the last byte; this should - * be used to properly calculate end of the bit stream - */ - private var accumulatorBits = 0 - private set - - /** - * When [close] is called, represents the number of used bits in the last byte; - * bits after this number are the garbage and should be ignored - */ - val lastByteBits: Int get() { - if( !isClosed ) throw IllegalStateException("BitOutput is not closed") - return accumulatorBits - } +interface BitOutput { fun putBits(bits: ULong, count: Int) { require(count <= 64) @@ -41,13 +20,11 @@ abstract class BitOutput { } } - fun putBit(bit: Int) { - accumulator = (accumulator shl 1) or bit - if (++accumulatorBits >= 8) { - outputByte(accumulator.toUByte()) - accumulator = accumulator shr 0 - accumulatorBits = 0 - } + fun putBit(bit: Int) + + fun putBits(bitList: BitList) { + for (i in bitList.indices) + putBit(bitList[i]) } fun packUnsigned(value: ULong) { @@ -71,23 +48,57 @@ abstract class BitOutput { } } - var isClosed = false - private set - - fun close(): BitOutput { - if (!isClosed) { - if (accumulatorBits > 0) { - outputByte(accumulator.toUByte()) - } else accumulatorBits = 8 - isClosed = true - } - return this - } - fun putBytes(data: ByteArray) { for (b in data) { putBits(b.toULong(), 8) } } + + /** + * Create compressed record with content and size check. Compression works with _bytes_. + * + * Structure: + * + * | size | meaning | + * |------|--------------------------------------------------| + * | packed unsigned | size of uncompressed content in bytes | + * | 1 | 0 - not compressed, 1 - compressed | + * + * __If compressed__, then: + * + * | size | meaning | + * |------|--------------------------------------| + * | 2 | 00 - LZW, other combinations reserved| + * + * After this header compressed bits follow. + * + * __If not compressed,__ then source data follows as bit stream. + * + * Compressed block overhead is 3 bits, uncompressed 1. + */ + fun compress(source: ByteArray) { + // size + packUnsigned(source.size.toULong()) + // check compression is effective? + val compressed = LZW.compress(source.asUByteArray()) + // check that compression is effective including header bits size: + if( compressed.size + 2 < source.size * 8L) { + println("write compressed") + putBit(1) + // LZW algorithm + putBits(0, 2) + // compressed data + putBits(compressed) + } + else { + putBit(0) + putBytes(source) + } + } + + fun compress(source: String) { + compress(source.encodeToByteArray()) + } + } \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/DecompressionException.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/DecompressionException.kt new file mode 100644 index 0000000..d518673 --- /dev/null +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/DecompressionException.kt @@ -0,0 +1,3 @@ +package net.sergeych.lynon + +class DecompressionException(message: String) : IllegalArgumentException(message) {} \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonDecoder.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonDecoder.kt index 0894793..94c1d9e 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonDecoder.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonDecoder.kt @@ -28,10 +28,10 @@ open class LynonDecoder(val bin: BitInput,val settings: LynonSettings = LynonSet return decodeCached { type.deserialize(scope, this) } } - fun unpackBinaryData(): ByteArray? { - val size = bin.unpackUnsigned() - return bin.getBytes(size.toInt()) - } + fun unpackBinaryData(): ByteArray = bin.decompress() + + @Suppress("unused") + fun unpackBinaryDataOrNull(): ByteArray? = bin.decompressOrNull() fun unpackBoolean(): Boolean { return bin.getBit() == 1 diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt index 3dbb87a..a9e4e08 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/LynonEncoder.kt @@ -29,8 +29,7 @@ open class LynonEncoder(val bout: BitOutput,val settings: LynonSettings = LynonS } fun encodeBinaryData(data: ByteArray) { - bout.packUnsigned(data.size.toULong()) - bout.putBytes(data) + bout.compress(data) } fun encodeSigned(value: Long) { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitInput.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitInput.kt index d274057..5c5843f 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitInput.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitInput.kt @@ -1,20 +1,39 @@ package net.sergeych.lynon -class MemoryBitInput(val packedBits: UByteArray,val lastByteBits: Int): BitInput() { - constructor(bout: MemoryBitOutput): this(bout.toUByteArray(), bout.lastByteBits) +class MemoryBitInput(val packedBits: UByteArray, val lastByteBits: Int) : BitInput { + + constructor(ba: BitArray) : this(ba.bytes, ba.lastByteBits) {} + constructor(mba: MemoryBitOutput) : this(mba.toBitArray()) {} private var index = 0 - override fun getByte(): DataByte { - return if( index < packedBits.size ) { - DataByte( - packedBits[index++].toInt(), - if( index == packedBits.size ) lastByteBits else 8 - ) - } else { - DataByte(-1,0) + /** + * Return next byte, int in 0..255 range, or -1 if end of stream reached + */ + private var accumulator = 0 + + private var isEndOfStream: Boolean = false + private set + + private var mask = 0 + + override fun getBitOrNull(): Int? { + if (isEndOfStream) return null + if (mask == 0) { + if (index < packedBits.size) { + accumulator = packedBits[index++].toInt() + val n = if (index == packedBits.size) lastByteBits else 8 + mask = 1 shl (n - 1) + } else { + isEndOfStream = true + return null + } } + val result = if (0 == accumulator and mask) 0 else 1 + mask = mask shr 1 + return result } + } \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitOutput.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitOutput.kt index d028e24..4af2520 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitOutput.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/MemoryBitOutput.kt @@ -1,14 +1,135 @@ package net.sergeych.lynon -class MemoryBitOutput: BitOutput() { - private val buffer = mutableListOf() +import kotlin.math.min - fun toUByteArray(): UByteArray { - close() - return buffer.toTypedArray().toUByteArray() +/** + * BitList implementation as fixed suze array of bits; indexing works exactly same as if + * [MemoryBitInput] is used with [MemoryBitInput.getBit]. + */ +class BitArray(val bytes: UByteArray, val lastByteBits: Int) : BitList { + + val bytesSize: Int get() = bytes.size + override val size by lazy { bytes.size * 8L - (8 - lastByteBits) } + + override val indices by lazy { 0.. { + val byteIndex = (bitIndex / 8).toInt() + if (byteIndex !in bytes.indices) + throw IndexOutOfBoundsException("$bitIndex is out of bounds") + val i = (bitIndex % 8).toInt() + return byteIndex to ( + if (byteIndex == bytes.lastIndex) { + if (i >= lastByteBits) + throw IndexOutOfBoundsException("$bitIndex is out of bounds (last)") + 1 shl (lastByteBits - i - 1) + } else { + 1 shl (7 - i) + } + ) } - override fun outputByte(byte: UByte) { + override operator fun get(bitIndex: Long): Int = + getIndexAndMask(bitIndex).let { (byteIndex, mask) -> + if (bytes[byteIndex].toInt() and mask == 0) 0 else 1 + } + + override operator fun set(bitIndex: Long, value: Int) { + require(value == 0 || value == 1) + val (byteIndex, mask) = getIndexAndMask(bitIndex) + if (value == 1) + bytes[byteIndex] = bytes[byteIndex] or mask.toUByte() + else + bytes[byteIndex] = bytes[byteIndex] and mask.inv().toUByte() + } + + override fun toString(): String { + val result = StringBuilder() + val s = min(size, 64) + for (i in 0..() + + private var accumulator = 0 + + /** + * Number of bits in accumulator. After output is closed by [close] this value is + * not changed and represents the number of bits in the last byte; this should + * be used to properly calculate end of the bit stream + */ + private var accumulatorBits = 0 + private set + +// /** +// * When [close] is called, represents the number of used bits in the last byte; +// * bits after this number are the garbage and should be ignored +// */ +// val lastByteBits: Int +// get() { +// if (!isClosed) throw IllegalStateException("BitOutput is not closed") +// return accumulatorBits +// } + + override fun putBit(bit: Int) { + accumulator = (accumulator shl 1) or bit + if (++accumulatorBits >= 8) { + outputByte(accumulator.toUByte()) + accumulator = accumulator shr 8 + accumulatorBits = 0 + } + } + + var isClosed = false + private set + + fun close(): BitArray { + if (!isClosed) { + if (accumulatorBits > 0) { + outputByte(accumulator.toUByte()) + } else accumulatorBits = 8 + isClosed = true + } + return toBitArray() + } + + fun toBitArray(): BitArray { + if (!isClosed) { + close() + } + return BitArray(buffer.toTypedArray().toUByteArray(), accumulatorBits) + } + + fun toBitInput(): BitInput = toBitArray().toBitInput() + + private fun outputByte(byte: UByte) { buffer.add(byte) } } \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/TinyBits.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/TinyBits.kt new file mode 100644 index 0000000..d9b5b40 --- /dev/null +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/TinyBits.kt @@ -0,0 +1,71 @@ +package net.sergeych.lynon + + +/** + * Bit size-aware code, short [BitList] implementation, up to 64 bits (efficiency tradeoff). + * E.g `Bits(0, 3) != Bits(0, 2). For longer, use [BitArray]. + * + * Note that [bitListOf] creates [TinyBits] when possible. + */ +class TinyBits(initValue: ULong = 0U, override val size: Long = 0): BitList { + + private var bits: ULong = initValue + + constructor(value: ULong, size: Int): this(value, size.toLong()) {} + + override val indices: LongRange by lazy { 0.. this[i.toLong()] = v } } + } + } +} \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt new file mode 100644 index 0000000..a8c3aa8 --- /dev/null +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/huffman.kt @@ -0,0 +1,244 @@ +package net.sergeych.lynon + +import net.sergeych.collections.SortedList + +/** + * Experimental, reference implementation of Huffman trees and encoding. + * + * This is a reference huffman encoding implementation not yet ready; + * it was used to experiment with LZW, at the moment, LZW won the competition + * for compressed module format for its speed and sufficiently small size/ + * + * This is byte-based compressor which makes it not too interesting. + * + * TODO: convert to use various source dictionary + * + * reason: version thant compress bytes is not too interesting; particular alphabets + * are often longer than byte bits and are often sparse, that requires another + * codes serialization implementation + */ +object Huffman { + + sealed class Node(val freq: Int) : Comparable { + override fun compareTo(other: Node): Int { + return freq.compareTo(other.freq) + } + + abstract fun decode(bin: BitInput): Int? + + class Leaf(val value: Int, freq: Int) : Node(freq) { + override fun toString(): String { + return "[$value:$freq]" + } + + override fun decode(bin: BitInput): Int { + return value//.also { println(": ${Char(value)}") } + } + } + + class Internal(val left: Node, val right: Node) : Node(left.freq + right.freq) { + override fun toString(): String { + return "[${left.freq}<- :<$freq>: ->${right.freq}]" + } + + override fun decode(bin: BitInput): Int? { + return when (bin.getBitOrNull().also { print("$it") }) { + 1 -> left.decode(bin) + 0 -> right.decode(bin) + else -> null + } + } + } + } + + data class Code(val symbol: Int, val bits: TinyBits) { + + val size by bits::size + + override fun toString(): String { + return "[${Char(symbol)}:$size:$bits]" + } + + } + + private fun generateCanonicCodes(tree: Node): List { + val codes = MutableList(256) { null } + + fun traverse(node: Node, code: TinyBits) { + when (node) { + is Node.Leaf -> + codes[node.value] = (Code(node.value, code)) + + is Node.Internal -> { + traverse(node.left, code.insertBit(1)) + traverse(node.right, code.insertBit(0)) + } + } + } + traverse(tree, TinyBits()) + + return makeCanonical(codes) + } + + private fun makeCanonical(source: List): List { + val sorted = source.filterNotNull().sortedWith(canonicComparator) + + val canonical = MutableList(256) { null } + + val first = sorted[0] + val prevValue = first.copy(bits = TinyBits(0UL, first.bits.size)) + canonical[first.symbol] = prevValue + var prev = prevValue.bits + + for (i in 1.. bits.size) { + bits = bits.insertBit(0) + } + canonical[code.symbol] = code.copy(bits = bits)//.also { println("$it") } + prev = bits + } + return canonical + } + + private val canonicComparator = { a: Code, b: Code -> + if (a.bits.size == b.bits.size) { + a.symbol.compareTo(b.symbol) + } else { + a.bits.size.compareTo(b.bits.size) + } + } + + private fun buildTree(data: UByteArray): Node { +// println(data.toDump()) + val frequencies = Array(256) { 0 } + data.forEach { frequencies[it.toInt()]++ } + + val list = SortedList(*frequencies.mapIndexed { index, i -> Node.Leaf(index, i) }.filter { it.freq > 0 } + .toTypedArray()) + + // build the tree + while (list.size > 1) { + val left = list.removeAt(0) + val right = list.removeAt(0) + list.add(Node.Internal(left, right)) + } + return list[0] + } + + fun decompressUsingCodes(bin: BitInput, codes: List): UByteArray { + val result = mutableListOf() + val table = codes.filterNotNull().associateBy { it.bits } + + outer@ while (true) { + var input = TinyBits() + while (true) { + bin.getBitOrNull()?.let { input = input.insertBit(it) } + ?: break@outer + val data = table[input] + if (data != null) { +// println("Code found: ${data.bits} -> [${data.symbol.toChar()}]") + result.add(data.symbol.toUByte()) + break + } + } + } + return result.toUByteArray() + } + + private fun serializeCanonicCodes(bout: BitOutput, codes: List) { + var minSize: Int? = null + var maxSize: Int? = null + for (i in 1.. maxSize) maxSize = s + } + val size = maxSize!! - minSize!! + 1 + val sizeInBits = sizeInBits(size) + bout.packUnsigned(minSize.toULong()) + bout.packUnsigned(sizeInBits.toULong()) + for (c in codes) { + if (c != null) + bout.putBits(c.bits.size.toInt() - minSize + 1, sizeInBits) + else + bout.putBits(0, sizeInBits) + } + } + + fun deserializeCanonicCodes(bin: BitInput): List { + val minSize = bin.unpackUnsigned().toInt() + val sizeInBits = bin.unpackUnsigned().toInt() + val sorted = mutableListOf().also { codes -> + for (i in 0..<256) { + val s = bin.getBits(sizeInBits).toInt() + if (s > 0) { + codes.add(Code(i, TinyBits(0U, s - 1 + minSize))) + } + } + }.sortedWith(canonicComparator) + + val result = MutableList(256) { null } + var prev = sorted[0].copy(bits = TinyBits(0U, sorted[0].bits.size)) + result[prev.symbol] = prev + + for (i in 1..> $code") + bout.putBits(code.bits) + } +// println(bout.toBitArray().bytes.toDump()) + val compressed = bout.toBitArray() +// println("Size: ${compressed.bytes.size / data.size.toDouble() }") +// println("compression ratio: ${compressed.bytes.size / data.size.toDouble() }") + + // test decompress +// val bin = MemoryBitInput(compressed) +// val codes2 = deserializeCanonicCodes(bin) +// for ((a, b) in codes.zip(codes2)) { +// if (a != b) { +// println("Codes mismatch: $a != $b") +// break +// } +// } +// require(codes == codes2) +// val result = decompressUsingCodes(bin, codes2) +// +//// println(result.toUByteArray().toDump()) +// check(data contentEquals result.toUByteArray()) +// if( !(data contentEquals result.toUByteArray()) ) +// throw RuntimeException("Data mismatch") +// println(data.toDump()) +// + return compressed + } + + fun decompress(bin: BitInput): UByteArray { + val codes = deserializeCanonicCodes(bin) + return decompressUsingCodes(bin, codes) + } + +} \ No newline at end of file diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw.kt new file mode 100644 index 0000000..8fa4cfa --- /dev/null +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw.kt @@ -0,0 +1,128 @@ +package net.sergeych.lynon + +import net.sergeych.bintools.ByteChunk +import kotlin.math.roundToInt + +/** + * LZW lightweight pure kotlin compression. + */ +object LZW { + + val MAX_CODE_SIZE = 17 + val STOP_CODE = (1 shl MAX_CODE_SIZE) - 1 + val MAX_DICT_SIZE = (STOP_CODE * 0.92).roundToInt() + + + fun compress(input: ByteArray, bitOutput: BitOutput) + = compress(input.asUByteArray(), bitOutput) + + /** + * Compresses the input string using LZW algorithm + * @param input The string to compress + * @return List of compressed codes + */ + fun compress(input: UByteArray, bitOutput: BitOutput) { + // Initialize dictionary with all possible single characters + val dictionary = mutableMapOf() + for (i in 0..255) { + // 23 + dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i + } + + var nextCode = 256 + var current = ByteChunk(ubyteArrayOf()) +// val result = mutableListOf() + + for (char in input) { + val combined = current + char + if (dictionary.containsKey(combined)) { + current = combined + } else { + val size = sizeInBits(dictionary.size) + bitOutput.putBits(dictionary[current]!!, size) + if (dictionary.size >= MAX_DICT_SIZE) { + bitOutput.putBits(STOP_CODE, size) + dictionary.clear() + nextCode = 256 + for (i in 0..255) { + dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i + } + } else + dictionary[combined] = nextCode++ + current = ByteChunk(ubyteArrayOf(char)) + } + } + + if (current.size > 0) { + val size = sizeInBits(dictionary.size) + bitOutput.putBits(dictionary[current]!!, size) + } + } + + fun compress(input: UByteArray): BitArray { + return MemoryBitOutput().apply { + compress(input, this) + }.toBitArray() + } + + /** + * Decompresses a list of LZW codes back to the original string. Note that usage of apriori existing + * size is crucial: it let repeal explosion style attacks. + * + * @param compressed The list of compressed codes + * @param resultSize The expected size of the decompressed string + * + * @throws DecompressionException if something goes wrong + * @return The decompressed string + */ + fun decompress(compressed: BitInput, resultSize: Int): UByteArray { + // Initialize dictionary with all possible single characters + val dictionary = mutableMapOf() + for (i in 0..255) { + dictionary[i] = ubyteArrayOf(i.toUByte()) + } + + var nextCode = 256 + val firstCode = compressed.getBits(9).toInt() + var previous = dictionary[firstCode] + ?: throw DecompressionException("Invalid first compressed code: $firstCode") + val result = mutableListOf() + result += previous + + while (result.size < resultSize) { + val codeSize = sizeInBits(nextCode + 1) + val code = compressed.getBitsOrNull(codeSize)?.toInt() ?: break + + if (code == STOP_CODE) { + nextCode = 256 + dictionary.clear() + for (i in 0..255) + dictionary[i] = ubyteArrayOf(i.toUByte()) + previous = dictionary[compressed.getBits(9).toInt()]!! + } else { + + val current = if (code in dictionary) { + dictionary[code]!! + } else if (code == nextCode) { + // Special case for pattern like cScSc + previous + previous[0] + } else { + throw DecompressionException("Invalid compressed code: $code") + } + + result += current + dictionary[nextCode++] = previous + current[0] + previous = current + } + } + + if (result.size != resultSize) + throw DecompressionException("Decompressed size is not equal to expected: real/expected = ${result.size}/$resultSize") + return result.toTypedArray().toUByteArray() + } +} + + +private operator fun ByteChunk.plus(byte: UByte): ByteChunk { + return ByteChunk(data + byte) +} diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw0.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw0.kt deleted file mode 100644 index 450ce6d..0000000 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw0.kt +++ /dev/null @@ -1,114 +0,0 @@ -package net.sergeych.lynon - -import net.sergeych.bintools.ByteChunk -import kotlin.math.roundToInt - -/** - * LZW compression algorithm: work in progress. - * - * Uses Lyng but input/output. Uses automatic code size. - */ -class LZW { - companion object { - - val MAX_CODE_SIZE = 17 - val STOP_CODE = (1 shl MAX_CODE_SIZE) - 1 - val MAX_DICT_SIZE = (STOP_CODE * 0.92).roundToInt() - - - /** - * Compresses the input string using LZW algorithm - * @param input The string to compress - * @return List of compressed codes - */ - fun compress(input: UByteArray,bitOutput: BitOutput) { - // Initialize dictionary with all possible single characters - val dictionary = mutableMapOf() - for (i in 0..255) { - // 23 - dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i - } - - var nextCode = 256 - var current = ByteChunk(ubyteArrayOf()) -// val result = mutableListOf() - - for (char in input) { - val combined = current + char - if (dictionary.containsKey(combined)) { - current = combined - } else { - val size = sizeInBits(dictionary.size) - bitOutput.putBits(dictionary[current]!!,size) - if( dictionary.size >= MAX_DICT_SIZE ) { - bitOutput.putBits(STOP_CODE,size) - dictionary.clear() - nextCode = 256 - for (i in 0..255) { - dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i - } - } - else - dictionary[combined] = nextCode++ - current = ByteChunk(ubyteArrayOf(char)) - } - } - - if (current.size > 0) { - val size = sizeInBits(dictionary.size) - bitOutput.putBits(dictionary[current]!!,size) - } - } - - /** - * Decompresses a list of LZW codes back to the original string - * @param compressed The list of compressed codes - * @return The decompressed string - */ - fun decompress(compressed: BitInput): UByteArray { - // Initialize dictionary with all possible single characters - val dictionary = mutableMapOf() - for (i in 0..255) { - dictionary[i] = ubyteArrayOf(i.toUByte()) - } - - var nextCode = 256 - var previous = dictionary[compressed.getBits(9).toInt()]!! - val result = mutableListOf() - - while( !compressed.isEndOfStream ) { - val codeSize = sizeInBits(nextCode + 1) - val code = compressed.getBitsOrNull(codeSize)?.toInt() ?: break - - if( code == STOP_CODE ) { - nextCode = 256 - dictionary.clear() - for (i in 0..255) - dictionary[i] = ubyteArrayOf(i.toUByte()) - previous = dictionary[compressed.getBits(9).toInt()]!! - } - else { - - val current = if (code in dictionary) { - dictionary[code]!! - } else if (code == nextCode) { - // Special case for pattern like cScSc - previous + previous[0] - } else { - throw IllegalArgumentException("Invalid compressed code: $code") - } - - result += current - dictionary[nextCode++] = previous + current[0] - previous = current - } - } - - return result.toTypedArray().toUByteArray() - } - } -} - -private operator fun ByteChunk.plus(byte: UByte): ByteChunk { - return ByteChunk(data + byte) -} diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/tools.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/tools.kt index 75f1559..4cb1118 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lynon/tools.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lynon/tools.kt @@ -11,5 +11,5 @@ class LynonPacker(bout: MemoryBitOutput = MemoryBitOutput(), settings: LynonSett * Variant of [LynonDecoder] that reads from a given `source` using [MemoryBitInput] */ class LynonUnpacker(source: BitInput) : LynonDecoder(source) { - constructor(packer: LynonPacker) : this(MemoryBitInput(packer.bout as MemoryBitOutput)) + constructor(packer: LynonPacker) : this((packer.bout as MemoryBitOutput).toBitInput()) } \ No newline at end of file diff --git a/lynglib/src/jvmTest/kotlin/LynonTests.kt b/lynglib/src/jvmTest/kotlin/LynonTests.kt index 3287013..4058eee 100644 --- a/lynglib/src/jvmTest/kotlin/LynonTests.kt +++ b/lynglib/src/jvmTest/kotlin/LynonTests.kt @@ -6,7 +6,7 @@ import net.sergeych.lynon.* import java.nio.file.Files import java.nio.file.Path import kotlin.test.Test - +import kotlin.test.assertContentEquals class LynonTests { @Test @@ -210,11 +210,13 @@ class LynonTests { assertEquals(null, bin.getBitsOrNull(3)) } + + val original = Files.readString(Path.of("../sample_texts/dikkens_hard_times.txt")) + @Test fun testLzw() { // Example usage // val original = "TOBEORNOTTOBEORTOBEORNOT" - val original = Files.readString(Path.of("../sample_texts/dikkens_hard_times.txt")) // println("Original: $original") println("Length: ${original.length}") @@ -222,14 +224,112 @@ class LynonTests { val out = MemoryBitOutput() LZW.compress(original.encodeToByteArray().toUByteArray(), out) // println("\nCompressed codes: ${out.toUByteArray().toDump()}") - println("Number of codes: ${out.toUByteArray().size}") - + println("Number of codes: ${out.toBitArray().bytesSize}") + println("Copression rate: ${out.toBitArray().bytesSize.toDouble() / original.length.toDouble()}") // // Decompress - val decompressed = LZW.decompress(MemoryBitInput(out)).toByteArray().decodeToString() + val decompressed = LZW.decompress(MemoryBitInput(out), original.length).toByteArray().decodeToString() // println("\nDecompressed: $decompressed") println("Length: ${decompressed.length}") // Verification println("\nOriginal and decompressed match: ${original == decompressed}") + assertEquals(original, decompressed) } -} \ No newline at end of file + + @Test + fun testTinyBits() { + var a0 = TinyBits() + + assertEquals(a0, a0) + a0 = a0.insertBit(0) + a0 = a0.insertBit(1) + a0 = a0.insertBit(1) + a0 = a0.insertBit(1) + a0 = a0.insertBit(0) + a0 = a0.insertBit(1) +// println(a0) + assertEquals("011101", a0.toString()) + val bin = MemoryBitInput(MemoryBitOutput().apply { putBits(a0) }) + var result = TinyBits() + for( i in a0.indices) result = result.insertBit(bin.getBit()) + assertEquals(a0, result) + } + + @Test + fun testHuffman() { + val x = original.encodeToByteArray().toUByteArray() +// val x ="hello, world!".toByteArray().asUByteArray()// original.encodeToByteArray().toUByteArray() + println("Original : ${x.size}") + val lzw = LZW.compress(x).bytes + println("LZW : ${lzw.size}") + val ba = Huffman.compress(x) + val huff = ba.bytes + println("Huffman : ${huff.size}") + val lzwhuff = Huffman.compress(lzw).bytes + println("LZW+HUFF : ${lzwhuff.size}") + val compressed = Huffman.compress(x) + val decompressed = Huffman.decompress(compressed.toBitInput()) + assertContentEquals(x, decompressed) + } + + @Test + fun testBitListSmall() { + var t = TinyBits() + for( i in listOf(1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1) ) + t = t.insertBit(i) + assertEquals(1, t[0]) + assertEquals(1, t[1]) + assertEquals(0, t[2]) + assertEquals("1101000111101",t.toString()) + t[0] = 0 + t[1] = 0 + t[2] = 1 + assertEquals("0011000111101",t.toString()) + t[12] = 0 + t[11] = 1 + assertEquals("0011000111110",t.toString()) + } + + @Test + fun testBitListSerialization() { + // this also tests bitArray with first and last bytes + val bout = MemoryBitOutput() + assertEquals("1101", bitListOf(1, 1, 0, 1).toString()) + bout.putBits(bitListOf(1, 1, 0, 1)) + bout.putBits(bitListOf( 0, 0)) + bout.putBits(bitListOf( 0, 1, 1, 1, 1, 0, 1)) + val x = bout.toBitArray() + assertEquals("1101000111101",x.toString()) + } + + + @Test + fun testCompressionWithOffsets() { + val src = "to be or not to be or not to be or not to be or not to be" + val bout = MemoryBitOutput() + bout.packUnsigned(1571UL) + LZW.compress(src.encodeToByteArray(), bout) + bout.packUnsigned(157108UL) + val bin = bout.toBitInput() + assertEquals(1571UL, bin.unpackUnsigned()) + assertEquals(src, LZW.decompress(bin, src.length).asByteArray().decodeToString()) + assertEquals(157108UL, bin.unpackUnsigned()) + } + + @Test + fun testCompressionRecord() { + val bout = MemoryBitOutput() + val src = "to be or not to be or not to be or not to be or not to be" + val src2 = "to be or not to be" + val src3 = "ababababab" + bout.compress(src) + bout.compress(src2) + bout.compress(src3) + val bin = bout.toBitInput() + assertEquals(src, bin.decompressString()) + assertEquals(src2, bin.decompressString()) + assertEquals(src3, bin.decompressString()) + } + +} +