2025-11-01 10:28:04 +01:00

185 lines
6.6 KiB
Kotlin

/*
* Copyright 2025 Sergey S. Chernov real.sergeych@gmail.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package net.sergeych.lynon
import net.sergeych.bintools.ByteChunk
import kotlin.math.roundToInt
/**
* LZW lightweight pure kotlin compression. LZW works with but streams [BitInput] and [BitOutput]
* to be used effectively in Lynon or other bit-grained formats. To safely comress byte arresy and
* strings without boilerplate use [lzwCompress], [lzwDecompress], [lzwCompressUtf8] and [lzwDecompressUtf8].
*/
object LZW {
val MAX_CODE_SIZE = 17
val STOP_CODE = (1 shl MAX_CODE_SIZE) - 1
val MAX_DICT_SIZE = (STOP_CODE * 0.92).roundToInt()
/**
* Compress a byte array using LZW algorithm writing the result to [bitOutput]
*/
fun compress(input: ByteArray, bitOutput: BitOutput) = compress(input.asUByteArray(), bitOutput)
/**
* Compresses the input string using LZW algorithm
* @param input The string to compress
* @return List of compressed codes
*/
fun compress(input: UByteArray, bitOutput: BitOutput) {
// Initialize dictionary with all possible single characters
val dictionary = mutableMapOf<ByteChunk, Int>()
for (i in 0..255) {
// 23
dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i
}
var nextCode = 256
var current = ByteChunk(ubyteArrayOf())
// val result = mutableListOf<Int>()
for (char in input) {
val combined = current + char
if (dictionary.containsKey(combined)) {
current = combined
} else {
val size = sizeInBits(dictionary.size)
bitOutput.putBits(dictionary[current]!!, size)
if (dictionary.size >= MAX_DICT_SIZE) {
bitOutput.putBits(STOP_CODE, size)
dictionary.clear()
nextCode = 256
for (i in 0..255) {
dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i
}
} else
dictionary[combined] = nextCode++
current = ByteChunk(ubyteArrayOf(char))
}
}
if (current.size > 0) {
val size = sizeInBits(dictionary.size)
bitOutput.putBits(dictionary[current]!!, size)
}
}
fun compress(input: UByteArray): BitArray {
return MemoryBitOutput().apply {
compress(input, this)
}.toBitArray()
}
/**
* Decompresses a list of LZW codes back to the original string. Note that usage of apriori existing
* size is crucial: it let repeal explosion style attacks.
*
* @param compressed The list of compressed codes
* @param resultSize The expected size of the decompressed string
*
* @throws DecompressionException if something goes wrong
* @return The decompressed string
*/
fun decompress(compressed: BitInput, resultSize: Int): UByteArray {
// Initialize dictionary with all possible single characters
val dictionary = mutableMapOf<Int, UByteArray>()
for (i in 0..255) {
dictionary[i] = ubyteArrayOf(i.toUByte())
}
var nextCode = 256
val firstCode = compressed.getBits(9).toInt()
var previous = dictionary[firstCode]
?: throw DecompressionException("Invalid first compressed code: $firstCode")
val result = mutableListOf<UByte>()
result += previous
while (result.size < resultSize) {
val codeSize = sizeInBits(nextCode + 1)
val code = compressed.getBitsOrNull(codeSize)?.toInt() ?: break
if (code == STOP_CODE) {
nextCode = 256
dictionary.clear()
for (i in 0..255)
dictionary[i] = ubyteArrayOf(i.toUByte())
previous = dictionary[compressed.getBits(9).toInt()]!!
} else {
val current = if (code in dictionary) {
dictionary[code]!!
} else if (code == nextCode) {
// Special case for pattern like cScSc
previous + previous[0]
} else {
throw DecompressionException("Invalid compressed code: $code")
}
result += current
dictionary[nextCode++] = previous + current[0]
previous = current
}
}
if (result.size != resultSize)
throw DecompressionException("Decompressed size is not equal to expected: real/expected = ${result.size}/$resultSize")
return result.toTypedArray().toUByteArray()
}
}
private operator fun ByteChunk.plus(byte: UByte): ByteChunk {
return ByteChunk(data + byte)
}
/**
* Safely compress binary byte data using the LZW algorithm. This can use up to one byte more space than
* with [LZW.compress] and [BitOutput] but you often need byte array operations. Source data size is also
* encoded to prevent file-bomb-like attacks. Note that content protection is not included (we assume
* LZW is robust).
* @param source the data to compress
*/
fun lzwCompress(source: UByteArray): UByteArray {
val out = MemoryBitOutput()
out.packUnsigned(source.size.toULong())
LZW.compress(source, out)
return out.toBitArray().asUByteArray()
}
/**
* Safely decompress data compressed with [lzwCompress], checking size. Contect checks are not implemented
* here as we assume LZW is robust.
* @param packed the compressed data
* @throws DecompressionException if something goes wrong, like size mismatch or bad compressed data
*/
fun lzwDecompress(packed: UByteArray): UByteArray {
val inp = MemoryBitInput(packed, 8)
val size = inp.unpackUnsigned()
return LZW.decompress(inp, size.toInt())
}
/**
* Compress a text using utf-8 encoding and [lzwCompress]
*/
fun lzwCompressUtf8(text: String) = lzwCompress(text.encodeToByteArray().toUByteArray())
/**
* Decompress a text compressed with [lzwCompressUtf8]. See also [lzwDecompress]
*/
fun lzwDecompressUtf8(packed: UByteArray): String = lzwDecompress(packed).asByteArray().decodeToString()