lyng/lynglib/src/commonMain/kotlin/net/sergeych/lynon/lzw.kt

/*
 * Copyright 2025 Sergey S. Chernov real.sergeych@gmail.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package net.sergeych.lynon

import net.sergeych.bintools.ByteChunk
import kotlin.math.roundToInt

/**
 * LZW lightweight pure kotlin compression. LZW works with but streams [BitInput] and [BitOutput]
 * to be used effectively in Lynon or other bit-grained formats. To safely comress byte arresy and
 * strings without boilerplate use [lzwCompress], [lzwDecompress], [lzwCompressUtf8] and [lzwDecompressUtf8].
 */
object LZW {

    val MAX_CODE_SIZE = 17
    val STOP_CODE = (1 shl MAX_CODE_SIZE) - 1
    val MAX_DICT_SIZE = (STOP_CODE * 0.92).roundToInt()


    /**
     * Compress a byte array using LZW algorithm writing the result to [bitOutput]
     */
    fun compress(input: ByteArray, bitOutput: BitOutput) = compress(input.asUByteArray(), bitOutput)

    /**
     * Compresses the input string using LZW algorithm
     * @param input The string to compress
     * @return List of compressed codes
     */
    fun compress(input: UByteArray, bitOutput: BitOutput) {
        // Initialize dictionary with all possible single characters
        val dictionary = mutableMapOf<ByteChunk, Int>()
        for (i in 0..255) {
            // 23
            dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i
        }

        var nextCode = 256
        var current = ByteChunk(ubyteArrayOf())
//            val result = mutableListOf<Int>()

        for (char in input) {
            val combined = current + char
            if (dictionary.containsKey(combined)) {
                current = combined
            } else {
                val size = sizeInBits(dictionary.size)
                bitOutput.putBits(dictionary[current]!!, size)
                if (dictionary.size >= MAX_DICT_SIZE) {
                    bitOutput.putBits(STOP_CODE, size)
                    dictionary.clear()
                    nextCode = 256
                    for (i in 0..255) {
                        dictionary[ByteChunk(ubyteArrayOf(i.toUByte()))] = i
                    }
                } else
                    dictionary[combined] = nextCode++
                current = ByteChunk(ubyteArrayOf(char))
            }
        }

        if (current.size > 0) {
            val size = sizeInBits(dictionary.size)
            bitOutput.putBits(dictionary[current]!!, size)
        }
    }

    fun compress(input: UByteArray): BitArray {
        return MemoryBitOutput().apply {
            compress(input, this)
        }.toBitArray()
    }

    /**
     * Decompresses a list of LZW codes back to the original string. Note that usage of apriori existing
     * size is crucial: it let repeal explosion style attacks.
     *
     * @param compressed The list of compressed codes
     * @param resultSize The expected size of the decompressed string
     *
     * @throws DecompressionException if something goes wrong
     * @return The decompressed string
     */
    fun decompress(compressed: BitInput, resultSize: Int): UByteArray {
        // Initialize dictionary with all possible single characters
        val dictionary = mutableMapOf<Int, UByteArray>()
        for (i in 0..255) {
            dictionary[i] = ubyteArrayOf(i.toUByte())
        }

        var nextCode = 256
        val firstCode = compressed.getBits(9).toInt()
        var previous = dictionary[firstCode]
            ?: throw DecompressionException("Invalid first compressed code: $firstCode")
        val result = mutableListOf<UByte>()
        result += previous

        while (result.size < resultSize) {
            val codeSize = sizeInBits(nextCode + 1)
            val code = compressed.getBitsOrNull(codeSize)?.toInt() ?: break

            if (code == STOP_CODE) {
                nextCode = 256
                dictionary.clear()
                for (i in 0..255)
                    dictionary[i] = ubyteArrayOf(i.toUByte())
                previous = dictionary[compressed.getBits(9).toInt()]!!
            } else {

                val current = if (code in dictionary) {
                    dictionary[code]!!
                } else if (code == nextCode) {
                    // Special case for pattern like cScSc
                    previous + previous[0]
                } else {
                    throw DecompressionException("Invalid compressed code: $code")
                }

                result += current
                dictionary[nextCode++] = previous + current[0]
                previous = current
            }
        }

        if (result.size != resultSize)
            throw DecompressionException("Decompressed size is not equal to expected: real/expected = ${result.size}/$resultSize")
        return result.toTypedArray().toUByteArray()
    }
}

private operator fun ByteChunk.plus(byte: UByte): ByteChunk {
    return ByteChunk(data + byte)
}

/**
 * Safely compress binary byte data using the LZW algorithm. This can use up to one byte more space than
 * with [LZW.compress] and [BitOutput] but you often need byte array operations. Source data size is also
 * encoded to prevent file-bomb-like attacks. Note that content protection is not included (we assume
 * LZW is robust).
 * @param source the data to compress
 */
fun lzwCompress(source: UByteArray): UByteArray {
    val out = MemoryBitOutput()
    out.packUnsigned(source.size.toULong())
    LZW.compress(source, out)
    return out.toBitArray().asUByteArray()
}

/**
 * Safely decompress data compressed with [lzwCompress], checking size. Contect checks are not implemented
 * here as we assume LZW is robust.
 * @param packed the compressed data
 * @throws DecompressionException if something goes wrong, like size mismatch or bad compressed data
 */
fun lzwDecompress(packed: UByteArray): UByteArray {
    val inp = MemoryBitInput(packed, 8)
    val size = inp.unpackUnsigned()
    return LZW.decompress(inp, size.toInt())
}

/**
 * Compress a text using utf-8 encoding and [lzwCompress]
 */
fun lzwCompressUtf8(text: String) = lzwCompress(text.encodeToByteArray().toUByteArray())

/**
 * Decompress a text compressed with [lzwCompressUtf8]. See also [lzwDecompress]
 */
fun lzwDecompressUtf8(packed: UByteArray): String = lzwDecompress(packed).asByteArray().decodeToString()