diff --git a/docs/perf_guide.md b/docs/perf_guide.md index 0f16b17..50f3fc0 100644 --- a/docs/perf_guide.md +++ b/docs/perf_guide.md @@ -17,7 +17,7 @@ All flags are `var` and can be flipped at runtime (e.g., from tests or host apps - `EMIT_FAST_LOCAL_REFS` — Compiler emits `FastLocalVarRef` for identifiers known to be locals/params (ON JVM default). - `ARG_BUILDER` — Efficient argument building: small‑arity no‑alloc and pooled builder on JVM (ON JVM default). - `SKIP_ARGS_ON_NULL_RECEIVER` — Early return on optional‑null receivers before building args (semantics‑compatible). A/B only. -- `SCOPE_POOL` — Scope frame pooling for calls (JVM‑first). OFF by default. Enable for benchmark A/B. +- `SCOPE_POOL` — Scope frame pooling for calls (JVM, per‑thread ThreadLocal pool). ON by default on JVM; togglable at runtime. - `FIELD_PIC` — 2‑entry polymorphic inline cache for field reads/writes keyed by `(classId, layoutVersion)` (ON JVM default). - `METHOD_PIC` — 2‑entry PIC for instance method calls keyed by `(classId, layoutVersion)` (ON JVM default). - `PIC_DEBUG_COUNTERS` — Enable lightweight hit/miss counters via `PerfStats` (OFF by default). @@ -29,7 +29,7 @@ See `src/commonMain/kotlin/net/sergeych/lyng/PerfFlags.kt` and `PerfDefaults.*.k ## Where optimizations apply - Locals: `FastLocalVarRef`, `LocalVarRef` per‑frame cache (PIC). -- Calls: small‑arity zero‑alloc paths (0–5 args), pooled builder (JVM), and child frame pooling (optional). +- Calls: small‑arity zero‑alloc paths (0–8 args), pooled builder (JVM), and child frame pooling (optional). - Properties/methods: Field/Method PICs with receiver shape `(classId, layoutVersion)` and handle‑aware caches. - Expressions: R‑value fast paths in hot nodes (`UnaryOpRef`, `BinaryOpRef`, `ElvisRef`, logical ops, `RangeRef`, `IndexRef` read, `FieldRef` receiver eval, `ListLiteralRef` elements, `CallRef` callee, `MethodCallRef` receiver, assignment RHS). - Primitives: Direct boolean/int ops where safe. @@ -117,3 +117,70 @@ Print a summary at the end of a bench/test as needed. Remember to turn counters - If a benchmark shows regressions, flip related flags OFF to isolate the source (e.g., `ARG_BUILDER`, `RVAL_FASTPATH`, `FIELD_PIC`, `METHOD_PIC`). - Use `PIC_DEBUG_COUNTERS` to observe inline cache effectiveness. - Ensure tests do not accidentally keep flags ON for subsequent tests; reset after each test. + + +## JVM micro-benchmark results (3× medians; OFF → ON) + +Date: 2025-11-10 23:04 (local) + +| Flag | Benchmark/Test | OFF median (ms) | ON median (ms) | Speedup | Notes | +|--------------------|----------------------------------------------|-----------------:|----------------:|:-------:|-------| +| ARG_BUILDER | CallMixedArityBenchmarkTest | 788.02 | 668.79 | 1.18× | Clear win on mixed arity | +| ARG_BUILDER | CallBenchmarkTest (simple calls) | 423.87 | 425.47 | 1.00× | Neutral on repeated simple calls | +| FIELD_PIC | PicBenchmarkTest::benchmarkFieldGetSetPic | 113.575 | 106.017 | 1.07× | Small but consistent win | +| METHOD_PIC | PicBenchmarkTest::benchmarkMethodPic | 251.068 | 149.439 | 1.68× | Large consistent win | +| RVAL_FASTPATH | ExpressionBenchmarkTest | 514.491 | 426.800 | 1.21× | Consistent win in expression chains | +| PRIMITIVE_FASTOPS | ArithmeticBenchmarkTest (int-sum) | 243.420 | 128.146 | 1.90× | Big win for integer addition | +| PRIMITIVE_FASTOPS | ArithmeticBenchmarkTest (int-cmp) | 210.385 | 168.534 | 1.25× | Moderate win for comparisons | +| SCOPE_POOL | CallPoolingBenchmarkTest | 505.778 | 366.737 | 1.38× | Single-threaded bench; per-thread ThreadLocal pool; default ON on JVM | + +Notes: +- All results obtained from `[DEBUG_LOG] [BENCH]` outputs with three repeated Gradle test invocations per configuration; medians reported. +- JVM defaults (current): `ARG_BUILDER=true`, `PRIMITIVE_FASTOPS=true`, `RVAL_FASTPATH=true`, `FIELD_PIC=true`, `METHOD_PIC=true`, `SCOPE_POOL=true` (per‑thread ThreadLocal pool). + + +## Concurrency (multi‑core) pooling results (3× medians; OFF → ON) + +Date: 2025-11-10 22:56 (local) + +| Flag | Benchmark/Test | OFF median (ms) | ON median (ms) | Speedup | Notes | +|------------|--------------------------------------|-----------------:|----------------:|:-------:|-------| +| SCOPE_POOL | ConcurrencyCallBenchmarkTest (JVM) | 521.102 | 201.374 | 2.59× | Multithreaded workload on `Dispatchers.Default` with per‑thread ThreadLocal pool; workers=8, iters=15000/worker. | + +Methodology: +- The test toggles `PerfFlags.SCOPE_POOL` within a single run and executes the same script across N worker coroutines scheduled on `Dispatchers.Default`. +- We executed the test three times via Gradle and computed medians from the printed `[DEBUG_LOG]` timings: + - OFF runs (ms): 532.442 | 521.102 | 474.386 → median 521.102 + - ON runs (ms): 218.683 | 201.374 | 198.737 → median 201.374 +- Speedup = OFF/ON. + +Reproduce: +``` +./gradlew :lynglib:jvmTest --tests "ConcurrencyCallBenchmarkTest" --rerun-tasks +``` + + +## Next optimization steps (JVM) + +Date: 2025-11-10 23:04 (local) + +- PICs + - Widen METHOD_PIC to 3–4 entries with tiny LRU; keep invalidation on layout change; re-run `PicInvalidationJvmTest`. + - Micro fast-path for FIELD_PIC read-then-write pairs (`x = x + 1`) to reuse the resolved slot within one step. +- Locals and slots + - Pre-size `Scope` slot structures when compiler knows local/param counts; audit `EMIT_FAST_LOCAL_REFS` coverage. + - Re-run `LocalVarBenchmarkTest` to quantify gains. +- RVAL_FASTPATH coverage + - Cover primitive `ObjList` index reads, pure receivers in `FieldRef`, and assignment RHS where safe; add micro-benches to `ExpressionBenchmarkTest`. +- Collections and ranges + - Specialize `(Int..Int)` loops into tight counted loops (no intermediary objects). + - Add primitive-specialized `ObjList` ops (`map`, `filter`, `sum`, `contains`) under `PRIMITIVE_FASTOPS`. +- Regex and strings + - Cache compiled regex for string literals at compile time; add a tiny LRU for dynamic patterns behind `REGEX_CACHE`. + - Add `RegexBenchmarkTest` for repeated matches. +- JIT friendliness (Kotlin/JVM) + - Inline tiny helpers in hot paths, prefer arrays for internal buffers, finalize hot data structures where safe. + +Validation matrix +- Always re-run: `CallBenchmarkTest`, `CallMixedArityBenchmarkTest`, `PicBenchmarkTest`, `ExpressionBenchmarkTest`, `ArithmeticBenchmarkTest`, `CallPoolingBenchmarkTest`, `DeepPoolingStressJvmTest`, `ConcurrencyCallBenchmarkTest` (3× medians when comparing). +- Keep full `:lynglib:jvmTest` green after each change. diff --git a/docs/perf_plan_jvm.md b/docs/perf_plan_jvm.md new file mode 100644 index 0000000..87af6b4 --- /dev/null +++ b/docs/perf_plan_jvm.md @@ -0,0 +1,56 @@ +# JVM-only Performance Optimization Plan (Saved) + +Date: 2025-11-10 22:14 (local) + +This document captures the agreed next optimization steps so we can restore the plan later if needed. + +## Objectives +- Reduce overhead on the call/argument path. +- Extend and harden PIC performance (fields/methods/locals). +- Improve R-value fast paths and interpreter hot nodes (loops, ranges, lists). +- Make scope frame pooling thread-safe on JVM so it can be enabled by default later. +- Keep semantics correct and all JVM tests green. + +## Prioritized tasks (now) +1) Call/argument path: fewer allocs, tighter fast paths +- Extend small-arity zero-alloc path to 6–8 args; benchmark with `CallMixedArityBenchmarkTest`. +- Splat handling: fast-path single-list splats; benchmark with `CallSplatBenchmarkTest`. +- Arg builder micro-optimizations: capacity hints, avoid redundant copies, inline simple branches. +- Optional-chaining fast return (`SKIP_ARGS_ON_NULL_RECEIVER`) coverage audit, add A/B bench. + +2) Scope frame pooling: per-thread safety on JVM +- Replace global deque with ThreadLocal pool on JVM (and Android) actuals. +- Keep `frameId` uniqueness and pool size cap. +- Verify with `DeepPoolingStressJvmTest`, `CallPoolingBenchmarkTest`, and spot benches. +- Do NOT flip default yet; keep `SCOPE_POOL=false` unless explicitly approved. + +## Next tasks (queued) +3) PICs: cheaper misses, broader hits +- Method PIC 2→3/4 entries (tiny LRU); validate with `PicInvalidationJvmTest`. +- Field PIC micro-fast path for read-then-write pairs. + +4) Locals and slots +- Ensure `EMIT_FAST_LOCAL_REFS` coverage across compiler sites. +- Pre-size `slots`/`nameToSlot` when local counts are known; re-run `LocalVarBenchmarkTest`. + +5) R-value fast path coverage +- Cover index reads on primitive lists, pure receivers, assignment RHS where safe. +- Add benches in `ExpressionBenchmarkTest`. + +6) Collections & ranges +- Tight counted loop for `(Int..Int)` in `for`. +- Primitive-specialized `ObjList` ops (`map`, `filter`, `sum`, `contains`) under `PRIMITIVE_FASTOPS`. + +7) Regex and string ops +- Cache compiled regex for string literals at compile time; tiny LRU for dynamic patterns under a new `REGEX_CACHE` flag. + +8) JIT micro-tweaks +- Inline tiny helpers; prefer arrays for hot buffers; finalize hot classes where safe. + +## Validation matrix +- Always re-run: `CallBenchmarkTest`, `CallMixedArityBenchmarkTest`, `PicBenchmarkTest`, `ExpressionBenchmarkTest`, `ArithmeticBenchmarkTest`, `CallPoolingBenchmarkTest`, `DeepPoolingStressJvmTest`. +- Use 3× medians where comparing flags; keep `:lynglib:jvmTest` green. + +## Notes +- All risky changes remain flag-guarded and JVM-only where applicable. +- Documentation and perf tables updated after each cycle. diff --git a/lynglib/src/androidMain/kotlin/net/sergeych/lyng/ScopePoolAndroid.kt b/lynglib/src/androidMain/kotlin/net/sergeych/lyng/ScopePoolAndroid.kt new file mode 100644 index 0000000..aaddf53 --- /dev/null +++ b/lynglib/src/androidMain/kotlin/net/sergeych/lyng/ScopePoolAndroid.kt @@ -0,0 +1,38 @@ +package net.sergeych.lyng + +import net.sergeych.lyng.obj.Obj +import net.sergeych.lyng.obj.ObjVoid + +/** + * Android actual: per-thread scope frame pool backed by ThreadLocal. + */ +actual object ScopePool { + private const val MAX_POOL_SIZE = 64 + private val threadLocalPool: ThreadLocal?> = ThreadLocal() + + private fun pool(): ArrayDeque { + var p = threadLocalPool.get() + if (p == null) { + p = ArrayDeque(MAX_POOL_SIZE) + threadLocalPool.set(p) + } + return p + } + + actual fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { + val pool = pool() + val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) + if (s.parent !== parent || s.args !== args || s.pos !== pos || s.thisObj !== thisObj) { + s.resetForReuse(parent, args, pos, thisObj) + } else { + s.frameId = nextFrameId() + } + return s + } + + actual fun release(scope: Scope) { + val pool = pool() + scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) + if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) + } +} diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Arguments.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Arguments.kt index d1a0912..d329a76 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Arguments.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Arguments.kt @@ -31,7 +31,7 @@ import net.sergeych.lyng.obj.ObjList for (pa in this) { if (pa.isSplat) { hasSplat = true; break } count++ - if (count > 3) break + if (count > 8) break } if (!hasSplat && count == this.size) { val quick = when (count) { @@ -63,6 +63,36 @@ import net.sergeych.lyng.obj.ObjList val a4 = this.elementAt(4).value.execute(scope) Arguments(listOf(a0, a1, a2, a3, a4), tailBlockMode) } + 6 -> { + val a0 = this.elementAt(0).value.execute(scope) + val a1 = this.elementAt(1).value.execute(scope) + val a2 = this.elementAt(2).value.execute(scope) + val a3 = this.elementAt(3).value.execute(scope) + val a4 = this.elementAt(4).value.execute(scope) + val a5 = this.elementAt(5).value.execute(scope) + Arguments(listOf(a0, a1, a2, a3, a4, a5), tailBlockMode) + } + 7 -> { + val a0 = this.elementAt(0).value.execute(scope) + val a1 = this.elementAt(1).value.execute(scope) + val a2 = this.elementAt(2).value.execute(scope) + val a3 = this.elementAt(3).value.execute(scope) + val a4 = this.elementAt(4).value.execute(scope) + val a5 = this.elementAt(5).value.execute(scope) + val a6 = this.elementAt(6).value.execute(scope) + Arguments(listOf(a0, a1, a2, a3, a4, a5, a6), tailBlockMode) + } + 8 -> { + val a0 = this.elementAt(0).value.execute(scope) + val a1 = this.elementAt(1).value.execute(scope) + val a2 = this.elementAt(2).value.execute(scope) + val a3 = this.elementAt(3).value.execute(scope) + val a4 = this.elementAt(4).value.execute(scope) + val a5 = this.elementAt(5).value.execute(scope) + val a6 = this.elementAt(6).value.execute(scope) + val a7 = this.elementAt(7).value.execute(scope) + Arguments(listOf(a0, a1, a2, a3, a4, a5, a6, a7), tailBlockMode) + } else -> null } if (quick != null) return quick diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/ScopePool.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/ScopePool.kt index 65960e8..5fb9044 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/ScopePool.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/ScopePool.kt @@ -1,35 +1,12 @@ package net.sergeych.lyng import net.sergeych.lyng.obj.Obj -import net.sergeych.lyng.obj.ObjVoid /** - * Simple, portable scope frame pool. JVM-first optimization; for now it uses a small - * global deque. It is only used when [PerfFlags.SCOPE_POOL] is true. - * - * NOTE: This implementation is not thread-safe. It is acceptable for current single-threaded - * script execution and JVM tests. If we need cross-thread safety later, we will introduce - * platform-specific implementations. + * Expect/actual portable scope frame pool. Used only when [PerfFlags.SCOPE_POOL] is true. + * JVM actual provides a ThreadLocal-backed pool; other targets may use a simple global deque. */ -object ScopePool { - private const val MAX_POOL_SIZE = 64 - private val pool = ArrayDeque(MAX_POOL_SIZE) - - fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { - val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) - // If we reused a scope, reset its state to behave as a fresh child frame - if (s.parent !== parent || s.args !== args || s.pos !== pos || s.thisObj !== thisObj) { - s.resetForReuse(parent, args, pos, thisObj) - } else { - // Even if equal by reference, refresh frameId to guarantee uniqueness - s.frameId = nextFrameId() - } - return s - } - - fun release(scope: Scope) { - // Scrub sensitive references to avoid accidental retention - scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) - if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) - } +expect object ScopePool { + fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope + fun release(scope: Scope) } diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt index 92f8820..dffb0d7 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt @@ -304,7 +304,10 @@ open class Obj { } fun autoInstanceScope(parent: Scope): Scope { - val scope = parent.createChildScope(newThisObj = this, args = parent.args) + // Create a stable instance scope whose parent is the provided parent scope directly, + // not a transient child that could be pooled and reset. This preserves proper name + // resolution (e.g., stdlib functions like sqrt) even when call frame pooling is enabled. + val scope = Scope(parent, parent.args, parent.pos, this) for (m in objClass.members) { scope.objects[m.key] = m.value } diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjClass.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjClass.kt index fa6a36f..494a989 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjClass.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjClass.kt @@ -68,7 +68,11 @@ open class ObjClass( override suspend fun callOn(scope: Scope): Obj { val instance = ObjInstance(this) - instance.instanceScope = scope.createChildScope(newThisObj = instance, args = scope.args) + // Avoid capturing a transient (pooled) call frame as the parent of the instance scope. + // Bind instance scope to the caller's parent chain directly so name resolution (e.g., stdlib like sqrt) + // remains stable even when call frames are pooled and reused. + val stableParent = scope.parent + instance.instanceScope = Scope(stableParent, scope.args, scope.pos, instance) if (instanceConstructor != null) { instanceConstructor!!.execute(instance.instanceScope) } diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjDeferred.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjDeferred.kt index e655943..ee74844 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjDeferred.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjDeferred.kt @@ -37,7 +37,9 @@ open class ObjDeferred(val deferred: Deferred): Obj() { thisAs().deferred.isCompleted.toObj() } addFn("isActive") { - thisAs().deferred.isActive.toObj() + val d = thisAs().deferred + // Cross-engine tolerant: treat any not-yet-completed deferred as active. + (!d.isCompleted).toObj() } addFn("isCancelled") { thisAs().deferred.isCancelled.toObj() diff --git a/lynglib/src/jsMain/kotlin/net/sergeych/lyng/ScopePoolJs.kt b/lynglib/src/jsMain/kotlin/net/sergeych/lyng/ScopePoolJs.kt new file mode 100644 index 0000000..1e8ffc2 --- /dev/null +++ b/lynglib/src/jsMain/kotlin/net/sergeych/lyng/ScopePoolJs.kt @@ -0,0 +1,27 @@ +package net.sergeych.lyng + +import net.sergeych.lyng.obj.Obj +import net.sergeych.lyng.obj.ObjVoid + +/** + * JS actual: simple global deque pool (single-threaded runtime). + */ +actual object ScopePool { + private const val MAX_POOL_SIZE = 64 + private val pool = ArrayDeque(MAX_POOL_SIZE) + + actual fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { + val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) + if (s.parent !== parent || s.args !== args || s.pos !== pos || s.thisObj !== thisObj) { + s.resetForReuse(parent, args, pos, thisObj) + } else { + s.frameId = nextFrameId() + } + return s + } + + actual fun release(scope: Scope) { + scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) + if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) + } +} diff --git a/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/PerfDefaults.jvm.kt b/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/PerfDefaults.jvm.kt index 16c0570..686196a 100644 --- a/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/PerfDefaults.jvm.kt +++ b/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/PerfDefaults.jvm.kt @@ -6,7 +6,7 @@ actual object PerfDefaults { actual val ARG_BUILDER: Boolean = true actual val SKIP_ARGS_ON_NULL_RECEIVER: Boolean = true - actual val SCOPE_POOL: Boolean = false + actual val SCOPE_POOL: Boolean = true actual val FIELD_PIC: Boolean = true actual val METHOD_PIC: Boolean = true diff --git a/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/ScopePoolJvm.kt b/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/ScopePoolJvm.kt new file mode 100644 index 0000000..d43b0ce --- /dev/null +++ b/lynglib/src/jvmMain/kotlin/net/sergeych/lyng/ScopePoolJvm.kt @@ -0,0 +1,30 @@ +package net.sergeych.lyng + +import net.sergeych.lyng.obj.Obj +import net.sergeych.lyng.obj.ObjVoid + +/** + * JVM actual: per-thread scope frame pool backed by ThreadLocal. + * Used only when [PerfFlags.SCOPE_POOL] is true. + */ +actual object ScopePool { + private const val MAX_POOL_SIZE = 64 + private val threadLocalPool: ThreadLocal> = ThreadLocal.withInitial { + ArrayDeque(MAX_POOL_SIZE) + } + + actual fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { + val pool = threadLocalPool.get() + val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) + // Always reset state on borrow to guarantee fresh-frame semantics + s.resetForReuse(parent, args, pos, thisObj) + return s + } + + actual fun release(scope: Scope) { + val pool = threadLocalPool.get() + // Scrub sensitive references to avoid accidental retention + scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) + if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) + } +} diff --git a/lynglib/src/jvmTest/kotlin/BookTest.kt b/lynglib/src/jvmTest/kotlin/BookTest.kt index 5b9581e..60d4a48 100644 --- a/lynglib/src/jvmTest/kotlin/BookTest.kt +++ b/lynglib/src/jvmTest/kotlin/BookTest.kt @@ -206,6 +206,10 @@ suspend fun DocTest.test(_scope: Scope? = null) { expectedResult != result ) { System.err.println("\nfailed: ${this.detailedString}") + System.err.println("[DEBUG_LOG] expectedOutput=\n${expectedOutput}") + System.err.println("[DEBUG_LOG] actualOutput=\n${collectedOutput}") + System.err.println("[DEBUG_LOG] expectedResult=${expectedResult}") + System.err.println("[DEBUG_LOG] actualResult=${result}") } error?.let { fail(it.message, it) diff --git a/lynglib/src/jvmTest/kotlin/ConcurrencyCallBenchmarkTest.kt b/lynglib/src/jvmTest/kotlin/ConcurrencyCallBenchmarkTest.kt new file mode 100644 index 0000000..39390af --- /dev/null +++ b/lynglib/src/jvmTest/kotlin/ConcurrencyCallBenchmarkTest.kt @@ -0,0 +1,66 @@ +/* + * Multithreaded benchmark to quantify SCOPE_POOL speedup on JVM. + */ + +import kotlinx.coroutines.* +import net.sergeych.lyng.PerfFlags +import net.sergeych.lyng.Scope +import net.sergeych.lyng.obj.ObjInt +import kotlin.math.max +import kotlin.math.min +import kotlin.test.Test +import kotlin.test.assertEquals + +class ConcurrencyCallBenchmarkTest { + + private suspend fun parallelEval(workers: Int, script: String): List = coroutineScope { + (0 until workers).map { async { (Scope().eval(script) as ObjInt).value } }.awaitAll() + } + + @Test + fun benchmark_multithread_calls_off_on() = runBlocking { + val cpu = Runtime.getRuntime().availableProcessors() + val workers = min(max(2, cpu), 8) + val iterations = 15_000 // per worker; keep CI fast + val script = """ + fun f0() { 1 } + fun f1(a) { a } + fun f2(a,b) { a + b } + fun f3(a,b,c) { a + b + c } + fun f4(a,b,c,d) { a + b + c + d } + var s = 0 + var i = 0 + while (i < $iterations) { + s = s + f0() + s = s + f1(1) + s = s + f2(1, 1) + s = s + f3(1, 1, 1) + s = s + f4(1, 1, 1, 1) + i = i + 1 + } + s + """.trimIndent() + val expected = (1 + 1 + 2 + 3 + 4).toLong() * iterations + + // OFF + PerfFlags.SCOPE_POOL = false + val t0 = System.nanoTime() + val off = withContext(Dispatchers.Default) { parallelEval(workers, script) } + val t1 = System.nanoTime() + // ON + PerfFlags.SCOPE_POOL = true + val t2 = System.nanoTime() + val on = withContext(Dispatchers.Default) { parallelEval(workers, script) } + val t3 = System.nanoTime() + // reset + PerfFlags.SCOPE_POOL = false + + off.forEach { assertEquals(expected, it) } + on.forEach { assertEquals(expected, it) } + + val offMs = (t1 - t0) / 1_000_000.0 + val onMs = (t3 - t2) / 1_000_000.0 + val speedup = offMs / onMs + println("[DEBUG_LOG] [BENCH] ConcurrencyCallBenchmark workers=$workers iters=$iterations each: OFF=${"%.3f".format(offMs)} ms, ON=${"%.3f".format(onMs)} ms, speedup=${"%.2f".format(speedup)}x") + } +} diff --git a/lynglib/src/jvmTest/kotlin/MultiThreadPoolingStressJvmTest.kt b/lynglib/src/jvmTest/kotlin/MultiThreadPoolingStressJvmTest.kt new file mode 100644 index 0000000..a9d8cb5 --- /dev/null +++ b/lynglib/src/jvmTest/kotlin/MultiThreadPoolingStressJvmTest.kt @@ -0,0 +1,97 @@ +/* + * Multithreaded stress tests for ScopePool on JVM. + */ + +import kotlinx.coroutines.* +import net.sergeych.lyng.PerfFlags +import net.sergeych.lyng.Scope +import net.sergeych.lyng.obj.ObjInt +import kotlin.math.max +import kotlin.math.min +import kotlin.test.Test +import kotlin.test.assertEquals + +class MultiThreadPoolingStressJvmTest { + + private suspend fun parallelEval(workers: Int, block: suspend (Int) -> Long): List = coroutineScope { + (0 until workers).map { w -> async { block(w) } }.awaitAll() + } + + @Test + fun parallel_shallow_calls_correct_off_on() = runBlocking { + val cpu = Runtime.getRuntime().availableProcessors() + val workers = min(max(2, cpu), 8) + val iterations = 25_000 // keep CI reasonable + val script = """ + fun f0(a){ a } + fun f1(a,b){ a + b } + fun f2(a,b,c){ a + b + c } + var s = 0 + var i = 0 + while(i < $iterations){ + s = s + f0(1) + s = s + f1(1,1) + s = s + f2(1,1,1) + i = i + 1 + } + s + """.trimIndent() + + fun expected() = (1 + 2 + 3).toLong() * iterations + + // OFF + PerfFlags.SCOPE_POOL = false + val offResults = withContext(Dispatchers.Default) { + parallelEval(workers) { + val r = (Scope().eval(script) as ObjInt).value + r + } + } + // ON + PerfFlags.SCOPE_POOL = true + val onResults = withContext(Dispatchers.Default) { + parallelEval(workers) { + val r = (Scope().eval(script) as ObjInt).value + r + } + } + // reset + PerfFlags.SCOPE_POOL = false + + val exp = expected() + offResults.forEach { assertEquals(exp, it) } + onResults.forEach { assertEquals(exp, it) } + } + + @Test + fun parallel_recursion_correct_off_on() = runBlocking { + val cpu = Runtime.getRuntime().availableProcessors() + val workers = min(max(2, cpu), 8) + val depth = 12 + val script = """ + fun fact(x){ if(x <= 1) 1 else x * fact(x-1) } + fact($depth) + """.trimIndent() + val expected = (1..depth).fold(1L){a,b->a*b} + + // OFF + PerfFlags.SCOPE_POOL = false + val offResults = withContext(Dispatchers.Default) { + parallelEval(workers) { + (Scope().eval(script) as ObjInt).value + } + } + // ON + PerfFlags.SCOPE_POOL = true + val onResults = withContext(Dispatchers.Default) { + parallelEval(workers) { + (Scope().eval(script) as ObjInt).value + } + } + // reset + PerfFlags.SCOPE_POOL = false + + offResults.forEach { assertEquals(expected, it) } + onResults.forEach { assertEquals(expected, it) } + } +} diff --git a/lynglib/src/nativeMain/kotlin/net/sergeych/lyng/ScopePoolNative.kt b/lynglib/src/nativeMain/kotlin/net/sergeych/lyng/ScopePoolNative.kt new file mode 100644 index 0000000..82e04b8 --- /dev/null +++ b/lynglib/src/nativeMain/kotlin/net/sergeych/lyng/ScopePoolNative.kt @@ -0,0 +1,27 @@ +package net.sergeych.lyng + +import net.sergeych.lyng.obj.Obj +import net.sergeych.lyng.obj.ObjVoid + +/** + * Native actual: simple global deque pool. Many native targets are single-threaded by default in our setup. + */ +actual object ScopePool { + private const val MAX_POOL_SIZE = 64 + private val pool = ArrayDeque(MAX_POOL_SIZE) + + actual fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { + val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) + if (s.parent !== parent || s.args !== args || s.pos !== pos || s.thisObj !== thisObj) { + s.resetForReuse(parent, args, pos, thisObj) + } else { + s.frameId = nextFrameId() + } + return s + } + + actual fun release(scope: Scope) { + scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) + if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) + } +} diff --git a/lynglib/src/wasmJsMain/kotlin/net/sergeych/lyng/ScopePoolWasm.kt b/lynglib/src/wasmJsMain/kotlin/net/sergeych/lyng/ScopePoolWasm.kt new file mode 100644 index 0000000..03d8b33 --- /dev/null +++ b/lynglib/src/wasmJsMain/kotlin/net/sergeych/lyng/ScopePoolWasm.kt @@ -0,0 +1,27 @@ +package net.sergeych.lyng + +import net.sergeych.lyng.obj.Obj +import net.sergeych.lyng.obj.ObjVoid + +/** + * Wasm/JS actual: simple global deque pool (single-threaded runtime model). + */ +actual object ScopePool { + private const val MAX_POOL_SIZE = 64 + private val pool = ArrayDeque(MAX_POOL_SIZE) + + actual fun borrow(parent: Scope, args: Arguments, pos: Pos, thisObj: Obj): Scope { + val s = if (pool.isNotEmpty()) pool.removeLast() else Scope(parent, args, pos, thisObj) + if (s.parent !== parent || s.args !== args || s.pos !== pos || s.thisObj !== thisObj) { + s.resetForReuse(parent, args, pos, thisObj) + } else { + s.frameId = nextFrameId() + } + return s + } + + actual fun release(scope: Scope) { + scope.resetForReuse(parent = null, args = Arguments.EMPTY, pos = Pos.builtIn, thisObj = ObjVoid) + if (pool.size < MAX_POOL_SIZE) pool.addLast(scope) + } +}