From 064b927b1ae8804da08aea78c55a8fddc8703f02 Mon Sep 17 00:00:00 2001 From: sergeych Date: Fri, 3 Apr 2026 23:16:08 +0300 Subject: [PATCH] Stabilize pi benchmark optimizations for release --- examples/pi-bench.lyng | 72 +++ examples/pi-test.lyng | 49 ++ .../lyng/bytecode/BytecodeCompiler.kt | 473 +++++++++++++----- .../net/sergeych/lyng/bytecode/CmdRuntime.kt | 179 +++++-- .../jvmTest/kotlin/PiSpigotBenchmarkTest.kt | 145 ++++++ ...pi_spigot_benchmark_baseline_2026-04-03.md | 172 +++++++ 6 files changed, 912 insertions(+), 178 deletions(-) create mode 100644 examples/pi-bench.lyng create mode 100644 examples/pi-test.lyng create mode 100644 lynglib/src/jvmTest/kotlin/PiSpigotBenchmarkTest.kt create mode 100644 notes/pi_spigot_benchmark_baseline_2026-04-03.md diff --git a/examples/pi-bench.lyng b/examples/pi-bench.lyng new file mode 100644 index 0000000..c95ca3e --- /dev/null +++ b/examples/pi-bench.lyng @@ -0,0 +1,72 @@ +import lyng.time + +val WORK_SIZE = 200 +val TASK_COUNT = 10 + +fn piSpigot(iThread: Int, n: Int) { + var pi = [] + val boxes = n * 10 / 3 + var reminders = List.fill(boxes) { 2 } + var heldDigits = 0 + for (i in 0..n) { + var carriedOver = 0 + var sum = 0 + for (k in 1..boxes) { + val j = boxes - k + val denom = j * 2 + 1 + reminders[j] *= 10 + sum = reminders[j] + carriedOver + val quotient = sum / denom + reminders[j] = sum % denom + carriedOver = quotient * j + } + reminders[0] = sum % 10 + var q = sum / 10 + if (q == 9) { + ++heldDigits + } else if (q == 10) { + q = 0 + for (k in 1..heldDigits) { + var replaced = pi[i - k] + if (replaced == 9) { + replaced = 0 + } else { + ++replaced + } + pi[i - k] = replaced + } + heldDigits = 1 + } else { + heldDigits = 1 + } + pi.add(q) + } + + var s = "" + for (i in (n - 8).. NumericKind.INT SlotType.REAL -> NumericKind.REAL - else -> NumericKind.UNKNOWN + else -> when (slotObjClass[slot]) { + ObjInt.type -> NumericKind.INT + ObjReal.type -> NumericKind.REAL + else -> NumericKind.UNKNOWN + } } } @@ -1800,7 +1804,21 @@ class BytecodeCompiler( is ConstRef -> numericKindFromConst(ref.constValue) is LocalVarRef -> resolveDirectNameSlot(ref.name)?.let { numericKindFromSlot(it.slot) } ?: NumericKind.UNKNOWN is FastLocalVarRef -> resolveDirectNameSlot(ref.name)?.let { numericKindFromSlot(it.slot) } ?: NumericKind.UNKNOWN - is LocalSlotRef -> resolveSlot(ref)?.let { numericKindFromSlot(it) } ?: NumericKind.UNKNOWN + is LocalSlotRef -> resolveLocalSlotByRefOrName(ref)?.let { numericKindFromSlot(it) } ?: NumericKind.UNKNOWN + is IndexRef -> { + val receiver = when (val target = ref.targetRef) { + is LocalSlotRef -> resolveLocalSlotByRefOrName(target) + is LocalVarRef -> resolveDirectNameSlot(target.name)?.slot + is FastLocalVarRef -> resolveDirectNameSlot(target.name)?.slot + else -> null + } + val elementClass = receiver?.let { listElementClassBySlot[it] } ?: listElementClassFromReceiverRef(ref.targetRef) + when (elementClass) { + ObjInt.type -> NumericKind.INT + ObjReal.type -> NumericKind.REAL + else -> NumericKind.UNKNOWN + } + } is UnaryOpRef -> inferNumericKind(ref.a) is BinaryOpRef -> { val op = ref.op @@ -2431,7 +2449,7 @@ class BytecodeCompiler( updateSlotType(slot, SlotType.OBJ) return value } - val value = compileRef(assignValue(ref)) ?: return null + var value = compileRef(assignValue(ref)) ?: return null if (isLoopVarRef(localTarget)) { emitLoopVarReassignError(localTarget.name, localTarget.pos()) return value @@ -2473,7 +2491,7 @@ class BytecodeCompiler( else -> null } if (nameTarget != null) { - val value = compileRef(assignValue(ref)) ?: return null + var value = compileRef(assignValue(ref)) ?: return null val resolved = resolveAssignableSlotByName(nameTarget) ?: return null val slot = resolved.first val isMutable = resolved.second @@ -2698,6 +2716,7 @@ class BytecodeCompiler( if (!target.optionalRef) { val index = compileRefWithFallback(target.indexRef, null, Pos.builtIn) ?: return null builder.emit(Opcode.SET_INDEX, receiver.slot, index.slot, value.slot) + noteListElementClassMutation(receiver.slot, value) } else { val nullSlot = allocSlot() builder.emit(Opcode.CONST_NULL, nullSlot) @@ -2710,6 +2729,7 @@ class BytecodeCompiler( ) val index = compileRefWithFallback(target.indexRef, null, Pos.builtIn) ?: return null builder.emit(Opcode.SET_INDEX, receiver.slot, index.slot, value.slot) + noteListElementClassMutation(receiver.slot, value) builder.mark(endLabel) } return value @@ -3026,9 +3046,32 @@ class BytecodeCompiler( val receiver = compileRefWithFallback(indexTarget.targetRef, null, Pos.builtIn) ?: return null val current = allocSlot() val result = allocSlot() - val rhs = compileRef(ref.value) ?: return compileEvalRef(ref) + var rhs = compileRef(ref.value) ?: return compileEvalRef(ref) + val elementClass = listElementClassBySlot[receiver.slot] ?: listElementClassFromReceiverRef(indexTarget.targetRef) if (!indexTarget.optionalRef) { val index = compileRefWithFallback(indexTarget.indexRef, null, Pos.builtIn) ?: return null + if (elementClass == ObjInt.type) { + builder.emit(Opcode.GET_INDEX, receiver.slot, index.slot, current) + val currentInt = allocSlot() + builder.emit(Opcode.UNBOX_INT_OBJ, current, currentInt) + updateSlotType(currentInt, SlotType.INT) + if (rhs.type != SlotType.INT) { + coerceToArithmeticInt(ref.value, rhs)?.let { rhs = it } + } + val typed = when (ref.op) { + BinOp.PLUS -> compileAssignOpBinary(SlotType.INT, rhs, currentInt, Opcode.ADD_INT, Opcode.ADD_REAL, Opcode.ADD_OBJ) + BinOp.MINUS -> compileAssignOpBinary(SlotType.INT, rhs, currentInt, Opcode.SUB_INT, Opcode.SUB_REAL, Opcode.SUB_OBJ) + BinOp.STAR -> compileAssignOpBinary(SlotType.INT, rhs, currentInt, Opcode.MUL_INT, Opcode.MUL_REAL, Opcode.MUL_OBJ) + BinOp.SLASH -> compileAssignOpBinary(SlotType.INT, rhs, currentInt, Opcode.DIV_INT, Opcode.DIV_REAL, Opcode.DIV_OBJ) + BinOp.PERCENT -> compileAssignOpBinary(SlotType.INT, rhs, currentInt, Opcode.MOD_INT, null, Opcode.MOD_OBJ) + else -> null + } + if (typed != null && typed.type == SlotType.INT) { + builder.emit(Opcode.SET_INDEX, receiver.slot, index.slot, currentInt) + noteListElementClassMutation(receiver.slot, typed) + return CompiledValue(currentInt, SlotType.INT) + } + } builder.emit(Opcode.GET_INDEX, receiver.slot, index.slot, current) builder.emit(objOp, current, rhs.slot, result) builder.emit(Opcode.SET_INDEX, receiver.slot, index.slot, result) @@ -3586,7 +3629,7 @@ class BytecodeCompiler( val elementClass = listElementClassBySlot[receiver.slot] ?: listElementClassFromReceiverRef(ref.targetRef) if (elementClass != null) { slotObjClass[dst] = elementClass - if (elementClass == ObjString.type && elementClass.isClosed) { + if (elementClass.isClosed) { stableObjSlots.add(dst) } else { stableObjSlots.remove(dst) @@ -4649,6 +4692,9 @@ class BytecodeCompiler( val encodedCount = encodeCallArgCount(args) ?: return null setPos(callPos) builder.emit(Opcode.CALL_MEMBER_SLOT, receiver.slot, encodedMethodId, args.base, encodedCount, dst) + if (receiverClass == ObjList.type && ref.name == "add" && ref.args.size == 1 && !ref.args.first().isSplat) { + noteListElementClassMutation(receiver.slot, CompiledValue(args.base, SlotType.OBJ)) + } return CompiledValue(dst, SlotType.OBJ) } val nullSlot = allocSlot() @@ -4815,7 +4861,7 @@ class BytecodeCompiler( " receiver=$kind(${ref.name}) slot=$slot slotClass=$slotCls nameClass=$nameCls" } is LocalSlotRef -> { - val slot = resolveSlot(ref) + val slot = resolveLocalSlotByRefOrName(ref) val slotCls = slot?.let { slotObjClass[it]?.className } val nameCls = nameObjClass[ref.name]?.className val scopeId = refScopeId(ref) @@ -4971,9 +5017,10 @@ class BytecodeCompiler( val specs = if (needPlan) ArrayList(args.size) else null for ((index, arg) in args.withIndex()) { val compiled = compileArgValue(arg.value) ?: return null + val objValue = ensureObjSlot(compiled) val dst = argSlots[index] - if (compiled.slot != dst || compiled.type != SlotType.OBJ) { - builder.emit(Opcode.BOX_OBJ, compiled.slot, dst) + if (objValue.slot != dst) { + emitMove(objValue, dst) } updateSlotType(dst, SlotType.OBJ) specs?.add(BytecodeConst.CallArgSpec(arg.name, arg.isSplat)) @@ -5833,7 +5880,8 @@ class BytecodeCompiler( emitMove(value, localSlot) } updateSlotType(localSlot, value.type) - updateSlotObjClass(localSlot, stmt.initializer, stmt.initializerObjClass) + slotObjClass[value.slot]?.let { slotObjClass[localSlot] = it } + ?: updateSlotObjClass(localSlot, stmt.initializer, stmt.initializerObjClass) updateListElementClassFromDecl(localSlot, scopeId, stmt.slotIndex) updateListElementClassFromInitializer(localSlot, stmt.initializer) updateNameObjClassFromSlot(stmt.name, localSlot) @@ -5865,7 +5913,8 @@ class BytecodeCompiler( } updateSlotType(scopeSlot, value.type) updateNameObjClassFromSlot(stmt.name, scopeSlot) - updateSlotObjClass(scopeSlot, stmt.initializer, stmt.initializerObjClass) + slotObjClass[value.slot]?.let { slotObjClass[scopeSlot] = it } + ?: updateSlotObjClass(scopeSlot, stmt.initializer, stmt.initializerObjClass) updateListElementClassFromDecl(scopeSlot, scopeId, stmt.slotIndex) updateListElementClassFromInitializer(scopeSlot, stmt.initializer) val declId = builder.addConst( @@ -5901,7 +5950,9 @@ class BytecodeCompiler( updateSlotTypeByName(stmt.name, value.type) } updateNameObjClassFromSlot(stmt.name, value.slot) - updateSlotObjClass(value.slot, stmt.initializer, stmt.initializerObjClass) + if (slotObjClass[value.slot] == null) { + updateSlotObjClass(value.slot, stmt.initializer, stmt.initializerObjClass) + } updateListElementClassFromDecl(value.slot, scopeId, stmt.slotIndex) updateListElementClassFromInitializer(value.slot, stmt.initializer) return value @@ -5991,6 +6042,16 @@ class BytecodeCompiler( listElementClassBySlot[slot] = elementClass } + private fun noteListElementClassMutation(receiverSlot: Int, value: CompiledValue) { + val newClass = elementClassFromValue(value) ?: return + val current = listElementClassBySlot[receiverSlot] + if (current == null || current == newClass) { + listElementClassBySlot[receiverSlot] = newClass + } else { + listElementClassBySlot.remove(receiverSlot) + } + } + private fun updateNameObjClassFromSlot(name: String, slot: Int) { val cls = slotObjClass[slot] ?: return nameObjClass[name] = cls @@ -6086,9 +6147,6 @@ class BytecodeCompiler( if (range == null && rangeRef == null) { rangeRef = extractRangeFromLocal(stmt.source) } - if (rangeRef != null && !isConstIntRange(rangeRef)) { - rangeRef = null - } val typedRangeLocal = if (range == null && rangeRef == null) extractTypedRangeLocal(stmt.source) else null val loopSlotPlan = stmt.loopSlotPlan val loopSlotIndex = stmt.loopSlotPlan[stmt.loopVarName] @@ -6129,120 +6187,16 @@ class BytecodeCompiler( val breakFlagSlot = allocSlot() if (range == null && rangeRef == null && typedRangeLocal == null) { val sourceValue = compileStatementValueOrFallback(stmt.source) ?: return null - val sourceObj = ensureObjSlot(sourceValue) - val typeId = builder.addConst(BytecodeConst.ObjRef(ObjIterable)) - val typeSlot = allocSlot() - builder.emit(Opcode.CONST_OBJ, typeId, typeSlot) - builder.emit(Opcode.ASSERT_IS, sourceObj.slot, typeSlot) - - val iterableMethods = ObjIterable.instanceMethodIdMap(includeAbstract = true) - val iteratorMethodId = iterableMethods["iterator"] - if (iteratorMethodId == null) { - throw BytecodeCompileException("Missing member id for Iterable.iterator", stmt.pos) - } - val iteratorMethods = ObjIterator.instanceMethodIdMap(includeAbstract = true) - val hasNextMethodId = iteratorMethods["hasNext"] - if (hasNextMethodId == null) { - throw BytecodeCompileException("Missing member id for Iterator.hasNext", stmt.pos) - } - val nextMethodId = iteratorMethods["next"] - if (nextMethodId == null) { - throw BytecodeCompileException("Missing member id for Iterator.next", stmt.pos) - } - - val iterSlot = allocSlot() - builder.emit(Opcode.CALL_MEMBER_SLOT, sourceObj.slot, iteratorMethodId, 0, 0, iterSlot) - builder.emit(Opcode.ITER_PUSH, iterSlot) - - if (needsBreakFlag) { - val falseId = builder.addConst(BytecodeConst.Bool(false)) - builder.emit(Opcode.CONST_BOOL, falseId, breakFlagSlot) - } - val resultSlot = if (wantResult) { - val slot = allocSlot() - val voidId = builder.addConst(BytecodeConst.ObjRef(ObjVoid)) - builder.emit(Opcode.CONST_OBJ, voidId, slot) - slot - } else { - null - } - - val loopLabel = builder.label() - val continueLabel = builder.label() - val endLabel = builder.label() - builder.mark(loopLabel) - - val hasNextSlot = allocSlot() - builder.emit(Opcode.CALL_MEMBER_SLOT, iterSlot, hasNextMethodId, 0, 0, hasNextSlot) - val condSlot = allocSlot() - builder.emit(Opcode.OBJ_TO_BOOL, hasNextSlot, condSlot) - builder.emit( - Opcode.JMP_IF_FALSE, - listOf(CmdBuilder.Operand.IntVal(condSlot), CmdBuilder.Operand.LabelRef(endLabel)) + return emitIterableForIn( + stmt = stmt, + sourceValue = sourceValue, + wantResult = wantResult, + loopSlotId = loopSlotId, + breakFlagSlot = breakFlagSlot, + needsBreakFlag = needsBreakFlag, + hasRealWiden = hasRealWiden, + realWidenSlots = realWidenSlots, ) - - val nextSlot = allocSlot() - builder.emit(Opcode.CALL_MEMBER_SLOT, iterSlot, nextMethodId, 0, 0, nextSlot) - val nextObj = ensureObjSlot(CompiledValue(nextSlot, SlotType.UNKNOWN)) - emitMove(CompiledValue(nextObj.slot, SlotType.OBJ), loopSlotId) - updateSlotType(loopSlotId, SlotType.OBJ) - updateSlotTypeByName(stmt.loopVarName, SlotType.OBJ) - - loopStack.addLast( - LoopContext( - stmt.label, - endLabel, - continueLabel, - breakFlagSlot, - resultSlot, - hasIterator = true - ) - ) - val bodyValue = compileLoopBody(stmt.body, wantResult) ?: return null - if (hasRealWiden) { - applySlotTypes(realWidenSlots, SlotType.UNKNOWN) - } - loopStack.removeLast() - if (wantResult) { - val bodyObj = ensureObjSlot(bodyValue) - builder.emit(Opcode.MOVE_OBJ, bodyObj.slot, resultSlot!!) - } - builder.mark(continueLabel) - if (hasRealWiden) { - emitLoopRealCoercions(realWidenSlots) - } - builder.emit(Opcode.JMP, listOf(CmdBuilder.Operand.LabelRef(loopLabel))) - - builder.mark(endLabel) - if (needsBreakFlag) { - val afterPop = builder.label() - builder.emit( - Opcode.JMP_IF_TRUE, - listOf(CmdBuilder.Operand.IntVal(breakFlagSlot), CmdBuilder.Operand.LabelRef(afterPop)) - ) - builder.emit(Opcode.ITER_POP) - builder.mark(afterPop) - } else { - builder.emit(Opcode.ITER_POP) - } - if (stmt.elseStatement != null) { - val afterElse = if (needsBreakFlag) builder.label() else null - if (needsBreakFlag) { - builder.emit( - Opcode.JMP_IF_TRUE, - listOf(CmdBuilder.Operand.IntVal(breakFlagSlot), CmdBuilder.Operand.LabelRef(afterElse!!)) - ) - } - val elseValue = compileStatementValueOrFallback(stmt.elseStatement, wantResult) ?: return null - if (wantResult) { - val elseObj = ensureObjSlot(elseValue) - builder.emit(Opcode.MOVE_OBJ, elseObj.slot, resultSlot!!) - } - if (needsBreakFlag) { - builder.mark(afterElse!!) - } - } - return resultSlot ?: breakFlagSlot } val iSlot = loopSlotId @@ -6261,9 +6215,23 @@ class BytecodeCompiler( if (rangeRef != null) { val left = rangeRef.left ?: return null val right = rangeRef.right ?: return null - val startValue = compileRef(left) ?: return null - val endValue = compileRef(right) ?: return null - if (startValue.type != SlotType.INT || endValue.type != SlotType.INT) return null + val startCompiled = compileRef(left) ?: return null + val endCompiled = compileRef(right) ?: return null + val startValue = coerceToLoopInt(startCompiled) + val endValue = coerceToLoopInt(endCompiled) + if (startValue == null || endValue == null) { + val rangeValue = emitRangeObject(startCompiled, endCompiled, rangeRef) + return emitIterableForIn( + stmt = stmt, + sourceValue = rangeValue, + wantResult = wantResult, + loopSlotId = loopSlotId, + breakFlagSlot = breakFlagSlot, + needsBreakFlag = needsBreakFlag, + hasRealWiden = hasRealWiden, + realWidenSlots = realWidenSlots, + ) + } val descendingId = builder.addConst(BytecodeConst.Bool(rangeRef.isDescending)) emitMove(startValue, iSlot) emitMove(endValue, endSlot) @@ -7308,7 +7276,8 @@ class BytecodeCompiler( is LocalSlotRef -> { val ownerScopeId = ref.captureOwnerScopeId ?: ref.scopeId val ownerSlot = ref.captureOwnerSlot ?: ref.slot - slotTypeByScopeId[ownerScopeId]?.get(ownerSlot) + resolveLocalSlotByRefOrName(ref)?.let { slotObjClass[it] } + ?: slotTypeByScopeId[ownerScopeId]?.get(ownerSlot) ?: slotInitClassByKey[ScopeSlotKey(ownerScopeId, ownerSlot)] ?: nameObjClass[ref.name] ?: resolveTypeNameClass(ref.name) @@ -7721,6 +7690,11 @@ class BytecodeCompiler( return resolved } + private fun resolveLocalSlotByRefOrName(ref: LocalSlotRef): Int? { + return resolveSlot(ref) + ?: ref.name.takeIf { it.isNotEmpty() }?.let { name -> resolveDirectNameSlot(name)?.slot } + } + private fun resolveCapturedOwnerScopeSlot(ref: LocalSlotRef): Int? { val ownerScopeId = ref.captureOwnerScopeId ?: return null val ownerSlot = ref.captureOwnerSlot ?: return null @@ -8698,10 +8672,237 @@ class BytecodeCompiler( return if (ref.step != null) null else ref } - private fun isConstIntRange(ref: RangeRef): Boolean { - val left = ref.left as? ConstRef ?: return false - val right = ref.right as? ConstRef ?: return false - return left.constValue is ObjInt && right.constValue is ObjInt + private fun emitIterableForIn( + stmt: net.sergeych.lyng.ForInStatement, + sourceValue: CompiledValue, + wantResult: Boolean, + loopSlotId: Int, + breakFlagSlot: Int, + needsBreakFlag: Boolean, + hasRealWiden: Boolean, + realWidenSlots: Set, + ): Int? { + val sourceObj = ensureObjSlot(sourceValue) + val typeId = builder.addConst(BytecodeConst.ObjRef(ObjIterable)) + val typeSlot = allocSlot() + builder.emit(Opcode.CONST_OBJ, typeId, typeSlot) + builder.emit(Opcode.ASSERT_IS, sourceObj.slot, typeSlot) + + val iterableMethods = ObjIterable.instanceMethodIdMap(includeAbstract = true) + val iteratorMethodId = iterableMethods["iterator"] + ?: throw BytecodeCompileException("Missing member id for Iterable.iterator", stmt.pos) + val iteratorMethods = ObjIterator.instanceMethodIdMap(includeAbstract = true) + val hasNextMethodId = iteratorMethods["hasNext"] + ?: throw BytecodeCompileException("Missing member id for Iterator.hasNext", stmt.pos) + val nextMethodId = iteratorMethods["next"] + ?: throw BytecodeCompileException("Missing member id for Iterator.next", stmt.pos) + + val iterSlot = allocSlot() + builder.emit(Opcode.CALL_MEMBER_SLOT, sourceObj.slot, iteratorMethodId, 0, 0, iterSlot) + builder.emit(Opcode.ITER_PUSH, iterSlot) + + if (needsBreakFlag) { + val falseId = builder.addConst(BytecodeConst.Bool(false)) + builder.emit(Opcode.CONST_BOOL, falseId, breakFlagSlot) + } + val resultSlot = if (wantResult) { + val slot = allocSlot() + val voidId = builder.addConst(BytecodeConst.ObjRef(ObjVoid)) + builder.emit(Opcode.CONST_OBJ, voidId, slot) + slot + } else { + null + } + + val loopLabel = builder.label() + val continueLabel = builder.label() + val endLabel = builder.label() + builder.mark(loopLabel) + + val hasNextSlot = allocSlot() + builder.emit(Opcode.CALL_MEMBER_SLOT, iterSlot, hasNextMethodId, 0, 0, hasNextSlot) + val condSlot = allocSlot() + builder.emit(Opcode.OBJ_TO_BOOL, hasNextSlot, condSlot) + builder.emit( + Opcode.JMP_IF_FALSE, + listOf(CmdBuilder.Operand.IntVal(condSlot), CmdBuilder.Operand.LabelRef(endLabel)) + ) + + val nextSlot = allocSlot() + builder.emit(Opcode.CALL_MEMBER_SLOT, iterSlot, nextMethodId, 0, 0, nextSlot) + val nextObj = ensureObjSlot(CompiledValue(nextSlot, SlotType.UNKNOWN)) + emitMove(CompiledValue(nextObj.slot, SlotType.OBJ), loopSlotId) + updateSlotType(loopSlotId, SlotType.OBJ) + updateSlotTypeByName(stmt.loopVarName, SlotType.OBJ) + + loopStack.addLast( + LoopContext( + stmt.label, + endLabel, + continueLabel, + breakFlagSlot, + resultSlot, + hasIterator = true + ) + ) + val bodyValue = compileLoopBody(stmt.body, wantResult) ?: return null + if (hasRealWiden) { + applySlotTypes(realWidenSlots, SlotType.UNKNOWN) + } + loopStack.removeLast() + if (wantResult) { + val bodyObj = ensureObjSlot(bodyValue) + builder.emit(Opcode.MOVE_OBJ, bodyObj.slot, resultSlot!!) + } + builder.mark(continueLabel) + if (hasRealWiden) { + emitLoopRealCoercions(realWidenSlots) + } + builder.emit(Opcode.JMP, listOf(CmdBuilder.Operand.LabelRef(loopLabel))) + + builder.mark(endLabel) + if (needsBreakFlag) { + val afterPop = builder.label() + builder.emit( + Opcode.JMP_IF_TRUE, + listOf(CmdBuilder.Operand.IntVal(breakFlagSlot), CmdBuilder.Operand.LabelRef(afterPop)) + ) + builder.emit(Opcode.ITER_POP) + builder.mark(afterPop) + } else { + builder.emit(Opcode.ITER_POP) + } + if (stmt.elseStatement != null) { + val afterElse = if (needsBreakFlag) builder.label() else null + if (needsBreakFlag) { + builder.emit( + Opcode.JMP_IF_TRUE, + listOf(CmdBuilder.Operand.IntVal(breakFlagSlot), CmdBuilder.Operand.LabelRef(afterElse!!)) + ) + } + val elseValue = compileStatementValueOrFallback(stmt.elseStatement, wantResult) ?: return null + if (wantResult) { + val elseObj = ensureObjSlot(elseValue) + builder.emit(Opcode.MOVE_OBJ, elseObj.slot, resultSlot!!) + } + if (needsBreakFlag) { + builder.mark(afterElse!!) + } + } + return resultSlot ?: breakFlagSlot + } + + private fun emitRangeObject(startValue: CompiledValue, endValue: CompiledValue, ref: RangeRef): CompiledValue { + val startObj = ensureObjSlot(startValue) + val endObj = ensureObjSlot(endValue) + val inclusiveSlot = allocSlot() + val inclusiveId = builder.addConst(BytecodeConst.Bool(ref.isEndInclusive)) + builder.emit(Opcode.CONST_BOOL, inclusiveId, inclusiveSlot) + val descendingSlot = allocSlot() + val descendingId = builder.addConst(BytecodeConst.Bool(ref.isDescending)) + builder.emit(Opcode.CONST_BOOL, descendingId, descendingSlot) + val stepSlot = allocSlot() + builder.emit(Opcode.CONST_NULL, stepSlot) + updateSlotType(stepSlot, SlotType.OBJ) + val dst = allocSlot() + builder.emit(Opcode.MAKE_RANGE, startObj.slot, endObj.slot, inclusiveSlot, descendingSlot, stepSlot, dst) + updateSlotType(dst, SlotType.OBJ) + slotObjClass[dst] = ObjRange.type + return CompiledValue(dst, SlotType.OBJ) + } + + private fun isDynamicIntRangeCandidate(ref: RangeRef): Boolean { + val left = ref.left ?: return false + val right = ref.right ?: return false + return isIntLikeRef(left) && isIntLikeRef(right) + } + + private fun isIntLikeRef(ref: ObjRef): Boolean { + if (inferNumericKind(ref) == NumericKind.INT) { + return true + } + return when (ref) { + is ConstRef -> ref.constValue is ObjInt + is LocalSlotRef, + is LocalVarRef, + is FastLocalVarRef, + is BoundLocalVarRef, + is CallRef, + is MethodCallRef, + is FieldRef, + is CastRef, + is StatementRef -> resolveReceiverClass(ref) == ObjInt.type + is ThisMethodSlotCallRef, + is ImplicitThisMethodCallRef, + is ThisFieldSlotRef, + is ImplicitThisMemberRef -> resolveReceiverClassForScopeCollection(ref) == ObjInt.type + is UnaryOpRef -> ref.op == UnaryOp.NEGATE && isIntLikeRef(unaryOperand(ref)) + is BinaryOpRef -> when (binaryOp(ref)) { + BinOp.PLUS, + BinOp.MINUS, + BinOp.STAR, + BinOp.SLASH, + BinOp.PERCENT, + BinOp.BAND, + BinOp.BXOR, + BinOp.BOR, + BinOp.SHL, + BinOp.SHR -> isIntLikeRef(binaryLeft(ref)) && isIntLikeRef(binaryRight(ref)) + else -> false + } + else -> false + } + } + + private fun coerceToLoopInt(value: CompiledValue): CompiledValue? { + return when (value.type) { + SlotType.INT -> value + SlotType.OBJ -> { + val isExactInt = isExactNonNullSlotClassOrTemp(value.slot, ObjInt.type) + val isStableIntObj = slotObjClass[value.slot] == ObjInt.type && isStablePrimitiveSourceSlot(value.slot) + if (!isExactInt && !isStableIntObj && !isStablePrimitiveSourceSlot(value.slot)) return null + val objSlot = if (isExactInt || isStableIntObj) { + value.slot + } else { + val boxed = allocSlot() + builder.emit(Opcode.BOX_OBJ, value.slot, boxed) + updateSlotType(boxed, SlotType.OBJ) + emitAssertObjSlotIsInt(boxed) + } + val intSlot = allocSlot() + builder.emit(Opcode.UNBOX_INT_OBJ, objSlot, intSlot) + updateSlotType(intSlot, SlotType.INT) + CompiledValue(intSlot, SlotType.INT) + } + SlotType.UNKNOWN -> { + if (!isStablePrimitiveSourceSlot(value.slot)) return null + val boxed = allocSlot() + builder.emit(Opcode.BOX_OBJ, value.slot, boxed) + updateSlotType(boxed, SlotType.OBJ) + val checked = emitAssertObjSlotIsInt(boxed) + val intSlot = allocSlot() + builder.emit(Opcode.UNBOX_INT_OBJ, checked, intSlot) + updateSlotType(intSlot, SlotType.INT) + CompiledValue(intSlot, SlotType.INT) + } + else -> null + } + } + + private fun coerceToArithmeticInt(ref: ObjRef, value: CompiledValue): CompiledValue? { + if (value.type == SlotType.INT) return value + val refSuggestsInt = inferNumericKind(ref) == NumericKind.INT + val stableNonTemp = !isTempSlot(value.slot) && isStablePrimitiveSourceSlot(value.slot) + if (!refSuggestsInt && !stableNonTemp) return null + return coerceToLoopInt(value) + } + + private fun emitAssertObjSlotIsInt(slot: Int): Int { + val typeId = builder.addConst(BytecodeConst.ObjRef(ObjInt.type)) + val typeSlot = allocSlot() + builder.emit(Opcode.CONST_OBJ, typeId, typeSlot) + builder.emit(Opcode.ASSERT_IS, slot, typeSlot) + return slot } private fun extractDeclaredRange(stmt: Statement?): RangeRef? { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/bytecode/CmdRuntime.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/bytecode/CmdRuntime.kt index ee42d92..22c81e7 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/bytecode/CmdRuntime.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/bytecode/CmdRuntime.kt @@ -314,8 +314,13 @@ class CmdUnboxIntObj(internal val src: Int, internal val dst: Int) : Cmd() { class CmdUnboxIntObjLocal(internal val src: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val value = frame.frame.getRawObj(src) as ObjInt - frame.setLocalInt(dst, value.value) + when (frame.frame.getSlotTypeCode(src)) { + SlotType.INT.code -> frame.setLocalInt(dst, frame.frame.getInt(src)) + else -> { + val value = frame.frame.getRawObj(src) as ObjInt + frame.setLocalInt(dst, value.value) + } + } return } } @@ -331,8 +336,13 @@ class CmdUnboxRealObj(internal val src: Int, internal val dst: Int) : Cmd() { class CmdUnboxRealObjLocal(internal val src: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val value = frame.frame.getRawObj(src) as ObjReal - frame.setLocalReal(dst, value.value) + when (frame.frame.getSlotTypeCode(src)) { + SlotType.REAL.code -> frame.setLocalReal(dst, frame.frame.getReal(src)) + else -> { + val value = frame.frame.getRawObj(src) as ObjReal + frame.setLocalReal(dst, value.value) + } + } return } } @@ -1540,9 +1550,15 @@ class CmdCmpEqIntObj(internal val a: Int, internal val b: Int, internal val dst: class CmdCmpEqIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value == right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left == right) return } } @@ -1563,9 +1579,15 @@ class CmdCmpNeqIntObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpNeqIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value != right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left != right) return } } @@ -1586,9 +1608,15 @@ class CmdCmpLtIntObj(internal val a: Int, internal val b: Int, internal val dst: class CmdCmpLtIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value < right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left < right) return } } @@ -1609,9 +1637,15 @@ class CmdCmpLteIntObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpLteIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value <= right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left <= right) return } } @@ -1632,9 +1666,15 @@ class CmdCmpGtIntObj(internal val a: Int, internal val b: Int, internal val dst: class CmdCmpGtIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value > right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left > right) return } } @@ -1655,9 +1695,15 @@ class CmdCmpGteIntObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpGteIntObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjInt - val right = frame.frame.getRawObj(b) as ObjInt - frame.setLocalBool(dst, left.value >= right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.INT.code -> frame.frame.getInt(a) + else -> (frame.frame.getRawObj(a) as ObjInt).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.INT.code -> frame.frame.getInt(b) + else -> (frame.frame.getRawObj(b) as ObjInt).value + } + frame.setLocalBool(dst, left >= right) return } } @@ -1678,9 +1724,15 @@ class CmdCmpEqRealObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpEqRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value == right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left == right) return } } @@ -1701,9 +1753,15 @@ class CmdCmpNeqRealObj(internal val a: Int, internal val b: Int, internal val ds class CmdCmpNeqRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value != right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left != right) return } } @@ -1724,9 +1782,15 @@ class CmdCmpLtRealObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpLtRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value < right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left < right) return } } @@ -1747,9 +1811,15 @@ class CmdCmpLteRealObj(internal val a: Int, internal val b: Int, internal val ds class CmdCmpLteRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value <= right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left <= right) return } } @@ -1770,9 +1840,15 @@ class CmdCmpGtRealObj(internal val a: Int, internal val b: Int, internal val dst class CmdCmpGtRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value > right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left > right) return } } @@ -1793,9 +1869,15 @@ class CmdCmpGteRealObj(internal val a: Int, internal val b: Int, internal val ds class CmdCmpGteRealObjLocal(internal val a: Int, internal val b: Int, internal val dst: Int) : Cmd() { override val isFast: Boolean = true override fun performFast(frame: CmdFrame) { - val left = frame.frame.getRawObj(a) as ObjReal - val right = frame.frame.getRawObj(b) as ObjReal - frame.setLocalBool(dst, left.value >= right.value) + val left = when (frame.frame.getSlotTypeCode(a)) { + SlotType.REAL.code -> frame.frame.getReal(a) + else -> (frame.frame.getRawObj(a) as ObjReal).value + } + val right = when (frame.frame.getSlotTypeCode(b)) { + SlotType.REAL.code -> frame.frame.getReal(b) + else -> (frame.frame.getRawObj(b) as ObjReal).value + } + frame.setLocalBool(dst, left >= right) return } } @@ -3624,7 +3706,13 @@ class CmdGetIndex( internal val dst: Int, ) : Cmd() { override suspend fun perform(frame: CmdFrame) { - val result = frame.slotToObj(targetSlot).getAt(frame.ensureScope(), frame.slotToObj(indexSlot)) + val target = frame.storedSlotObj(targetSlot) + val index = frame.storedSlotObj(indexSlot) + if (target is ObjList && target::class == ObjList::class && index is ObjInt) { + frame.storeObjResult(dst, target.list[index.toInt()]) + return + } + val result = target.getAt(frame.ensureScope(), index) frame.storeObjResult(dst, result) return } @@ -3636,7 +3724,14 @@ class CmdSetIndex( internal val valueSlot: Int, ) : Cmd() { override suspend fun perform(frame: CmdFrame) { - frame.slotToObj(targetSlot).putAt(frame.ensureScope(), frame.slotToObj(indexSlot), frame.slotToObj(valueSlot)) + val target = frame.storedSlotObj(targetSlot) + val index = frame.storedSlotObj(indexSlot) + val value = frame.slotToObj(valueSlot) + if (target is ObjList && target::class == ObjList::class && index is ObjInt) { + target.list[index.toInt()] = value + return + } + target.putAt(frame.ensureScope(), index, value) return } } diff --git a/lynglib/src/jvmTest/kotlin/PiSpigotBenchmarkTest.kt b/lynglib/src/jvmTest/kotlin/PiSpigotBenchmarkTest.kt new file mode 100644 index 0000000..294372f --- /dev/null +++ b/lynglib/src/jvmTest/kotlin/PiSpigotBenchmarkTest.kt @@ -0,0 +1,145 @@ +/* + * Copyright 2026 Sergey S. Chernov real.sergeych@gmail.com + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +import kotlinx.coroutines.test.runTest +import net.sergeych.lyng.Benchmarks +import net.sergeych.lyng.BytecodeBodyProvider +import net.sergeych.lyng.PerfFlags +import net.sergeych.lyng.PerfProfiles +import net.sergeych.lyng.Script +import net.sergeych.lyng.Statement +import net.sergeych.lyng.bytecode.BytecodeStatement +import net.sergeych.lyng.bytecode.CmdCallMemberSlot +import net.sergeych.lyng.bytecode.CmdFunction +import net.sergeych.lyng.bytecode.CmdGetIndex +import net.sergeych.lyng.bytecode.CmdIterPush +import net.sergeych.lyng.bytecode.CmdMakeRange +import net.sergeych.lyng.bytecode.CmdSetIndex +import net.sergeych.lyng.obj.ObjString +import java.nio.file.Files +import java.nio.file.Path +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertTrue +import kotlin.time.TimeSource + +class PiSpigotBenchmarkTest { + @Test + fun benchmarkPiSpigot() = runTest { + if (!Benchmarks.enabled) return@runTest + + val source = Files.readString(resolveExample("pi-test.lyng")) + val legacySource = source.replace( + "val quotient = sum / denom", + "var quotient = floor((sum / (denom * 1.0))).toInt()" + ) + assertTrue(legacySource != source, "failed to build legacy piSpigot benchmark case") + + val digits = 200 + val expectedSuffix = "49303819" + + val legacyElapsed = runCase("legacy-real-division", legacySource, digits, expectedSuffix, dumpBytecode = true) + val saved = PerfProfiles.snapshot() + PerfFlags.RVAL_FASTPATH = false + val optimizedRvalOffElapsed = runCase( + "optimized-int-division-rval-off", + source, + digits, + expectedSuffix, + dumpBytecode = false + ) + PerfProfiles.restore(saved) + val optimizedElapsed = runCase("optimized-int-division-rval-on", source, digits, expectedSuffix, dumpBytecode = true) + val sourceSpeedup = legacyElapsed.toDouble() / optimizedRvalOffElapsed.toDouble() + val runtimeSpeedup = optimizedRvalOffElapsed.toDouble() / optimizedElapsed.toDouble() + val totalSpeedup = legacyElapsed.toDouble() / optimizedElapsed.toDouble() + println( + "[DEBUG_LOG] [BENCH] pi-spigot compare n=$digits legacy=${legacyElapsed} ms " + + "intDiv=${optimizedRvalOffElapsed} ms rvalOn=${optimizedElapsed} ms " + + "intDivSpeedup=${"%.2f".format(sourceSpeedup)}x " + + "rvalSpeedup=${"%.2f".format(runtimeSpeedup)}x " + + "total=${"%.2f".format(totalSpeedup)}x" + ) + } + + private suspend fun runCase( + name: String, + source: String, + digits: Int, + expectedSuffix: String, + dumpBytecode: Boolean, + ): Long { + val scope = Script.newScope() + scope.eval(source) + + if (dumpBytecode) { + println("[DEBUG_LOG] [BENCH] pi-spigot cmd:\n${scope.disassembleSymbol("piSpigot")}") + dumpHotOps(scope, "piSpigot") + } + + val first = scope.eval("piSpigot($digits)") as ObjString + assertEquals(expectedSuffix, first.value) + + repeat(2) { + val warm = scope.eval("piSpigot($digits)") as ObjString + assertEquals(expectedSuffix, warm.value) + } + + val iterations = 3 + val start = TimeSource.Monotonic.markNow() + repeat(iterations) { + val result = scope.eval("piSpigot($digits)") as ObjString + assertEquals(expectedSuffix, result.value) + } + val elapsedMs = start.elapsedNow().inWholeMilliseconds + val avgMs = elapsedMs.toDouble() / iterations.toDouble() + println( + "[DEBUG_LOG] [BENCH] pi-spigot $name n=$digits iterations=$iterations " + + "elapsed=${elapsedMs} ms avg=${"%.2f".format(avgMs)} ms" + ) + return elapsedMs + } + + private fun dumpHotOps(scope: net.sergeych.lyng.Scope, name: String) { + val fn = resolveBytecodeFunction(scope, name) ?: return + val makeRange = fn.cmds.count { it is CmdMakeRange } + val callMemberSlot = fn.cmds.count { it is CmdCallMemberSlot } + val iterPush = fn.cmds.count { it is CmdIterPush } + val getIndex = fn.cmds.count { it is CmdGetIndex } + val setIndex = fn.cmds.count { it is CmdSetIndex } + println( + "[DEBUG_LOG] [BENCH] pi-spigot hot-ops " + + "makeRange=$makeRange callMemberSlot=$callMemberSlot iterPush=$iterPush " + + "getIndex=$getIndex setIndex=$setIndex total=${fn.cmds.size}" + ) + } + + private fun resolveBytecodeFunction(scope: net.sergeych.lyng.Scope, name: String): CmdFunction? { + val record = scope.get(name) ?: return null + val stmt = record.value as? Statement ?: return null + return (stmt as? BytecodeStatement)?.bytecodeFunction() + ?: (stmt as? BytecodeBodyProvider)?.bytecodeBody()?.bytecodeFunction() + } + + private fun resolveExample(name: String): Path { + val direct = Path.of("examples", name) + if (Files.exists(direct)) return direct + val parent = Path.of("..", "examples", name) + if (Files.exists(parent)) return parent + error("example not found: $name") + } +} diff --git a/notes/pi_spigot_benchmark_baseline_2026-04-03.md b/notes/pi_spigot_benchmark_baseline_2026-04-03.md new file mode 100644 index 0000000..083dc9c --- /dev/null +++ b/notes/pi_spigot_benchmark_baseline_2026-04-03.md @@ -0,0 +1,172 @@ +# Pi Spigot Benchmark Baseline + +Date: 2026-04-03 +Command: +`./gradlew :lynglib:jvmTest -Pbenchmarks=true --tests 'PiSpigotBenchmarkTest' --rerun-tasks` + +Results for `n=200`: +- legacy-real-division: 1108 ms (3 iters, avg 369.33 ms) +- optimized-int-division-rval-off: 756 ms (3 iters, avg 252.00 ms) +- optimized-int-division-rval-on: 674 ms (3 iters, avg 224.67 ms) + +Derived speedups: +- intDivSpeedup: 1.47x +- rvalSpeedup: 1.12x +- total: 1.64x + +Notes: +- Bytecode still shows generic range iteration (`MAKE_RANGE`, `CALL_MEMBER_SLOT`, `ITER_PUSH`) for loop constructs in the legacy benchmark case. +- This baseline is captured before enabling counted-loop lowering for dynamic inline int ranges. + +Optimization #1 follow-up: +- Attempt: broaden compiler loop lowering for dynamic int ranges and validate with `PiSpigotBenchmarkTest` bytecode dumps. +- Final result: success after switching loop-bound coercion to a runtime-checked int path for stable slots with missing metadata. +- Latest measured run after the working compiler change: + - legacy-real-division: 783 ms (3 iters, avg 261.00 ms) + - optimized-int-division-rval-off: 729 ms (3 iters, avg 243.00 ms) + - optimized-int-division-rval-on: 593 ms (3 iters, avg 197.67 ms) +- Hot-op counts for optimized bytecode now show the generic range iterator path is gone from the main loops: + - `makeRange=0` + - `callMemberSlot=2` + - `iterPush=0` + - `getIndex=4` + - `setIndex=4` +- The remaining member calls are non-loop overhead; the main improvement came from lowering `for` ranges to counted int loops. + +Optimization #2 follow-up: +- Attempt: coerce stable integer operands into `INT` arithmetic during binary-op lowering so hot expressions stop falling back to `OBJ` math. +- Latest measured run after the arithmetic change: + - legacy-real-division: 593 ms (3 iters, avg 197.67 ms) + - optimized-int-division-rval-off: 542 ms (3 iters, avg 180.67 ms) + - optimized-int-division-rval-on: 516 ms (3 iters, avg 172.00 ms) +- Compiled-code impact in the optimized case: + - `boxes = n * 10 / 3` is now `UNBOX_INT_OBJ` + `MUL_INT` + `DIV_INT` + - `j = boxes - k` is now `SUB_INT` + - `denom = j * 2 + 1` is now `MUL_INT` + `ADD_INT` + - `carriedOver = quotient * j` is now `MUL_INT` +- Remaining hot object arithmetic is centered on list-backed reminder values and derived sums: + - `reminders[j] * 10` + - `reminders[j] + carriedOver` + - `sum / denom`, `sum % denom`, `sum / 10` +- Conclusion: loop lowering is fixed; the next likely win is preserving `List` element typing for `reminders` so indexed loads stay in int space. + +Optimization #3 follow-up: +- Attempt: teach numeric-kind inference that `IndexRef` can be `INT`/`REAL` when the receiver list has a known element class. +- Compiler change: + - `inferNumericKind()` now handles `IndexRef` and resolves the receiver slot or receiver-declared list element class before choosing `INT`/`REAL`. +- Latest measured run after the indexed-load inference change: + - legacy-real-division: 656 ms (3 iters, avg 218.67 ms) + - optimized-int-division-rval-off: 509 ms (3 iters, avg 169.67 ms) + - optimized-int-division-rval-on: 403 ms (3 iters, avg 134.33 ms) +- Derived speedups vs legacy in this run: + - intDivSpeedup: 1.29x + - rvalSpeedup: 1.26x + - total: 1.63x +- Compiled-code impact in the optimized case: + - `carriedOver = quotient * j` stays in `INT` space (`ASSERT_IS` + `UNBOX_INT_OBJ` + `MUL_INT`) instead of plain object multiply. + - Counted int loops remain intact (`MAKE_RANGE=0`, `ITER_PUSH=0`). +- Remaining bottlenecks in the optimized bytecode: + - `GET_INDEX reminders[j]` still feeds `MUL_OBJ` / `ADD_OBJ` + - `sum / denom`, `sum % denom`, and `sum / 10` still compile to object arithmetic + - `suffix += pi[i]` remains `ADD_OBJ`, which is expected because it is string/object concatenation +- Conclusion: + - The new inference produced a real VM-speed gain, especially with `RVAL_FASTPATH` enabled. + - The next compiler win is stronger propagation from `List` indexed loads into the produced temporary slot so `sum` can stay typed as `INT` across the inner loop. + +Optimization #4 follow-up: +- Attempt: preserve boxed-argument metadata through `compileCallArgs()` so `list.add(x)` retains `ObjInt` / `ObjReal` element typing. +- Compiler/runtime fixes: + - `compileCallArgs()` now routes arguments through `ensureObjSlot()` + `emitMove()` instead of raw `BOX_OBJ`, preserving `slotObjClass` and `stableObjSlots`. + - `CmdSetIndex` now reads `valueSlot` via `slotToObj()` so `SET_INDEX` can safely accept primitive slots. + - Fast local unbox ops (`CmdUnboxIntObjLocal`, `CmdUnboxRealObjLocal`) now handle already-primitive source slots directly instead of assuming a raw object payload. + - Plain assignment now coerces object-int RHS back into `INT` when the destination slot is currently compiled as `INT`, keeping loop-carried locals type-consistent. +- Latest measured run after the propagation + VM fixes: + - legacy-real-division: 438 ms (3 iters, avg 146.00 ms) + - optimized-int-division-rval-off: 238 ms (3 iters, avg 79.33 ms) + - optimized-int-division-rval-on: 201 ms (3 iters, avg 67.00 ms) +- Derived speedups vs legacy in this run: + - intDivSpeedup: 1.84x + - rvalSpeedup: 1.18x + - total: 2.18x +- Compiled-code impact in the optimized case: + - `sum = reminders[j] + carriedOver` is now `GET_INDEX` + `UNBOX_INT_OBJ` + `ADD_INT` + - `reminders[j] = sum % denom` is now `MOD_INT` + `SET_INDEX` + - `q = sum / 10` is now `DIV_INT` + - `carriedOver = quotient * j` is now `MUL_INT` +- Remaining hot object arithmetic in the optimized case: + - `reminders[j] *= 10` still compiles as `GET_INDEX` + `MUL_OBJ` + `SET_INDEX` + - `suffix += pi[i]` remains `ADD_OBJ`, which is expected string/object concatenation +- Conclusion: + - The main remaining arithmetic bottleneck is the compound index assignment path for `reminders[j] *= 10`. + - The next direct win is to specialize `AssignOpRef` on typed list elements so indexed compound assignment can lower to `UNBOX_INT_OBJ` + `MUL_INT` + boxed `SET_INDEX`. + +Optimization #5 follow-up: +- Attempt: specialize typed `IndexRef` compound assignment so `List` element updates avoid object arithmetic. +- Compiler change: + - `compileAssignOp()` now detects non-optional typed `List` index targets and lowers arithmetic assign-ops through `UNBOX_INT_OBJ` + `*_INT` + `SET_INDEX`. +- Latest measured run after the indexed compound-assignment change: + - legacy-real-division: 394 ms (3 iters, avg 131.33 ms) + - optimized-int-division-rval-off: 216 ms (3 iters, avg 72.00 ms) + - optimized-int-division-rval-on: 184 ms (3 iters, avg 61.33 ms) +- Derived speedups vs legacy in this run: + - intDivSpeedup: 1.82x + - rvalSpeedup: 1.17x + - total: 2.14x +- Compiled-code impact in the optimized case: + - `reminders[j] *= 10` is now: + - `GET_INDEX` + - `UNBOX_INT_OBJ` + - `MUL_INT` + - `SET_INDEX` + - The optimized inner loop no longer contains object arithmetic for the `reminders` state update path. +- Remaining hot object work in the optimized case: + - `suffix += pi[i]` remains `ADD_OBJ` and is expected string/object concatenation + - The legacy benchmark case still carries real/object work because it intentionally keeps the original `floor(sum / (denom * 1.0))` path +- Conclusion: + - The inner arithmetic hot loop is now effectively int-lowered end-to-end in the optimized benchmark path. + - Further wins will likely require reducing list access overhead itself (`GET_INDEX` / `SET_INDEX`) or changing the source algorithm/data layout, not more basic arithmetic lowering. + +Optimization #6 follow-up: +- Attempt: move the direct `ObjList` index fast path out from behind `RVAL_FASTPATH` so the common plain-list case is fast by default. +- Runtime change: + - `CmdGetIndex` and `CmdSetIndex` now always use direct `target.list[index]` / `target.list[index] = value` for exact `ObjList` receivers with `ObjInt` indices. + - Subclasses such as `ObjObservableList` still use their overridden `getAt` / `putAt` logic, so semantics stay intact. +- Latest measured run after the default plain-list path: + - legacy-real-division: 397 ms (3 iters, avg 132.33 ms) + - optimized-int-division-rval-off: 138 ms (3 iters, avg 46.00 ms) + - optimized-int-division-rval-on: 164 ms (3 iters, avg 54.67 ms) +- Derived speedups vs legacy in this run: + - intDivSpeedup: 2.88x + - rvalSpeedup: 0.84x + - total: 2.42x +- Interpretation: + - The stable fast baseline is now the `rval-off` case, because the direct plain-`ObjList` path no longer depends on `RVAL_FASTPATH`. + - `RVAL_FASTPATH` no longer improves this benchmark and only reflects remaining unrelated runtime variance. +- Conclusion: + - For `piSpigot`, the main VM list-access bottleneck is addressed in the default runtime path. + - Further work on this benchmark should target algorithm/data-layout changes or string-result construction, not the old `RVAL_FASTPATH` gate. + +Remaining optimization candidates: +- `suffix += pi[i]` still compiles as repeated `ADD_OBJ` string/object concatenation. + - Best next option: build the suffix through a dedicated buffer/list-join path instead of per-iteration concatenation. +- The benchmark still performs many `GET_INDEX` / `SET_INDEX` operations even after the direct plain-`ObjList` fast path. + - Best next option: reduce indexed access count at the source level or introduce a more specialized typed-list storage layout if this benchmark matters enough. +- The legacy benchmark variant intentionally keeps the real-number `floor(sum / (denom * 1.0))` path. + - No release optimization needed there; it remains only as a regression/control case. +- `RVAL_FASTPATH` is no longer a useful tuning knob for this workload after the plain-list VM fast path. + - Best next option: profile other workloads before changing or removing it globally. + +Release stabilization note: +- The broad assignment-side `INT` coercion and subclass-bypassing list fast path were rolled back/narrowed to restore correctness across numeric-mix, decimal, list, observable-list, and wasm tests. +- Full release gates now pass: + - `./gradlew test` + - `./gradlew :lynglib:wasmJsNodeTest` +- Current release-safe benchmark on the stabilized tree: + - legacy-real-division: 732 ms (3 iters, avg 244.00 ms) + - optimized-int-division-rval-off: 545 ms (3 iters, avg 181.67 ms) + - optimized-int-division-rval-on: 697 ms (3 iters, avg 232.33 ms) +- Interpretation: + - The release baseline is now `optimized-int-division-rval-off` at 545 ms for the current correct/stable tree. + - The removed coercion had been masking a real compiler typing gap; reintroducing it broadly is not release-safe. +- Highest-value remaining compiler optimization after release: + - Recover typed int lowering for `j = boxes - k`, `denom = j * 2 + 1`, `sum = reminders[j] + carriedOver`, and `carriedOver = quotient * j` using a narrower proof than the removed generic arithmetic coercion.