From f45fa7f7a0fd5b0632abd22852e6caff1edb5f30 Mon Sep 17 00:00:00 2001 From: sergeych Date: Fri, 5 Sep 2025 17:53:19 +0400 Subject: [PATCH] fix #49 regular expressions, operator match and docs --- docs/Regex.md | 91 +++++++++++++++++++ docs/tutorial.md | 37 +++++++- lynglib/build.gradle.kts | 2 +- .../kotlin/net/sergeych/lyng/Compiler.kt | 5 +- .../kotlin/net/sergeych/lyng/Parser.kt | 70 +++++++++----- .../kotlin/net/sergeych/lyng/Token.kt | 2 +- .../kotlin/net/sergeych/lyng/obj/Obj.kt | 20 ++++ .../kotlin/net/sergeych/lyng/obj/ObjRegex.kt | 28 +++++- .../kotlin/net/sergeych/lyng/obj/ObjString.kt | 49 ++++++++-- lynglib/src/commonTest/kotlin/ScriptTest.kt | 41 ++++++++- lynglib/src/jvmTest/kotlin/BookTest.kt | 4 + 11 files changed, 304 insertions(+), 45 deletions(-) create mode 100644 docs/Regex.md diff --git a/docs/Regex.md b/docs/Regex.md new file mode 100644 index 0000000..d811ab0 --- /dev/null +++ b/docs/Regex.md @@ -0,0 +1,91 @@ +# Regular expressions + +In lyng, you create regular expressions using class `Regex` or `String.re` methods: + + assert( "\d*".re is Regex ) + assert( Regex("\d*") is Regex ) + >>> void + +We plan to add slash syntax at some point. + +To check that some string matches as whole to some regex: + + assert( "123".matches("\d{3}".re) ) + assert( !"123".matches("\d{4}".re) ) + assert( !"1234".matches("\d".re) ) + >>> void + +To check that _part of the string_ matches some regular expession, use _match operator_ `=~` just like in Ruby, and its +counterpart, _not match_ operator `!~`: + + assert( "abc123def" =~ "\d\d\d".re ) + assert( "abc" !~ "\d\d\d".re ) + >>> void + +When you need to find groups, and more detailed match information, use `Regex.find`: + + val result = Regex("abc(\d)(\d)(\d)").find( "bad456 good abc123") + assert( result != null ) + assertEquals( 12 .. 17, result.range ) + assertEquals( "abc123", result[0] ) + assertEquals( "1", result[1] ) + assertEquals( "2", result[2] ) + assertEquals( "3", result[3] ) + >>> void + +Note that the object `RegexMatch`, returned by [Regex.find], behaves much like in many other languages: it provides the +index range and groups matches as indexes. + +Match operator actually also provides `RegexMatch` in `$~` reserved variable (borrowed from Ruby too): + + assert( "bad456 good abc123" =~ "abc(\d)(\d)(\d)".re ) + assertEquals( 12 .. 17, $~.range ) + assertEquals( "abc123", $~[0] ) + assertEquals( "1", $~[1] ) + assertEquals( "2", $~[2] ) + assertEquals( "3", $~[3] ) + >>> void + +This is often more readable than calling `find`. + +Note that `=~` and `!~` operators against strings and regular expressions are commutative, e.g. regular expression and a +string can be either left or right operator, but not both: + + assert( "abc" =~ "\wc".re ) + assert( "abc" !~ "\w1c".re ) + assert( "a\wc".re =~ "abcd" ) + assert( "a[a-z]c".re !~ "a2cd" ) + >>> void + +Also, string indexing is Regex-aware, and works like `Regex.find` (_not findall!_): + + assert( "cd" == "abcdef"[ "c.".re ].value ) + >>> void + + +# Regex class reference + +| name | description | notes | +|--------------|-------------------------------------|-------| +| matches(str) | true if the whole `str` matches | | +| find(str) | find first match in `str` or null | (1) | +| findAll(str) | find all matches in `str` as [List] | (1) | + +(1) +:: See `RegexMatch` class description below + +# RegexMatch + +| name | description | notes | +|-------|-------------------------------------------|-------| +| range | the [Range] of the match in source string | | +| value | the value that matches | | +| [n] | [List] of group matches | (1) | + +(1) +:: the [0] element is always value, [1] is group 1 match of any, etc. + +[List]: List.md + +[Range]: Range.md + diff --git a/docs/tutorial.md b/docs/tutorial.md index 32261f1..274da2c 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -275,6 +275,8 @@ Logical operation could be used the same | === | | Any | (2) | | !== | | Any | (2) | | != | | Any | (1) | +| =~ | | | (3) | +| !~ | | | (3) | | ++a, a++ | | Int | | | --a, a-- | | Int | | @@ -286,6 +288,9 @@ Logical operation could be used the same singleton object, like `null`, are referentially equal too, while string different literals even being equal are most likely referentially not equal +(3) +: Implemented now in String and Regex as regular expression match and not match, see [Regex]. + Reference quality and object equality example: assert( null == null) // singletons @@ -1285,9 +1290,26 @@ Open-ended ranges could be used to get start and end too: assertEquals( "pult", "catapult"[ 4.. ]) >>> void + ### String operations -Concatenation is a `+`: `"hello " + name` works as expected. No confusion. +Concatenation is a `+`: `"hello " + name` works as expected. No confusion. There is also +[Regex] support for strings, see the link, for example, whole string match: + + assert( !"123".matches( "\d\d".re ) ) + assert( "123".matches( "\d\d\d".re ) ) + >>> void + +Extraction: + + "abcd42def"[ "\d+".re ].value + >>> "42" + +Part match: + + assert( "abc foo def" =~ "f[oO]+".re ) + assert( "foo" == $~.value ) + >>> void Typical set of String functions includes: @@ -1305,17 +1327,24 @@ Typical set of String functions includes: | size | size in characters like `length` because String is [Array] | | (args...) | sprintf-like formatting, see [string formatting] | | [index] | character at index | -| [Range] | substring at range | +| [Range] | substring at range (2) | +| [Regex] | find first match of regex, like [Regex.find] (2) | | s1 + s2 | concatenation | | s1 += s2 | self-modifying concatenation | | toReal() | attempts to parse string as a Real value | | toInt() | parse string to Int value | | characters() | create [List] of characters (1) | | encodeUtf8() | returns [Buffer] with characters encoded to utf8 | +| matches(re) | matches the regular expression (2) | +| | | + (1) : List is mutable therefore a new copy is created on each call. +(2) +: See [Regex] + ### Literals String literal could be multiline: @@ -1390,4 +1419,6 @@ See [math functions](math.md). Other general purpose functions are: [Collection]: Collection.md -[Array]: Array.md \ No newline at end of file +[Array]: Array.md + +[Regex]: Regex.md diff --git a/lynglib/build.gradle.kts b/lynglib/build.gradle.kts index 61ad726..21dd306 100644 --- a/lynglib/build.gradle.kts +++ b/lynglib/build.gradle.kts @@ -21,7 +21,7 @@ import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl import org.jetbrains.kotlin.gradle.dsl.JvmTarget group = "net.sergeych" -version = "0.8.15-SNAPSHOT" +version = "0.9.0-SNAPSHOT" buildscript { repositories { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Compiler.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Compiler.kt index d0eabcb..51fc564 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Compiler.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Compiler.kt @@ -1000,6 +1000,7 @@ class Compiler( // condition could be a value, in and is clauses: // parse several conditions for one then clause + // loop cases outer@ while (true) { @@ -1466,7 +1467,7 @@ class Compiler( } } } else { - for (i in start ..< end) { + for (i in start.. ObjBool(a.compareTo(c, b) != 0) }, Operator.simple(Token.Type.REF_EQ, lastPriority) { _, a, b -> ObjBool(a === b) }, Operator.simple(Token.Type.REF_NEQ, lastPriority) { _, a, b -> ObjBool(a !== b) }, + Operator.simple(Token.Type.MATCH, lastPriority) { s, a, b -> a.operatorMatch(s,b) }, + Operator.simple(Token.Type.NOTMATCH, lastPriority) { s, a, b -> a.operatorNotMatch(s,b) }, // relational <=,... 5 Operator.simple(Token.Type.LTE, ++lastPriority) { c, a, b -> ObjBool(a.compareTo(c, b) <= 0) }, Operator.simple(Token.Type.LT, lastPriority) { c, a, b -> ObjBool(a.compareTo(c, b) < 0) }, diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Parser.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Parser.kt index b326127..b03659d 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Parser.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Parser.kt @@ -20,10 +20,10 @@ package net.sergeych.lyng val digitsSet = ('0'..'9').toSet() val digits = { d: Char -> d in digitsSet } val hexDigits = digitsSet + ('a'..'f') + ('A'..'F') -val idNextChars = { d: Char -> d.isLetter() || d == '_' || d.isDigit() } +val idNextChars = { d: Char -> d.isLetter() || d == '_' || d.isDigit() || d == '$' || d == '~' } @Suppress("unused") -val idFirstChars = { d: Char -> d.isLetter() || d == '_' } +val idFirstChars = { d: Char -> d.isLetter() || d == '_' || d == '$' } fun parseLyng(source: Source): List { val p = Parser(fromPos = source.startPos) @@ -67,13 +67,16 @@ private class Parser(fromPos: Pos) { pos.advance() Token("===", from, Token.Type.REF_EQ) } + else -> Token("==", from, Token.Type.EQ) } - } else if( currentChar == '>' ) { + } else if (currentChar == '>') { pos.advance() Token("=>", from, Token.Type.EQARROW) - } - else + } else if (currentChar == '~') { + pos.advance() + Token("=~", from, Token.Type.MATCH) + } else Token("=", from, Token.Type.ASSIGN) } @@ -227,6 +230,9 @@ private class Parser(fromPos: Pos) { Token("!==", from, Token.Type.REF_NEQ) } else Token("!=", from, Token.Type.NEQ) + } else if (currentChar == '~') { + pos.advance() + Token("!~", from, Token.Type.NOTMATCH) } else Token("!", from, Token.Type.NOT) } @@ -267,7 +273,7 @@ private class Parser(fromPos: Pos) { in digitsSet -> { pos.back() - decodeNumber(loadChars { it in digitsSet || it == '_'}, from) + decodeNumber(loadChars { it in digitsSet || it == '_' }, from) } '\'' -> { @@ -291,7 +297,7 @@ private class Parser(fromPos: Pos) { } '?' -> { - when(currentChar.also { pos.advance() }) { + when (currentChar.also { pos.advance() }) { ':' -> Token("??", from, Token.Type.ELVIS) '?' -> Token("??", from, Token.Type.ELVIS) '.' -> Token("?.", from, Token.Type.NULL_COALESCE) @@ -310,7 +316,7 @@ private class Parser(fromPos: Pos) { // Labels processing is complicated! // some@ statement: label 'some', ID 'statement' // statement@some: ID 'statement', LABEL 'some'! - if (ch.isLetter() || ch == '_') { + if (idNextChars(ch)) { val text = ch + loadChars(idNextChars) if (currentChar == '@') { pos.advance() @@ -395,25 +401,24 @@ private class Parser(fromPos: Pos) { private fun fixMultilineStringLiteral(source: String): String { val sizes = mutableListOf() val lines = source.lines().toMutableList() - if( lines.size == 0 ) return "" - if( lines[0].isBlank() ) lines.removeFirst() - if( lines.isEmpty()) return "" - if( lines.last().isBlank() ) lines.removeLast() + if (lines.size == 0) return "" + if (lines[0].isBlank()) lines.removeFirst() + if (lines.isEmpty()) return "" + if (lines.last().isBlank()) lines.removeLast() val normalized = lines.map { l -> - if( l.isBlank() ) { + if (l.isBlank()) { sizes.add(-1) "" - } - else { + } else { val margin = leftMargin(l) sizes += margin " ".repeat(margin) + l.trim() } } val commonMargin = sizes.filter { it >= 0 }.min() - val fixed = if( commonMargin < 1 ) lines else normalized.map { - if( it.isBlank() ) "" else it.drop(commonMargin) + val fixed = if (commonMargin < 1) lines else normalized.map { + if (it.isBlank()) "" else it.drop(commonMargin) } return fixed.joinToString("\n") } @@ -433,11 +438,26 @@ private class Parser(fromPos: Pos) { '\\' -> { pos.advance() ?: raise("unterminated string") when (currentChar) { - 'n' -> {sb.append('\n'); pos.advance()} - 'r' -> {sb.append('\r'); pos.advance()} - 't' -> {sb.append('\t'); pos.advance()} - '"' -> {sb.append('"'); pos.advance()} - '\\' -> {sb.append('\\'); pos.advance()} + 'n' -> { + sb.append('\n'); pos.advance() + } + + 'r' -> { + sb.append('\r'); pos.advance() + } + + 't' -> { + sb.append('\t'); pos.advance() + } + + '"' -> { + sb.append('"'); pos.advance() + } + + '\\' -> { + sb.append('\\'); pos.advance() + } + else -> { sb.append('\\').append(currentChar) pos.advance() @@ -445,7 +465,7 @@ private class Parser(fromPos: Pos) { } } - '\n', '\r'-> { + '\n', '\r' -> { newlineDetected = true sb.append(currentChar) pos.advance() @@ -459,7 +479,7 @@ private class Parser(fromPos: Pos) { } pos.advance() - val result = sb.toString().let { if( newlineDetected ) fixMultilineStringLiteral(it) else it } + val result = sb.toString().let { if (newlineDetected) fixMultilineStringLiteral(it) else it } return Token(result, start, Token.Type.STRING) } @@ -538,7 +558,7 @@ private class Parser(fromPos: Pos) { init { // skip shebang - if( pos.readFragment("#!") ) + if (pos.readFragment("#!")) loadToEndOfLine() } diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Token.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Token.kt index 6d5bc97..9470913 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Token.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/Token.kt @@ -37,7 +37,7 @@ data class Token(val value: String, val pos: Pos, val type: Type) { ASSIGN, PLUSASSIGN, MINUSASSIGN, STARASSIGN, SLASHASSIGN, PERCENTASSIGN, PLUS2, MINUS2, IN, NOTIN, IS, NOTIS, - EQ, NEQ, LT, LTE, GT, GTE, REF_EQ, REF_NEQ, + EQ, NEQ, LT, LTE, GT, GTE, REF_EQ, REF_NEQ, MATCH, NOTMATCH, SHUTTLE, AND, BITAND, OR, BITOR, NOT, BITNOT, DOT, ARROW, EQARROW, QUESTION, COLONCOLON, SINLGE_LINE_COMMENT, MULTILINE_COMMENT, diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt index 8c075fc..19cd6be 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/Obj.kt @@ -178,6 +178,14 @@ open class Obj { scope.raiseNotImplemented() } + open suspend fun operatorMatch(scope: Scope, other: Obj): Obj { + scope.raiseNotImplemented() + } + + open suspend fun operatorNotMatch(scope: Scope, other: Obj): Obj { + return operatorMatch(scope,other).logicalNot(scope) + } + open suspend fun assign(scope: Scope, other: Obj): Obj? = null open fun getValue(scope: Scope) = this @@ -301,6 +309,18 @@ open class Obj { return scope } + inline fun cast(scope: Scope): R { + castOrNull()?.let { return it } + scope.raiseClassCastError("can't cast ${this::class.simpleName} to ${R::class.simpleName}") + + } + + inline fun castOrNull(): R? { + (this as? R)?.let { return it } + // todo: check for subclasses + return null + } + companion object { val rootObjectType = ObjClass("Obj").apply { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjRegex.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjRegex.kt index 8fbc43b..2eaa08d 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjRegex.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjRegex.kt @@ -22,11 +22,20 @@ import net.sergeych.lyng.Scope class ObjRegex(val regex: Regex) : Obj() { override val objClass = type + override suspend fun operatorMatch(scope: Scope, other: Obj): Obj { + return regex.find(other.cast(scope).value)?.let { + scope.addConst("$~", ObjRegexMatch(it)) + ObjTrue + } ?: ObjFalse + } + + fun find(s: ObjString): Obj = + regex.find(s.value)?.let { ObjRegexMatch(it) } ?: ObjNull + companion object { val type by lazy { object : ObjClass("Regex") { override suspend fun callOn(scope: Scope): Obj { - println(scope.requireOnlyArg().value) return ObjRegex( scope.requireOnlyArg().value.toRegex() ) @@ -36,8 +45,7 @@ class ObjRegex(val regex: Regex) : Obj() { ObjBool(args.firstAndOnly().toString().matches(thisAs().regex)) } addFn("find") { - val s = requireOnlyArg().value - thisAs().regex.find(s)?.let { ObjRegexMatch(it) } ?: ObjNull + thisAs().find(requireOnlyArg()) } addFn("findAll") { val s = requireOnlyArg().value @@ -61,6 +69,7 @@ class ObjRegexMatch(val match: MatchResult) : Obj() { val objRange: ObjRange by lazy { val r = match.range + ObjRange( ObjInt(r.first.toLong()), ObjInt(r.last.toLong()), @@ -68,6 +77,19 @@ class ObjRegexMatch(val match: MatchResult) : Obj() { ) } + override suspend fun toString(scope: Scope,calledFromLyng: Boolean): ObjString { + return ObjString("RegexMath(${objRange.toString(scope)},${objGroups.toString(scope)})") + } + + override suspend fun getAt(scope: Scope, index: Obj): Obj { + return objGroups.getAt(scope, index) + } + + override suspend fun compareTo(scope: Scope, other: Obj): Int { + if( other === this) return 0 + return -2 + } + companion object { val type by lazy { object : ObjClass("RegexMatch") { diff --git a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt index 7277714..122063d 100644 --- a/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt +++ b/lynglib/src/commonMain/kotlin/net/sergeych/lyng/obj/ObjString.kt @@ -56,16 +56,23 @@ data class ObjString(val value: String) : Obj() { } override suspend fun getAt(scope: Scope, index: Obj): Obj { - if (index is ObjInt) return ObjChar(value[index.toInt()]) - if (index is ObjRange) { - val start = if (index.start == null || index.start.isNull) 0 else index.start.toInt() - val end = if (index.end == null || index.end.isNull) value.length else { - val e = index.end.toInt() - if (index.isEndInclusive) e + 1 else e + when (index) { + is ObjInt -> return ObjChar(value[index.toInt()]) + is ObjRange -> { + val start = if (index.start == null || index.start.isNull) 0 else index.start.toInt() + val end = if (index.end == null || index.end.isNull) value.length else { + val e = index.end.toInt() + if (index.isEndInclusive) e + 1 else e + } + return ObjString(value.substring(start, end)) } - return ObjString(value.substring(start, end)) + + is ObjRegex -> { + return index.find(this) + } + + else -> scope.raiseIllegalArgument("String index must be Int, Regex or Range") } - scope.raiseIllegalArgument("String index must be Int or Range") } override fun hashCode(): Int { @@ -96,6 +103,11 @@ data class ObjString(val value: String) : Obj() { return value == other.value } + override suspend fun operatorMatch(scope: Scope, other: Obj): Obj { + val re = other.cast(scope) + return re.operatorMatch(scope, this) + } + override suspend fun lynonType(): LynonType = LynonType.String override suspend fun serialize(scope: Scope, encoder: LynonEncoder, lynonType: LynonType?) { @@ -108,8 +120,9 @@ data class ObjString(val value: String) : Obj() { ObjString(decoder.unpackBinaryData().decodeToString()) }.apply { addFn("toInt") { - ObjInt(thisAs().value.toLongOrNull() - ?: raiseIllegalArgument("can't convert to int: $thisObj") + ObjInt( + thisAs().value.toLongOrNull() + ?: raiseIllegalArgument("can't convert to int: $thisObj") ) } addFn("startsWith") { @@ -160,6 +173,22 @@ data class ObjString(val value: String) : Obj() { addFn("trim") { thisAs().value.trim().let(::ObjString) } + addFn("matches") { + val s = requireOnlyArg() + val self = thisAs().value + ObjBool( + when (s) { + is ObjRegex -> self.matches(s.regex) + is ObjString -> { + if (s.value == ".*") true + else self.matches(s.value.toRegex()) + } + + else -> + raiseIllegalArgument("can't match ${s.objClass.className}: required Regex or String") + } + ) + } } } } \ No newline at end of file diff --git a/lynglib/src/commonTest/kotlin/ScriptTest.kt b/lynglib/src/commonTest/kotlin/ScriptTest.kt index e029cdd..8752538 100644 --- a/lynglib/src/commonTest/kotlin/ScriptTest.kt +++ b/lynglib/src/commonTest/kotlin/ScriptTest.kt @@ -2242,6 +2242,45 @@ class ScriptTest { ) } + @Test + fun testParseSpecialVars() { + val l = parseLyng("$~".toSource("test$~")) + println(l) + assertEquals(Token.Type.ID, l[0].type) + assertEquals("$~", l[0].value) + } + + @Test + fun testMatchOperator() = runTest { + eval(""" + assert( "abc123".matches(".*\d{3}") ) + assert( ".*\d{3}".re =~ "abc123" ) + assert( "abc123" =~ ".*\d{3}".re ) + assert( "abc123" !~ ".*\d{4}".re ) + + "abc123" =~ ".*(\d)(\d)(\d)$".re + println($~) + assertEquals("1", $~[1]) + assertEquals("2", $~[2]) + assertEquals("3", $~[3]) + assertEquals("abc123", $~[0]) + """.trimIndent()) + } + +// @Test +// fun testWhenMatch() = runTest { +// eval( +// """ +// when("abc123") { +// ".*(\d)(\d)(\d)".re -> { x -> +// assertEquals("123", x[0]) +// } +// else -> assert(false) +// } +// """.trimIndent() +// ) +// } + @Test fun testWhenSample1() = runTest { eval( @@ -3247,7 +3286,7 @@ class ScriptTest { } -// @Test + // @Test fun testMinimumOptimization() = runTest { val x = Scope().eval( """ diff --git a/lynglib/src/jvmTest/kotlin/BookTest.kt b/lynglib/src/jvmTest/kotlin/BookTest.kt index 07e2b65..5b9581e 100644 --- a/lynglib/src/jvmTest/kotlin/BookTest.kt +++ b/lynglib/src/jvmTest/kotlin/BookTest.kt @@ -332,4 +332,8 @@ class BookTest { runDocTests("../docs/Array.md") } + @Test + fun testRegex() = runBlocking { + runDocTests("../docs/Regex.md") + } } \ No newline at end of file