fix #49 regular expressions, operator match and docs

This commit is contained in:
Sergey Chernov 2025-09-05 17:53:19 +04:00
parent ead2f7168e
commit f45fa7f7a0
11 changed files with 304 additions and 45 deletions

91
docs/Regex.md Normal file
View File

@ -0,0 +1,91 @@
# Regular expressions
In lyng, you create regular expressions using class `Regex` or `String.re` methods:
assert( "\d*".re is Regex )
assert( Regex("\d*") is Regex )
>>> void
We plan to add slash syntax at some point.
To check that some string matches as whole to some regex:
assert( "123".matches("\d{3}".re) )
assert( !"123".matches("\d{4}".re) )
assert( !"1234".matches("\d".re) )
>>> void
To check that _part of the string_ matches some regular expession, use _match operator_ `=~` just like in Ruby, and its
counterpart, _not match_ operator `!~`:
assert( "abc123def" =~ "\d\d\d".re )
assert( "abc" !~ "\d\d\d".re )
>>> void
When you need to find groups, and more detailed match information, use `Regex.find`:
val result = Regex("abc(\d)(\d)(\d)").find( "bad456 good abc123")
assert( result != null )
assertEquals( 12 .. 17, result.range )
assertEquals( "abc123", result[0] )
assertEquals( "1", result[1] )
assertEquals( "2", result[2] )
assertEquals( "3", result[3] )
>>> void
Note that the object `RegexMatch`, returned by [Regex.find], behaves much like in many other languages: it provides the
index range and groups matches as indexes.
Match operator actually also provides `RegexMatch` in `$~` reserved variable (borrowed from Ruby too):
assert( "bad456 good abc123" =~ "abc(\d)(\d)(\d)".re )
assertEquals( 12 .. 17, $~.range )
assertEquals( "abc123", $~[0] )
assertEquals( "1", $~[1] )
assertEquals( "2", $~[2] )
assertEquals( "3", $~[3] )
>>> void
This is often more readable than calling `find`.
Note that `=~` and `!~` operators against strings and regular expressions are commutative, e.g. regular expression and a
string can be either left or right operator, but not both:
assert( "abc" =~ "\wc".re )
assert( "abc" !~ "\w1c".re )
assert( "a\wc".re =~ "abcd" )
assert( "a[a-z]c".re !~ "a2cd" )
>>> void
Also, string indexing is Regex-aware, and works like `Regex.find` (_not findall!_):
assert( "cd" == "abcdef"[ "c.".re ].value )
>>> void
# Regex class reference
| name | description | notes |
|--------------|-------------------------------------|-------|
| matches(str) | true if the whole `str` matches | |
| find(str) | find first match in `str` or null | (1) |
| findAll(str) | find all matches in `str` as [List] | (1) |
(1)
:: See `RegexMatch` class description below
# RegexMatch
| name | description | notes |
|-------|-------------------------------------------|-------|
| range | the [Range] of the match in source string | |
| value | the value that matches | |
| [n] | [List] of group matches | (1) |
(1)
:: the [0] element is always value, [1] is group 1 match of any, etc.
[List]: List.md
[Range]: Range.md

View File

@ -275,6 +275,8 @@ Logical operation could be used the same
| === | | Any | (2) |
| !== | | Any | (2) |
| != | | Any | (1) |
| =~ | | | (3) |
| !~ | | | (3) |
| ++a, a++ | | Int | |
| --a, a-- | | Int | |
@ -286,6 +288,9 @@ Logical operation could be used the same
singleton object, like `null`, are referentially equal too, while string different literals even being equal are most
likely referentially not equal
(3)
: Implemented now in String and Regex as regular expression match and not match, see [Regex].
Reference quality and object equality example:
assert( null == null) // singletons
@ -1285,9 +1290,26 @@ Open-ended ranges could be used to get start and end too:
assertEquals( "pult", "catapult"[ 4.. ])
>>> void
### String operations
Concatenation is a `+`: `"hello " + name` works as expected. No confusion.
Concatenation is a `+`: `"hello " + name` works as expected. No confusion. There is also
[Regex] support for strings, see the link, for example, whole string match:
assert( !"123".matches( "\d\d".re ) )
assert( "123".matches( "\d\d\d".re ) )
>>> void
Extraction:
"abcd42def"[ "\d+".re ].value
>>> "42"
Part match:
assert( "abc foo def" =~ "f[oO]+".re )
assert( "foo" == $~.value )
>>> void
Typical set of String functions includes:
@ -1305,17 +1327,24 @@ Typical set of String functions includes:
| size | size in characters like `length` because String is [Array] |
| (args...) | sprintf-like formatting, see [string formatting] |
| [index] | character at index |
| [Range] | substring at range |
| [Range] | substring at range (2) |
| [Regex] | find first match of regex, like [Regex.find] (2) |
| s1 + s2 | concatenation |
| s1 += s2 | self-modifying concatenation |
| toReal() | attempts to parse string as a Real value |
| toInt() | parse string to Int value |
| characters() | create [List] of characters (1) |
| encodeUtf8() | returns [Buffer] with characters encoded to utf8 |
| matches(re) | matches the regular expression (2) |
| | |
(1)
: List is mutable therefore a new copy is created on each call.
(2)
: See [Regex]
### Literals
String literal could be multiline:
@ -1390,4 +1419,6 @@ See [math functions](math.md). Other general purpose functions are:
[Collection]: Collection.md
[Array]: Array.md
[Array]: Array.md
[Regex]: Regex.md

View File

@ -21,7 +21,7 @@ import org.jetbrains.kotlin.gradle.ExperimentalWasmDsl
import org.jetbrains.kotlin.gradle.dsl.JvmTarget
group = "net.sergeych"
version = "0.8.15-SNAPSHOT"
version = "0.9.0-SNAPSHOT"
buildscript {
repositories {

View File

@ -1000,6 +1000,7 @@ class Compiler(
// condition could be a value, in and is clauses:
// parse several conditions for one then clause
// loop cases
outer@ while (true) {
@ -1466,7 +1467,7 @@ class Compiler(
}
}
} else {
for (i in start ..< end) {
for (i in start..<end) {
iVar.value = i
result = body.execute(forScope)
}
@ -2007,6 +2008,8 @@ class Compiler(
Operator.simple(Token.Type.NEQ, lastPriority) { c, a, b -> ObjBool(a.compareTo(c, b) != 0) },
Operator.simple(Token.Type.REF_EQ, lastPriority) { _, a, b -> ObjBool(a === b) },
Operator.simple(Token.Type.REF_NEQ, lastPriority) { _, a, b -> ObjBool(a !== b) },
Operator.simple(Token.Type.MATCH, lastPriority) { s, a, b -> a.operatorMatch(s,b) },
Operator.simple(Token.Type.NOTMATCH, lastPriority) { s, a, b -> a.operatorNotMatch(s,b) },
// relational <=,... 5
Operator.simple(Token.Type.LTE, ++lastPriority) { c, a, b -> ObjBool(a.compareTo(c, b) <= 0) },
Operator.simple(Token.Type.LT, lastPriority) { c, a, b -> ObjBool(a.compareTo(c, b) < 0) },

View File

@ -20,10 +20,10 @@ package net.sergeych.lyng
val digitsSet = ('0'..'9').toSet()
val digits = { d: Char -> d in digitsSet }
val hexDigits = digitsSet + ('a'..'f') + ('A'..'F')
val idNextChars = { d: Char -> d.isLetter() || d == '_' || d.isDigit() }
val idNextChars = { d: Char -> d.isLetter() || d == '_' || d.isDigit() || d == '$' || d == '~' }
@Suppress("unused")
val idFirstChars = { d: Char -> d.isLetter() || d == '_' }
val idFirstChars = { d: Char -> d.isLetter() || d == '_' || d == '$' }
fun parseLyng(source: Source): List<Token> {
val p = Parser(fromPos = source.startPos)
@ -67,13 +67,16 @@ private class Parser(fromPos: Pos) {
pos.advance()
Token("===", from, Token.Type.REF_EQ)
}
else -> Token("==", from, Token.Type.EQ)
}
} else if( currentChar == '>' ) {
} else if (currentChar == '>') {
pos.advance()
Token("=>", from, Token.Type.EQARROW)
}
else
} else if (currentChar == '~') {
pos.advance()
Token("=~", from, Token.Type.MATCH)
} else
Token("=", from, Token.Type.ASSIGN)
}
@ -227,6 +230,9 @@ private class Parser(fromPos: Pos) {
Token("!==", from, Token.Type.REF_NEQ)
} else
Token("!=", from, Token.Type.NEQ)
} else if (currentChar == '~') {
pos.advance()
Token("!~", from, Token.Type.NOTMATCH)
} else
Token("!", from, Token.Type.NOT)
}
@ -267,7 +273,7 @@ private class Parser(fromPos: Pos) {
in digitsSet -> {
pos.back()
decodeNumber(loadChars { it in digitsSet || it == '_'}, from)
decodeNumber(loadChars { it in digitsSet || it == '_' }, from)
}
'\'' -> {
@ -291,7 +297,7 @@ private class Parser(fromPos: Pos) {
}
'?' -> {
when(currentChar.also { pos.advance() }) {
when (currentChar.also { pos.advance() }) {
':' -> Token("??", from, Token.Type.ELVIS)
'?' -> Token("??", from, Token.Type.ELVIS)
'.' -> Token("?.", from, Token.Type.NULL_COALESCE)
@ -310,7 +316,7 @@ private class Parser(fromPos: Pos) {
// Labels processing is complicated!
// some@ statement: label 'some', ID 'statement'
// statement@some: ID 'statement', LABEL 'some'!
if (ch.isLetter() || ch == '_') {
if (idNextChars(ch)) {
val text = ch + loadChars(idNextChars)
if (currentChar == '@') {
pos.advance()
@ -395,25 +401,24 @@ private class Parser(fromPos: Pos) {
private fun fixMultilineStringLiteral(source: String): String {
val sizes = mutableListOf<Int>()
val lines = source.lines().toMutableList()
if( lines.size == 0 ) return ""
if( lines[0].isBlank() ) lines.removeFirst()
if( lines.isEmpty()) return ""
if( lines.last().isBlank() ) lines.removeLast()
if (lines.size == 0) return ""
if (lines[0].isBlank()) lines.removeFirst()
if (lines.isEmpty()) return ""
if (lines.last().isBlank()) lines.removeLast()
val normalized = lines.map { l ->
if( l.isBlank() ) {
if (l.isBlank()) {
sizes.add(-1)
""
}
else {
} else {
val margin = leftMargin(l)
sizes += margin
" ".repeat(margin) + l.trim()
}
}
val commonMargin = sizes.filter { it >= 0 }.min()
val fixed = if( commonMargin < 1 ) lines else normalized.map {
if( it.isBlank() ) "" else it.drop(commonMargin)
val fixed = if (commonMargin < 1) lines else normalized.map {
if (it.isBlank()) "" else it.drop(commonMargin)
}
return fixed.joinToString("\n")
}
@ -433,11 +438,26 @@ private class Parser(fromPos: Pos) {
'\\' -> {
pos.advance() ?: raise("unterminated string")
when (currentChar) {
'n' -> {sb.append('\n'); pos.advance()}
'r' -> {sb.append('\r'); pos.advance()}
't' -> {sb.append('\t'); pos.advance()}
'"' -> {sb.append('"'); pos.advance()}
'\\' -> {sb.append('\\'); pos.advance()}
'n' -> {
sb.append('\n'); pos.advance()
}
'r' -> {
sb.append('\r'); pos.advance()
}
't' -> {
sb.append('\t'); pos.advance()
}
'"' -> {
sb.append('"'); pos.advance()
}
'\\' -> {
sb.append('\\'); pos.advance()
}
else -> {
sb.append('\\').append(currentChar)
pos.advance()
@ -445,7 +465,7 @@ private class Parser(fromPos: Pos) {
}
}
'\n', '\r'-> {
'\n', '\r' -> {
newlineDetected = true
sb.append(currentChar)
pos.advance()
@ -459,7 +479,7 @@ private class Parser(fromPos: Pos) {
}
pos.advance()
val result = sb.toString().let { if( newlineDetected ) fixMultilineStringLiteral(it) else it }
val result = sb.toString().let { if (newlineDetected) fixMultilineStringLiteral(it) else it }
return Token(result, start, Token.Type.STRING)
}
@ -538,7 +558,7 @@ private class Parser(fromPos: Pos) {
init {
// skip shebang
if( pos.readFragment("#!") )
if (pos.readFragment("#!"))
loadToEndOfLine()
}

View File

@ -37,7 +37,7 @@ data class Token(val value: String, val pos: Pos, val type: Type) {
ASSIGN, PLUSASSIGN, MINUSASSIGN, STARASSIGN, SLASHASSIGN, PERCENTASSIGN,
PLUS2, MINUS2,
IN, NOTIN, IS, NOTIS,
EQ, NEQ, LT, LTE, GT, GTE, REF_EQ, REF_NEQ,
EQ, NEQ, LT, LTE, GT, GTE, REF_EQ, REF_NEQ, MATCH, NOTMATCH,
SHUTTLE,
AND, BITAND, OR, BITOR, NOT, BITNOT, DOT, ARROW, EQARROW, QUESTION, COLONCOLON,
SINLGE_LINE_COMMENT, MULTILINE_COMMENT,

View File

@ -178,6 +178,14 @@ open class Obj {
scope.raiseNotImplemented()
}
open suspend fun operatorMatch(scope: Scope, other: Obj): Obj {
scope.raiseNotImplemented()
}
open suspend fun operatorNotMatch(scope: Scope, other: Obj): Obj {
return operatorMatch(scope,other).logicalNot(scope)
}
open suspend fun assign(scope: Scope, other: Obj): Obj? = null
open fun getValue(scope: Scope) = this
@ -301,6 +309,18 @@ open class Obj {
return scope
}
inline fun <reified R: Obj> cast(scope: Scope): R {
castOrNull<R>()?.let { return it }
scope.raiseClassCastError("can't cast ${this::class.simpleName} to ${R::class.simpleName}")
}
inline fun <reified R: Obj> castOrNull(): R? {
(this as? R)?.let { return it }
// todo: check for subclasses
return null
}
companion object {
val rootObjectType = ObjClass("Obj").apply {

View File

@ -22,11 +22,20 @@ import net.sergeych.lyng.Scope
class ObjRegex(val regex: Regex) : Obj() {
override val objClass = type
override suspend fun operatorMatch(scope: Scope, other: Obj): Obj {
return regex.find(other.cast<ObjString>(scope).value)?.let {
scope.addConst("$~", ObjRegexMatch(it))
ObjTrue
} ?: ObjFalse
}
fun find(s: ObjString): Obj =
regex.find(s.value)?.let { ObjRegexMatch(it) } ?: ObjNull
companion object {
val type by lazy {
object : ObjClass("Regex") {
override suspend fun callOn(scope: Scope): Obj {
println(scope.requireOnlyArg<ObjString>().value)
return ObjRegex(
scope.requireOnlyArg<ObjString>().value.toRegex()
)
@ -36,8 +45,7 @@ class ObjRegex(val regex: Regex) : Obj() {
ObjBool(args.firstAndOnly().toString().matches(thisAs<ObjRegex>().regex))
}
addFn("find") {
val s = requireOnlyArg<ObjString>().value
thisAs<ObjRegex>().regex.find(s)?.let { ObjRegexMatch(it) } ?: ObjNull
thisAs<ObjRegex>().find(requireOnlyArg<ObjString>())
}
addFn("findAll") {
val s = requireOnlyArg<ObjString>().value
@ -61,6 +69,7 @@ class ObjRegexMatch(val match: MatchResult) : Obj() {
val objRange: ObjRange by lazy {
val r = match.range
ObjRange(
ObjInt(r.first.toLong()),
ObjInt(r.last.toLong()),
@ -68,6 +77,19 @@ class ObjRegexMatch(val match: MatchResult) : Obj() {
)
}
override suspend fun toString(scope: Scope,calledFromLyng: Boolean): ObjString {
return ObjString("RegexMath(${objRange.toString(scope)},${objGroups.toString(scope)})")
}
override suspend fun getAt(scope: Scope, index: Obj): Obj {
return objGroups.getAt(scope, index)
}
override suspend fun compareTo(scope: Scope, other: Obj): Int {
if( other === this) return 0
return -2
}
companion object {
val type by lazy {
object : ObjClass("RegexMatch") {

View File

@ -56,16 +56,23 @@ data class ObjString(val value: String) : Obj() {
}
override suspend fun getAt(scope: Scope, index: Obj): Obj {
if (index is ObjInt) return ObjChar(value[index.toInt()])
if (index is ObjRange) {
val start = if (index.start == null || index.start.isNull) 0 else index.start.toInt()
val end = if (index.end == null || index.end.isNull) value.length else {
val e = index.end.toInt()
if (index.isEndInclusive) e + 1 else e
when (index) {
is ObjInt -> return ObjChar(value[index.toInt()])
is ObjRange -> {
val start = if (index.start == null || index.start.isNull) 0 else index.start.toInt()
val end = if (index.end == null || index.end.isNull) value.length else {
val e = index.end.toInt()
if (index.isEndInclusive) e + 1 else e
}
return ObjString(value.substring(start, end))
}
return ObjString(value.substring(start, end))
is ObjRegex -> {
return index.find(this)
}
else -> scope.raiseIllegalArgument("String index must be Int, Regex or Range")
}
scope.raiseIllegalArgument("String index must be Int or Range")
}
override fun hashCode(): Int {
@ -96,6 +103,11 @@ data class ObjString(val value: String) : Obj() {
return value == other.value
}
override suspend fun operatorMatch(scope: Scope, other: Obj): Obj {
val re = other.cast<ObjRegex>(scope)
return re.operatorMatch(scope, this)
}
override suspend fun lynonType(): LynonType = LynonType.String
override suspend fun serialize(scope: Scope, encoder: LynonEncoder, lynonType: LynonType?) {
@ -108,8 +120,9 @@ data class ObjString(val value: String) : Obj() {
ObjString(decoder.unpackBinaryData().decodeToString())
}.apply {
addFn("toInt") {
ObjInt(thisAs<ObjString>().value.toLongOrNull()
?: raiseIllegalArgument("can't convert to int: $thisObj")
ObjInt(
thisAs<ObjString>().value.toLongOrNull()
?: raiseIllegalArgument("can't convert to int: $thisObj")
)
}
addFn("startsWith") {
@ -160,6 +173,22 @@ data class ObjString(val value: String) : Obj() {
addFn("trim") {
thisAs<ObjString>().value.trim().let(::ObjString)
}
addFn("matches") {
val s = requireOnlyArg<Obj>()
val self = thisAs<ObjString>().value
ObjBool(
when (s) {
is ObjRegex -> self.matches(s.regex)
is ObjString -> {
if (s.value == ".*") true
else self.matches(s.value.toRegex())
}
else ->
raiseIllegalArgument("can't match ${s.objClass.className}: required Regex or String")
}
)
}
}
}
}

View File

@ -2242,6 +2242,45 @@ class ScriptTest {
)
}
@Test
fun testParseSpecialVars() {
val l = parseLyng("$~".toSource("test$~"))
println(l)
assertEquals(Token.Type.ID, l[0].type)
assertEquals("$~", l[0].value)
}
@Test
fun testMatchOperator() = runTest {
eval("""
assert( "abc123".matches(".*\d{3}") )
assert( ".*\d{3}".re =~ "abc123" )
assert( "abc123" =~ ".*\d{3}".re )
assert( "abc123" !~ ".*\d{4}".re )
"abc123" =~ ".*(\d)(\d)(\d)$".re
println($~)
assertEquals("1", $~[1])
assertEquals("2", $~[2])
assertEquals("3", $~[3])
assertEquals("abc123", $~[0])
""".trimIndent())
}
// @Test
// fun testWhenMatch() = runTest {
// eval(
// """
// when("abc123") {
// ".*(\d)(\d)(\d)".re -> { x ->
// assertEquals("123", x[0])
// }
// else -> assert(false)
// }
// """.trimIndent()
// )
// }
@Test
fun testWhenSample1() = runTest {
eval(
@ -3247,7 +3286,7 @@ class ScriptTest {
}
// @Test
// @Test
fun testMinimumOptimization() = runTest {
val x = Scope().eval(
"""

View File

@ -332,4 +332,8 @@ class BookTest {
runDocTests("../docs/Array.md")
}
@Test
fun testRegex() = runBlocking {
runDocTests("../docs/Regex.md")
}
}