From 70f63b12612e3a3bbeeeb2e29ee772d65b961b8e Mon Sep 17 00:00:00 2001 From: Lexy Plt Date: Tue, 31 Mar 2026 18:49:56 +0200 Subject: [PATCH] fix(builtins): ARK-350, string:ord need to check it get only one utf8 char ; add string:utf8len --- CHANGELOG.md | 2 + include/Ark/Builtins/Builtins.hpp | 1 + include/utf8.hpp | 81 ++++++++++++------- lib/std | 2 +- src/arkreactor/Builtins/Builtins.cpp | 1 + src/arkreactor/Builtins/String.cpp | 20 ++++- .../ir/operators_as_builtins.expected | 2 +- .../optimized_ir/builtins.expected | 2 +- .../runtime/string_ord_empty_str.ark | 1 + .../runtime/string_ord_empty_str.expected | 6 ++ .../runtime/string_ord_str_too_long.ark | 1 + .../runtime/string_ord_str_too_long.expected | 6 ++ .../typeChecking/utf8len_num_num.ark | 1 + .../typeChecking/utf8len_num_num.expected | 13 +++ 14 files changed, 106 insertions(+), 33 deletions(-) create mode 100644 tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark create mode 100644 tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.expected create mode 100644 tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark create mode 100644 tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.expected create mode 100644 tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark create mode 100644 tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.expected diff --git a/CHANGELOG.md b/CHANGELOG.md index 88314f202..d3d2a5db9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,11 +12,13 @@ - `append`, `concat`, and `pop` can be used as values - new `ptr` command for the debugger, printing the VM pointers (ip, pp, sp) - compile time arity check when performing a tail call +- `string:utf8len` to compute the number of codepoints in a string ### Changed - all paths inside `if` should return a value, when used as an expression. If an `else` branch is missing, `nil` will be returned - new compile time error when trying to use `append!`, `concat!`, `pop!`, `@=` and `@@=` as values - arguments in tail calls are loaded by value and not by reference +- `string:ord` checks that it get only 1 utf8 character ### Removed diff --git a/include/Ark/Builtins/Builtins.hpp b/include/Ark/Builtins/Builtins.hpp index 275468711..f4179cec7 100644 --- a/include/Ark/Builtins/Builtins.hpp +++ b/include/Ark/Builtins/Builtins.hpp @@ -78,6 +78,7 @@ namespace Ark::internal::Builtins ARK_BUILTIN(format); ARK_BUILTIN(findSubStr); ARK_BUILTIN(removeAtStr); + ARK_BUILTIN(utf8len); ARK_BUILTIN(ord); ARK_BUILTIN(chr); ARK_BUILTIN(setStringAt); diff --git a/include/utf8.hpp b/include/utf8.hpp index 2bb983b42..716320506 100644 --- a/include/utf8.hpp +++ b/include/utf8.hpp @@ -133,7 +133,7 @@ namespace utf8 /** * @brief Check the validity of a given string in UTF8 * @param str - * @return true if the given string is a valid UTF88 string + * @return true if the given string is a valid UTF8 string */ inline bool isValid(const char* str) { @@ -183,6 +183,43 @@ namespace utf8 return true; } + inline std::size_t length(const char* str) + { + std::size_t count = 0; + const char* s = str; + + if (str == nullptr) + return 0; + + while (*s != 0) + { + if (0xf0 == (0xf8 & *s)) + { + ++count; + s += 4; + } + else if (0xe0 == (0xf0 & *s)) + { + ++count; + s += 3; + } + else if (0xc0 == (0xe0 & *s)) + { + ++count; + s += 2; + } + else if (0x00 == (0x80 & *s)) + { + ++count; + s += 1; + } + else + break; + } + + return count; + } + /** * @brief Compute the UTF8 codepoint for a given UTF8 char * @param str @@ -190,39 +227,25 @@ namespace utf8 */ inline int32_t codepoint(const char* str) { - int32_t codepoint = 0; const char* s = str; if (isValid(str)) { - while (*s != 0) - { - if (0xf0 == (0xf8 & *s)) - { - codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]); - s += 4; - } - else if (0xe0 == (0xf0 & *s)) - { - codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]); - s += 3; - } - else if (0xc0 == (0xe0 & *s)) - { - codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]); - s += 2; - } - else if (0x00 == (0x80 & *s)) - { - codepoint = s[0]; - ++s; - } - else - return -1; - } - } + int32_t c = 0; - return codepoint; + if (0xf0 == (0xf8 & *s)) + c = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]); + else if (0xe0 == (0xf0 & *s)) + c = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]); + else if (0xc0 == (0xe0 & *s)) + c = ((0x1f & s[0]) << 6) | (0x3f & s[1]); + else if (0x00 == (0x80 & *s)) + c = s[0]; + else + return -1; + return c; + } + return -1; } /** diff --git a/lib/std b/lib/std index 7eb3f2a3b..0faef4071 160000 --- a/lib/std +++ b/lib/std @@ -1 +1 @@ -Subproject commit 7eb3f2a3bd6e6561e2e213504ee5168a9407b059 +Subproject commit 0faef4071c91b62d84cfca0629e306e6c95da8b3 diff --git a/src/arkreactor/Builtins/Builtins.cpp b/src/arkreactor/Builtins/Builtins.cpp index 5e8dd6c0c..4d499632b 100644 --- a/src/arkreactor/Builtins/Builtins.cpp +++ b/src/arkreactor/Builtins/Builtins.cpp @@ -63,6 +63,7 @@ namespace Ark::internal::Builtins { "format", Value(String::format) }, { "builtin__string:find", Value(String::findSubStr) }, { "builtin__string:removeAt", Value(String::removeAtStr) }, + { "builtin__string:utf8len", Value(String::utf8len) }, { "builtin__string:ord", Value(String::ord) }, { "builtin__string:chr", Value(String::chr) }, { "builtin__string:setAt", Value(String::setStringAt) }, diff --git a/src/arkreactor/Builtins/String.cpp b/src/arkreactor/Builtins/String.cpp index 352b2ac3a..f3d1b2bc3 100644 --- a/src/arkreactor/Builtins/String.cpp +++ b/src/arkreactor/Builtins/String.cpp @@ -288,6 +288,18 @@ namespace Ark::internal::Builtins::String throw std::runtime_error(fmt::format("string:removeAt: index {} out of range (length: {})", num, n[0].stringRef().size())); } + Value utf8len(std::vector& n, VM* vm [[maybe_unused]]) + { + if (!types::check(n, ValueType::String)) + throw types::TypeCheckingError( + "string:utf8len", + { { types::Contract { { types::Typedef("string", ValueType::String) } } } }, + n); + + const std::size_t len = utf8::length(n[0].stringRef().c_str()); + return Value(static_cast(len)); + } + Value ord(std::vector& n, VM* vm [[maybe_unused]]) { if (!types::check(n, ValueType::String)) @@ -296,7 +308,13 @@ namespace Ark::internal::Builtins::String { { types::Contract { { types::Typedef("string", ValueType::String) } } } }, n); - return Value(utf8::codepoint(n[0].stringRef().c_str())); + if (const std::size_t len = utf8::length(n[0].stringRef().c_str()); len != 1) + throw std::runtime_error(fmt::format("string:ord: invalid string '{}', expected a single character, got {}", n[0].string(), len)); + + const int32_t codepoint = utf8::codepoint(n[0].stringRef().c_str()); + if (codepoint == -1) + throw std::runtime_error(fmt::format("string:ord: invalid string '{}'", n[0].string())); + return Value(codepoint); } // cppcheck-suppress constParameterReference diff --git a/tests/unittests/resources/CompilerSuite/ir/operators_as_builtins.expected b/tests/unittests/resources/CompilerSuite/ir/operators_as_builtins.expected index e0c95fe03..0377c5568 100644 --- a/tests/unittests/resources/CompilerSuite/ir/operators_as_builtins.expected +++ b/tests/unittests/resources/CompilerSuite/ir/operators_as_builtins.expected @@ -1,6 +1,5 @@ page_0 PUSH_RETURN_ADDRESS L0 - BUILTIN 69 BUILTIN 70 BUILTIN 71 BUILTIN 72 @@ -25,6 +24,7 @@ page_0 BUILTIN 91 BUILTIN 92 BUILTIN 93 + BUILTIN 94 CALL_BUILTIN 9, 25 .L0: POP 0 diff --git a/tests/unittests/resources/CompilerSuite/optimized_ir/builtins.expected b/tests/unittests/resources/CompilerSuite/optimized_ir/builtins.expected index a8c4a2277..766179fe6 100644 --- a/tests/unittests/resources/CompilerSuite/optimized_ir/builtins.expected +++ b/tests/unittests/resources/CompilerSuite/optimized_ir/builtins.expected @@ -5,7 +5,7 @@ page_0 HALT 0 page_1 - CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 55, 1 + CALL_BUILTIN_WITHOUT_RETURN_ADDRESS 56, 1 .L0: RET 0 HALT 0 diff --git a/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark new file mode 100644 index 000000000..c9ebe3a0c --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark @@ -0,0 +1 @@ +(print (builtin__string:ord "")) diff --git a/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.expected b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.expected new file mode 100644 index 000000000..0f3f97d7c --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.expected @@ -0,0 +1,6 @@ +string:ord: invalid string '', expected a single character, got 0 + +In file tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_empty_str.ark:1 + 1 | (print (builtin__string:ord "")) + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + 2 | diff --git a/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark new file mode 100644 index 000000000..d21ccaffa --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark @@ -0,0 +1 @@ +(print (builtin__string:ord "abc")) diff --git a/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.expected b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.expected new file mode 100644 index 000000000..06fd44117 --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.expected @@ -0,0 +1,6 @@ +string:ord: invalid string 'abc', expected a single character, got 3 + +In file tests/unittests/resources/DiagnosticsSuite/runtime/string_ord_str_too_long.ark:1 + 1 | (print (builtin__string:ord "abc")) + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + 2 | diff --git a/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark b/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark new file mode 100644 index 000000000..fd0da9ea4 --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark @@ -0,0 +1 @@ +(print (builtin__string:utf8len 1 2)) diff --git a/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.expected b/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.expected new file mode 100644 index 000000000..bb7f781d8 --- /dev/null +++ b/tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.expected @@ -0,0 +1,13 @@ +Function string:utf8len expected 1 argument but got 2 +Call + ↳ (string:utf8len 1 2) +Signature + ↳ (string:utf8len string) +Arguments + → `string' (expected String), got 1 (Number) + → unexpected additional args: 2 (Number) + +In file tests/unittests/resources/DiagnosticsSuite/typeChecking/utf8len_num_num.ark:1 + 1 | (print (builtin__string:utf8len 1 2)) + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + 2 |