diff --git a/PolyEngine/Core/CMakeLists.txt b/PolyEngine/Core/CMakeLists.txt index 70c11654..7e6b1e58 100644 --- a/PolyEngine/Core/CMakeLists.txt +++ b/PolyEngine/Core/CMakeLists.txt @@ -6,7 +6,10 @@ file(GLOB_RECURSE POLYCORE_SRCS RELATIVE ${CMAKE_CURRENT_LIST_DIR} ${POLYCORE_INCLUDE}/*.h) GenerateSourceGoups("${POLYCORE_SRCS}") +find_package(ICU 65.1 COMPONENTS uc i18n REQUIRED) + add_library(${CORE_TARGET} SHARED ${POLYCORE_SRCS}) +target_link_libraries(${CORE_TARGET} PUBLIC ICU::uc ICU::i18n) target_compile_options(${CORE_TARGET} PRIVATE $<$:${SIMD_FLAGS}>) target_compile_definitions(${CORE_TARGET} PRIVATE _CORE DISABLE_SIMD=$>) target_include_directories(${CORE_TARGET} PUBLIC ${POLYCORE_INCLUDE} PRIVATE ${RapidJSON_INCLUDE_DIRS}) diff --git a/PolyEngine/Core/Src/pe/core/CorePCH.hpp b/PolyEngine/Core/Src/pe/core/CorePCH.hpp index fe97973f..b40dc95b 100644 --- a/PolyEngine/Core/Src/pe/core/CorePCH.hpp +++ b/PolyEngine/Core/Src/pe/core/CorePCH.hpp @@ -7,4 +7,13 @@ SILENCE_GCC_WARNING(-Wclass-memaccess, "Rapidjson has no release containing fix #include #include #include -UNSILENCE_GCC_WARNING() \ No newline at end of file +UNSILENCE_GCC_WARNING() + +// ICU +//#include +#include +#include +//#include +//#include +#include +#include \ No newline at end of file diff --git a/PolyEngine/Core/Src/pe/core/storage/String.cpp b/PolyEngine/Core/Src/pe/core/storage/String.cpp index 4085aee8..7dd6fbf4 100644 --- a/PolyEngine/Core/Src/pe/core/storage/String.cpp +++ b/PolyEngine/Core/Src/pe/core/storage/String.cpp @@ -12,13 +12,50 @@ static const std::vector WHITESPACES { ' ', '\t', '\r', '\n', '\0' }; namespace pe::core::storage { -size_t StrLen(const char* str) { +size_t StrLen(const char* str) +{ size_t len = 0; while (str[len] != 0) ++len; return len; } +bool isValidASCIIString(const char* str) +{ + size_t pos = 0; + unsigned char c = 0; + while (c = str[pos], c != 0) + if (c > 0x7f) + return false; + else + ++pos; + return true; +} + +} + +String String::fromASCII(const char* data) // can still be invalid but better than nothing +{ + ASSERTE(isValidASCIIString(data), "Passed string is not valid ASCII, please use fromUTF8 factory method instead!"); + return String(data); +} + +String String::fromUTF8(const char* data) +{ + String ret{}; + const size_t len = StrLen(data); + UErrorCode success = UErrorCode::U_ZERO_ERROR; + icu::UnicodeString dst, src(data, len); + auto normalizer = icu::Normalizer2::getNFCInstance(success); + normalizer->normalize(src, dst, success); + ret.Data.reserve(dst.length()); + dst.extract(0, dst.length(), ret.Data.data(), dst.length()); + return ret; +} + +String String::fromCodePoint(const char* data) +{ + return String(); } String::String(const char* data) { @@ -86,6 +123,19 @@ String String::ToUpper() const return ret; } +String String::toASCII() const // C-api is very unwieldy for this one, copying and duplicating is unavoidable +{ + String ret{}; + ret.Data.reserve(Data.size()); + icu::UnicodeString str(GetCStr(), Data.size()); + UErrorCode success = UErrorCode::U_ZERO_ERROR; + UParseError parseError; + auto trans = icu::Transliterator::createInstance("Any-Latin; Latin-ASCII", UTRANS_FORWARD, parseError, success); + trans->transliterate(str); + str.extract(0, str.length(), ret.Data.data(), Data.size()); + return ret; +} + bool String::IsEmpty() const { return GetLength() == 0; } @@ -100,8 +150,8 @@ String String::Replace(char what, char with) const return ret; } -String String::Replace(const String& what, const String& with) const { - +String String::Replace(const String& what, const String& with) const +{ std::vector splitted = Split(what); return Join(splitted.data(), splitted.size(), with); } @@ -122,28 +172,28 @@ std::vector String::Split(const String& delimiter) const { return elements; } -String String::Join(const String* vars, size_t size, const String& separator) { - //TODO replace using stringbuilder - String s = String(""); - for (size_t i = 0; i < size; i++) { - s = s + vars[i]; - if (i != size - 1) { - s = s + separator; - } +String String::Join(const String* vars, size_t size, const String& separator) +{ + StringBuilder sb; + for (size_t i = 0; i < size; ++i) + { + sb.Append(vars[i]); + if (i != size - 1) + sb.Append(separator); } - return s; + return sb.StealString(); } -String String::Join(const String* vars, size_t size, char separator) { - //TODO replace using stringbuilder - String s = String(""); - for (size_t i = 0; i < size; i++) { - s = s + vars[i]; - if (i != size - 1) { - s = s + separator; - } +String String::Join(const String* vars, size_t size, char separator) +{ + StringBuilder sb; + for (size_t i = 0; i < size; ++i) + { + sb.Append(vars[i]); + if (i != size - 1) + sb.Append(separator); } - return s; + return sb.StealString(); } bool String::StartsWith(char var) const { @@ -202,7 +252,8 @@ String& String::operator=(String&& rhs) { return *this; } -bool String::operator==(const char* str) const { +bool String::CmpBytes(const char* str) const +{ if (GetLength() != StrLen(str)) return false; for (size_t k = 0; k < GetLength(); ++k) @@ -211,10 +262,25 @@ bool String::operator==(const char* str) const { return true; } -bool String::operator==(const String& str) const { +bool String::CmpBytes(const String& str) const +{ return Data == str.Data; } +bool String::operator==(const char* str) const +{ + UErrorCode success = U_ZERO_ERROR; + auto coll = icu::Collator::createInstance(success); + return coll->compareUTF8(Data.data(), str, success) == UCOL_EQUAL; +} + +bool String::operator==(const String& str) const +{ + UErrorCode success = U_ZERO_ERROR; + auto coll = icu::Collator::createInstance(success); + return coll->compareUTF8(Data.data(), str.Data.data(), success) == UCOL_EQUAL; +} + bool String::operator<(const String& rhs) const { if (GetLength() < rhs.GetLength()) return true; @@ -261,6 +327,11 @@ size_t String::GetLength() const return Data.size() - 1; } +size_t String::GetLogicalLength() const +{ + return 0; // TODO: count it on demand? +} + size_t String::FindSubstrFromPoint(size_t startPoint, const String& str) const { for (size_t idx = startPoint; idx < GetLength(); ++idx) diff --git a/PolyEngine/Core/Src/pe/core/storage/String.hpp b/PolyEngine/Core/Src/pe/core/storage/String.hpp index 4f83bee4..f9b300b1 100644 --- a/PolyEngine/Core/Src/pe/core/storage/String.hpp +++ b/PolyEngine/Core/Src/pe/core/storage/String.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include namespace pe::core::storage { @@ -12,6 +13,12 @@ namespace pe::core::storage public: static const String EMPTY; + static String fromASCII(const char* data); + + static String fromUTF8(const char* data); + + static String fromCodePoint(const char* data); + /// Basic String costructor that creates empty String String() : String("") {}; @@ -27,7 +34,6 @@ namespace pe::core::storage /// Reference to String instance which state should be moved String(String&& rhs); - /// Casts int to String /// Integer value which should be used to make String instance /// String containing integer value @@ -72,7 +78,6 @@ namespace pe::core::storage /// String containing given std::string static String From(const std::string& var); - /// Checks if String instance contains another String instance /// String reference which should be contained in another String instance bool Contains(const String& var) const; @@ -89,6 +94,8 @@ namespace pe::core::storage /// Upper-case String instance String ToUpper() const; + String toASCII() const; + /// Checks if String is empty bool IsEmpty() const; @@ -164,6 +171,10 @@ namespace pe::core::storage /// Moved String reference String& operator=(String&& rhs); + bool CmpBytes(const char* str) const; + + bool CmpBytes(const String& str) const; + /// Compares String with Cstring /// Cstring to be compared with bool operator==(const char* str) const; @@ -193,10 +204,85 @@ namespace pe::core::storage char operator[](size_t idx) const; size_t GetLength() const; + + size_t GetLogicalLength() const; + const char* GetCStr() const { return Data.data(); } friend std::ostream& operator<< (std::ostream& stream, const String& rhs) { return stream << rhs.GetCStr(); } + class StringIteratorMemory final : public BaseObjectLiteralType<> + { + friend class String; + public: + using iterator_category = std::bidirectional_iterator_tag; + using value_type = char; + using difference_type = std::ptrdiff_t; + using pointer = char*; + using reference = char&; + + bool operator==(const StringIteratorMemory& rhs) const { return idx == rhs.idx; } + bool operator!=(const StringIteratorMemory& rhs) const { return idx != rhs.idx; } + + const char& operator*() const { return s->Data.at(idx); } + //const char* operator->() const { return s->Data.data() + idx * sizeof(char); } //are they even useful? + + StringIteratorMemory& operator++() { ++idx; return *this; } + StringIteratorMemory operator++(int) { StringIteratorMemory ret(s, idx); ++idx; return ret; } + StringIteratorMemory& operator--() { ASSERTE(idx > 0, "Index cannot be negative"); --idx; return *this; } + StringIteratorMemory operator--(int) { ASSERTE(idx > 0, "Index cannot be negative"); StringIteratorMemory ret(s, idx); --idx; return ret; } + private: + StringIteratorMemory(const String* string, size_t index) : s(string), idx(index) {}; + const String* s; + size_t idx; + }; + + class StringIteratorGlyph final : public BaseObjectLiteralType<>// add implementation from numeria + { + friend class String; + public: + using iterator_category = std::bidirectional_iterator_tag; + using value_type = char; + using difference_type = std::ptrdiff_t; + using pointer = char*; + using reference = char&; + + bool operator==(const StringIteratorGlyph& rhs) const { return idx == rhs.idx; } + bool operator!=(const StringIteratorGlyph& rhs) const { return idx != rhs.idx; } + + const char& operator*() const { return s->Data.at(idx); } + //const char* operator->() const { return s->Data.data() + idx * sizeof(char); } + + StringIteratorGlyph& operator++() { ++idx; return *this; } + StringIteratorGlyph operator++(int) { StringIteratorGlyph ret(s, idx); ++idx; return ret; } + StringIteratorGlyph& operator--() { ASSERTE(idx > 0, "Index cannot be negative"); --idx; return *this; } + StringIteratorGlyph operator--(int) { ASSERTE(idx > 0, "Index cannot be negative"); StringIteratorGlyph ret(s, idx); --idx; return ret; } + private: + StringIteratorGlyph(const String* string, size_t index) : s(string), idx(index) {}; + const String* s; + size_t idx; + }; + + // FIXME: default begin and end are memory for now, please scrutinize this, const added as well, they are const anyway, should user only use const ones? disallow mutating? + StringIteratorMemory begin() { return StringIteratorMemory(this, 0); } + StringIteratorMemory end() { return StringIteratorMemory(this, Data.size()); } + StringIteratorMemory cbegin() const { return StringIteratorMemory(this, 0); } + StringIteratorMemory cend() const { return StringIteratorMemory(this, Data.size()); } + StringIteratorGlyph beginGlyph() { return StringIteratorGlyph(this, 0); } + StringIteratorGlyph endGlyph() { return StringIteratorGlyph(this, Data.size()); } + + //iteratememory + ::pe::core::utils::Range IterateMemory() + { + return ::pe::core::utils::Range(begin(), end()); + } + + // iterateglyph + ::pe::core::utils::Range IterateGlyphs() + { + return ::pe::core::utils::Range(beginGlyph(), endGlyph()); + } + private: String(std::vector rawData) : Data(std::move(rawData)) { Data.push_back('\0'); } diff --git a/PolyEngine/Tests/CoreTests/Src/StringTests.cpp b/PolyEngine/Tests/CoreTests/Src/StringTests.cpp index 1c166197..047d4eb2 100644 --- a/PolyEngine/Tests/CoreTests/Src/StringTests.cpp +++ b/PolyEngine/Tests/CoreTests/Src/StringTests.cpp @@ -54,6 +54,8 @@ TEST_CASE("String operations", "[String]") { ::pe::core::storage::String empty = ::pe::core::storage::String(""); REQUIRE(!test.IsEmpty()); REQUIRE(empty.IsEmpty()); + REQUIRE(!empty.StartsWith('?')); + REQUIRE(!empty.EndsWith('?')); ::pe::core::storage::String replace = ::pe::core::storage::String("@ALZ[aWWzD{"); ::pe::core::storage::String replaced = test.Replace('l', 'W').Replace('\'', 'D'); @@ -104,4 +106,121 @@ TEST_CASE("String operations", "[String]") { ::pe::core::storage::String notContainsTest = ::pe::core::storage::String("Z[allz'/"); REQUIRE(test.Contains(notContainsTest) == false); +} + +TEST_CASE("UTF-8 string normalization and collation tests", "[String]") +{ + ::pe::core::storage::String utf8Literal("śląsk"); + ::pe::core::storage::String uft8Escaped("\xC5\x9B\x6C\xC4\x85\x73\x6B"); + REQUIRE(utf8Literal == uft8Escaped); + + auto codepointUTF8One = ::pe::core::storage::String::fromUTF8("\u4eba\u53e3\u3058\u3093\u3053\u3046\u306b\u81be\u7099\u304b\u3044\u3057\u3083\u3059\u308b"); + auto codepointUTF8Two = ::pe::core::storage::String::fromUTF8("\u4eba\u53e3\u3058\u3093\u3053\u3046\u306b\u81be\uf9fb\u304b\u3044\u3057\u3083\u3059\u308b"); + REQUIRE(codepointUTF8One == codepointUTF8Two); + REQUIRE(codepointUTF8One.CmpBytes(codepointUTF8Two) == true); + + auto normalizedUTF8One = ::pe::core::storage::String::fromUTF8("\xC3\xA4\x00"); + auto normalizedUTF8Two = ::pe::core::storage::String::fromUTF8("\x61\xCC\x88\x00"); + REQUIRE(normalizedUTF8One == normalizedUTF8Two); + REQUIRE(normalizedUTF8One.CmpBytes(normalizedUTF8Two) == true); + + ::pe::core::storage::String regularStringOne("\xC3\xA4\x00"); // from regular string they will not have proper representation as bytes, no normalization + ::pe::core::storage::String regularStringTwo("\x61\xCC\x88\x00"); + REQUIRE(regularStringTwo == regularStringTwo); + REQUIRE(regularStringOne.CmpBytes(regularStringTwo) == false); + + //auto invalidASCII = ::pe::core::storage::String::fromASCII("\xC3\xA4\x00"); +} + +TEST_CASE("UTF-8 string conversion tests", "[String]") +{ + ::pe::core::storage::String utf8polish("śląsk"); + auto asciiConverted = utf8polish.toASCII(); + REQUIRE(asciiConverted == "slask"); + + ::pe::core::storage::String utf8capital("ŚPIĄC"); + asciiConverted = utf8capital.toASCII(); + REQUIRE(asciiConverted == "SPIAC"); + + ::pe::core::storage::String hangulTest("김, 국삼"); + asciiConverted = hangulTest.toASCII(); + REQUIRE(asciiConverted == "gim, gugsam"); // does not preserve capital letters + + ::pe::core::storage::String kanjiTest("たけだ, まさゆき"); + asciiConverted = kanjiTest.toASCII(); + REQUIRE(asciiConverted == "takeda, masayuki"); // does not preserve capital letters + + ::pe::core::storage::String cyrilicTest("Θεοδωράτου, Ελένη"); + asciiConverted = cyrilicTest.toASCII(); + REQUIRE(asciiConverted == "Theodoratou, Elene"); // this preserves it somehow +} + +TEST_CASE("STL iterator tests", "[String]") +{ + ::pe::core::storage::String s1("regular string"); + ::pe::core::storage::String s2; + + size_t i = 0; + for(auto c : s1) + { + UNUSED(c); + ++i; + } + REQUIRE(i == s1.GetLength() + 1);// +1 for the \0 + + i = 0; + for(auto c : s2) + { + UNUSED(c); + ++i; + } + REQUIRE(i == s2.GetLength() + 1); +} + +TEST_CASE("Memory iterator tests", "[String]") +{ + ::pe::core::storage::String s1("regular string"); + ::pe::core::storage::String s2; + + size_t i = 0; + for(auto c : s1.IterateMemory()) + { + UNUSED(c); + ++i; + } + REQUIRE(i == s1.GetLength() + 1); + + i = 0; + for(auto c : s2.IterateMemory()) + { + UNUSED(c); + ++i; + } + REQUIRE(i == s2.GetLength() + 1); + + auto it = s1.begin(); + auto it2 = s1.end(); + //--it; + ++it; + --it2; + REQUIRE(*it == 'e'); + REQUIRE(*it2 == '\0'); + it++; + REQUIRE(*it == 'g'); + --it; + REQUIRE(*it == 'e'); + it--; + REQUIRE(*it == 'r'); + it2 = s1.begin(); + REQUIRE(it == it2); + ++it2; + REQUIRE(it != it2); +} + +TEST_CASE("Glyph iterator tests", "[String]") +{ + //::pe::core::storage::String s1("regular string"); +// ::pe::core::storage::String s2(""); + + } \ No newline at end of file