# HG changeset patch # User Sebastien Jodogne # Date 1552667871 -3600 # Node ID b32b7c44a2232ce6148f85d791704d19ffda0e4c # Parent e54ca78059bdc4559b39c4a6c5791ba0b4db6f10 Toolbox::Utf8ToUnicodeCharacter() diff -r e54ca78059bd -r b32b7c44a223 Core/Toolbox.cpp --- a/Core/Toolbox.cpp Wed Mar 06 12:23:14 2019 +0100 +++ b/Core/Toolbox.cpp Fri Mar 15 17:37:51 2019 +0100 @@ -1789,6 +1789,81 @@ } } } + + + void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode, + size_t& length, + const std::string& utf8, + size_t position) + { + // https://en.wikipedia.org/wiki/UTF-8 + + static const uint8_t MASK_IS_1_BYTE = 0x80; // printf '0x%x\n' "$((2#10000000))" + static const uint8_t TEST_IS_1_BYTE = 0x00; + + static const uint8_t MASK_IS_2_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + static const uint8_t TEST_IS_2_BYTES = 0xc0; // printf '0x%x\n' "$((2#11000000))" + + static const uint8_t MASK_IS_3_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + static const uint8_t TEST_IS_3_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + + static const uint8_t MASK_IS_4_BYTES = 0xf8; // printf '0x%x\n' "$((2#11111000))" + static const uint8_t TEST_IS_4_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + + static const uint8_t MASK_CONTINUATION = 0xc0; // printf '0x%x\n' "$((2#11000000))" + static const uint8_t TEST_CONTINUATION = 0x80; // printf '0x%x\n' "$((2#10000000))" + + if (position >= utf8.size()) + { + throw OrthancException(ErrorCode_ParameterOutOfRange); + } + + const uint8_t* buffer = reinterpret_cast(utf8.c_str()); + + if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE) + { + length = 1; + unicode = buffer[0] & ~MASK_IS_1_BYTE; + } + else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES && + position + 1 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 2; + uint32_t a = buffer[0] & ~MASK_IS_2_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + unicode = (a << 6) | b; + } + else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES && + position + 2 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 3; + uint32_t a = buffer[0] & ~MASK_IS_3_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + unicode = (a << 12) | (b << 6) | c; + } + else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES && + position + 3 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 4; + uint32_t a = buffer[0] & ~MASK_IS_4_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + uint32_t d = buffer[3] & ~MASK_CONTINUATION; + unicode = (a << 18) | (b << 12) | (c << 6) | d; + } + else + { + // This is not a valid UTF-8 encoding + throw OrthancException(ErrorCode_BadFileFormat); + } + } } diff -r e54ca78059bd -r b32b7c44a223 Core/Toolbox.h --- a/Core/Toolbox.h Wed Mar 06 12:23:14 2019 +0100 +++ b/Core/Toolbox.h Fri Mar 15 17:37:51 2019 +0100 @@ -252,6 +252,11 @@ void RemoveIso2022EscapeSequences(std::string& dest, const std::string& src); + + void Utf8ToUnicodeCharacter(uint32_t& unicode, + size_t& utf8Length, + const std::string& utf8, + size_t position); } } diff -r e54ca78059bd -r b32b7c44a223 UnitTestsSources/UnitTestsMain.cpp --- a/UnitTestsSources/UnitTestsMain.cpp Wed Mar 06 12:23:14 2019 +0100 +++ b/UnitTestsSources/UnitTestsMain.cpp Fri Mar 15 17:37:51 2019 +0100 @@ -482,6 +482,69 @@ } +static int32_t GetUnicode(const uint8_t* data, + size_t size, + size_t expectedLength) +{ + std::string s((char*) &data[0], size); + uint32_t unicode; + size_t length; + Toolbox::Utf8ToUnicodeCharacter(unicode, length, s, 0); + if (length != expectedLength) + { + return -1; // Error case + } + else + { + return unicode; + } +} + + +TEST(Toolbox, Utf8ToUnicode) +{ + // https://en.wikipedia.org/wiki/UTF-8 + + ASSERT_EQ(1, sizeof(char)); + ASSERT_EQ(1, sizeof(uint8_t)); + + { + const uint8_t data[] = { 0x24 }; + ASSERT_EQ(0x24, GetUnicode(data, 1, 1)); + ASSERT_THROW(GetUnicode(data, 0, 1), OrthancException); + } + + { + const uint8_t data[] = { 0xc2, 0xa2 }; + ASSERT_EQ(0xa2, GetUnicode(data, 2, 2)); + ASSERT_THROW(GetUnicode(data, 1, 2), OrthancException); + } + + { + const uint8_t data[] = { 0xe0, 0xa4, 0xb9 }; + ASSERT_EQ(0x0939, GetUnicode(data, 3, 3)); + ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException); + } + + { + const uint8_t data[] = { 0xe2, 0x82, 0xac }; + ASSERT_EQ(0x20ac, GetUnicode(data, 3, 3)); + ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException); + } + + { + const uint8_t data[] = { 0xf0, 0x90, 0x8d, 0x88 }; + ASSERT_EQ(0x010348, GetUnicode(data, 4, 4)); + ASSERT_THROW(GetUnicode(data, 3, 4), OrthancException); + } + + { + const uint8_t data[] = { 0xe0 }; + ASSERT_THROW(GetUnicode(data, 1, 1), OrthancException); + } +} + + TEST(Toolbox, UrlDecode) { std::string s;