Mercurial > hg > orthanc
changeset 3322:b32b7c44a223
Toolbox::Utf8ToUnicodeCharacter()
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Fri, 15 Mar 2019 17:37:51 +0100 |
parents | e54ca78059bd |
children | a15a4b9d8c00 |
files | Core/Toolbox.cpp Core/Toolbox.h UnitTestsSources/UnitTestsMain.cpp |
diffstat | 3 files changed, 143 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/Core/Toolbox.cpp Wed Mar 06 12:23:14 2019 +0100 +++ b/Core/Toolbox.cpp Fri Mar 15 17:37:51 2019 +0100 @@ -1789,6 +1789,81 @@ } } } + + + void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode, + size_t& length, + const std::string& utf8, + size_t position) + { + // https://en.wikipedia.org/wiki/UTF-8 + + static const uint8_t MASK_IS_1_BYTE = 0x80; // printf '0x%x\n' "$((2#10000000))" + static const uint8_t TEST_IS_1_BYTE = 0x00; + + static const uint8_t MASK_IS_2_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + static const uint8_t TEST_IS_2_BYTES = 0xc0; // printf '0x%x\n' "$((2#11000000))" + + static const uint8_t MASK_IS_3_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + static const uint8_t TEST_IS_3_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + + static const uint8_t MASK_IS_4_BYTES = 0xf8; // printf '0x%x\n' "$((2#11111000))" + static const uint8_t TEST_IS_4_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + + static const uint8_t MASK_CONTINUATION = 0xc0; // printf '0x%x\n' "$((2#11000000))" + static const uint8_t TEST_CONTINUATION = 0x80; // printf '0x%x\n' "$((2#10000000))" + + if (position >= utf8.size()) + { + throw OrthancException(ErrorCode_ParameterOutOfRange); + } + + const uint8_t* buffer = reinterpret_cast<const uint8_t*>(utf8.c_str()); + + if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE) + { + length = 1; + unicode = buffer[0] & ~MASK_IS_1_BYTE; + } + else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES && + position + 1 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 2; + uint32_t a = buffer[0] & ~MASK_IS_2_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + unicode = (a << 6) | b; + } + else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES && + position + 2 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 3; + uint32_t a = buffer[0] & ~MASK_IS_3_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + unicode = (a << 12) | (b << 6) | c; + } + else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES && + position + 3 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 4; + uint32_t a = buffer[0] & ~MASK_IS_4_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + uint32_t d = buffer[3] & ~MASK_CONTINUATION; + unicode = (a << 18) | (b << 12) | (c << 6) | d; + } + else + { + // This is not a valid UTF-8 encoding + throw OrthancException(ErrorCode_BadFileFormat); + } + } }
--- a/Core/Toolbox.h Wed Mar 06 12:23:14 2019 +0100 +++ b/Core/Toolbox.h Fri Mar 15 17:37:51 2019 +0100 @@ -252,6 +252,11 @@ void RemoveIso2022EscapeSequences(std::string& dest, const std::string& src); + + void Utf8ToUnicodeCharacter(uint32_t& unicode, + size_t& utf8Length, + const std::string& utf8, + size_t position); } }
--- a/UnitTestsSources/UnitTestsMain.cpp Wed Mar 06 12:23:14 2019 +0100 +++ b/UnitTestsSources/UnitTestsMain.cpp Fri Mar 15 17:37:51 2019 +0100 @@ -482,6 +482,69 @@ } +static int32_t GetUnicode(const uint8_t* data, + size_t size, + size_t expectedLength) +{ + std::string s((char*) &data[0], size); + uint32_t unicode; + size_t length; + Toolbox::Utf8ToUnicodeCharacter(unicode, length, s, 0); + if (length != expectedLength) + { + return -1; // Error case + } + else + { + return unicode; + } +} + + +TEST(Toolbox, Utf8ToUnicode) +{ + // https://en.wikipedia.org/wiki/UTF-8 + + ASSERT_EQ(1, sizeof(char)); + ASSERT_EQ(1, sizeof(uint8_t)); + + { + const uint8_t data[] = { 0x24 }; + ASSERT_EQ(0x24, GetUnicode(data, 1, 1)); + ASSERT_THROW(GetUnicode(data, 0, 1), OrthancException); + } + + { + const uint8_t data[] = { 0xc2, 0xa2 }; + ASSERT_EQ(0xa2, GetUnicode(data, 2, 2)); + ASSERT_THROW(GetUnicode(data, 1, 2), OrthancException); + } + + { + const uint8_t data[] = { 0xe0, 0xa4, 0xb9 }; + ASSERT_EQ(0x0939, GetUnicode(data, 3, 3)); + ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException); + } + + { + const uint8_t data[] = { 0xe2, 0x82, 0xac }; + ASSERT_EQ(0x20ac, GetUnicode(data, 3, 3)); + ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException); + } + + { + const uint8_t data[] = { 0xf0, 0x90, 0x8d, 0x88 }; + ASSERT_EQ(0x010348, GetUnicode(data, 4, 4)); + ASSERT_THROW(GetUnicode(data, 3, 4), OrthancException); + } + + { + const uint8_t data[] = { 0xe0 }; + ASSERT_THROW(GetUnicode(data, 1, 1), OrthancException); + } +} + + TEST(Toolbox, UrlDecode) { std::string s;