Mercurial > hg > orthanc
diff Core/Toolbox.cpp @ 3322:b32b7c44a223
Toolbox::Utf8ToUnicodeCharacter()
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Fri, 15 Mar 2019 17:37:51 +0100 |
parents | 4b042ec734c1 |
children | 87396c571109 2e7c5c15ba25 |
line wrap: on
line diff
--- a/Core/Toolbox.cpp Wed Mar 06 12:23:14 2019 +0100 +++ b/Core/Toolbox.cpp Fri Mar 15 17:37:51 2019 +0100 @@ -1789,6 +1789,81 @@ } } } + + + void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode, + size_t& length, + const std::string& utf8, + size_t position) + { + // https://en.wikipedia.org/wiki/UTF-8 + + static const uint8_t MASK_IS_1_BYTE = 0x80; // printf '0x%x\n' "$((2#10000000))" + static const uint8_t TEST_IS_1_BYTE = 0x00; + + static const uint8_t MASK_IS_2_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + static const uint8_t TEST_IS_2_BYTES = 0xc0; // printf '0x%x\n' "$((2#11000000))" + + static const uint8_t MASK_IS_3_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + static const uint8_t TEST_IS_3_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" + + static const uint8_t MASK_IS_4_BYTES = 0xf8; // printf '0x%x\n' "$((2#11111000))" + static const uint8_t TEST_IS_4_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" + + static const uint8_t MASK_CONTINUATION = 0xc0; // printf '0x%x\n' "$((2#11000000))" + static const uint8_t TEST_CONTINUATION = 0x80; // printf '0x%x\n' "$((2#10000000))" + + if (position >= utf8.size()) + { + throw OrthancException(ErrorCode_ParameterOutOfRange); + } + + const uint8_t* buffer = reinterpret_cast<const uint8_t*>(utf8.c_str()); + + if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE) + { + length = 1; + unicode = buffer[0] & ~MASK_IS_1_BYTE; + } + else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES && + position + 1 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 2; + uint32_t a = buffer[0] & ~MASK_IS_2_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + unicode = (a << 6) | b; + } + else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES && + position + 2 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 3; + uint32_t a = buffer[0] & ~MASK_IS_3_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + unicode = (a << 12) | (b << 6) | c; + } + else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES && + position + 3 < utf8.size() && + (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION && + (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION) + { + length = 4; + uint32_t a = buffer[0] & ~MASK_IS_4_BYTES; + uint32_t b = buffer[1] & ~MASK_CONTINUATION; + uint32_t c = buffer[2] & ~MASK_CONTINUATION; + uint32_t d = buffer[3] & ~MASK_CONTINUATION; + unicode = (a << 18) | (b << 12) | (c << 6) | d; + } + else + { + // This is not a valid UTF-8 encoding + throw OrthancException(ErrorCode_BadFileFormat); + } + } }