changeset 3322:b32b7c44a223

Toolbox::Utf8ToUnicodeCharacter()
author Sebastien Jodogne <s.jodogne@gmail.com>
date Fri, 15 Mar 2019 17:37:51 +0100
parents e54ca78059bd
children a15a4b9d8c00
files Core/Toolbox.cpp Core/Toolbox.h UnitTestsSources/UnitTestsMain.cpp
diffstat 3 files changed, 143 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/Core/Toolbox.cpp	Wed Mar 06 12:23:14 2019 +0100
+++ b/Core/Toolbox.cpp	Fri Mar 15 17:37:51 2019 +0100
@@ -1789,6 +1789,81 @@
       }
     }
   }
+
+
+  void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode,
+                                       size_t& length,
+                                       const std::string& utf8,
+                                       size_t position)
+  {
+    // https://en.wikipedia.org/wiki/UTF-8
+
+    static const uint8_t MASK_IS_1_BYTE = 0x80;     // printf '0x%x\n' "$((2#10000000))"
+    static const uint8_t TEST_IS_1_BYTE = 0x00;
+ 
+    static const uint8_t MASK_IS_2_BYTES = 0xe0;    // printf '0x%x\n' "$((2#11100000))"
+    static const uint8_t TEST_IS_2_BYTES = 0xc0;    // printf '0x%x\n' "$((2#11000000))"
+
+    static const uint8_t MASK_IS_3_BYTES = 0xf0;    // printf '0x%x\n' "$((2#11110000))"
+    static const uint8_t TEST_IS_3_BYTES = 0xe0;    // printf '0x%x\n' "$((2#11100000))"
+
+    static const uint8_t MASK_IS_4_BYTES = 0xf8;    // printf '0x%x\n' "$((2#11111000))"
+    static const uint8_t TEST_IS_4_BYTES = 0xf0;    // printf '0x%x\n' "$((2#11110000))"
+
+    static const uint8_t MASK_CONTINUATION = 0xc0;  // printf '0x%x\n' "$((2#11000000))"
+    static const uint8_t TEST_CONTINUATION = 0x80;  // printf '0x%x\n' "$((2#10000000))"
+
+    if (position >= utf8.size())
+    {
+      throw OrthancException(ErrorCode_ParameterOutOfRange);
+    }
+
+    const uint8_t* buffer = reinterpret_cast<const uint8_t*>(utf8.c_str());
+
+    if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE)
+    {
+      length = 1;
+      unicode = buffer[0] & ~MASK_IS_1_BYTE;
+    }
+    else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES &&
+             position + 1 < utf8.size() &&
+             (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION)
+    {
+      length = 2;
+      uint32_t a = buffer[0] & ~MASK_IS_2_BYTES;
+      uint32_t b = buffer[1] & ~MASK_CONTINUATION;
+      unicode = (a << 6) | b;
+    }
+    else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES &&
+             position + 2 < utf8.size() &&
+             (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION &&
+             (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION)
+    {
+      length = 3;
+      uint32_t a = buffer[0] & ~MASK_IS_3_BYTES;
+      uint32_t b = buffer[1] & ~MASK_CONTINUATION;
+      uint32_t c = buffer[2] & ~MASK_CONTINUATION;
+      unicode = (a << 12) | (b << 6) | c;
+    }
+    else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES &&
+             position + 3 < utf8.size() &&
+             (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION &&
+             (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION &&
+             (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION)
+    {
+      length = 4;
+      uint32_t a = buffer[0] & ~MASK_IS_4_BYTES;
+      uint32_t b = buffer[1] & ~MASK_CONTINUATION;
+      uint32_t c = buffer[2] & ~MASK_CONTINUATION;
+      uint32_t d = buffer[3] & ~MASK_CONTINUATION;
+      unicode = (a << 18) | (b << 12) | (c << 6) | d;
+    }
+    else
+    {
+      // This is not a valid UTF-8 encoding
+      throw OrthancException(ErrorCode_BadFileFormat);
+    }
+  }
 }
 
 
--- a/Core/Toolbox.h	Wed Mar 06 12:23:14 2019 +0100
+++ b/Core/Toolbox.h	Fri Mar 15 17:37:51 2019 +0100
@@ -252,6 +252,11 @@
 
     void RemoveIso2022EscapeSequences(std::string& dest,
                                       const std::string& src);
+
+    void Utf8ToUnicodeCharacter(uint32_t& unicode,
+                                size_t& utf8Length,
+                                const std::string& utf8,
+                                size_t position);
   }
 }
 
--- a/UnitTestsSources/UnitTestsMain.cpp	Wed Mar 06 12:23:14 2019 +0100
+++ b/UnitTestsSources/UnitTestsMain.cpp	Fri Mar 15 17:37:51 2019 +0100
@@ -482,6 +482,69 @@
 }
 
 
+static int32_t GetUnicode(const uint8_t* data,
+                          size_t size,
+                          size_t expectedLength)
+{
+  std::string s((char*) &data[0], size);
+  uint32_t unicode;
+  size_t length;
+  Toolbox::Utf8ToUnicodeCharacter(unicode, length, s, 0);
+  if (length != expectedLength)
+  {
+    return -1;  // Error case
+  }
+  else
+  {
+    return unicode;
+  }
+}
+
+
+TEST(Toolbox, Utf8ToUnicode)
+{
+  // https://en.wikipedia.org/wiki/UTF-8
+  
+  ASSERT_EQ(1, sizeof(char));
+  ASSERT_EQ(1, sizeof(uint8_t));
+  
+  {
+    const uint8_t data[] = { 0x24 };
+    ASSERT_EQ(0x24, GetUnicode(data, 1, 1));
+    ASSERT_THROW(GetUnicode(data, 0, 1), OrthancException);
+  }
+  
+  {
+    const uint8_t data[] = { 0xc2, 0xa2 };
+    ASSERT_EQ(0xa2, GetUnicode(data, 2, 2));
+    ASSERT_THROW(GetUnicode(data, 1, 2), OrthancException);
+  }
+  
+  {
+    const uint8_t data[] = { 0xe0, 0xa4, 0xb9 };
+    ASSERT_EQ(0x0939, GetUnicode(data, 3, 3));
+    ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException);
+  }
+  
+  {
+    const uint8_t data[] = { 0xe2, 0x82, 0xac };
+    ASSERT_EQ(0x20ac, GetUnicode(data, 3, 3));
+    ASSERT_THROW(GetUnicode(data, 2, 3), OrthancException);
+  }
+  
+  {
+    const uint8_t data[] = { 0xf0, 0x90, 0x8d, 0x88 };
+    ASSERT_EQ(0x010348, GetUnicode(data, 4, 4));
+    ASSERT_THROW(GetUnicode(data, 3, 4), OrthancException);
+  }
+  
+  {
+    const uint8_t data[] = { 0xe0 };
+    ASSERT_THROW(GetUnicode(data, 1, 1), OrthancException);
+  }
+}
+
+
 TEST(Toolbox, UrlDecode)
 {
   std::string s;