diff Core/Toolbox.cpp @ 3217:cf8cbeb35f33

preliminary support of Korean character set
author Sebastien Jodogne <s.jodogne@gmail.com>
date Wed, 13 Feb 2019 17:46:12 +0100
parents 810772486249
children 9a83d94b2a1e
line wrap: on
line diff
--- a/Core/Toolbox.cpp	Tue Feb 12 17:27:33 2019 +0100
+++ b/Core/Toolbox.cpp	Wed Feb 13 17:46:12 2019 +0100
@@ -514,6 +514,10 @@
         return "TIS620.2533-0";
         break;
 
+      case Encoding_Korean:
+        return "ISO-IR-149";
+        break;
+
       default:
         throw OrthancException(ErrorCode_NotImplemented);
     }
@@ -522,27 +526,46 @@
 
 
 #if ORTHANC_ENABLE_LOCALE == 1
+  // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2
   std::string Toolbox::ConvertToUtf8(const std::string& source,
-                                     Encoding sourceEncoding)
+                                     Encoding sourceEncoding,
+                                     bool hasCodeExtensions)
   {
     // The "::skip" flag makes boost skip invalid UTF-8
     // characters. This can occur in badly-encoded DICOM files.
     
     try
     {
-      if (sourceEncoding == Encoding_Utf8)
-      {
-        // Already in UTF-8: No conversion is required
-        return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
-      }
-      else if (sourceEncoding == Encoding_Ascii)
+      if (sourceEncoding == Encoding_Ascii)
       {
         return ConvertToAscii(source);
       }
-      else
+      else 
       {
-        const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
-        return boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
+        std::string s;
+        
+        if (sourceEncoding == Encoding_Utf8)
+        {
+          // Already in UTF-8: No conversion is required, but we ensure
+          // the output is correctly encoded
+          s = boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
+        }
+        else
+        {
+          const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
+          s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
+        }
+
+        if (hasCodeExtensions)
+        {
+          std::string t;
+          RemoveIso2022EscapeSequences(t, s);
+          return t;
+        }
+        else
+        {
+          return s;
+        }        
       }
     }
     catch (std::runtime_error&)
@@ -1593,6 +1616,182 @@
 
     return boost::regex_replace(source, pattern, formatter);
   }
+
+
+  namespace Iso2022
+  {
+    /**
+       Returns whether the string s contains a single-byte control message
+       at index i
+    **/
+    static inline bool IsControlMessage1(const std::string& s, size_t i)
+    {
+      if (i < s.size())
+      {
+        char c = s[i];
+        return
+          (c == '\x0f') || // Locking shift zero
+          (c == '\x0e');   // Locking shift one
+      }
+      else
+      {
+        return false;
+      }
+    }
+
+    /**
+       Returns whether the string s contains a double-byte control message
+       at index i
+    **/
+    static inline size_t IsControlMessage2(const std::string& s, size_t i)
+    {
+      if (i + 1 < s.size())
+      {
+        char c1 = s[i];
+        char c2 = s[i + 1];
+        return (c1 == 0x1b) && (
+          (c2 == '\x6e') || // Locking shift two
+          (c2 == '\x6f') || // Locking shift three
+          (c2 == '\x4e') || // Single shift two (alt)
+          (c2 == '\x4f') || // Single shift three (alt)
+          (c2 == '\x7c') || // Locking shift three right
+          (c2 == '\x7d') || // Locking shift two right
+          (c2 == '\x7e')    // Locking shift one right
+          );
+      }
+      else
+      {
+        return false;
+      }
+    }
+
+    /**
+       Returns whether the string s contains a triple-byte control message
+       at index i
+    **/
+    static inline size_t IsControlMessage3(const std::string& s, size_t i)
+    {
+      if (i + 2 < s.size())
+      {
+        char c1 = s[i];
+        char c2 = s[i + 1];
+        char c3 = s[i + 2];
+        return ((c1 == '\x8e' && c2 == 0x1b && c3 == '\x4e') ||
+                (c1 == '\x8f' && c2 == 0x1b && c3 == '\x4f'));
+      }
+      else
+      {
+        return false;
+      }
+    }
+
+    /**
+       This function returns true if the index i in the supplied string s:
+       - is valid
+       - contains the c character
+       This function returns false otherwise.
+    **/
+    static inline bool TestCharValue(
+      const std::string& s, size_t i, char c)
+    {
+      if (i < s.size())
+        return s[i] == c;
+      else
+        return false;
+    }
+
+    /**
+       This function returns true if the index i in the supplied string s:
+       - is valid
+       - has a c character that is >= cMin and <= cMax (included)
+       This function returns false otherwise.
+    **/
+    static inline bool TestCharRange(
+      const std::string& s, size_t i, char cMin, char cMax)
+    {
+      if (i < s.size())
+        return (s[i] >= cMin) && (s[i] <= cMax);
+      else
+        return false;
+    }
+
+    /**
+       This function returns the total length in bytes of the escape sequence
+       located in string s at index i, if there is one, or 0 otherwise.
+    **/
+    static inline size_t GetEscapeSequenceLength(const std::string& s, size_t i)
+    {
+      if (TestCharValue(s, i, 0x1b))
+      {
+        size_t j = i+1;
+
+        // advance reading cursor while we are in a sequence 
+        while (TestCharRange(s, j, '\x20', '\x2f'))
+          ++j;
+
+        // check there is a valid termination byte AND we're long enough (there
+        // must be at least one byte between 0x20 and 0x2f
+        if (TestCharRange(s, j, '\x30', '\x7f') && (j - i) >= 2)
+          return j - i + 1;
+        else
+          return 0;
+      }
+      else
+        return 0;
+    }
+  }
+
+  
+
+  /**
+     This function will strip all ISO/IEC 2022 control codes and escape
+     sequences.
+     Please see https://en.wikipedia.org/wiki/ISO/IEC_2022 (as of 2019-02)
+     for a list of those.
+
+     Please note that this operation is potentially destructive, because
+     it removes the character set information from the byte stream.
+
+     However, in the case where the encoding is unique, then suppressing
+     the escape sequences allows to provide us with a clean string after
+     conversion to utf-8 with boost.
+  **/
+  void Toolbox::RemoveIso2022EscapeSequences(std::string& dest, const std::string& src)
+  {
+    // we need AT MOST the same size as the source string in the output
+    dest.clear();
+    if (dest.capacity() < src.size())
+      dest.reserve(src.size());
+
+    size_t i = 0;
+
+    // uint8_t view to the string
+    while (i < src.size())
+    {
+      size_t j = i;
+
+      // The i index will only be incremented if a message is detected
+      // in that case, the message is skipped and the index is set to the
+      // next position to read
+      if (Iso2022::IsControlMessage1(src, i))
+        i += 1;
+      else if (Iso2022::IsControlMessage2(src, i))
+        i += 2;
+      else if (Iso2022::IsControlMessage3(src, i))
+        i += 3;
+      else
+        i += Iso2022::GetEscapeSequenceLength(src, i);
+
+      // if the index was NOT incremented, this means there was no message at
+      // this location: we then may copy the character at this index and 
+      // increment the index to point to the next read position
+      if (j == i)
+      {
+        dest.push_back(src[i]);
+        i++;
+      }
+    }
+  }
 }