changeset 2907:0204af4ece6a

Remove invalid characters from badly-encoded UTF-8 strings
author Sebastien Jodogne <s.jodogne@gmail.com>
date Tue, 30 Oct 2018 13:53:29 +0100
parents 2a504fef4ed7
children 9d277f8ad698
files Core/Toolbox.cpp NEWS UnitTestsSources/UnitTestsMain.cpp
diffstat 3 files changed, 52 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/Core/Toolbox.cpp	Tue Oct 30 12:29:55 2018 +0100
+++ b/Core/Toolbox.cpp	Tue Oct 30 13:53:29 2018 +0100
@@ -519,22 +519,25 @@
   std::string Toolbox::ConvertToUtf8(const std::string& source,
                                      Encoding sourceEncoding)
   {
-    if (sourceEncoding == Encoding_Utf8)
-    {
-      // Already in UTF-8: No conversion is required
-      return source;
-    }
-
-    if (sourceEncoding == Encoding_Ascii)
-    {
-      return ConvertToAscii(source);
-    }
-
-    const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
-
+    // The "::skip" flag makes boost skip invalid UTF-8
+    // characters. This can occur in badly-encoded DICOM files.
+    
     try
     {
-      return boost::locale::conv::to_utf<char>(source, encoding);
+      if (sourceEncoding == Encoding_Utf8)
+      {
+        // Already in UTF-8: No conversion is required
+        return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
+      }
+      else if (sourceEncoding == Encoding_Ascii)
+      {
+        return ConvertToAscii(source);
+      }
+      else
+      {
+        const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
+        return boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
+      }
     }
     catch (std::runtime_error&)
     {
@@ -549,22 +552,25 @@
   std::string Toolbox::ConvertFromUtf8(const std::string& source,
                                        Encoding targetEncoding)
   {
-    if (targetEncoding == Encoding_Utf8)
-    {
-      // Already in UTF-8: No conversion is required
-      return source;
-    }
-
-    if (targetEncoding == Encoding_Ascii)
-    {
-      return ConvertToAscii(source);
-    }
-
-    const char* encoding = GetBoostLocaleEncoding(targetEncoding);
-
+    // The "::skip" flag makes boost skip invalid UTF-8
+    // characters. This can occur in badly-encoded DICOM files.
+    
     try
     {
-      return boost::locale::conv::from_utf<char>(source, encoding);
+      if (targetEncoding == Encoding_Utf8)
+      {
+        // Already in UTF-8: No conversion is required.
+        return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
+      }
+      else if (targetEncoding == Encoding_Ascii)
+      {
+        return ConvertToAscii(source);
+      }
+      else
+      {
+        const char* encoding = GetBoostLocaleEncoding(targetEncoding);
+        return boost::locale::conv::from_utf<char>(source, encoding, boost::locale::conv::skip);
+      }
     }
     catch (std::runtime_error&)
     {
@@ -1427,9 +1433,9 @@
      * "utf_to_utf" in order to convert to/from std::wstring.
      **/
 
-    std::wstring w = boost::locale::conv::utf_to_utf<wchar_t>(source);
+    std::wstring w = boost::locale::conv::utf_to_utf<wchar_t>(source, boost::locale::conv::skip);
     w = boost::algorithm::to_upper_copy<std::wstring>(w, *globalLocale_);
-    return boost::locale::conv::utf_to_utf<char>(w);
+    return boost::locale::conv::utf_to_utf<char>(w, boost::locale::conv::skip);
   }
 #endif
 
--- a/NEWS	Tue Oct 30 12:29:55 2018 +0100
+++ b/NEWS	Tue Oct 30 13:53:29 2018 +0100
@@ -26,7 +26,9 @@
 * New modality manufacturer: "GE" for GE Healthcare EA and AW
 * Executing a query/retrieve from the REST API now creates a job
 * Fix: Closing DICOM associations after running query/retrieve from REST API
-* Fix: Allow creation of MONOCHROME1 greyscale images in tools/create-dicom
+* Fix: Allow creation of MONOCHROME1 grayscale images in tools/create-dicom
+* Remove invalid characters from badly-encoded UTF-8 strings (impacts PostgreSQL)
+
 
 Version 1.4.2 (2018-09-20)
 ==========================
--- a/UnitTestsSources/UnitTestsMain.cpp	Tue Oct 30 12:29:55 2018 +0100
+++ b/UnitTestsSources/UnitTestsMain.cpp	Tue Oct 30 13:53:29 2018 +0100
@@ -444,6 +444,19 @@
   ASSERT_EQ(0x00, static_cast<unsigned char>(utf8[14]));  // Null-terminated string
 }
 
+
+TEST(Toolbox, FixUtf8)
+{
+  // This is a Latin-1 test string: "crane" with a circumflex accent
+  const unsigned char latin1[] = { 0x63, 0x72, 0xe2, 0x6e, 0x65 };
+
+  std::string s((char*) &latin1[0], sizeof(latin1) / sizeof(char));
+
+  ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1), Encoding_Latin1));
+  ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8));
+}
+
+
 TEST(Toolbox, UrlDecode)
 {
   std::string s;