changeset 6233:a93598f96cc1

fix handling of backslashes if ISO_IR 13 encoding
author Sebastien Jodogne <s.jodogne@gmail.com>
date Wed, 09 Jul 2025 12:28:12 +0200
parents 46cd2a84ffdf
children 51a0e464e898
files NEWS OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp OrthancFramework/Sources/Toolbox.cpp OrthancFramework/Sources/Toolbox.h OrthancFramework/UnitTestsSources/FrameworkTests.cpp OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp OrthancFramework/UnitTestsSources/ToolboxTests.cpp OrthancServer/Sources/Search/DatabaseLookup.cpp OrthancServer/Sources/Search/HierarchicalMatcher.cpp OrthancServer/UnitTestsSources/UnitTestsMain.cpp
diffstat 13 files changed, 132 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS	Wed Jul 09 08:21:25 2025 +0200
+++ b/NEWS	Wed Jul 09 12:28:12 2025 +0200
@@ -49,7 +49,7 @@
 * If the "RegisteredUsers" configuration option is present but empty,
   Orthanc does not create the default user "orthanc" anymore.
 * Added new CMake option "-DBUILD_UNIT_TESTS=ON" to disable the building of unit tests
-
+* Fix handling of backslashes in DICOM elements if encoding is ISO_IR 13
 
 
 Version 1.12.8 (2025-06-13)
--- a/OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -167,7 +167,8 @@
         {
           if (s != NULL)
           {
-            result = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions);
+            const bool skipBacklashes = true;  // cf. "ISO_IR 13": In this method, the VR will never be UT, ST, or LT
+            result = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions, skipBacklashes);
           }
           
           return true;
--- a/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -632,8 +632,8 @@
       {
         target.SetValueInternal(element->getTag().getGTag(),
                                 element->getTag().getETag(),
-                                ConvertLeafElement(*element, DicomToJsonFlags_Default,
-                                                   maxStringLength, encoding, hasCodeExtensions, ignoreTagLength));
+                                ConvertLeafElement(*element, DicomToJsonFlags_Default, maxStringLength, encoding,
+                                                   hasCodeExtensions, ignoreTagLength, Convert(element->getVR())));
       }
       else
       {
@@ -694,7 +694,8 @@
                                                   unsigned int maxStringLength,
                                                   Encoding encoding,
                                                   bool hasCodeExtensions,
-                                                  const std::set<DicomTag>& ignoreTagLength)
+                                                  const std::set<DicomTag>& ignoreTagLength,
+                                                  ValueRepresentation vr)
   {
     if (!element.isLeaf())
     {
@@ -714,7 +715,7 @@
         else
         {
           const std::string s(c);
-          const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions);
+          const std::string utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, Convert(element.getVR()));
           return CreateValueFromUtf8String(GetTag(element), utf8, maxStringLength, ignoreTagLength);
         }
       }
@@ -782,7 +783,7 @@
             // "SpecificCharacterSet" tag, if present. This branch is
             // new in Orthanc 1.9.1 (cf. DICOM CP 246).
             const std::string s(reinterpret_cast<const char*>(data), length);
-            const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions);
+            const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions, Convert(element.getVR()));
             return CreateValueFromUtf8String(GetTag(element), utf8, maxStringLength, ignoreTagLength);
           }
         }
@@ -1102,7 +1103,7 @@
     {
       // The "0" below lets "LeafValueToJson()" take care of "TooLong" values
       std::unique_ptr<DicomValue> v(FromDcmtkBridge::ConvertLeafElement
-                                    (element, flags, 0, encoding, hasCodeExtensions, ignoreTagLength));
+                                    (element, flags, 0, encoding, hasCodeExtensions, ignoreTagLength, Convert(element.getVR())));
 
       if (ignoreTagLength.find(GetTag(element)) == ignoreTagLength.end())
       {
@@ -2594,7 +2595,7 @@
               element->getString(c).good() && 
               c != NULL)
           {
-            std::string a = Toolbox::ConvertToUtf8(c, source, hasSourceCodeExtensions);
+            std::string a = Toolbox::ConvertToUtf8(c, source, hasSourceCodeExtensions, Convert(element->getVR()));
             std::string b = Toolbox::ConvertFromUtf8(a, target);
             element->putString(b.c_str());
           }
@@ -2848,7 +2849,7 @@
         else
         {
           std::string s(c);
-          utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions);
+          utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, FromDcmtkBridge::Convert(element.getVR()));
         }
       }
 
@@ -2924,8 +2925,8 @@
 
             std::string ignored;
             std::string s(reinterpret_cast<const char*>(data), l);
-            action = visitor.VisitString(ignored, parentTags, parentIndexes, tag, vr,
-                                         Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions));
+            std::string utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, FromDcmtkBridge::Convert(element.getVR()));
+            action = visitor.VisitString(ignored, parentTags, parentIndexes, tag, vr, utf8);
           }
           else
           {
--- a/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h	Wed Jul 09 12:28:12 2025 +0200
@@ -192,7 +192,8 @@
                                           unsigned int maxStringLength,
                                           Encoding encoding,
                                           bool hasCodeExtensions,
-                                          const std::set<DicomTag>& ignoreTagLength);
+                                          const std::set<DicomTag>& ignoreTagLength,
+                                          ValueRepresentation vr);
 
     static void ExtractHeaderAsJson(Json::Value& target, 
                                     DcmMetaInfo& header,
--- a/OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -893,7 +893,7 @@
       std::set<DicomTag> tmp;
       std::unique_ptr<DicomValue> v(FromDcmtkBridge::ConvertLeafElement
                                     (*element, DicomToJsonFlags_Default, 
-                                     0, encoding, hasCodeExtensions, tmp));
+                                     0, encoding, hasCodeExtensions, tmp, FromDcmtkBridge::Convert(element->getVR())));
       
       if (v.get() == NULL ||
           v->IsNull())
--- a/OrthancFramework/Sources/Toolbox.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/Toolbox.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -724,7 +724,8 @@
   // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2
   std::string Toolbox::ConvertToUtf8(const std::string& source,
                                      Encoding sourceEncoding,
-                                     bool hasCodeExtensions)
+                                     bool hasCodeExtensions,
+                                     bool skipBackslashes)
   {
 #if ORTHANC_STATIC_ICU == 1
 #  if ORTHANC_ENABLE_ICU == 0
@@ -760,7 +761,26 @@
         else
         {
           const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
-          s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
+
+          if (skipBackslashes)
+          {
+            /**
+             * This is to deal with the fact that in Japanese coding
+             * (ISO_IR 13), backslashes will be converted to the Yen
+             * character.
+             **/
+            std::vector<std::string> tokens;
+            TokenizeString(tokens, source, '\\');
+            for (size_t i = 0; i < tokens.size(); i++)
+            {
+              tokens[i] = boost::locale::conv::to_utf<char>(tokens[i], encoding, boost::locale::conv::skip);
+            }
+            JoinStrings(s, tokens, "\\");
+          }
+          else
+          {
+            s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
+          }
         }
 
         if (hasCodeExtensions)
@@ -830,6 +850,50 @@
 #endif
 
 
+#if ORTHANC_ENABLE_LOCALE == 1
+  std::string Toolbox::ConvertDicomStringToUtf8(const std::string& source,
+                                                Encoding sourceEncoding,
+                                                bool hasCodeExtensions,
+                                                ValueRepresentation vr)
+  {
+    /**
+     * This method was added in Orthanc 1.12.9, as a consequence of:
+     * https://discourse.orthanc-server.org/t/issue-with-special-characters-when-scans-where-uploaded-with-specificcharacterset-dicom-tag-value-as-iso-ir-13/5962
+     *
+     * From the DICOM standard: "Two character codes of the
+     * single-byte character sets invoked in the GL area of the code
+     * table, 02/00 and 05/12, have special significance in the DICOM
+     * Standard. The character SPACE, represented by bit combination
+     * 02/00, shall be used for the padding of Data Element Values
+     * that are character strings. The Graphic Character represented
+     * by the bit combination 05/12, "\" (BACKSLASH) (reverse solidus)
+     * in the repertoire ISO-IR 6, shall only be used in character
+     * strings with Value Representations of UT, ST and LT (see
+     * Section 6.2). Otherwise the character code 05/12 is used as a
+     * separator for multi-valued Data Elements (see Section
+     * 6.4). [...] When the Value of Specific Character Set
+     * (0008,0005) is either "ISO_IR 13" or "ISO 2022 IR 13", the
+     * graphic character represented by the bit combination 05/12 is a
+     * "¥" (YEN SIGN) in the character set of ISO-IR 14."
+     * https://www.dicomstandard.org/standards/view/data-structures-and-encoding
+     *
+     * This description implies that if "sourceEncoding" (which is
+     * derived from the value of the DICOM Specific Character Set)
+     * corresponds "ISO_IR 13" or "ISO 2022 IR 13", AND if the value
+     * representation is *not* UT, ST, or LT, then backslashes should
+     * be ignored during the conversion to UTF-8.
+     **/
+
+    const bool skipBackslashes = (sourceEncoding == Encoding_Japanese &&
+                                  vr != ValueRepresentation_UnlimitedText &&  // UT
+                                  vr != ValueRepresentation_ShortText &&      // ST
+                                  vr != ValueRepresentation_LongText);        // LT
+
+    return ConvertToUtf8(source, sourceEncoding, hasCodeExtensions, skipBackslashes);
+  }
+#endif
+
+
   static bool IsAsciiCharacter(uint8_t c)
   {
     return (c != 0 &&
--- a/OrthancFramework/Sources/Toolbox.h	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/Sources/Toolbox.h	Wed Jul 09 12:28:12 2025 +0200
@@ -186,7 +186,13 @@
 #if ORTHANC_ENABLE_LOCALE == 1
     static std::string ConvertToUtf8(const std::string& source,
                                      Encoding sourceEncoding,
-                                     bool hasCodeExtensions);
+                                     bool hasCodeExtensions,
+                                     bool skipBackslashes /* was always "false" in Orthanc <= 1.12.8 */);
+
+    static std::string ConvertDicomStringToUtf8(const std::string& source,
+                                                Encoding sourceEncoding,
+                                                bool hasCodeExtensions,
+                                                ValueRepresentation vr);
 
     static std::string ConvertFromUtf8(const std::string& source,
                                        Encoding targetEncoding);
--- a/OrthancFramework/UnitTestsSources/FrameworkTests.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/UnitTestsSources/FrameworkTests.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -500,7 +500,7 @@
   ASSERT_EQ("&abc", Toolbox::ConvertToAscii(s));
 
   // Open in Emacs, then save with UTF-8 encoding, then "hexdump -C"
-  std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1, false);
+  std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1, false, false);
   ASSERT_EQ(15u, utf8.size());
   ASSERT_EQ(0xc3, static_cast<unsigned char>(utf8[0]));
   ASSERT_EQ(0xa0, static_cast<unsigned char>(utf8[1]));
@@ -527,8 +527,8 @@
 
   std::string s((char*) &latin1[0], sizeof(latin1) / sizeof(char));
 
-  ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1, false), Encoding_Latin1));
-  ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8, false));
+  ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1, false, false), Encoding_Latin1));
+  ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8, false, false));
 }
 
 
--- a/OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -264,7 +264,7 @@
   {
     std::string source(testEncodingsEncoded[i]);
     std::string expected(testEncodingsExpected[i]);
-    std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i], false);
+    std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i], false, false);
     //std::cout << EnumerationToString(testEncodings[i]) << std::endl;
     EXPECT_EQ(expected, s);
   }
@@ -334,7 +334,7 @@
       ParsedDicomFile f(true);
       f.SetEncoding(testEncodings[i]);
 
-      std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false);
+      std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false, false);
       f.Insert(DICOM_TAG_PATIENT_NAME, s, false, "");
       f.SaveToMemoryBuffer(dicom);
     }
@@ -571,7 +571,7 @@
         ASSERT_FALSE(hasCodeExtensions);
       }
 
-      Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false);
+      Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false, false);
       f.Replace(DICOM_TAG_PATIENT_NAME, s, false, DicomReplaceMode_InsertIfAbsent, "");
 
       Json::Value v;
@@ -1172,7 +1172,7 @@
         // Sanity check to test the proper behavior of "EncodingTests.py"
         std::string encoded = Toolbox::ConvertFromUtf8(testEncodingsExpected[i], testEncodings[i]);
         ASSERT_STREQ(testEncodingsEncoded[i], encoded.c_str());
-        std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i], false);
+        std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i], false, false);
         ASSERT_STREQ(testEncodingsExpected[i], decoded.c_str());
 
         if (testEncodings[i] != Encoding_Chinese)
@@ -1181,7 +1181,7 @@
           // test against Chinese, it is normal that it does not correspond to UTF8
 
           const std::string tmp = Toolbox::ConvertToUtf8(
-            Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i], false);
+            Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i], false, false);
           ASSERT_STREQ(testEncodingsExpected[i], tmp.c_str());
         }
       }
--- a/OrthancFramework/UnitTestsSources/ToolboxTests.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancFramework/UnitTestsSources/ToolboxTests.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -407,4 +407,32 @@
   ASSERT_EQ("8.59Gbps", Toolbox::GetHumanTransferSpeed(false, 1024*1024*1024, 1000000000));
   ASSERT_EQ("1.00GB in 1.00s = 8.59Gbps", Toolbox::GetHumanTransferSpeed(true, 1024*1024*1024, 1000000000));
   ASSERT_EQ("976.56KB in 1.00s = 8.00Mbps", Toolbox::GetHumanTransferSpeed(true, 1000*1000, 1000000000));
-}
\ No newline at end of file
+}
+
+TEST(Toolbox, JapaneseBackslashes)
+{
+  std::string s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, false);
+  ASSERT_EQ("ORIGINAL\302\245PRIMARY", s);  // NB: The Yen symbol is encoded as 0xC2 0xA5 in UTF-8
+
+  s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, true);
+  ASSERT_EQ("ORIGINAL\\PRIMARY", s);
+
+  s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_PersonName);
+  ASSERT_EQ("ORIGINAL\\PRIMARY", s);
+
+  // Backslashes should only be interpreted as the Yen symbol if VR is ST, LT, or UL
+  s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_ShortText);
+  ASSERT_EQ("ORIGINAL\302\245PRIMARY", s);
+
+  s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_LongText);
+  ASSERT_EQ("ORIGINAL\302\245PRIMARY", s);
+
+  s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_UnlimitedText);
+  ASSERT_EQ("ORIGINAL\302\245PRIMARY", s);
+
+  s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Latin1, false, false);
+  ASSERT_EQ("ORIGINAL\\PRIMARY", s);
+
+  s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Latin1, false, ValueRepresentation_ShortText);
+  ASSERT_EQ("ORIGINAL\\PRIMARY", s);
+}
--- a/OrthancServer/Sources/Search/DatabaseLookup.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancServer/Sources/Search/DatabaseLookup.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -111,8 +111,8 @@
 
       std::set<DicomTag> ignoreTagLength;
       std::unique_ptr<DicomValue> value(FromDcmtkBridge::ConvertLeafElement
-                                        (*element, DicomToJsonFlags_None, 
-                                         0, encoding, hasCodeExtensions, ignoreTagLength));
+                                        (*element, DicomToJsonFlags_None, 0, encoding, hasCodeExtensions,
+                                         ignoreTagLength, FromDcmtkBridge::Convert(element->getVR())));
 
       // WARNING: Also modify "HierarchicalMatcher::Setup()" if modifying this code
       if (value.get() == NULL ||
--- a/OrthancServer/Sources/Search/HierarchicalMatcher.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancServer/Sources/Search/HierarchicalMatcher.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -114,8 +114,8 @@
 
         std::set<DicomTag> ignoreTagLength;
         std::unique_ptr<DicomValue> value(FromDcmtkBridge::ConvertLeafElement
-                                          (*element, DicomToJsonFlags_None, 
-                                           0, encoding, hasCodeExtensions, ignoreTagLength));
+                                          (*element, DicomToJsonFlags_None, 0, encoding, hasCodeExtensions,
+                                           ignoreTagLength, FromDcmtkBridge::Convert(element->getVR())));
 
         // WARNING: Also modify "DatabaseLookup::IsMatch()" if modifying this code
         if (value.get() == NULL ||
--- a/OrthancServer/UnitTestsSources/UnitTestsMain.cpp	Wed Jul 09 08:21:25 2025 +0200
+++ b/OrthancServer/UnitTestsSources/UnitTestsMain.cpp	Wed Jul 09 12:28:12 2025 +0200
@@ -194,7 +194,7 @@
   const unsigned char raw[] = { 0x63, 0x72, 0xe2, 0x6e, 0x65 };
   std::string latin1((char*) &raw[0], sizeof(raw) / sizeof(char));
 
-  std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1, false);
+  std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1, false, false);
 
   ParsedDicomFile dicom(false);
   dicom.SetEncoding(Encoding_Latin1);