Mercurial > hg > orthanc
changeset 6233:a93598f96cc1
fix handling of backslashes if ISO_IR 13 encoding
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Wed, 09 Jul 2025 12:28:12 +0200 |
parents | 46cd2a84ffdf |
children | 51a0e464e898 |
files | NEWS OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp OrthancFramework/Sources/Toolbox.cpp OrthancFramework/Sources/Toolbox.h OrthancFramework/UnitTestsSources/FrameworkTests.cpp OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp OrthancFramework/UnitTestsSources/ToolboxTests.cpp OrthancServer/Sources/Search/DatabaseLookup.cpp OrthancServer/Sources/Search/HierarchicalMatcher.cpp OrthancServer/UnitTestsSources/UnitTestsMain.cpp |
diffstat | 13 files changed, 132 insertions(+), 31 deletions(-) [+] |
line wrap: on
line diff
--- a/NEWS Wed Jul 09 08:21:25 2025 +0200 +++ b/NEWS Wed Jul 09 12:28:12 2025 +0200 @@ -49,7 +49,7 @@ * If the "RegisteredUsers" configuration option is present but empty, Orthanc does not create the default user "orthanc" anymore. * Added new CMake option "-DBUILD_UNIT_TESTS=ON" to disable the building of unit tests - +* Fix handling of backslashes in DICOM elements if encoding is ISO_IR 13 Version 1.12.8 (2025-06-13)
--- a/OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/DicomParsing/DicomDirWriter.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -167,7 +167,8 @@ { if (s != NULL) { - result = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); + const bool skipBacklashes = true; // cf. "ISO_IR 13": In this method, the VR will never be UT, ST, or LT + result = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions, skipBacklashes); } return true;
--- a/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -632,8 +632,8 @@ { target.SetValueInternal(element->getTag().getGTag(), element->getTag().getETag(), - ConvertLeafElement(*element, DicomToJsonFlags_Default, - maxStringLength, encoding, hasCodeExtensions, ignoreTagLength)); + ConvertLeafElement(*element, DicomToJsonFlags_Default, maxStringLength, encoding, + hasCodeExtensions, ignoreTagLength, Convert(element->getVR()))); } else { @@ -694,7 +694,8 @@ unsigned int maxStringLength, Encoding encoding, bool hasCodeExtensions, - const std::set<DicomTag>& ignoreTagLength) + const std::set<DicomTag>& ignoreTagLength, + ValueRepresentation vr) { if (!element.isLeaf()) { @@ -714,7 +715,7 @@ else { const std::string s(c); - const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); + const std::string utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, Convert(element.getVR())); return CreateValueFromUtf8String(GetTag(element), utf8, maxStringLength, ignoreTagLength); } } @@ -782,7 +783,7 @@ // "SpecificCharacterSet" tag, if present. This branch is // new in Orthanc 1.9.1 (cf. DICOM CP 246). const std::string s(reinterpret_cast<const char*>(data), length); - const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); + const std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions, Convert(element.getVR())); return CreateValueFromUtf8String(GetTag(element), utf8, maxStringLength, ignoreTagLength); } } @@ -1102,7 +1103,7 @@ { // The "0" below lets "LeafValueToJson()" take care of "TooLong" values std::unique_ptr<DicomValue> v(FromDcmtkBridge::ConvertLeafElement - (element, flags, 0, encoding, hasCodeExtensions, ignoreTagLength)); + (element, flags, 0, encoding, hasCodeExtensions, ignoreTagLength, Convert(element.getVR()))); if (ignoreTagLength.find(GetTag(element)) == ignoreTagLength.end()) { @@ -2594,7 +2595,7 @@ element->getString(c).good() && c != NULL) { - std::string a = Toolbox::ConvertToUtf8(c, source, hasSourceCodeExtensions); + std::string a = Toolbox::ConvertToUtf8(c, source, hasSourceCodeExtensions, Convert(element->getVR())); std::string b = Toolbox::ConvertFromUtf8(a, target); element->putString(b.c_str()); } @@ -2848,7 +2849,7 @@ else { std::string s(c); - utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); + utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, FromDcmtkBridge::Convert(element.getVR())); } } @@ -2924,8 +2925,8 @@ std::string ignored; std::string s(reinterpret_cast<const char*>(data), l); - action = visitor.VisitString(ignored, parentTags, parentIndexes, tag, vr, - Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions)); + std::string utf8 = Toolbox::ConvertDicomStringToUtf8(s, encoding, hasCodeExtensions, FromDcmtkBridge::Convert(element.getVR())); + action = visitor.VisitString(ignored, parentTags, parentIndexes, tag, vr, utf8); } else {
--- a/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/DicomParsing/FromDcmtkBridge.h Wed Jul 09 12:28:12 2025 +0200 @@ -192,7 +192,8 @@ unsigned int maxStringLength, Encoding encoding, bool hasCodeExtensions, - const std::set<DicomTag>& ignoreTagLength); + const std::set<DicomTag>& ignoreTagLength, + ValueRepresentation vr); static void ExtractHeaderAsJson(Json::Value& target, DcmMetaInfo& header,
--- a/OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/DicomParsing/ParsedDicomFile.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -893,7 +893,7 @@ std::set<DicomTag> tmp; std::unique_ptr<DicomValue> v(FromDcmtkBridge::ConvertLeafElement (*element, DicomToJsonFlags_Default, - 0, encoding, hasCodeExtensions, tmp)); + 0, encoding, hasCodeExtensions, tmp, FromDcmtkBridge::Convert(element->getVR()))); if (v.get() == NULL || v->IsNull())
--- a/OrthancFramework/Sources/Toolbox.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/Toolbox.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -724,7 +724,8 @@ // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2 std::string Toolbox::ConvertToUtf8(const std::string& source, Encoding sourceEncoding, - bool hasCodeExtensions) + bool hasCodeExtensions, + bool skipBackslashes) { #if ORTHANC_STATIC_ICU == 1 # if ORTHANC_ENABLE_ICU == 0 @@ -760,7 +761,26 @@ else { const char* encoding = GetBoostLocaleEncoding(sourceEncoding); - s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); + + if (skipBackslashes) + { + /** + * This is to deal with the fact that in Japanese coding + * (ISO_IR 13), backslashes will be converted to the Yen + * character. + **/ + std::vector<std::string> tokens; + TokenizeString(tokens, source, '\\'); + for (size_t i = 0; i < tokens.size(); i++) + { + tokens[i] = boost::locale::conv::to_utf<char>(tokens[i], encoding, boost::locale::conv::skip); + } + JoinStrings(s, tokens, "\\"); + } + else + { + s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); + } } if (hasCodeExtensions) @@ -830,6 +850,50 @@ #endif +#if ORTHANC_ENABLE_LOCALE == 1 + std::string Toolbox::ConvertDicomStringToUtf8(const std::string& source, + Encoding sourceEncoding, + bool hasCodeExtensions, + ValueRepresentation vr) + { + /** + * This method was added in Orthanc 1.12.9, as a consequence of: + * https://discourse.orthanc-server.org/t/issue-with-special-characters-when-scans-where-uploaded-with-specificcharacterset-dicom-tag-value-as-iso-ir-13/5962 + * + * From the DICOM standard: "Two character codes of the + * single-byte character sets invoked in the GL area of the code + * table, 02/00 and 05/12, have special significance in the DICOM + * Standard. The character SPACE, represented by bit combination + * 02/00, shall be used for the padding of Data Element Values + * that are character strings. The Graphic Character represented + * by the bit combination 05/12, "\" (BACKSLASH) (reverse solidus) + * in the repertoire ISO-IR 6, shall only be used in character + * strings with Value Representations of UT, ST and LT (see + * Section 6.2). Otherwise the character code 05/12 is used as a + * separator for multi-valued Data Elements (see Section + * 6.4). [...] When the Value of Specific Character Set + * (0008,0005) is either "ISO_IR 13" or "ISO 2022 IR 13", the + * graphic character represented by the bit combination 05/12 is a + * "¥" (YEN SIGN) in the character set of ISO-IR 14." + * https://www.dicomstandard.org/standards/view/data-structures-and-encoding + * + * This description implies that if "sourceEncoding" (which is + * derived from the value of the DICOM Specific Character Set) + * corresponds "ISO_IR 13" or "ISO 2022 IR 13", AND if the value + * representation is *not* UT, ST, or LT, then backslashes should + * be ignored during the conversion to UTF-8. + **/ + + const bool skipBackslashes = (sourceEncoding == Encoding_Japanese && + vr != ValueRepresentation_UnlimitedText && // UT + vr != ValueRepresentation_ShortText && // ST + vr != ValueRepresentation_LongText); // LT + + return ConvertToUtf8(source, sourceEncoding, hasCodeExtensions, skipBackslashes); + } +#endif + + static bool IsAsciiCharacter(uint8_t c) { return (c != 0 &&
--- a/OrthancFramework/Sources/Toolbox.h Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/Sources/Toolbox.h Wed Jul 09 12:28:12 2025 +0200 @@ -186,7 +186,13 @@ #if ORTHANC_ENABLE_LOCALE == 1 static std::string ConvertToUtf8(const std::string& source, Encoding sourceEncoding, - bool hasCodeExtensions); + bool hasCodeExtensions, + bool skipBackslashes /* was always "false" in Orthanc <= 1.12.8 */); + + static std::string ConvertDicomStringToUtf8(const std::string& source, + Encoding sourceEncoding, + bool hasCodeExtensions, + ValueRepresentation vr); static std::string ConvertFromUtf8(const std::string& source, Encoding targetEncoding);
--- a/OrthancFramework/UnitTestsSources/FrameworkTests.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/UnitTestsSources/FrameworkTests.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -500,7 +500,7 @@ ASSERT_EQ("&abc", Toolbox::ConvertToAscii(s)); // Open in Emacs, then save with UTF-8 encoding, then "hexdump -C" - std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1, false); + std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1, false, false); ASSERT_EQ(15u, utf8.size()); ASSERT_EQ(0xc3, static_cast<unsigned char>(utf8[0])); ASSERT_EQ(0xa0, static_cast<unsigned char>(utf8[1])); @@ -527,8 +527,8 @@ std::string s((char*) &latin1[0], sizeof(latin1) / sizeof(char)); - ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1, false), Encoding_Latin1)); - ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8, false)); + ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1, false, false), Encoding_Latin1)); + ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8, false, false)); }
--- a/OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/UnitTestsSources/FromDcmtkTests.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -264,7 +264,7 @@ { std::string source(testEncodingsEncoded[i]); std::string expected(testEncodingsExpected[i]); - std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i], false); + std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i], false, false); //std::cout << EnumerationToString(testEncodings[i]) << std::endl; EXPECT_EQ(expected, s); } @@ -334,7 +334,7 @@ ParsedDicomFile f(true); f.SetEncoding(testEncodings[i]); - std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false); + std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false, false); f.Insert(DICOM_TAG_PATIENT_NAME, s, false, ""); f.SaveToMemoryBuffer(dicom); } @@ -571,7 +571,7 @@ ASSERT_FALSE(hasCodeExtensions); } - Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false); + Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false, false); f.Replace(DICOM_TAG_PATIENT_NAME, s, false, DicomReplaceMode_InsertIfAbsent, ""); Json::Value v; @@ -1172,7 +1172,7 @@ // Sanity check to test the proper behavior of "EncodingTests.py" std::string encoded = Toolbox::ConvertFromUtf8(testEncodingsExpected[i], testEncodings[i]); ASSERT_STREQ(testEncodingsEncoded[i], encoded.c_str()); - std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i], false); + std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i], false, false); ASSERT_STREQ(testEncodingsExpected[i], decoded.c_str()); if (testEncodings[i] != Encoding_Chinese) @@ -1181,7 +1181,7 @@ // test against Chinese, it is normal that it does not correspond to UTF8 const std::string tmp = Toolbox::ConvertToUtf8( - Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i], false); + Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i], false, false); ASSERT_STREQ(testEncodingsExpected[i], tmp.c_str()); } }
--- a/OrthancFramework/UnitTestsSources/ToolboxTests.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancFramework/UnitTestsSources/ToolboxTests.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -407,4 +407,32 @@ ASSERT_EQ("8.59Gbps", Toolbox::GetHumanTransferSpeed(false, 1024*1024*1024, 1000000000)); ASSERT_EQ("1.00GB in 1.00s = 8.59Gbps", Toolbox::GetHumanTransferSpeed(true, 1024*1024*1024, 1000000000)); ASSERT_EQ("976.56KB in 1.00s = 8.00Mbps", Toolbox::GetHumanTransferSpeed(true, 1000*1000, 1000000000)); -} \ No newline at end of file +} + +TEST(Toolbox, JapaneseBackslashes) +{ + std::string s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, false); + ASSERT_EQ("ORIGINAL\302\245PRIMARY", s); // NB: The Yen symbol is encoded as 0xC2 0xA5 in UTF-8 + + s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, true); + ASSERT_EQ("ORIGINAL\\PRIMARY", s); + + s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_PersonName); + ASSERT_EQ("ORIGINAL\\PRIMARY", s); + + // Backslashes should only be interpreted as the Yen symbol if VR is ST, LT, or UL + s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_ShortText); + ASSERT_EQ("ORIGINAL\302\245PRIMARY", s); + + s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_LongText); + ASSERT_EQ("ORIGINAL\302\245PRIMARY", s); + + s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Japanese, false, ValueRepresentation_UnlimitedText); + ASSERT_EQ("ORIGINAL\302\245PRIMARY", s); + + s = Orthanc::Toolbox::ConvertToUtf8("ORIGINAL\\PRIMARY", Encoding_Latin1, false, false); + ASSERT_EQ("ORIGINAL\\PRIMARY", s); + + s = Orthanc::Toolbox::ConvertDicomStringToUtf8("ORIGINAL\\PRIMARY", Encoding_Latin1, false, ValueRepresentation_ShortText); + ASSERT_EQ("ORIGINAL\\PRIMARY", s); +}
--- a/OrthancServer/Sources/Search/DatabaseLookup.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancServer/Sources/Search/DatabaseLookup.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -111,8 +111,8 @@ std::set<DicomTag> ignoreTagLength; std::unique_ptr<DicomValue> value(FromDcmtkBridge::ConvertLeafElement - (*element, DicomToJsonFlags_None, - 0, encoding, hasCodeExtensions, ignoreTagLength)); + (*element, DicomToJsonFlags_None, 0, encoding, hasCodeExtensions, + ignoreTagLength, FromDcmtkBridge::Convert(element->getVR()))); // WARNING: Also modify "HierarchicalMatcher::Setup()" if modifying this code if (value.get() == NULL ||
--- a/OrthancServer/Sources/Search/HierarchicalMatcher.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancServer/Sources/Search/HierarchicalMatcher.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -114,8 +114,8 @@ std::set<DicomTag> ignoreTagLength; std::unique_ptr<DicomValue> value(FromDcmtkBridge::ConvertLeafElement - (*element, DicomToJsonFlags_None, - 0, encoding, hasCodeExtensions, ignoreTagLength)); + (*element, DicomToJsonFlags_None, 0, encoding, hasCodeExtensions, + ignoreTagLength, FromDcmtkBridge::Convert(element->getVR()))); // WARNING: Also modify "DatabaseLookup::IsMatch()" if modifying this code if (value.get() == NULL ||
--- a/OrthancServer/UnitTestsSources/UnitTestsMain.cpp Wed Jul 09 08:21:25 2025 +0200 +++ b/OrthancServer/UnitTestsSources/UnitTestsMain.cpp Wed Jul 09 12:28:12 2025 +0200 @@ -194,7 +194,7 @@ const unsigned char raw[] = { 0x63, 0x72, 0xe2, 0x6e, 0x65 }; std::string latin1((char*) &raw[0], sizeof(raw) / sizeof(char)); - std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1, false); + std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1, false, false); ParsedDicomFile dicom(false); dicom.SetEncoding(Encoding_Latin1);