# HG changeset patch
# User Sebastien Jodogne <s.jodogne@gmail.com>
# Date 1550149444 -3600
# Node ID 4be505c2ac565df203c7b4a2991fd7f13ac9cca6
# Parent  1a0b4db799e85563810a783c1f27113429511497
Separation of ideographic and phonetic characters in DICOMweb JSON and XML

diff -r 1a0b4db799e8 -r 4be505c2ac56 Core/DicomParsing/DicomWebJsonVisitor.cpp
--- a/Core/DicomParsing/DicomWebJsonVisitor.cpp	Thu Feb 14 12:15:28 2019 +0100
+++ b/Core/DicomParsing/DicomWebJsonVisitor.cpp	Thu Feb 14 14:04:04 2019 +0100
@@ -43,9 +43,12 @@
 
 
 static const char* const KEY_ALPHABETIC = "Alphabetic";
+static const char* const KEY_IDEOGRAPHIC = "Ideographic";
+static const char* const KEY_PHONETIC = "Phonetic";
 static const char* const KEY_BULK_DATA_URI = "BulkDataURI";
 static const char* const KEY_INLINE_BINARY = "InlineBinary";
 static const char* const KEY_SQ = "SQ";
+static const char* const KEY_TAG = "tag";
 static const char* const KEY_VALUE = "Value";
 static const char* const KEY_VR = "vr";
 
@@ -53,9 +56,42 @@
 namespace Orthanc
 {
 #if ORTHANC_ENABLE_PUGIXML == 1
+  static void DecomposeXmlPersonName(pugi::xml_node& target,
+                                     const std::string& source)
+  {
+    std::vector<std::string> tokens;
+    Toolbox::TokenizeString(tokens, source, '^');
+
+    if (tokens.size() >= 1)
+    {
+      target.append_child("FamilyName").text() = tokens[0].c_str();
+    }
+            
+    if (tokens.size() >= 2)
+    {
+      target.append_child("GivenName").text() = tokens[1].c_str();
+    }
+            
+    if (tokens.size() >= 3)
+    {
+      target.append_child("MiddleName").text() = tokens[2].c_str();
+    }
+            
+    if (tokens.size() >= 4)
+    {
+      target.append_child("NamePrefix").text() = tokens[3].c_str();
+    }
+            
+    if (tokens.size() >= 5)
+    {
+      target.append_child("NameSuffix").text() = tokens[4].c_str();
+    }
+  }
+  
   static void ExploreXmlDataset(pugi::xml_node& target,
                                 const Json::Value& source)
   {
+    // http://dicom.nema.org/medical/dicom/current/output/chtml/part18/sect_F.3.html#table_F.3.1-1
     assert(source.type() == Json::objectValue);
 
     Json::Value::Members members = source.getMemberNames();
@@ -65,15 +101,15 @@
       const Json::Value& content = source[members[i]];
 
       assert(content.type() == Json::objectValue &&
-             content.isMember("vr") &&
-             content["vr"].type() == Json::stringValue);
-      const std::string vr = content["vr"].asString();
+             content.isMember(KEY_VR) &&
+             content[KEY_VR].type() == Json::stringValue);
+      const std::string vr = content[KEY_VR].asString();
 
       const std::string keyword = FromDcmtkBridge::GetTagName(tag, "");
     
       pugi::xml_node node = target.append_child("DicomAttribute");
-      node.append_attribute("tag").set_value(members[i].c_str());
-      node.append_attribute("vr").set_value(vr.c_str());
+      node.append_attribute(KEY_TAG).set_value(members[i].c_str());
+      node.append_attribute(KEY_VR).set_value(vr.c_str());
 
       if (keyword != std::string(DcmTag_ERROR_TagName))
       {
@@ -99,40 +135,38 @@
           }
           if (vr == "PN")
           {
-            if (content[KEY_VALUE][j].isMember(KEY_ALPHABETIC) &&
-                content[KEY_VALUE][j][KEY_ALPHABETIC].type() == Json::stringValue)
+            bool hasAlphabetic = (content[KEY_VALUE][j].isMember(KEY_ALPHABETIC) &&
+                                  content[KEY_VALUE][j][KEY_ALPHABETIC].type() == Json::stringValue);
+
+            bool hasIdeographic = (content[KEY_VALUE][j].isMember(KEY_IDEOGRAPHIC) &&
+                                   content[KEY_VALUE][j][KEY_IDEOGRAPHIC].type() == Json::stringValue);
+
+            bool hasPhonetic = (content[KEY_VALUE][j].isMember(KEY_PHONETIC) &&
+                                content[KEY_VALUE][j][KEY_PHONETIC].type() == Json::stringValue);
+
+            if (hasAlphabetic ||
+                hasIdeographic ||
+                hasPhonetic)
             {
-              std::vector<std::string> tokens;
-              Toolbox::TokenizeString(tokens, content[KEY_VALUE][j][KEY_ALPHABETIC].asString(), '^');
-
               pugi::xml_node child = node.append_child("PersonName");
               child.append_attribute("number").set_value(number.c_str());
-            
-              pugi::xml_node name = child.append_child(KEY_ALPHABETIC);
-            
-              if (tokens.size() >= 1)
+
+              if (hasAlphabetic)
               {
-                name.append_child("FamilyName").text() = tokens[0].c_str();
-              }
-            
-              if (tokens.size() >= 2)
-              {
-                name.append_child("GivenName").text() = tokens[1].c_str();
+                pugi::xml_node name = child.append_child(KEY_ALPHABETIC);
+                DecomposeXmlPersonName(name, content[KEY_VALUE][j][KEY_ALPHABETIC].asString());
               }
-            
-              if (tokens.size() >= 3)
+
+              if (hasIdeographic)
               {
-                name.append_child("MiddleName").text() = tokens[2].c_str();
+                pugi::xml_node name = child.append_child(KEY_IDEOGRAPHIC);
+                DecomposeXmlPersonName(name, content[KEY_VALUE][j][KEY_IDEOGRAPHIC].asString());
               }
-            
-              if (tokens.size() >= 4)
+
+              if (hasPhonetic)
               {
-                name.append_child("NamePrefix").text() = tokens[3].c_str();
-              }
-            
-              if (tokens.size() >= 5)
-              {
-                name.append_child("NameSuffix").text() = tokens[4].c_str();
+                pugi::xml_node name = child.append_child(KEY_PHONETIC);
+                DecomposeXmlPersonName(name, content[KEY_VALUE][j][KEY_PHONETIC].asString());
               }
             }
           }
@@ -517,8 +551,25 @@
                   Json::Value value = Json::objectValue;
                   if (!tokens[i].empty())
                   {
-                    value[KEY_ALPHABETIC] = tokens[i];
+                    std::vector<std::string> components;
+                    Toolbox::TokenizeString(components, tokens[i], '=');
+
+                    if (components.size() >= 1)
+                    {
+                      value[KEY_ALPHABETIC] = components[0];
+                    }
+
+                    if (components.size() >= 2)
+                    {
+                      value[KEY_IDEOGRAPHIC] = components[1];
+                    }
+
+                    if (components.size() >= 3)
+                    {
+                      value[KEY_PHONETIC] = components[2];
+                    }
                   }
+                  
                   node[KEY_VALUE].append(value);
                   break;
                 }
diff -r 1a0b4db799e8 -r 4be505c2ac56 Core/Toolbox.cpp
--- a/Core/Toolbox.cpp	Thu Feb 14 12:15:28 2019 +0100
+++ b/Core/Toolbox.cpp	Thu Feb 14 14:04:04 2019 +0100
@@ -464,63 +464,48 @@
 
       case Encoding_Latin1:
         return "ISO-8859-1";
-        break;
 
       case Encoding_Latin2:
         return "ISO-8859-2";
-        break;
 
       case Encoding_Latin3:
         return "ISO-8859-3";
-        break;
 
       case Encoding_Latin4:
         return "ISO-8859-4";
-        break;
 
       case Encoding_Latin5:
         return "ISO-8859-9";
-        break;
 
       case Encoding_Cyrillic:
         return "ISO-8859-5";
-        break;
 
       case Encoding_Windows1251:
         return "WINDOWS-1251";
-        break;
 
       case Encoding_Arabic:
         return "ISO-8859-6";
-        break;
 
       case Encoding_Greek:
         return "ISO-8859-7";
-        break;
 
       case Encoding_Hebrew:
         return "ISO-8859-8";
-        break;
         
       case Encoding_Japanese:
         return "SHIFT-JIS";
-        break;
 
       case Encoding_Chinese:
         return "GB18030";
-        break;
 
       case Encoding_Thai:
         return "TIS620.2533-0";
-        break;
 
       case Encoding_Korean:
         return "ISO-IR-149";
-        break;
 
       case Encoding_JapaneseKanji:
         return "JIS";
-        break;
 
       default:
         throw OrthancException(ErrorCode_NotImplemented);
diff -r 1a0b4db799e8 -r 4be505c2ac56 NEWS
--- a/NEWS	Thu Feb 14 12:15:28 2019 +0100
+++ b/NEWS	Thu Feb 14 14:04:04 2019 +0100
@@ -1,6 +1,7 @@
 Pending changes in the mainline
 ===============================
 
+* Separation of ideographic and phonetic characters in DICOMweb JSON and XML
 * Support of Japanese Kanji (ISO 2022 IR 87) and Korean (ISO 2022 IR 149) encodings
 * Basic support for character sets with code extensions (ISO 2022 escape sequences)
 
diff -r 1a0b4db799e8 -r 4be505c2ac56 TODO
--- a/TODO	Thu Feb 14 12:15:28 2019 +0100
+++ b/TODO	Thu Feb 14 14:04:04 2019 +0100
@@ -92,7 +92,6 @@
 * Support Supplementary Kanji set (ISO 2022 IR 159)
 * Implement the following unit tests:
   - Japanese: http://dicom.nema.org/MEDICAL/dicom/2017c/output/chtml/part05/chapter_H.html
-  - Korean: http://dicom.nema.org/MEDICAL/dicom/2017c/output/chtml/part05/chapter_I.html
   - Chinese:
     http://dicom.nema.org/MEDICAL/dicom/2017c/output/chtml/part05/chapter_J.html
     http://dicom.nema.org/MEDICAL/dicom/2017c/output/chtml/part05/chapter_K.html
diff -r 1a0b4db799e8 -r 4be505c2ac56 UnitTestsSources/FromDcmtkTests.cpp
--- a/UnitTestsSources/FromDcmtkTests.cpp	Thu Feb 14 12:15:28 2019 +0100
+++ b/UnitTestsSources/FromDcmtkTests.cpp	Thu Feb 14 14:04:04 2019 +0100
@@ -36,6 +36,7 @@
 
 #include "../Core/DicomNetworking/DicomFindAnswers.h"
 #include "../Core/DicomParsing/DicomModification.h"
+#include "../Core/DicomParsing/DicomWebJsonVisitor.h"
 #include "../Core/DicomParsing/FromDcmtkBridge.h"
 #include "../Core/DicomParsing/Internals/DicomImageDecoder.h"
 #include "../Core/DicomParsing/ToDcmtkBridge.h"
@@ -54,6 +55,10 @@
 #include <dcmtk/dcmdata/dcelem.h>
 #include <dcmtk/dcmdata/dcdeftag.h>
 
+#if ORTHANC_ENABLE_PUGIXML == 1
+#  include <pugixml.hpp>
+#endif
+
 using namespace Orthanc;
 
 TEST(DicomFormat, Tag)
@@ -1476,3 +1481,112 @@
   Toolbox::RemoveIso2022EscapeSequences(dest, iso2022_str_real_ir13);
   ASSERT_EQ(dest, iso2022_str_real_ir13_ref);
 }
+
+
+
+static std::string DecodeFromSpecification(const std::string& s)
+{
+  std::vector<std::string> tokens;
+  Toolbox::TokenizeString(tokens, s, ' ');
+
+  std::string result;
+  result.resize(tokens.size());
+  
+  for (size_t i = 0; i < tokens.size(); i++)
+  {
+    std::vector<std::string> components;
+    Toolbox::TokenizeString(components, tokens[i], '/');
+
+    if (components.size() != 2)
+    {
+      throw;
+    }
+
+    int a = boost::lexical_cast<int>(components[0]);
+    int b = boost::lexical_cast<int>(components[1]);
+    if (a < 0 || a > 15 ||
+        b < 0 || b > 15)
+    {
+      throw;
+    }
+
+    result[i] = static_cast<uint8_t>(a * 16 + b);
+  }
+
+  return result;
+}
+
+
+
+TEST(Toolbox, EncodingsKorean)
+{
+  // http://dicom.nema.org/MEDICAL/dicom/2017c/output/chtml/part05/sect_I.2.html
+
+  std::string korean = DecodeFromSpecification(
+    "04/08 06/15 06/14 06/07 05/14 04/07 06/09 06/12 06/04 06/15 06/14 06/07 03/13 "
+    "01/11 02/04 02/09 04/03 15/11 15/03 05/14 01/11 02/04 02/09 04/03 13/01 12/14 "
+    "13/04 13/07 03/13 01/11 02/04 02/09 04/03 12/08 10/11 05/14 01/11 02/04 02/09 "
+    "04/03 11/01 14/06 11/05 11/15");
+
+  // This array can be re-generated using command-line:
+  // echo -n "Hong^Gildong=..." | hexdump -v -e '14/1 "0x%02x, "' -e '"\n"'
+  static const uint8_t utf8raw[] = {
+    0x48, 0x6f, 0x6e, 0x67, 0x5e, 0x47, 0x69, 0x6c, 0x64, 0x6f, 0x6e, 0x67, 0x3d, 0xe6,
+    0xb4, 0xaa, 0x5e, 0xe5, 0x90, 0x89, 0xe6, 0xb4, 0x9e, 0x3d, 0xed, 0x99, 0x8d, 0x5e,
+    0xea, 0xb8, 0xb8, 0xeb, 0x8f, 0x99
+  };
+
+  std::string utf8(reinterpret_cast<const char*>(utf8raw), sizeof(utf8raw));
+
+  ParsedDicomFile dicom(false);
+  dicom.ReplacePlainString(DICOM_TAG_SPECIFIC_CHARACTER_SET, "\\ISO 2022 IR 149");
+  ASSERT_TRUE(dicom.GetDcmtkObject().getDataset()->putAndInsertString
+              (DCM_PatientName, korean.c_str(), korean.size(), true).good());
+
+  std::string value;
+  ASSERT_TRUE(dicom.GetTagValue(value, DICOM_TAG_PATIENT_NAME));
+  ASSERT_EQ(utf8, value);
+  
+  DicomWebJsonVisitor visitor;
+  dicom.Apply(visitor);
+  ASSERT_EQ(utf8.substr(0, 12), visitor.GetResult()["00100010"]["Value"][0]["Alphabetic"].asString());
+  ASSERT_EQ(utf8.substr(13, 10), visitor.GetResult()["00100010"]["Value"][0]["Ideographic"].asString());
+  ASSERT_EQ(utf8.substr(24), visitor.GetResult()["00100010"]["Value"][0]["Phonetic"].asString());
+
+#if ORTHANC_ENABLE_PUGIXML == 1
+  // http://dicom.nema.org/medical/dicom/current/output/chtml/part18/sect_F.3.html#table_F.3.1-1
+  std::string xml;
+  visitor.FormatXml(xml);
+
+  pugi::xml_document doc;
+  doc.load_string(xml.c_str());
+
+  pugi::xpath_node node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00080005\"]/Value");
+  ASSERT_STREQ("ISO_IR 192", node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00080005\"]");
+  ASSERT_STREQ("CS", node.node().attribute("vr").value());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]");
+  ASSERT_STREQ("PN", node.node().attribute("vr").value());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Alphabetic/FamilyName");
+  ASSERT_STREQ("Hong", node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Alphabetic/GivenName");
+  ASSERT_STREQ("Gildong", node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Ideographic/FamilyName");
+  ASSERT_EQ(utf8.substr(13, 3), node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Ideographic/GivenName");
+  ASSERT_EQ(utf8.substr(17, 6), node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Phonetic/FamilyName");
+  ASSERT_EQ(utf8.substr(24, 3), node.node().text().as_string());
+
+  node = doc.select_single_node("//NativeDicomModel/DicomAttribute[@tag=\"00100010\"]/PersonName/Phonetic/GivenName");
+  ASSERT_EQ(utf8.substr(28), node.node().text().as_string());
+#endif  
+}
+