# HG changeset patch # User Sebastien Jodogne # Date 1550076372 -3600 # Node ID cf8cbeb35f333d2440f503eb6a384f4860b4864a # Parent c9a71eb4edcf7030663902f649015aa28d615160 preliminary support of Korean character set diff -r c9a71eb4edcf -r cf8cbeb35f33 .hgignore --- a/.hgignore Tue Feb 12 17:27:33 2019 +0100 +++ b/.hgignore Wed Feb 13 17:46:12 2019 +0100 @@ -4,3 +4,4 @@ *.cpp.orig *.h.orig .vs/ +*~ diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/DicomParsing/DicomDirWriter.cpp --- a/Core/DicomParsing/DicomDirWriter.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/DicomParsing/DicomDirWriter.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -161,6 +161,7 @@ static bool GetUtf8TagValue(std::string& result, DcmItem& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key) { DcmElement* element = NULL; @@ -174,7 +175,7 @@ { if (s != NULL) { - result = Toolbox::ConvertToUtf8(s, encoding); + result = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); } return true; @@ -202,6 +203,7 @@ static bool CopyString(DcmDirectoryRecord& target, DcmDataset& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key, bool optional, bool copyEmpty) @@ -214,7 +216,7 @@ } std::string value; - bool found = GetUtf8TagValue(value, source, encoding, key); + bool found = GetUtf8TagValue(value, source, encoding, hasCodeExtensions, key); if (!found) { @@ -231,33 +233,37 @@ static void CopyStringType1(DcmDirectoryRecord& target, DcmDataset& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key) { - CopyString(target, source, encoding, key, false, false); + CopyString(target, source, encoding, hasCodeExtensions, key, false, false); } static void CopyStringType1C(DcmDirectoryRecord& target, DcmDataset& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key) { - CopyString(target, source, encoding, key, true, false); + CopyString(target, source, encoding, hasCodeExtensions, key, true, false); } static void CopyStringType2(DcmDirectoryRecord& target, DcmDataset& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key) { - CopyString(target, source, encoding, key, false, true); + CopyString(target, source, encoding, hasCodeExtensions, key, false, true); } static void CopyStringType3(DcmDirectoryRecord& target, DcmDataset& source, Encoding encoding, + bool hasCodeExtensions, const DcmTagKey& key) { - CopyString(target, source, encoding, key, true, true); + CopyString(target, source, encoding, hasCodeExtensions, key, true, true); } @@ -298,17 +304,19 @@ void FillPatient(DcmDirectoryRecord& record, DcmDataset& dicom, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { // cf. "DicomDirInterface::buildPatientRecord()" - CopyStringType1C(record, dicom, encoding, DCM_PatientID); - CopyStringType2(record, dicom, encoding, DCM_PatientName); + CopyStringType1C(record, dicom, encoding, hasCodeExtensions, DCM_PatientID); + CopyStringType2(record, dicom, encoding, hasCodeExtensions, DCM_PatientName); } void FillStudy(DcmDirectoryRecord& record, DcmDataset& dicom, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { // cf. "DicomDirInterface::buildStudyRecord()" @@ -316,19 +324,19 @@ SystemToolbox::GetNowDicom(nowDate, nowTime, utc_); std::string studyDate; - if (!GetUtf8TagValue(studyDate, dicom, encoding, DCM_StudyDate) && - !GetUtf8TagValue(studyDate, dicom, encoding, DCM_SeriesDate) && - !GetUtf8TagValue(studyDate, dicom, encoding, DCM_AcquisitionDate) && - !GetUtf8TagValue(studyDate, dicom, encoding, DCM_ContentDate)) + if (!GetUtf8TagValue(studyDate, dicom, encoding, hasCodeExtensions, DCM_StudyDate) && + !GetUtf8TagValue(studyDate, dicom, encoding, hasCodeExtensions, DCM_SeriesDate) && + !GetUtf8TagValue(studyDate, dicom, encoding, hasCodeExtensions, DCM_AcquisitionDate) && + !GetUtf8TagValue(studyDate, dicom, encoding, hasCodeExtensions, DCM_ContentDate)) { studyDate = nowDate; } std::string studyTime; - if (!GetUtf8TagValue(studyTime, dicom, encoding, DCM_StudyTime) && - !GetUtf8TagValue(studyTime, dicom, encoding, DCM_SeriesTime) && - !GetUtf8TagValue(studyTime, dicom, encoding, DCM_AcquisitionTime) && - !GetUtf8TagValue(studyTime, dicom, encoding, DCM_ContentTime)) + if (!GetUtf8TagValue(studyTime, dicom, encoding, hasCodeExtensions, DCM_StudyTime) && + !GetUtf8TagValue(studyTime, dicom, encoding, hasCodeExtensions, DCM_SeriesTime) && + !GetUtf8TagValue(studyTime, dicom, encoding, hasCodeExtensions, DCM_AcquisitionTime) && + !GetUtf8TagValue(studyTime, dicom, encoding, hasCodeExtensions, DCM_ContentTime)) { studyTime = nowTime; } @@ -336,52 +344,54 @@ /* copy attribute values from dataset to study record */ SetTagValue(record, DCM_StudyDate, studyDate); SetTagValue(record, DCM_StudyTime, studyTime); - CopyStringType2(record, dicom, encoding, DCM_StudyDescription); - CopyStringType1(record, dicom, encoding, DCM_StudyInstanceUID); + CopyStringType2(record, dicom, encoding, hasCodeExtensions, DCM_StudyDescription); + CopyStringType1(record, dicom, encoding, hasCodeExtensions, DCM_StudyInstanceUID); /* use type 1C instead of 1 in order to avoid unwanted overwriting */ - CopyStringType1C(record, dicom, encoding, DCM_StudyID); - CopyStringType2(record, dicom, encoding, DCM_AccessionNumber); + CopyStringType1C(record, dicom, encoding, hasCodeExtensions, DCM_StudyID); + CopyStringType2(record, dicom, encoding, hasCodeExtensions, DCM_AccessionNumber); } void FillSeries(DcmDirectoryRecord& record, DcmDataset& dicom, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { // cf. "DicomDirInterface::buildSeriesRecord()" /* copy attribute values from dataset to series record */ - CopyStringType1(record, dicom, encoding, DCM_Modality); - CopyStringType1(record, dicom, encoding, DCM_SeriesInstanceUID); + CopyStringType1(record, dicom, encoding, hasCodeExtensions, DCM_Modality); + CopyStringType1(record, dicom, encoding, hasCodeExtensions, DCM_SeriesInstanceUID); /* use type 1C instead of 1 in order to avoid unwanted overwriting */ - CopyStringType1C(record, dicom, encoding, DCM_SeriesNumber); + CopyStringType1C(record, dicom, encoding, hasCodeExtensions, DCM_SeriesNumber); // Add extended (non-standard) type 3 tags, those are not generated by DCMTK // http://dicom.nema.org/medical/Dicom/2016a/output/chtml/part02/sect_7.3.html // https://groups.google.com/d/msg/orthanc-users/Y7LOvZMDeoc/9cp3kDgxAwAJ if (extendedSopClass_) { - CopyStringType3(record, dicom, encoding, DCM_SeriesDescription); + CopyStringType3(record, dicom, encoding, hasCodeExtensions, DCM_SeriesDescription); } } void FillInstance(DcmDirectoryRecord& record, DcmDataset& dicom, Encoding encoding, + bool hasCodeExtensions, DcmMetaInfo& metaInfo, const char* path) { // cf. "DicomDirInterface::buildImageRecord()" /* copy attribute values from dataset to image record */ - CopyStringType1(record, dicom, encoding, DCM_InstanceNumber); - //CopyElementType1C(record, dicom, encoding, DCM_ImageType); + CopyStringType1(record, dicom, encoding, hasCodeExtensions, DCM_InstanceNumber); + //CopyElementType1C(record, dicom, encoding, hasCodeExtensions, DCM_ImageType); // REMOVED since 0.9.7: copyElementType1C(dicom, DCM_ReferencedImageSequence, record); std::string sopClassUid, sopInstanceUid, transferSyntaxUid; - if (!GetUtf8TagValue(sopClassUid, dicom, encoding, DCM_SOPClassUID) || - !GetUtf8TagValue(sopInstanceUid, dicom, encoding, DCM_SOPInstanceUID) || - !GetUtf8TagValue(transferSyntaxUid, metaInfo, encoding, DCM_TransferSyntaxUID)) + if (!GetUtf8TagValue(sopClassUid, dicom, encoding, hasCodeExtensions, DCM_SOPClassUID) || + !GetUtf8TagValue(sopInstanceUid, dicom, encoding, hasCodeExtensions, DCM_SOPInstanceUID) || + !GetUtf8TagValue(transferSyntaxUid, metaInfo, encoding, hasCodeExtensions, DCM_TransferSyntaxUID)) { throw OrthancException(ErrorCode_BadFileFormat); } @@ -401,7 +411,9 @@ const char* path) { DcmDataset& dataset = *dicom.GetDcmtkObject().getDataset(); - Encoding encoding = dicom.GetEncoding(); + + bool hasCodeExtensions; + Encoding encoding = dicom.DetectEncoding(hasCodeExtensions); bool found; std::string id; @@ -410,7 +422,7 @@ switch (level) { case ResourceType_Patient: - if (!GetUtf8TagValue(id, dataset, encoding, DCM_PatientID)) + if (!GetUtf8TagValue(id, dataset, encoding, hasCodeExtensions, DCM_PatientID)) { // Be tolerant about missing patient ID. Fixes issue #124 // (GET /studies/ID/media fails for certain dicom file). @@ -422,17 +434,17 @@ break; case ResourceType_Study: - found = GetUtf8TagValue(id, dataset, encoding, DCM_StudyInstanceUID); + found = GetUtf8TagValue(id, dataset, encoding, hasCodeExtensions, DCM_StudyInstanceUID); type = ERT_Study; break; case ResourceType_Series: - found = GetUtf8TagValue(id, dataset, encoding, DCM_SeriesInstanceUID); + found = GetUtf8TagValue(id, dataset, encoding, hasCodeExtensions, DCM_SeriesInstanceUID); type = ERT_Series; break; case ResourceType_Instance: - found = GetUtf8TagValue(id, dataset, encoding, DCM_SOPInstanceUID); + found = GetUtf8TagValue(id, dataset, encoding, hasCodeExtensions, DCM_SOPInstanceUID); type = ERT_Image; break; @@ -459,26 +471,26 @@ switch (level) { case ResourceType_Patient: - FillPatient(*record, dataset, encoding); + FillPatient(*record, dataset, encoding, hasCodeExtensions); break; case ResourceType_Study: - FillStudy(*record, dataset, encoding); + FillStudy(*record, dataset, encoding, hasCodeExtensions); break; case ResourceType_Series: - FillSeries(*record, dataset, encoding); + FillSeries(*record, dataset, encoding, hasCodeExtensions); break; case ResourceType_Instance: - FillInstance(*record, dataset, encoding, *dicom.GetDcmtkObject().getMetaInfo(), path); + FillInstance(*record, dataset, encoding, hasCodeExtensions, *dicom.GetDcmtkObject().getMetaInfo(), path); break; default: throw OrthancException(ErrorCode_InternalError); } - CopyStringType1C(*record, dataset, encoding, DCM_SpecificCharacterSet); + CopyStringType1C(*record, dataset, encoding, hasCodeExtensions, DCM_SpecificCharacterSet); target = record.get(); GetRoot().insertSub(record.release()); diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/DicomParsing/FromDcmtkBridge.cpp --- a/Core/DicomParsing/FromDcmtkBridge.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/DicomParsing/FromDcmtkBridge.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -414,37 +414,49 @@ } - Encoding FromDcmtkBridge::DetectEncoding(DcmItem& dataset, + Encoding FromDcmtkBridge::DetectEncoding(bool& hasCodeExtensions, + DcmItem& dataset, Encoding defaultEncoding) { - Encoding encoding = defaultEncoding; + // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2 OFString tmp; - if (dataset.findAndGetOFString(DCM_SpecificCharacterSet, tmp).good()) + if (dataset.findAndGetOFStringArray(DCM_SpecificCharacterSet, tmp).good()) { - std::string characterSet = Toolbox::StripSpaces(std::string(tmp.c_str())); - - if (characterSet.empty()) + std::vector tokens; + Toolbox::TokenizeString(tokens, std::string(tmp.c_str()), '\\'); + + hasCodeExtensions = (tokens.size() > 1); + + for (size_t i = 0; i < tokens.size(); i++) { - // Empty specific character set tag: Use the default encoding - } - else if (GetDicomEncoding(encoding, characterSet.c_str())) - { - // The specific character set is supported by the Orthanc core - } - else - { - LOG(WARNING) << "Value of Specific Character Set (0008,0005) is not supported: " << characterSet - << ", fallback to ASCII (remove all special characters)"; - encoding = Encoding_Ascii; + std::string characterSet = Toolbox::StripSpaces(tokens[i]); + + if (!characterSet.empty()) + { + Encoding encoding; + + if (GetDicomEncoding(encoding, characterSet.c_str())) + { + // The specific character set is supported by the Orthanc core + return encoding; + } + else + { + LOG(WARNING) << "Value of Specific Character Set (0008,0005) is not supported: " << characterSet + << ", fallback to ASCII (remove all special characters)"; + return Encoding_Ascii; + } + } } } else { - // No specific character set tag: Use the default encoding + hasCodeExtensions = false; } - - return encoding; + + // No specific character set tag: Use the default encoding + return defaultEncoding; } @@ -454,8 +466,9 @@ Encoding defaultEncoding) { std::set ignoreTagLength; - - Encoding encoding = DetectEncoding(dataset, defaultEncoding); + + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions, dataset, defaultEncoding); target.Clear(); for (unsigned long i = 0; i < dataset.card(); i++) @@ -466,7 +479,7 @@ target.SetValue(element->getTag().getGTag(), element->getTag().getETag(), ConvertLeafElement(*element, DicomToJsonFlags_Default, - maxStringLength, encoding, ignoreTagLength)); + maxStringLength, encoding, hasCodeExtensions, ignoreTagLength)); } } } @@ -488,6 +501,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding encoding, + bool hasCodeExtensions, const std::set& ignoreTagLength) { if (!element.isLeaf()) @@ -507,7 +521,7 @@ else { std::string s(c); - std::string utf8 = Toolbox::ConvertToUtf8(s, encoding); + std::string utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); if (maxStringLength != 0 && utf8.size() > maxStringLength && @@ -855,6 +869,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding encoding, + bool hasCodeExtensions, const std::set& ignoreTagLength) { if (parent.type() == Json::nullValue) @@ -869,7 +884,7 @@ { // The "0" below lets "LeafValueToJson()" take care of "TooLong" values std::auto_ptr v(FromDcmtkBridge::ConvertLeafElement - (element, flags, 0, encoding, ignoreTagLength)); + (element, flags, 0, encoding, hasCodeExtensions, ignoreTagLength)); if (ignoreTagLength.find(GetTag(element)) == ignoreTagLength.end()) { @@ -894,7 +909,7 @@ { DcmItem* child = sequence.getItem(i); Json::Value& v = target.append(Json::objectValue); - DatasetToJson(v, *child, format, flags, maxStringLength, encoding, ignoreTagLength); + DatasetToJson(v, *child, format, flags, maxStringLength, encoding, hasCodeExtensions, ignoreTagLength); } } } @@ -906,6 +921,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding encoding, + bool hasCodeExtensions, const std::set& ignoreTagLength) { assert(parent.type() == Json::objectValue); @@ -952,7 +968,7 @@ } FromDcmtkBridge::ElementToJson(parent, *element, format, flags, - maxStringLength, encoding, ignoreTagLength); + maxStringLength, encoding, hasCodeExtensions, ignoreTagLength); } } @@ -965,10 +981,11 @@ Encoding defaultEncoding, const std::set& ignoreTagLength) { - Encoding encoding = DetectEncoding(dataset, defaultEncoding); + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions, dataset, defaultEncoding); target = Json::objectValue; - DatasetToJson(target, dataset, format, flags, maxStringLength, encoding, ignoreTagLength); + DatasetToJson(target, dataset, format, flags, maxStringLength, encoding, hasCodeExtensions, ignoreTagLength); } @@ -980,7 +997,7 @@ { std::set ignoreTagLength; target = Json::objectValue; - DatasetToJson(target, dataset, format, flags, maxStringLength, Encoding_Ascii, ignoreTagLength); + DatasetToJson(target, dataset, format, flags, maxStringLength, Encoding_Ascii, false, ignoreTagLength); } @@ -2033,6 +2050,7 @@ void FromDcmtkBridge::ChangeStringEncoding(DcmItem& dataset, Encoding source, + bool hasSourceCodeExtensions, Encoding target) { // Recursive exploration of a dataset to change the encoding of @@ -2055,7 +2073,7 @@ element->getString(c).good() && c != NULL) { - std::string a = Toolbox::ConvertToUtf8(c, source); + std::string a = Toolbox::ConvertToUtf8(c, source, hasSourceCodeExtensions); std::string b = Toolbox::ConvertFromUtf8(a, target); element->putString(b.c_str()); } @@ -2069,7 +2087,7 @@ for (unsigned long j = 0; j < sequence.card(); j++) { - ChangeStringEncoding(*sequence.getItem(j), source, target); + ChangeStringEncoding(*sequence.getItem(j), source, hasSourceCodeExtensions, target); } } } @@ -2192,13 +2210,15 @@ ITagVisitor& visitor, const std::vector& parentTags, const std::vector& parentIndexes, - Encoding encoding); + Encoding encoding, + bool hasCodeExtensions); static void ApplyVisitorToDataset(DcmItem& dataset, ITagVisitor& visitor, const std::vector& parentTags, const std::vector& parentIndexes, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { assert(parentTags.size() == parentIndexes.size()); @@ -2211,7 +2231,7 @@ } else { - ApplyVisitorToElement(*element, visitor, parentTags, parentIndexes, encoding); + ApplyVisitorToElement(*element, visitor, parentTags, parentIndexes, encoding, hasCodeExtensions); } } } @@ -2222,7 +2242,8 @@ const std::vector& parentTags, const std::vector& parentIndexes, const DicomTag& tag, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { // TODO - Merge this function, that is more recent, with ConvertLeafElement() @@ -2299,7 +2320,7 @@ if (c != NULL) // This case corresponds to the empty string { std::string s(c); - utf8 = Toolbox::ConvertToUtf8(s, encoding); + utf8 = Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions); } std::string newValue; @@ -2380,7 +2401,7 @@ std::string s(reinterpret_cast(data), l); ITagVisitor::Action action = visitor.VisitString (ignored, parentTags, parentIndexes, tag, vr, - Toolbox::ConvertToUtf8(s, encoding)); + Toolbox::ConvertToUtf8(s, encoding, hasCodeExtensions)); if (action != ITagVisitor::Action_None) { @@ -2608,7 +2629,8 @@ ITagVisitor& visitor, const std::vector& parentTags, const std::vector& parentIndexes, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { assert(parentTags.size() == parentIndexes.size()); @@ -2616,7 +2638,7 @@ if (element.isLeaf()) { - ApplyVisitorToLeaf(element, visitor, parentTags, parentIndexes, tag, encoding); + ApplyVisitorToLeaf(element, visitor, parentTags, parentIndexes, tag, encoding, hasCodeExtensions); } else { @@ -2640,7 +2662,7 @@ { indexes.back() = static_cast(i); DcmItem* child = sequence.getItem(i); - ApplyVisitorToDataset(*child, visitor, tags, indexes, encoding); + ApplyVisitorToDataset(*child, visitor, tags, indexes, encoding, hasCodeExtensions); } } } @@ -2653,7 +2675,8 @@ { std::vector parentTags; std::vector parentIndexes; - Encoding encoding = DetectEncoding(dataset, defaultEncoding); - ApplyVisitorToDataset(dataset, visitor, parentTags, parentIndexes, encoding); + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions, dataset, defaultEncoding); + ApplyVisitorToDataset(dataset, visitor, parentTags, parentIndexes, encoding, hasCodeExtensions); } } diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/DicomParsing/FromDcmtkBridge.h --- a/Core/DicomParsing/FromDcmtkBridge.h Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/DicomParsing/FromDcmtkBridge.h Wed Feb 13 17:46:12 2019 +0100 @@ -92,6 +92,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding encoding, + bool hasCodeExtensions, const std::set& ignoreTagLength); static void ElementToJson(Json::Value& parent, @@ -100,6 +101,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding dicomEncoding, + bool hasCodeExtensions, const std::set& ignoreTagLength); static void ExtractDicomAsJson(Json::Value& target, @@ -112,6 +114,7 @@ static void ChangeStringEncoding(DcmItem& dataset, Encoding source, + bool hasSourceCodeExtensions, Encoding target); public: @@ -124,7 +127,8 @@ unsigned int maxMultiplicity, const std::string& privateCreator); - static Encoding DetectEncoding(DcmItem& dataset, + static Encoding DetectEncoding(bool& hasCodeExtensions, + DcmItem& dataset, Encoding defaultEncoding); static DicomTag Convert(const DcmTag& tag); @@ -137,6 +141,7 @@ DicomToJsonFlags flags, unsigned int maxStringLength, Encoding encoding, + bool hasCodeExtensions, const std::set& ignoreTagLength); static void ExtractHeaderAsJson(Json::Value& target, diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/DicomParsing/ParsedDicomFile.cpp --- a/Core/DicomParsing/ParsedDicomFile.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/DicomParsing/ParsedDicomFile.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -645,7 +645,10 @@ } InvalidateCache(); - std::auto_ptr element(FromDcmtkBridge::FromJson(tag, value, decodeDataUriScheme, GetEncoding())); + + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions); + std::auto_ptr element(FromDcmtkBridge::FromJson(tag, value, decodeDataUriScheme, encoding)); InsertInternal(*pimpl_->file_->getDataset(), element.release()); } @@ -706,8 +709,9 @@ } else { - Encoding encoding = GetEncoding(); - if (GetEncoding() != Encoding_Utf8) + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions); + if (encoding != Encoding_Utf8) { binary = Toolbox::ConvertFromUtf8(utf8Value, encoding); decoded = &binary; @@ -766,7 +770,10 @@ } std::auto_ptr element(FromDcmtkBridge::CreateElementForTag(tag)); - FromDcmtkBridge::FillElementWithString(*element, tag, utf8Value, decodeDataUriScheme, GetEncoding()); + + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions); + FromDcmtkBridge::FillElementWithString(*element, tag, utf8Value, decodeDataUriScheme, encoding); InsertInternal(dicom, element.release()); UpdateStorageUid(tag, utf8Value, false); @@ -805,7 +812,9 @@ } } - InsertInternal(dicom, FromDcmtkBridge::FromJson(tag, value, decodeDataUriScheme, GetEncoding())); + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions); + InsertInternal(dicom, FromDcmtkBridge::FromJson(tag, value, decodeDataUriScheme, encoding)); if (tag == DICOM_TAG_SOP_CLASS_UID || tag == DICOM_TAG_SOP_INSTANCE_UID) @@ -875,10 +884,13 @@ return false; } + bool hasCodeExtensions; + Encoding encoding = DetectEncoding(hasCodeExtensions); + std::set tmp; std::auto_ptr v(FromDcmtkBridge::ConvertLeafElement (*element, DicomToJsonFlags_Default, - 0, GetEncoding(), tmp)); + 0, encoding, hasCodeExtensions, tmp)); if (v.get() == NULL || v->IsNull()) @@ -1294,9 +1306,10 @@ } - Encoding ParsedDicomFile::GetEncoding() const + Encoding ParsedDicomFile::DetectEncoding(bool& hasCodeExtensions) const { - return FromDcmtkBridge::DetectEncoding(*pimpl_->file_->getDataset(), + return FromDcmtkBridge::DetectEncoding(hasCodeExtensions, + *pimpl_->file_->getDataset(), GetDefaultDicomEncoding()); } @@ -1532,12 +1545,13 @@ void ParsedDicomFile::ChangeEncoding(Encoding target) { - Encoding source = GetEncoding(); + bool hasCodeExtensions; + Encoding source = DetectEncoding(hasCodeExtensions); if (source != target) // Avoid unnecessary conversion { ReplacePlainString(DICOM_TAG_SPECIFIC_CHARACTER_SET, GetDicomSpecificCharacterSet(target)); - FromDcmtkBridge::ChangeStringEncoding(*pimpl_->file_->getDataset(), source, target); + FromDcmtkBridge::ChangeStringEncoding(*pimpl_->file_->getDataset(), source, hasCodeExtensions, target); } } diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/DicomParsing/ParsedDicomFile.h --- a/Core/DicomParsing/ParsedDicomFile.h Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/DicomParsing/ParsedDicomFile.h Wed Feb 13 17:46:12 2019 +0100 @@ -186,7 +186,7 @@ void EmbedImage(MimeType mime, const std::string& content); - Encoding GetEncoding() const; + Encoding DetectEncoding(bool& hasCodeExtensions) const; // WARNING: This function only sets the encoding, it will not // convert the encoding of the tags. Use "ChangeEncoding()" if need be. diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/Enumerations.cpp --- a/Core/Enumerations.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/Enumerations.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -647,6 +647,9 @@ case Encoding_Chinese: return "Chinese"; + case Encoding_Korean: + return "Korean"; + default: throw OrthancException(ErrorCode_ParameterOutOfRange); } @@ -1202,6 +1205,11 @@ return Encoding_Chinese; } + if (s == "KOREAN") + { + return Encoding_Korean; + } + throw OrthancException(ErrorCode_ParameterOutOfRange); } @@ -1836,11 +1844,13 @@ { encoding = Encoding_Hebrew; } - else if (s == "ISO_IR 166" || s == "ISO 2022 IR 166") + else if (s == "ISO_IR 166" || + s == "ISO 2022 IR 166") { encoding = Encoding_Thai; } - else if (s == "ISO_IR 13" || s == "ISO 2022 IR 13") + else if (s == "ISO_IR 13" || + s == "ISO 2022 IR 13") { encoding = Encoding_Japanese; } @@ -1855,11 +1865,11 @@ **/ encoding = Encoding_Chinese; } + else if (s == "ISO 2022 IR 149") + { + encoding = Encoding_Korean; + } /* - else if (s == "ISO 2022 IR 149") - { - TODO - } else if (s == "ISO 2022 IR 159") { TODO @@ -2013,6 +2023,9 @@ case Encoding_Thai: return "ISO_IR 166"; + case Encoding_Korean: + return "ISO 2022 IR 149"; + default: throw OrthancException(ErrorCode_ParameterOutOfRange); } diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/Enumerations.h --- a/Core/Enumerations.h Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/Enumerations.h Wed Feb 13 17:46:12 2019 +0100 @@ -442,10 +442,10 @@ Encoding_Hebrew, Encoding_Thai, // TIS 620-2533 Encoding_Japanese, // JIS X 0201 (Shift JIS): Katakana - Encoding_Chinese // GB18030 - Chinese simplified + Encoding_Chinese, // GB18030 - Chinese simplified //Encoding_JapaneseKanji, // Multibyte - JIS X 0208: Kanji //Encoding_JapaneseSupplementaryKanji, // Multibyte - JIS X 0212: Supplementary Kanji set - //Encoding_Korean, // Multibyte - KS X 1001: Hangul and Hanja + Encoding_Korean // Multibyte - KS X 1001: Hangul and Hanja }; diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/Toolbox.cpp --- a/Core/Toolbox.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/Toolbox.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -514,6 +514,10 @@ return "TIS620.2533-0"; break; + case Encoding_Korean: + return "ISO-IR-149"; + break; + default: throw OrthancException(ErrorCode_NotImplemented); } @@ -522,27 +526,46 @@ #if ORTHANC_ENABLE_LOCALE == 1 + // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2 std::string Toolbox::ConvertToUtf8(const std::string& source, - Encoding sourceEncoding) + Encoding sourceEncoding, + bool hasCodeExtensions) { // The "::skip" flag makes boost skip invalid UTF-8 // characters. This can occur in badly-encoded DICOM files. try { - if (sourceEncoding == Encoding_Utf8) - { - // Already in UTF-8: No conversion is required - return boost::locale::conv::utf_to_utf(source, boost::locale::conv::skip); - } - else if (sourceEncoding == Encoding_Ascii) + if (sourceEncoding == Encoding_Ascii) { return ConvertToAscii(source); } - else + else { - const char* encoding = GetBoostLocaleEncoding(sourceEncoding); - return boost::locale::conv::to_utf(source, encoding, boost::locale::conv::skip); + std::string s; + + if (sourceEncoding == Encoding_Utf8) + { + // Already in UTF-8: No conversion is required, but we ensure + // the output is correctly encoded + s = boost::locale::conv::utf_to_utf(source, boost::locale::conv::skip); + } + else + { + const char* encoding = GetBoostLocaleEncoding(sourceEncoding); + s = boost::locale::conv::to_utf(source, encoding, boost::locale::conv::skip); + } + + if (hasCodeExtensions) + { + std::string t; + RemoveIso2022EscapeSequences(t, s); + return t; + } + else + { + return s; + } } } catch (std::runtime_error&) @@ -1593,6 +1616,182 @@ return boost::regex_replace(source, pattern, formatter); } + + + namespace Iso2022 + { + /** + Returns whether the string s contains a single-byte control message + at index i + **/ + static inline bool IsControlMessage1(const std::string& s, size_t i) + { + if (i < s.size()) + { + char c = s[i]; + return + (c == '\x0f') || // Locking shift zero + (c == '\x0e'); // Locking shift one + } + else + { + return false; + } + } + + /** + Returns whether the string s contains a double-byte control message + at index i + **/ + static inline size_t IsControlMessage2(const std::string& s, size_t i) + { + if (i + 1 < s.size()) + { + char c1 = s[i]; + char c2 = s[i + 1]; + return (c1 == 0x1b) && ( + (c2 == '\x6e') || // Locking shift two + (c2 == '\x6f') || // Locking shift three + (c2 == '\x4e') || // Single shift two (alt) + (c2 == '\x4f') || // Single shift three (alt) + (c2 == '\x7c') || // Locking shift three right + (c2 == '\x7d') || // Locking shift two right + (c2 == '\x7e') // Locking shift one right + ); + } + else + { + return false; + } + } + + /** + Returns whether the string s contains a triple-byte control message + at index i + **/ + static inline size_t IsControlMessage3(const std::string& s, size_t i) + { + if (i + 2 < s.size()) + { + char c1 = s[i]; + char c2 = s[i + 1]; + char c3 = s[i + 2]; + return ((c1 == '\x8e' && c2 == 0x1b && c3 == '\x4e') || + (c1 == '\x8f' && c2 == 0x1b && c3 == '\x4f')); + } + else + { + return false; + } + } + + /** + This function returns true if the index i in the supplied string s: + - is valid + - contains the c character + This function returns false otherwise. + **/ + static inline bool TestCharValue( + const std::string& s, size_t i, char c) + { + if (i < s.size()) + return s[i] == c; + else + return false; + } + + /** + This function returns true if the index i in the supplied string s: + - is valid + - has a c character that is >= cMin and <= cMax (included) + This function returns false otherwise. + **/ + static inline bool TestCharRange( + const std::string& s, size_t i, char cMin, char cMax) + { + if (i < s.size()) + return (s[i] >= cMin) && (s[i] <= cMax); + else + return false; + } + + /** + This function returns the total length in bytes of the escape sequence + located in string s at index i, if there is one, or 0 otherwise. + **/ + static inline size_t GetEscapeSequenceLength(const std::string& s, size_t i) + { + if (TestCharValue(s, i, 0x1b)) + { + size_t j = i+1; + + // advance reading cursor while we are in a sequence + while (TestCharRange(s, j, '\x20', '\x2f')) + ++j; + + // check there is a valid termination byte AND we're long enough (there + // must be at least one byte between 0x20 and 0x2f + if (TestCharRange(s, j, '\x30', '\x7f') && (j - i) >= 2) + return j - i + 1; + else + return 0; + } + else + return 0; + } + } + + + + /** + This function will strip all ISO/IEC 2022 control codes and escape + sequences. + Please see https://en.wikipedia.org/wiki/ISO/IEC_2022 (as of 2019-02) + for a list of those. + + Please note that this operation is potentially destructive, because + it removes the character set information from the byte stream. + + However, in the case where the encoding is unique, then suppressing + the escape sequences allows to provide us with a clean string after + conversion to utf-8 with boost. + **/ + void Toolbox::RemoveIso2022EscapeSequences(std::string& dest, const std::string& src) + { + // we need AT MOST the same size as the source string in the output + dest.clear(); + if (dest.capacity() < src.size()) + dest.reserve(src.size()); + + size_t i = 0; + + // uint8_t view to the string + while (i < src.size()) + { + size_t j = i; + + // The i index will only be incremented if a message is detected + // in that case, the message is skipped and the index is set to the + // next position to read + if (Iso2022::IsControlMessage1(src, i)) + i += 1; + else if (Iso2022::IsControlMessage2(src, i)) + i += 2; + else if (Iso2022::IsControlMessage3(src, i)) + i += 3; + else + i += Iso2022::GetEscapeSequenceLength(src, i); + + // if the index was NOT incremented, this means there was no message at + // this location: we then may copy the character at this index and + // increment the index to point to the next read position + if (j == i) + { + dest.push_back(src[i]); + i++; + } + } + } } diff -r c9a71eb4edcf -r cf8cbeb35f33 Core/Toolbox.h --- a/Core/Toolbox.h Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/Toolbox.h Wed Feb 13 17:46:12 2019 +0100 @@ -163,7 +163,8 @@ #if ORTHANC_ENABLE_LOCALE == 1 std::string ConvertToUtf8(const std::string& source, - Encoding sourceEncoding); + Encoding sourceEncoding, + bool hasCodeExtensions); std::string ConvertFromUtf8(const std::string& source, Encoding targetEncoding); @@ -248,6 +249,9 @@ std::string SubstituteVariables(const std::string& source, const std::map& dictionary); + + void RemoveIso2022EscapeSequences(std::string& dest, + const std::string& src); } } diff -r c9a71eb4edcf -r cf8cbeb35f33 NEWS --- a/NEWS Tue Feb 12 17:27:33 2019 +0100 +++ b/NEWS Wed Feb 13 17:46:12 2019 +0100 @@ -1,6 +1,8 @@ Pending changes in the mainline =============================== +* Basic support for character sets with code extensions + Version 1.5.4 (2019-02-08) ========================== diff -r c9a71eb4edcf -r cf8cbeb35f33 OrthancServer/OrthancRestApi/OrthancRestAnonymizeModify.cpp --- a/OrthancServer/OrthancRestApi/OrthancRestAnonymizeModify.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/OrthancServer/OrthancRestApi/OrthancRestAnonymizeModify.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -519,7 +519,10 @@ else if (tag["Type"] == "String") { std::string value = tag["Value"].asString(); - dicom.ReplacePlainString(*it, Toolbox::ConvertFromUtf8(value, dicom.GetEncoding())); + + bool hasCodeExtensions; + Encoding encoding = dicom.DetectEncoding(hasCodeExtensions); + dicom.ReplacePlainString(*it, Toolbox::ConvertFromUtf8(value, encoding)); } } } diff -r c9a71eb4edcf -r cf8cbeb35f33 OrthancServer/Search/DatabaseLookup.cpp --- a/OrthancServer/Search/DatabaseLookup.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/OrthancServer/Search/DatabaseLookup.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -95,7 +95,8 @@ bool DatabaseLookup::IsMatch(DcmItem& item, - Encoding encoding) const + Encoding encoding, + bool hasCodeExtensions) const { for (size_t i = 0; i < constraints_.size(); i++) { @@ -118,7 +119,7 @@ std::set ignoreTagLength; std::auto_ptr value(FromDcmtkBridge::ConvertLeafElement (*element, DicomToJsonFlags_None, - 0, encoding, ignoreTagLength)); + 0, encoding, hasCodeExtensions, ignoreTagLength)); // WARNING: Also modify "HierarchicalMatcher::Setup()" if modifying this code if (value.get() == NULL || diff -r c9a71eb4edcf -r cf8cbeb35f33 OrthancServer/Search/DatabaseLookup.h --- a/OrthancServer/Search/DatabaseLookup.h Tue Feb 12 17:27:33 2019 +0100 +++ b/OrthancServer/Search/DatabaseLookup.h Wed Feb 13 17:46:12 2019 +0100 @@ -74,7 +74,8 @@ bool IsMatch(const DicomMap& value) const; bool IsMatch(DcmItem& item, - Encoding encoding) const; + Encoding encoding, + bool hasCodeExtensions) const; void AddDicomConstraint(const DicomTag& tag, const std::string& dicomQuery, diff -r c9a71eb4edcf -r cf8cbeb35f33 OrthancServer/Search/HierarchicalMatcher.cpp --- a/OrthancServer/Search/HierarchicalMatcher.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/OrthancServer/Search/HierarchicalMatcher.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -53,7 +53,9 @@ caseSensitivePN = lock.GetConfiguration().GetBooleanParameter("CaseSensitivePN", false); } - Setup(*query.GetDcmtkObject().getDataset(), caseSensitivePN, query.GetEncoding()); + bool hasCodeExtensions; + Encoding encoding = query.DetectEncoding(hasCodeExtensions); + Setup(*query.GetDcmtkObject().getDataset(), caseSensitivePN, encoding, hasCodeExtensions); } @@ -72,7 +74,8 @@ void HierarchicalMatcher::Setup(DcmItem& dataset, bool caseSensitivePN, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { for (unsigned long i = 0; i < dataset.card(); i++) { @@ -108,7 +111,7 @@ } else if (sequence.card() == 1) { - sequences_[tag] = new HierarchicalMatcher(*sequence.getItem(0), caseSensitivePN, encoding); + sequences_[tag] = new HierarchicalMatcher(*sequence.getItem(0), caseSensitivePN, encoding, hasCodeExtensions); } else { @@ -122,7 +125,7 @@ std::set ignoreTagLength; std::auto_ptr value(FromDcmtkBridge::ConvertLeafElement (*element, DicomToJsonFlags_None, - 0, encoding, ignoreTagLength)); + 0, encoding, hasCodeExtensions, ignoreTagLength)); // WARNING: Also modify "DatabaseLookup::IsMatch()" if modifying this code if (value.get() == NULL || @@ -197,15 +200,19 @@ bool HierarchicalMatcher::Match(ParsedDicomFile& dicom) const { + bool hasCodeExtensions; + Encoding encoding = dicom.DetectEncoding(hasCodeExtensions); + return MatchInternal(*dicom.GetDcmtkObject().getDataset(), - dicom.GetEncoding()); + encoding, hasCodeExtensions); } bool HierarchicalMatcher::MatchInternal(DcmItem& item, - Encoding encoding) const + Encoding encoding, + bool hasCodeExtensions) const { - if (!flatConstraints_.IsMatch(item, encoding)) + if (!flatConstraints_.IsMatch(item, encoding, hasCodeExtensions)) { return false; } @@ -228,7 +235,7 @@ for (unsigned long i = 0; i < sequence->card(); i++) { - if (it->second->MatchInternal(*sequence->getItem(i), encoding)) + if (it->second->MatchInternal(*sequence->getItem(i), encoding, hasCodeExtensions)) { match = true; break; @@ -247,7 +254,8 @@ DcmDataset* HierarchicalMatcher::ExtractInternal(DcmItem& source, - Encoding encoding) const + Encoding encoding, + bool hasCodeExtensions) const { std::auto_ptr target(new DcmDataset); @@ -283,13 +291,13 @@ { cloned->append(new DcmItem(*sequence->getItem(i))); } - else if (it->second->MatchInternal(*sequence->getItem(i), encoding)) // TODO Might be optimized + else if (it->second->MatchInternal(*sequence->getItem(i), encoding, hasCodeExtensions)) // TODO Might be optimized { // It is necessary to encapsulate the child dataset into a // "DcmItem" object before it can be included in a // sequence. Otherwise, "dciodvfy" reports an error "Bad // tag in sequence - Expecting Item or Sequence Delimiter." - std::auto_ptr child(it->second->ExtractInternal(*sequence->getItem(i), encoding)); + std::auto_ptr child(it->second->ExtractInternal(*sequence->getItem(i), encoding, hasCodeExtensions)); cloned->append(new DcmItem(*child)); } } @@ -304,11 +312,14 @@ ParsedDicomFile* HierarchicalMatcher::Extract(ParsedDicomFile& dicom) const { + bool hasCodeExtensions; + Encoding encoding = dicom.DetectEncoding(hasCodeExtensions); + std::auto_ptr dataset(ExtractInternal(*dicom.GetDcmtkObject().getDataset(), - dicom.GetEncoding())); + encoding, hasCodeExtensions)); std::auto_ptr result(new ParsedDicomFile(*dataset)); - result->SetEncoding(dicom.GetEncoding()); + result->SetEncoding(encoding); return result.release(); } diff -r c9a71eb4edcf -r cf8cbeb35f33 OrthancServer/Search/HierarchicalMatcher.h --- a/OrthancServer/Search/HierarchicalMatcher.h Tue Feb 12 17:27:33 2019 +0100 +++ b/OrthancServer/Search/HierarchicalMatcher.h Wed Feb 13 17:46:12 2019 +0100 @@ -51,20 +51,24 @@ void Setup(DcmItem& query, bool caseSensitivePN, - Encoding encoding); + Encoding encoding, + bool hasCodeExtensions); HierarchicalMatcher(DcmItem& query, bool caseSensitivePN, - Encoding encoding) + Encoding encoding, + bool hasCodeExtensions) { - Setup(query, caseSensitivePN, encoding); + Setup(query, caseSensitivePN, encoding, hasCodeExtensions); } bool MatchInternal(DcmItem& dicom, - Encoding encoding) const; + Encoding encoding, + bool hasCodeExtensions) const; DcmDataset* ExtractInternal(DcmItem& dicom, - Encoding encoding) const; + Encoding encoding, + bool hasCodeExtensions) const; public: HierarchicalMatcher(ParsedDicomFile& query); diff -r c9a71eb4edcf -r cf8cbeb35f33 UnitTestsSources/DicomMapTests.cpp --- a/UnitTestsSources/DicomMapTests.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/UnitTestsSources/DicomMapTests.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -425,7 +425,7 @@ const unsigned char raw[] = { 0x63, 0x72, 0xe2, 0x6e, 0x65 }; std::string latin1((char*) &raw[0], sizeof(raw) / sizeof(char)); - std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1); + std::string utf8 = Toolbox::ConvertToUtf8(latin1, Encoding_Latin1, false); ParsedDicomFile dicom(false); dicom.SetEncoding(Encoding_Latin1); diff -r c9a71eb4edcf -r cf8cbeb35f33 UnitTestsSources/FromDcmtkTests.cpp --- a/UnitTestsSources/FromDcmtkTests.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/UnitTestsSources/FromDcmtkTests.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -217,7 +217,7 @@ { std::string source(testEncodingsEncoded[i]); std::string expected(testEncodingsExpected[i]); - std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i]); + std::string s = Toolbox::ConvertToUtf8(source, testEncodings[i], false); //std::cout << EnumerationToString(testEncodings[i]) << std::endl; EXPECT_EQ(expected, s); } @@ -262,7 +262,7 @@ // http://dicom.nema.org/medical/dicom/current/output/html/part03.html#table_C.12-4 ASSERT_FALSE(GetDicomEncoding(e, "ISO 2022 IR 87")); //ASSERT_EQ(Encoding_JapaneseKanji, e); ASSERT_FALSE(GetDicomEncoding(e, "ISO 2022 IR 159")); //ASSERT_EQ(Encoding_JapaneseKanjiSupplementary, e); - ASSERT_FALSE(GetDicomEncoding(e, "ISO 2022 IR 149")); //ASSERT_EQ(Encoding_Korean, e); + ASSERT_TRUE(GetDicomEncoding(e, "ISO 2022 IR 149")); ASSERT_EQ(Encoding_Korean, e); // http://dicom.nema.org/medical/dicom/current/output/html/part03.html#table_C.12-5 ASSERT_TRUE(GetDicomEncoding(e, "ISO_IR 192")); ASSERT_EQ(Encoding_Utf8, e); @@ -282,7 +282,7 @@ ParsedDicomFile f(true); f.SetEncoding(testEncodings[i]); - std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i]); + std::string s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false); f.Insert(DICOM_TAG_PATIENT_NAME, s, false); f.SaveToMemoryBuffer(dicom); } @@ -293,7 +293,9 @@ if (testEncodings[i] != Encoding_Ascii) { - ASSERT_EQ(testEncodings[i], g.GetEncoding()); + bool hasCodeExtensions; + ASSERT_EQ(testEncodings[i], g.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); } std::string tag; @@ -405,16 +407,16 @@ ignoreTagLength.insert(DICOM_TAG_PATIENT_ID); FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Short, - DicomToJsonFlags_Default, 0, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 0, Encoding_Ascii, false, ignoreTagLength); ASSERT_TRUE(b.isMember("0010,0010")); ASSERT_EQ("Hello", b["0010,0010"].asString()); FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Short, - DicomToJsonFlags_Default, 3, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 3, Encoding_Ascii, false, ignoreTagLength); ASSERT_TRUE(b["0010,0010"].isNull()); // "Hello" has more than 3 characters FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Full, - DicomToJsonFlags_Default, 3, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 3, Encoding_Ascii, false, ignoreTagLength); ASSERT_TRUE(b["0010,0010"].isObject()); ASSERT_EQ("PatientName", b["0010,0010"]["Name"].asString()); ASSERT_EQ("TooLong", b["0010,0010"]["Type"].asString()); @@ -422,7 +424,7 @@ ignoreTagLength.insert(DICOM_TAG_PATIENT_NAME); FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Short, - DicomToJsonFlags_Default, 3, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 3, Encoding_Ascii, false, ignoreTagLength); ASSERT_EQ("Hello", b["0010,0010"].asString()); } @@ -448,7 +450,7 @@ Json::Value b; std::set ignoreTagLength; FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Short, - DicomToJsonFlags_Default, 0, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 0, Encoding_Ascii, false, ignoreTagLength); ASSERT_EQ("Hello", b["0010,0010"].asString()); } @@ -461,7 +463,7 @@ Json::Value b; std::set ignoreTagLength; FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Short, - DicomToJsonFlags_Default, 0, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 0, Encoding_Ascii, false, ignoreTagLength); ASSERT_EQ(Json::arrayValue, b["0008,1110"].type()); ASSERT_EQ(2u, b["0008,1110"].size()); @@ -480,7 +482,7 @@ Json::Value b; std::set ignoreTagLength; FromDcmtkBridge::ElementToJson(b, *element, DicomToJsonFormat_Full, - DicomToJsonFlags_Default, 0, Encoding_Ascii, ignoreTagLength); + DicomToJsonFlags_Default, 0, Encoding_Ascii, false, ignoreTagLength); Json::Value c; ServerToolbox::SimplifyTags(c, b, DicomToJsonFormat_Human); @@ -599,10 +601,12 @@ if (testEncodings[i] != Encoding_Ascii) { - ASSERT_EQ(testEncodings[i], f.GetEncoding()); + bool hasCodeExtensions; + ASSERT_EQ(testEncodings[i], f.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); } - Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i]); + Json::Value s = Toolbox::ConvertToUtf8(testEncodingsEncoded[i], testEncodings[i], false); f.Replace(DICOM_TAG_PATIENT_NAME, s, false, DicomReplaceMode_InsertIfAbsent); Json::Value v; @@ -1161,7 +1165,7 @@ // Sanity check to test the proper behavior of "EncodingTests.py" std::string encoded = Toolbox::ConvertFromUtf8(testEncodingsExpected[i], testEncodings[i]); ASSERT_STREQ(testEncodingsEncoded[i], encoded.c_str()); - std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i]); + std::string decoded = Toolbox::ConvertToUtf8(encoded, testEncodings[i], false); ASSERT_STREQ(testEncodingsExpected[i], decoded.c_str()); if (testEncodings[i] != Encoding_Chinese) @@ -1169,7 +1173,7 @@ // A specific source string is used in "EncodingTests.py" to // test against Chinese, it is normal that it does not correspond to UTF8 - std::string encoded = Toolbox::ConvertToUtf8(Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i]); + std::string encoded = Toolbox::ConvertToUtf8(Toolbox::ConvertFromUtf8(utf8, testEncodings[i]), testEncodings[i], false); ASSERT_STREQ(testEncodingsExpected[i], encoded.c_str()); } } @@ -1227,7 +1231,9 @@ std::string tag; ParsedDicomFile dicom(m, Encoding_Utf8); - ASSERT_EQ(Encoding_Utf8, dicom.GetEncoding()); + bool hasCodeExtensions; + ASSERT_EQ(Encoding_Utf8, dicom.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); ASSERT_TRUE(dicom.GetTagValue(tag, DICOM_TAG_PATIENT_NAME)); ASSERT_EQ(tag, testEncodingsExpected[i]); @@ -1240,7 +1246,8 @@ dicom.ChangeEncoding(testEncodings[i]); - ASSERT_EQ(testEncodings[i], dicom.GetEncoding()); + ASSERT_EQ(testEncodings[i], dicom.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); const char* c = NULL; ASSERT_TRUE(dicom.GetDcmtkObject().getDataset()->findAndGetString(DCM_PatientName, c).good()); @@ -1275,7 +1282,10 @@ m.SetValue(DICOM_TAG_PATIENT_NAME, "HELLO", false); ParsedDicomFile d(m, Encoding_Latin3 /* default encoding */); - ASSERT_EQ(Encoding_Latin3, d.GetEncoding()); + + bool hasCodeExtensions; + ASSERT_EQ(Encoding_Latin3, d.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); } { @@ -1285,7 +1295,10 @@ m.SetValue(DICOM_TAG_PATIENT_NAME, "HELLO", false); ParsedDicomFile d(m, Encoding_Latin3 /* default encoding */); - ASSERT_EQ(Encoding_Japanese, d.GetEncoding()); + + bool hasCodeExtensions; + ASSERT_EQ(Encoding_Japanese, d.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); } { @@ -1314,6 +1327,152 @@ m.SetValue(DICOM_TAG_PATIENT_NAME, "HELLO", false); ParsedDicomFile d(m, Encoding_Latin3 /* default encoding */); - ASSERT_EQ(Encoding_Latin3, d.GetEncoding()); + + bool hasCodeExtensions; + ASSERT_EQ(Encoding_Latin3, d.DetectEncoding(hasCodeExtensions)); + ASSERT_FALSE(hasCodeExtensions); } } + + + +TEST(Toolbox, RemoveIso2022EscapeSequences) +{ + // +----------------------------------+ + // | one-byte control messages | + // +----------------------------------+ + + static const uint8_t iso2022_cstr_oneByteControl[] = { + 0x0f, 0x41, + 0x0e, 0x42, + 0x8e, 0x1b, 0x4e, 0x43, + 0x8f, 0x1b, 0x4f, 0x44, + 0x8e, 0x1b, 0x4a, 0x45, + 0x8f, 0x1b, 0x4a, 0x46, + 0x50, 0x51, 0x52, 0x00 + }; + + static const uint8_t iso2022_cstr_oneByteControl_ref[] = { + 0x41, + 0x42, + 0x43, + 0x44, + 0x8e, 0x1b, 0x4a, 0x45, + 0x8f, 0x1b, 0x4a, 0x46, + 0x50, 0x51, 0x52, 0x00 + }; + + // +----------------------------------+ + // | two-byte control messages | + // +----------------------------------+ + + static const uint8_t iso2022_cstr_twoByteControl[] = { + 0x1b, 0x6e, 0x41, + 0x1b, 0x6f, 0x42, + 0x1b, 0x4e, 0x43, + 0x1b, 0x4f, 0x44, + 0x1b, 0x7e, 0x45, + 0x1b, 0x7d, 0x46, + 0x1b, 0x7c, 0x47, 0x00 + }; + + static const uint8_t iso2022_cstr_twoByteControl_ref[] = { + 0x41, + 0x42, + 0x43, + 0x44, + 0x45, + 0x46, + 0x47, 0x00 + }; + + // +----------------------------------+ + // | various-length escape sequences | + // +----------------------------------+ + + static const uint8_t iso2022_cstr_escapeSequence[] = { + 0x1b, 0x40, 0x41, // 1b and 40 should not be removed (invalid esc seq) + 0x1b, 0x50, 0x42, // ditto + 0x1b, 0x7f, 0x43, // ditto + 0x1b, 0x21, 0x4a, 0x44, // this will match + 0x1b, 0x20, 0x21, 0x2f, 0x40, 0x45, // this will match + 0x1b, 0x20, 0x21, 0x2f, 0x2f, 0x40, 0x46, // this will match too + 0x1b, 0x20, 0x21, 0x2f, 0x1f, 0x47, 0x48, 0x00 // this will NOT match! + }; + + static const uint8_t iso2022_cstr_escapeSequence_ref[] = { + 0x1b, 0x40, 0x41, // 1b and 40 should not be removed (invalid esc seq) + 0x1b, 0x50, 0x42, // ditto + 0x1b, 0x7f, 0x43, // ditto + 0x44, // this will match + 0x45, // this will match + 0x46, // this will match too + 0x1b, 0x20, 0x21, 0x2f, 0x1f, 0x47, 0x48, 0x00 // this will NOT match! + }; + + + // +----------------------------------+ + // | a real-world japanese sample | + // +----------------------------------+ + + static const uint8_t iso2022_cstr_real_ir13[] = { + 0xd4, 0xcf, 0xc0, 0xde, 0x5e, 0xc0, 0xdb, 0xb3, + 0x3d, 0x1b, 0x24, 0x42, 0x3b, 0x33, 0x45, 0x44, + 0x1b, 0x28, 0x4a, 0x5e, 0x1b, 0x24, 0x42, 0x42, + 0x40, 0x4f, 0x3a, 0x1b, 0x28, 0x4a, 0x3d, 0x1b, + 0x24, 0x42, 0x24, 0x64, 0x24, 0x5e, 0x24, 0x40, + 0x1b, 0x28, 0x4a, 0x5e, 0x1b, 0x24, 0x42, 0x24, + 0x3f, 0x24, 0x6d, 0x24, 0x26, 0x1b, 0x28, 0x4a, 0x00 + }; + + static const uint8_t iso2022_cstr_real_ir13_ref[] = { + 0xd4, 0xcf, 0xc0, 0xde, 0x5e, 0xc0, 0xdb, 0xb3, + 0x3d, + 0x3b, 0x33, 0x45, 0x44, + 0x5e, + 0x42, + 0x40, 0x4f, 0x3a, + 0x3d, + 0x24, 0x64, 0x24, 0x5e, 0x24, 0x40, + 0x5e, + 0x24, + 0x3f, 0x24, 0x6d, 0x24, 0x26, 0x00 + }; + + + + // +----------------------------------+ + // | the actual test | + // +----------------------------------+ + + std::string iso2022_str_oneByteControl( + reinterpret_cast(iso2022_cstr_oneByteControl)); + std::string iso2022_str_oneByteControl_ref( + reinterpret_cast(iso2022_cstr_oneByteControl_ref)); + std::string iso2022_str_twoByteControl( + reinterpret_cast(iso2022_cstr_twoByteControl)); + std::string iso2022_str_twoByteControl_ref( + reinterpret_cast(iso2022_cstr_twoByteControl_ref)); + std::string iso2022_str_escapeSequence( + reinterpret_cast(iso2022_cstr_escapeSequence)); + std::string iso2022_str_escapeSequence_ref( + reinterpret_cast(iso2022_cstr_escapeSequence_ref)); + std::string iso2022_str_real_ir13( + reinterpret_cast(iso2022_cstr_real_ir13)); + std::string iso2022_str_real_ir13_ref( + reinterpret_cast(iso2022_cstr_real_ir13_ref)); + + std::string dest; + + Toolbox::RemoveIso2022EscapeSequences(dest, iso2022_str_oneByteControl); + ASSERT_EQ(dest, iso2022_str_oneByteControl_ref); + + Toolbox::RemoveIso2022EscapeSequences(dest, iso2022_str_twoByteControl); + ASSERT_EQ(dest, iso2022_str_twoByteControl_ref); + + Toolbox::RemoveIso2022EscapeSequences(dest, iso2022_str_escapeSequence); + ASSERT_EQ(dest, iso2022_str_escapeSequence_ref); + + Toolbox::RemoveIso2022EscapeSequences(dest, iso2022_str_real_ir13); + ASSERT_EQ(dest, iso2022_str_real_ir13_ref); +} diff -r c9a71eb4edcf -r cf8cbeb35f33 UnitTestsSources/UnitTestsMain.cpp --- a/UnitTestsSources/UnitTestsMain.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/UnitTestsSources/UnitTestsMain.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -450,7 +450,7 @@ ASSERT_EQ("&abc", Toolbox::ConvertToAscii(s)); // Open in Emacs, then save with UTF-8 encoding, then "hexdump -C" - std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1); + std::string utf8 = Toolbox::ConvertToUtf8(s, Encoding_Latin1, false); ASSERT_EQ(15u, utf8.size()); ASSERT_EQ(0xc3, static_cast(utf8[0])); ASSERT_EQ(0xa0, static_cast(utf8[1])); @@ -477,8 +477,8 @@ std::string s((char*) &latin1[0], sizeof(latin1) / sizeof(char)); - ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1), Encoding_Latin1)); - ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8)); + ASSERT_EQ(s, Toolbox::ConvertFromUtf8(Toolbox::ConvertToUtf8(s, Encoding_Latin1, false), Encoding_Latin1)); + ASSERT_EQ("cre", Toolbox::ConvertToUtf8(s, Encoding_Utf8, false)); } @@ -690,6 +690,7 @@ ASSERT_EQ(Encoding_Japanese, StringToEncoding(EnumerationToString(Encoding_Japanese))); ASSERT_EQ(Encoding_Chinese, StringToEncoding(EnumerationToString(Encoding_Chinese))); ASSERT_EQ(Encoding_Thai, StringToEncoding(EnumerationToString(Encoding_Thai))); + ASSERT_EQ(Encoding_Korean, StringToEncoding(EnumerationToString(Encoding_Korean))); ASSERT_EQ(ResourceType_Patient, StringToResourceType(EnumerationToString(ResourceType_Patient))); ASSERT_EQ(ResourceType_Study, StringToResourceType(EnumerationToString(ResourceType_Study)));