Mercurial > hg > orthanc
diff Core/Toolbox.cpp @ 3217:cf8cbeb35f33
preliminary support of Korean character set
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Wed, 13 Feb 2019 17:46:12 +0100 |
parents | 810772486249 |
children | 9a83d94b2a1e |
line wrap: on
line diff
--- a/Core/Toolbox.cpp Tue Feb 12 17:27:33 2019 +0100 +++ b/Core/Toolbox.cpp Wed Feb 13 17:46:12 2019 +0100 @@ -514,6 +514,10 @@ return "TIS620.2533-0"; break; + case Encoding_Korean: + return "ISO-IR-149"; + break; + default: throw OrthancException(ErrorCode_NotImplemented); } @@ -522,27 +526,46 @@ #if ORTHANC_ENABLE_LOCALE == 1 + // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2 std::string Toolbox::ConvertToUtf8(const std::string& source, - Encoding sourceEncoding) + Encoding sourceEncoding, + bool hasCodeExtensions) { // The "::skip" flag makes boost skip invalid UTF-8 // characters. This can occur in badly-encoded DICOM files. try { - if (sourceEncoding == Encoding_Utf8) - { - // Already in UTF-8: No conversion is required - return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip); - } - else if (sourceEncoding == Encoding_Ascii) + if (sourceEncoding == Encoding_Ascii) { return ConvertToAscii(source); } - else + else { - const char* encoding = GetBoostLocaleEncoding(sourceEncoding); - return boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); + std::string s; + + if (sourceEncoding == Encoding_Utf8) + { + // Already in UTF-8: No conversion is required, but we ensure + // the output is correctly encoded + s = boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip); + } + else + { + const char* encoding = GetBoostLocaleEncoding(sourceEncoding); + s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); + } + + if (hasCodeExtensions) + { + std::string t; + RemoveIso2022EscapeSequences(t, s); + return t; + } + else + { + return s; + } } } catch (std::runtime_error&) @@ -1593,6 +1616,182 @@ return boost::regex_replace(source, pattern, formatter); } + + + namespace Iso2022 + { + /** + Returns whether the string s contains a single-byte control message + at index i + **/ + static inline bool IsControlMessage1(const std::string& s, size_t i) + { + if (i < s.size()) + { + char c = s[i]; + return + (c == '\x0f') || // Locking shift zero + (c == '\x0e'); // Locking shift one + } + else + { + return false; + } + } + + /** + Returns whether the string s contains a double-byte control message + at index i + **/ + static inline size_t IsControlMessage2(const std::string& s, size_t i) + { + if (i + 1 < s.size()) + { + char c1 = s[i]; + char c2 = s[i + 1]; + return (c1 == 0x1b) && ( + (c2 == '\x6e') || // Locking shift two + (c2 == '\x6f') || // Locking shift three + (c2 == '\x4e') || // Single shift two (alt) + (c2 == '\x4f') || // Single shift three (alt) + (c2 == '\x7c') || // Locking shift three right + (c2 == '\x7d') || // Locking shift two right + (c2 == '\x7e') // Locking shift one right + ); + } + else + { + return false; + } + } + + /** + Returns whether the string s contains a triple-byte control message + at index i + **/ + static inline size_t IsControlMessage3(const std::string& s, size_t i) + { + if (i + 2 < s.size()) + { + char c1 = s[i]; + char c2 = s[i + 1]; + char c3 = s[i + 2]; + return ((c1 == '\x8e' && c2 == 0x1b && c3 == '\x4e') || + (c1 == '\x8f' && c2 == 0x1b && c3 == '\x4f')); + } + else + { + return false; + } + } + + /** + This function returns true if the index i in the supplied string s: + - is valid + - contains the c character + This function returns false otherwise. + **/ + static inline bool TestCharValue( + const std::string& s, size_t i, char c) + { + if (i < s.size()) + return s[i] == c; + else + return false; + } + + /** + This function returns true if the index i in the supplied string s: + - is valid + - has a c character that is >= cMin and <= cMax (included) + This function returns false otherwise. + **/ + static inline bool TestCharRange( + const std::string& s, size_t i, char cMin, char cMax) + { + if (i < s.size()) + return (s[i] >= cMin) && (s[i] <= cMax); + else + return false; + } + + /** + This function returns the total length in bytes of the escape sequence + located in string s at index i, if there is one, or 0 otherwise. + **/ + static inline size_t GetEscapeSequenceLength(const std::string& s, size_t i) + { + if (TestCharValue(s, i, 0x1b)) + { + size_t j = i+1; + + // advance reading cursor while we are in a sequence + while (TestCharRange(s, j, '\x20', '\x2f')) + ++j; + + // check there is a valid termination byte AND we're long enough (there + // must be at least one byte between 0x20 and 0x2f + if (TestCharRange(s, j, '\x30', '\x7f') && (j - i) >= 2) + return j - i + 1; + else + return 0; + } + else + return 0; + } + } + + + + /** + This function will strip all ISO/IEC 2022 control codes and escape + sequences. + Please see https://en.wikipedia.org/wiki/ISO/IEC_2022 (as of 2019-02) + for a list of those. + + Please note that this operation is potentially destructive, because + it removes the character set information from the byte stream. + + However, in the case where the encoding is unique, then suppressing + the escape sequences allows to provide us with a clean string after + conversion to utf-8 with boost. + **/ + void Toolbox::RemoveIso2022EscapeSequences(std::string& dest, const std::string& src) + { + // we need AT MOST the same size as the source string in the output + dest.clear(); + if (dest.capacity() < src.size()) + dest.reserve(src.size()); + + size_t i = 0; + + // uint8_t view to the string + while (i < src.size()) + { + size_t j = i; + + // The i index will only be incremented if a message is detected + // in that case, the message is skipped and the index is set to the + // next position to read + if (Iso2022::IsControlMessage1(src, i)) + i += 1; + else if (Iso2022::IsControlMessage2(src, i)) + i += 2; + else if (Iso2022::IsControlMessage3(src, i)) + i += 3; + else + i += Iso2022::GetEscapeSequenceLength(src, i); + + // if the index was NOT incremented, this means there was no message at + // this location: we then may copy the character at this index and + // increment the index to point to the next read position + if (j == i) + { + dest.push_back(src[i]); + i++; + } + } + } }