Mercurial > hg > orthanc
comparison Core/Toolbox.cpp @ 3217:cf8cbeb35f33
preliminary support of Korean character set
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Wed, 13 Feb 2019 17:46:12 +0100 |
parents | 810772486249 |
children | 9a83d94b2a1e |
comparison
equal
deleted
inserted
replaced
3216:c9a71eb4edcf | 3217:cf8cbeb35f33 |
---|---|
512 | 512 |
513 case Encoding_Thai: | 513 case Encoding_Thai: |
514 return "TIS620.2533-0"; | 514 return "TIS620.2533-0"; |
515 break; | 515 break; |
516 | 516 |
517 case Encoding_Korean: | |
518 return "ISO-IR-149"; | |
519 break; | |
520 | |
517 default: | 521 default: |
518 throw OrthancException(ErrorCode_NotImplemented); | 522 throw OrthancException(ErrorCode_NotImplemented); |
519 } | 523 } |
520 } | 524 } |
521 #endif | 525 #endif |
522 | 526 |
523 | 527 |
524 #if ORTHANC_ENABLE_LOCALE == 1 | 528 #if ORTHANC_ENABLE_LOCALE == 1 |
529 // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2 | |
525 std::string Toolbox::ConvertToUtf8(const std::string& source, | 530 std::string Toolbox::ConvertToUtf8(const std::string& source, |
526 Encoding sourceEncoding) | 531 Encoding sourceEncoding, |
532 bool hasCodeExtensions) | |
527 { | 533 { |
528 // The "::skip" flag makes boost skip invalid UTF-8 | 534 // The "::skip" flag makes boost skip invalid UTF-8 |
529 // characters. This can occur in badly-encoded DICOM files. | 535 // characters. This can occur in badly-encoded DICOM files. |
530 | 536 |
531 try | 537 try |
532 { | 538 { |
533 if (sourceEncoding == Encoding_Utf8) | 539 if (sourceEncoding == Encoding_Ascii) |
534 { | |
535 // Already in UTF-8: No conversion is required | |
536 return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip); | |
537 } | |
538 else if (sourceEncoding == Encoding_Ascii) | |
539 { | 540 { |
540 return ConvertToAscii(source); | 541 return ConvertToAscii(source); |
541 } | 542 } |
542 else | 543 else |
543 { | 544 { |
544 const char* encoding = GetBoostLocaleEncoding(sourceEncoding); | 545 std::string s; |
545 return boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); | 546 |
547 if (sourceEncoding == Encoding_Utf8) | |
548 { | |
549 // Already in UTF-8: No conversion is required, but we ensure | |
550 // the output is correctly encoded | |
551 s = boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip); | |
552 } | |
553 else | |
554 { | |
555 const char* encoding = GetBoostLocaleEncoding(sourceEncoding); | |
556 s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); | |
557 } | |
558 | |
559 if (hasCodeExtensions) | |
560 { | |
561 std::string t; | |
562 RemoveIso2022EscapeSequences(t, s); | |
563 return t; | |
564 } | |
565 else | |
566 { | |
567 return s; | |
568 } | |
546 } | 569 } |
547 } | 570 } |
548 catch (std::runtime_error&) | 571 catch (std::runtime_error&) |
549 { | 572 { |
550 // Bad input string or bad encoding | 573 // Bad input string or bad encoding |
1591 | 1614 |
1592 VariableFormatter formatter(dictionary); | 1615 VariableFormatter formatter(dictionary); |
1593 | 1616 |
1594 return boost::regex_replace(source, pattern, formatter); | 1617 return boost::regex_replace(source, pattern, formatter); |
1595 } | 1618 } |
1619 | |
1620 | |
1621 namespace Iso2022 | |
1622 { | |
1623 /** | |
1624 Returns whether the string s contains a single-byte control message | |
1625 at index i | |
1626 **/ | |
1627 static inline bool IsControlMessage1(const std::string& s, size_t i) | |
1628 { | |
1629 if (i < s.size()) | |
1630 { | |
1631 char c = s[i]; | |
1632 return | |
1633 (c == '\x0f') || // Locking shift zero | |
1634 (c == '\x0e'); // Locking shift one | |
1635 } | |
1636 else | |
1637 { | |
1638 return false; | |
1639 } | |
1640 } | |
1641 | |
1642 /** | |
1643 Returns whether the string s contains a double-byte control message | |
1644 at index i | |
1645 **/ | |
1646 static inline size_t IsControlMessage2(const std::string& s, size_t i) | |
1647 { | |
1648 if (i + 1 < s.size()) | |
1649 { | |
1650 char c1 = s[i]; | |
1651 char c2 = s[i + 1]; | |
1652 return (c1 == 0x1b) && ( | |
1653 (c2 == '\x6e') || // Locking shift two | |
1654 (c2 == '\x6f') || // Locking shift three | |
1655 (c2 == '\x4e') || // Single shift two (alt) | |
1656 (c2 == '\x4f') || // Single shift three (alt) | |
1657 (c2 == '\x7c') || // Locking shift three right | |
1658 (c2 == '\x7d') || // Locking shift two right | |
1659 (c2 == '\x7e') // Locking shift one right | |
1660 ); | |
1661 } | |
1662 else | |
1663 { | |
1664 return false; | |
1665 } | |
1666 } | |
1667 | |
1668 /** | |
1669 Returns whether the string s contains a triple-byte control message | |
1670 at index i | |
1671 **/ | |
1672 static inline size_t IsControlMessage3(const std::string& s, size_t i) | |
1673 { | |
1674 if (i + 2 < s.size()) | |
1675 { | |
1676 char c1 = s[i]; | |
1677 char c2 = s[i + 1]; | |
1678 char c3 = s[i + 2]; | |
1679 return ((c1 == '\x8e' && c2 == 0x1b && c3 == '\x4e') || | |
1680 (c1 == '\x8f' && c2 == 0x1b && c3 == '\x4f')); | |
1681 } | |
1682 else | |
1683 { | |
1684 return false; | |
1685 } | |
1686 } | |
1687 | |
1688 /** | |
1689 This function returns true if the index i in the supplied string s: | |
1690 - is valid | |
1691 - contains the c character | |
1692 This function returns false otherwise. | |
1693 **/ | |
1694 static inline bool TestCharValue( | |
1695 const std::string& s, size_t i, char c) | |
1696 { | |
1697 if (i < s.size()) | |
1698 return s[i] == c; | |
1699 else | |
1700 return false; | |
1701 } | |
1702 | |
1703 /** | |
1704 This function returns true if the index i in the supplied string s: | |
1705 - is valid | |
1706 - has a c character that is >= cMin and <= cMax (included) | |
1707 This function returns false otherwise. | |
1708 **/ | |
1709 static inline bool TestCharRange( | |
1710 const std::string& s, size_t i, char cMin, char cMax) | |
1711 { | |
1712 if (i < s.size()) | |
1713 return (s[i] >= cMin) && (s[i] <= cMax); | |
1714 else | |
1715 return false; | |
1716 } | |
1717 | |
1718 /** | |
1719 This function returns the total length in bytes of the escape sequence | |
1720 located in string s at index i, if there is one, or 0 otherwise. | |
1721 **/ | |
1722 static inline size_t GetEscapeSequenceLength(const std::string& s, size_t i) | |
1723 { | |
1724 if (TestCharValue(s, i, 0x1b)) | |
1725 { | |
1726 size_t j = i+1; | |
1727 | |
1728 // advance reading cursor while we are in a sequence | |
1729 while (TestCharRange(s, j, '\x20', '\x2f')) | |
1730 ++j; | |
1731 | |
1732 // check there is a valid termination byte AND we're long enough (there | |
1733 // must be at least one byte between 0x20 and 0x2f | |
1734 if (TestCharRange(s, j, '\x30', '\x7f') && (j - i) >= 2) | |
1735 return j - i + 1; | |
1736 else | |
1737 return 0; | |
1738 } | |
1739 else | |
1740 return 0; | |
1741 } | |
1742 } | |
1743 | |
1744 | |
1745 | |
1746 /** | |
1747 This function will strip all ISO/IEC 2022 control codes and escape | |
1748 sequences. | |
1749 Please see https://en.wikipedia.org/wiki/ISO/IEC_2022 (as of 2019-02) | |
1750 for a list of those. | |
1751 | |
1752 Please note that this operation is potentially destructive, because | |
1753 it removes the character set information from the byte stream. | |
1754 | |
1755 However, in the case where the encoding is unique, then suppressing | |
1756 the escape sequences allows to provide us with a clean string after | |
1757 conversion to utf-8 with boost. | |
1758 **/ | |
1759 void Toolbox::RemoveIso2022EscapeSequences(std::string& dest, const std::string& src) | |
1760 { | |
1761 // we need AT MOST the same size as the source string in the output | |
1762 dest.clear(); | |
1763 if (dest.capacity() < src.size()) | |
1764 dest.reserve(src.size()); | |
1765 | |
1766 size_t i = 0; | |
1767 | |
1768 // uint8_t view to the string | |
1769 while (i < src.size()) | |
1770 { | |
1771 size_t j = i; | |
1772 | |
1773 // The i index will only be incremented if a message is detected | |
1774 // in that case, the message is skipped and the index is set to the | |
1775 // next position to read | |
1776 if (Iso2022::IsControlMessage1(src, i)) | |
1777 i += 1; | |
1778 else if (Iso2022::IsControlMessage2(src, i)) | |
1779 i += 2; | |
1780 else if (Iso2022::IsControlMessage3(src, i)) | |
1781 i += 3; | |
1782 else | |
1783 i += Iso2022::GetEscapeSequenceLength(src, i); | |
1784 | |
1785 // if the index was NOT incremented, this means there was no message at | |
1786 // this location: we then may copy the character at this index and | |
1787 // increment the index to point to the next read position | |
1788 if (j == i) | |
1789 { | |
1790 dest.push_back(src[i]); | |
1791 i++; | |
1792 } | |
1793 } | |
1794 } | |
1596 } | 1795 } |
1597 | 1796 |
1598 | 1797 |
1599 | 1798 |
1600 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) | 1799 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) |