comparison Core/Toolbox.cpp @ 3217:cf8cbeb35f33

preliminary support of Korean character set
author Sebastien Jodogne <s.jodogne@gmail.com>
date Wed, 13 Feb 2019 17:46:12 +0100
parents 810772486249
children 9a83d94b2a1e
comparison
equal deleted inserted replaced
3216:c9a71eb4edcf 3217:cf8cbeb35f33
512 512
513 case Encoding_Thai: 513 case Encoding_Thai:
514 return "TIS620.2533-0"; 514 return "TIS620.2533-0";
515 break; 515 break;
516 516
517 case Encoding_Korean:
518 return "ISO-IR-149";
519 break;
520
517 default: 521 default:
518 throw OrthancException(ErrorCode_NotImplemented); 522 throw OrthancException(ErrorCode_NotImplemented);
519 } 523 }
520 } 524 }
521 #endif 525 #endif
522 526
523 527
524 #if ORTHANC_ENABLE_LOCALE == 1 528 #if ORTHANC_ENABLE_LOCALE == 1
529 // http://dicom.nema.org/medical/dicom/current/output/chtml/part03/sect_C.12.html#sect_C.12.1.1.2
525 std::string Toolbox::ConvertToUtf8(const std::string& source, 530 std::string Toolbox::ConvertToUtf8(const std::string& source,
526 Encoding sourceEncoding) 531 Encoding sourceEncoding,
532 bool hasCodeExtensions)
527 { 533 {
528 // The "::skip" flag makes boost skip invalid UTF-8 534 // The "::skip" flag makes boost skip invalid UTF-8
529 // characters. This can occur in badly-encoded DICOM files. 535 // characters. This can occur in badly-encoded DICOM files.
530 536
531 try 537 try
532 { 538 {
533 if (sourceEncoding == Encoding_Utf8) 539 if (sourceEncoding == Encoding_Ascii)
534 {
535 // Already in UTF-8: No conversion is required
536 return boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
537 }
538 else if (sourceEncoding == Encoding_Ascii)
539 { 540 {
540 return ConvertToAscii(source); 541 return ConvertToAscii(source);
541 } 542 }
542 else 543 else
543 { 544 {
544 const char* encoding = GetBoostLocaleEncoding(sourceEncoding); 545 std::string s;
545 return boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip); 546
547 if (sourceEncoding == Encoding_Utf8)
548 {
549 // Already in UTF-8: No conversion is required, but we ensure
550 // the output is correctly encoded
551 s = boost::locale::conv::utf_to_utf<char>(source, boost::locale::conv::skip);
552 }
553 else
554 {
555 const char* encoding = GetBoostLocaleEncoding(sourceEncoding);
556 s = boost::locale::conv::to_utf<char>(source, encoding, boost::locale::conv::skip);
557 }
558
559 if (hasCodeExtensions)
560 {
561 std::string t;
562 RemoveIso2022EscapeSequences(t, s);
563 return t;
564 }
565 else
566 {
567 return s;
568 }
546 } 569 }
547 } 570 }
548 catch (std::runtime_error&) 571 catch (std::runtime_error&)
549 { 572 {
550 // Bad input string or bad encoding 573 // Bad input string or bad encoding
1591 1614
1592 VariableFormatter formatter(dictionary); 1615 VariableFormatter formatter(dictionary);
1593 1616
1594 return boost::regex_replace(source, pattern, formatter); 1617 return boost::regex_replace(source, pattern, formatter);
1595 } 1618 }
1619
1620
1621 namespace Iso2022
1622 {
1623 /**
1624 Returns whether the string s contains a single-byte control message
1625 at index i
1626 **/
1627 static inline bool IsControlMessage1(const std::string& s, size_t i)
1628 {
1629 if (i < s.size())
1630 {
1631 char c = s[i];
1632 return
1633 (c == '\x0f') || // Locking shift zero
1634 (c == '\x0e'); // Locking shift one
1635 }
1636 else
1637 {
1638 return false;
1639 }
1640 }
1641
1642 /**
1643 Returns whether the string s contains a double-byte control message
1644 at index i
1645 **/
1646 static inline size_t IsControlMessage2(const std::string& s, size_t i)
1647 {
1648 if (i + 1 < s.size())
1649 {
1650 char c1 = s[i];
1651 char c2 = s[i + 1];
1652 return (c1 == 0x1b) && (
1653 (c2 == '\x6e') || // Locking shift two
1654 (c2 == '\x6f') || // Locking shift three
1655 (c2 == '\x4e') || // Single shift two (alt)
1656 (c2 == '\x4f') || // Single shift three (alt)
1657 (c2 == '\x7c') || // Locking shift three right
1658 (c2 == '\x7d') || // Locking shift two right
1659 (c2 == '\x7e') // Locking shift one right
1660 );
1661 }
1662 else
1663 {
1664 return false;
1665 }
1666 }
1667
1668 /**
1669 Returns whether the string s contains a triple-byte control message
1670 at index i
1671 **/
1672 static inline size_t IsControlMessage3(const std::string& s, size_t i)
1673 {
1674 if (i + 2 < s.size())
1675 {
1676 char c1 = s[i];
1677 char c2 = s[i + 1];
1678 char c3 = s[i + 2];
1679 return ((c1 == '\x8e' && c2 == 0x1b && c3 == '\x4e') ||
1680 (c1 == '\x8f' && c2 == 0x1b && c3 == '\x4f'));
1681 }
1682 else
1683 {
1684 return false;
1685 }
1686 }
1687
1688 /**
1689 This function returns true if the index i in the supplied string s:
1690 - is valid
1691 - contains the c character
1692 This function returns false otherwise.
1693 **/
1694 static inline bool TestCharValue(
1695 const std::string& s, size_t i, char c)
1696 {
1697 if (i < s.size())
1698 return s[i] == c;
1699 else
1700 return false;
1701 }
1702
1703 /**
1704 This function returns true if the index i in the supplied string s:
1705 - is valid
1706 - has a c character that is >= cMin and <= cMax (included)
1707 This function returns false otherwise.
1708 **/
1709 static inline bool TestCharRange(
1710 const std::string& s, size_t i, char cMin, char cMax)
1711 {
1712 if (i < s.size())
1713 return (s[i] >= cMin) && (s[i] <= cMax);
1714 else
1715 return false;
1716 }
1717
1718 /**
1719 This function returns the total length in bytes of the escape sequence
1720 located in string s at index i, if there is one, or 0 otherwise.
1721 **/
1722 static inline size_t GetEscapeSequenceLength(const std::string& s, size_t i)
1723 {
1724 if (TestCharValue(s, i, 0x1b))
1725 {
1726 size_t j = i+1;
1727
1728 // advance reading cursor while we are in a sequence
1729 while (TestCharRange(s, j, '\x20', '\x2f'))
1730 ++j;
1731
1732 // check there is a valid termination byte AND we're long enough (there
1733 // must be at least one byte between 0x20 and 0x2f
1734 if (TestCharRange(s, j, '\x30', '\x7f') && (j - i) >= 2)
1735 return j - i + 1;
1736 else
1737 return 0;
1738 }
1739 else
1740 return 0;
1741 }
1742 }
1743
1744
1745
1746 /**
1747 This function will strip all ISO/IEC 2022 control codes and escape
1748 sequences.
1749 Please see https://en.wikipedia.org/wiki/ISO/IEC_2022 (as of 2019-02)
1750 for a list of those.
1751
1752 Please note that this operation is potentially destructive, because
1753 it removes the character set information from the byte stream.
1754
1755 However, in the case where the encoding is unique, then suppressing
1756 the escape sequences allows to provide us with a clean string after
1757 conversion to utf-8 with boost.
1758 **/
1759 void Toolbox::RemoveIso2022EscapeSequences(std::string& dest, const std::string& src)
1760 {
1761 // we need AT MOST the same size as the source string in the output
1762 dest.clear();
1763 if (dest.capacity() < src.size())
1764 dest.reserve(src.size());
1765
1766 size_t i = 0;
1767
1768 // uint8_t view to the string
1769 while (i < src.size())
1770 {
1771 size_t j = i;
1772
1773 // The i index will only be incremented if a message is detected
1774 // in that case, the message is skipped and the index is set to the
1775 // next position to read
1776 if (Iso2022::IsControlMessage1(src, i))
1777 i += 1;
1778 else if (Iso2022::IsControlMessage2(src, i))
1779 i += 2;
1780 else if (Iso2022::IsControlMessage3(src, i))
1781 i += 3;
1782 else
1783 i += Iso2022::GetEscapeSequenceLength(src, i);
1784
1785 // if the index was NOT incremented, this means there was no message at
1786 // this location: we then may copy the character at this index and
1787 // increment the index to point to the next read position
1788 if (j == i)
1789 {
1790 dest.push_back(src[i]);
1791 i++;
1792 }
1793 }
1794 }
1596 } 1795 }
1597 1796
1598 1797
1599 1798
1600 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) 1799 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content)