comparison Core/Toolbox.cpp @ 3322:b32b7c44a223

Toolbox::Utf8ToUnicodeCharacter()
author Sebastien Jodogne <s.jodogne@gmail.com>
date Fri, 15 Mar 2019 17:37:51 +0100
parents 4b042ec734c1
children 87396c571109 2e7c5c15ba25
comparison
equal deleted inserted replaced
3321:e54ca78059bd 3322:b32b7c44a223
1787 dest.push_back(src[i]); 1787 dest.push_back(src[i]);
1788 i++; 1788 i++;
1789 } 1789 }
1790 } 1790 }
1791 } 1791 }
1792
1793
1794 void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode,
1795 size_t& length,
1796 const std::string& utf8,
1797 size_t position)
1798 {
1799 // https://en.wikipedia.org/wiki/UTF-8
1800
1801 static const uint8_t MASK_IS_1_BYTE = 0x80; // printf '0x%x\n' "$((2#10000000))"
1802 static const uint8_t TEST_IS_1_BYTE = 0x00;
1803
1804 static const uint8_t MASK_IS_2_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))"
1805 static const uint8_t TEST_IS_2_BYTES = 0xc0; // printf '0x%x\n' "$((2#11000000))"
1806
1807 static const uint8_t MASK_IS_3_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))"
1808 static const uint8_t TEST_IS_3_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))"
1809
1810 static const uint8_t MASK_IS_4_BYTES = 0xf8; // printf '0x%x\n' "$((2#11111000))"
1811 static const uint8_t TEST_IS_4_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))"
1812
1813 static const uint8_t MASK_CONTINUATION = 0xc0; // printf '0x%x\n' "$((2#11000000))"
1814 static const uint8_t TEST_CONTINUATION = 0x80; // printf '0x%x\n' "$((2#10000000))"
1815
1816 if (position >= utf8.size())
1817 {
1818 throw OrthancException(ErrorCode_ParameterOutOfRange);
1819 }
1820
1821 const uint8_t* buffer = reinterpret_cast<const uint8_t*>(utf8.c_str());
1822
1823 if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE)
1824 {
1825 length = 1;
1826 unicode = buffer[0] & ~MASK_IS_1_BYTE;
1827 }
1828 else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES &&
1829 position + 1 < utf8.size() &&
1830 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION)
1831 {
1832 length = 2;
1833 uint32_t a = buffer[0] & ~MASK_IS_2_BYTES;
1834 uint32_t b = buffer[1] & ~MASK_CONTINUATION;
1835 unicode = (a << 6) | b;
1836 }
1837 else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES &&
1838 position + 2 < utf8.size() &&
1839 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION &&
1840 (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION)
1841 {
1842 length = 3;
1843 uint32_t a = buffer[0] & ~MASK_IS_3_BYTES;
1844 uint32_t b = buffer[1] & ~MASK_CONTINUATION;
1845 uint32_t c = buffer[2] & ~MASK_CONTINUATION;
1846 unicode = (a << 12) | (b << 6) | c;
1847 }
1848 else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES &&
1849 position + 3 < utf8.size() &&
1850 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION &&
1851 (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION &&
1852 (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION)
1853 {
1854 length = 4;
1855 uint32_t a = buffer[0] & ~MASK_IS_4_BYTES;
1856 uint32_t b = buffer[1] & ~MASK_CONTINUATION;
1857 uint32_t c = buffer[2] & ~MASK_CONTINUATION;
1858 uint32_t d = buffer[3] & ~MASK_CONTINUATION;
1859 unicode = (a << 18) | (b << 12) | (c << 6) | d;
1860 }
1861 else
1862 {
1863 // This is not a valid UTF-8 encoding
1864 throw OrthancException(ErrorCode_BadFileFormat);
1865 }
1866 }
1792 } 1867 }
1793 1868
1794 1869
1795 1870
1796 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) 1871 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content)