Mercurial > hg > orthanc
comparison Core/Toolbox.cpp @ 3322:b32b7c44a223
Toolbox::Utf8ToUnicodeCharacter()
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Fri, 15 Mar 2019 17:37:51 +0100 |
parents | 4b042ec734c1 |
children | 87396c571109 2e7c5c15ba25 |
comparison
equal
deleted
inserted
replaced
3321:e54ca78059bd | 3322:b32b7c44a223 |
---|---|
1787 dest.push_back(src[i]); | 1787 dest.push_back(src[i]); |
1788 i++; | 1788 i++; |
1789 } | 1789 } |
1790 } | 1790 } |
1791 } | 1791 } |
1792 | |
1793 | |
1794 void Toolbox::Utf8ToUnicodeCharacter(uint32_t& unicode, | |
1795 size_t& length, | |
1796 const std::string& utf8, | |
1797 size_t position) | |
1798 { | |
1799 // https://en.wikipedia.org/wiki/UTF-8 | |
1800 | |
1801 static const uint8_t MASK_IS_1_BYTE = 0x80; // printf '0x%x\n' "$((2#10000000))" | |
1802 static const uint8_t TEST_IS_1_BYTE = 0x00; | |
1803 | |
1804 static const uint8_t MASK_IS_2_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" | |
1805 static const uint8_t TEST_IS_2_BYTES = 0xc0; // printf '0x%x\n' "$((2#11000000))" | |
1806 | |
1807 static const uint8_t MASK_IS_3_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" | |
1808 static const uint8_t TEST_IS_3_BYTES = 0xe0; // printf '0x%x\n' "$((2#11100000))" | |
1809 | |
1810 static const uint8_t MASK_IS_4_BYTES = 0xf8; // printf '0x%x\n' "$((2#11111000))" | |
1811 static const uint8_t TEST_IS_4_BYTES = 0xf0; // printf '0x%x\n' "$((2#11110000))" | |
1812 | |
1813 static const uint8_t MASK_CONTINUATION = 0xc0; // printf '0x%x\n' "$((2#11000000))" | |
1814 static const uint8_t TEST_CONTINUATION = 0x80; // printf '0x%x\n' "$((2#10000000))" | |
1815 | |
1816 if (position >= utf8.size()) | |
1817 { | |
1818 throw OrthancException(ErrorCode_ParameterOutOfRange); | |
1819 } | |
1820 | |
1821 const uint8_t* buffer = reinterpret_cast<const uint8_t*>(utf8.c_str()); | |
1822 | |
1823 if ((buffer[0] & MASK_IS_1_BYTE) == TEST_IS_1_BYTE) | |
1824 { | |
1825 length = 1; | |
1826 unicode = buffer[0] & ~MASK_IS_1_BYTE; | |
1827 } | |
1828 else if ((buffer[0] & MASK_IS_2_BYTES) == TEST_IS_2_BYTES && | |
1829 position + 1 < utf8.size() && | |
1830 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION) | |
1831 { | |
1832 length = 2; | |
1833 uint32_t a = buffer[0] & ~MASK_IS_2_BYTES; | |
1834 uint32_t b = buffer[1] & ~MASK_CONTINUATION; | |
1835 unicode = (a << 6) | b; | |
1836 } | |
1837 else if ((buffer[0] & MASK_IS_3_BYTES) == TEST_IS_3_BYTES && | |
1838 position + 2 < utf8.size() && | |
1839 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && | |
1840 (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION) | |
1841 { | |
1842 length = 3; | |
1843 uint32_t a = buffer[0] & ~MASK_IS_3_BYTES; | |
1844 uint32_t b = buffer[1] & ~MASK_CONTINUATION; | |
1845 uint32_t c = buffer[2] & ~MASK_CONTINUATION; | |
1846 unicode = (a << 12) | (b << 6) | c; | |
1847 } | |
1848 else if ((buffer[0] & MASK_IS_4_BYTES) == TEST_IS_4_BYTES && | |
1849 position + 3 < utf8.size() && | |
1850 (buffer[1] & MASK_CONTINUATION) == TEST_CONTINUATION && | |
1851 (buffer[2] & MASK_CONTINUATION) == TEST_CONTINUATION && | |
1852 (buffer[3] & MASK_CONTINUATION) == TEST_CONTINUATION) | |
1853 { | |
1854 length = 4; | |
1855 uint32_t a = buffer[0] & ~MASK_IS_4_BYTES; | |
1856 uint32_t b = buffer[1] & ~MASK_CONTINUATION; | |
1857 uint32_t c = buffer[2] & ~MASK_CONTINUATION; | |
1858 uint32_t d = buffer[3] & ~MASK_CONTINUATION; | |
1859 unicode = (a << 18) | (b << 12) | (c << 6) | d; | |
1860 } | |
1861 else | |
1862 { | |
1863 // This is not a valid UTF-8 encoding | |
1864 throw OrthancException(ErrorCode_BadFileFormat); | |
1865 } | |
1866 } | |
1792 } | 1867 } |
1793 | 1868 |
1794 | 1869 |
1795 | 1870 |
1796 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) | 1871 OrthancLinesIterator* OrthancLinesIterator_Create(const std::string& content) |