# HG changeset patch # User Benjamin Golinvaux # Date 1614002113 -3600 # Node ID b6a6ad64192af64d145ea30f4edceb7dcb862d89 # Parent 730604db88b83df355c142931e1006c42b8a0cca FastParseVector : manually written code to parse strings like 3.1315\-1.2e12\2344.5\123 into boost::numeric::ublas::vector + tests diff -r 730604db88b8 -r b6a6ad64192a OrthancStone/Sources/Toolbox/GenericToolbox.h --- a/OrthancStone/Sources/Toolbox/GenericToolbox.h Fri Feb 12 11:09:07 2021 +0100 +++ b/OrthancStone/Sources/Toolbox/GenericToolbox.h Mon Feb 22 14:55:13 2021 +0100 @@ -23,6 +23,9 @@ #include #include +#include + +#include "LinearAlgebra.h" #include @@ -31,6 +34,7 @@ #include #include +#include namespace OrthancStone { @@ -38,27 +42,30 @@ { /** Fast floating point string validation. - No trimming applied, so the input must match regex + No trimming applied, so the input must match regex /^[-]?[0-9]*\.?[0-9]*([eE][-+]?[0-9]+)?$/ The following are allowed as edge cases: "" and "-" + + The parsing always stops if encountering either 0 or the stopChar + */ - inline bool LegitDoubleString(const char* text) + inline bool LegitDoubleString(const char* text, char stopChar = 0) { const char* p = text; - if(*p == '-') + if (*p == '-') p++; size_t period = 0; - while(*p != 0) + while ((*p != 0) && (*p != stopChar) && (*p != ' ') && (*p != '\t')) { if (*p >= '0' && *p <= '9') ++p; - else if(*p == '.') + else if (*p == '.') { - if(period > 0) + if (period > 0) return false; else period++; - ++p; + ++p; } else if (*p == 'e' || *p == 'E') { @@ -70,30 +77,51 @@ return false; // these must be the last in the string - while(*p >= '0' && *p <= '9') + while (*p >= '0' && *p <= '9') ++p; - return (*p == 0); + // after that, there can only be spaces + while ((*p != 0) && (*p != stopChar)) + { + if ((*p != ' ') && (*p != '\t')) + return false; + ++p; + } + + return ((*p == 0) || (*p == stopChar)); } else { return false; } } + + // we only accept trailing whitespace + while ((*p != 0) && (*p != stopChar)) + { + if( (*p != ' ') && (*p != '\t')) + return false; + ++p; + } return true; } + + /** Fast integer string validation. No trimming applied, so the input must match regex /^-?[0-9]*$/ The following are allowed as edge cases: "" and "-" + + The parsing always stops if encountering either 0 or the stopChar + */ - inline bool LegitIntegerString(const char* text) + inline bool LegitIntegerString(const char* text, char stopChar = 0) { const char* p = text; if (*p == '-') p++; - while (*p != 0) + while ((*p != 0) && (*p != stopChar)) { if (*p >= '0' && *p <= '9') ++p; @@ -103,41 +131,106 @@ return true; } - /* - Fast string --> double conversion. - Must pass the LegitDoubleString test - String to doubles with at most 18 digits - */ - inline bool StringToDouble(double& r, const char* text) + static const double FRAC_FACTORS[] = { - if(!LegitDoubleString(text)) - return false; + 1.0, + 0.1, + 0.01, + 0.001, + 0.0001, + 0.00001, + 0.000001, + 0.0000001, + 0.00000001, + 0.000000001, + 0.0000000001, + 0.00000000001, + 0.000000000001, + 0.0000000000001, + 0.00000000000001, + 0.000000000000001, + 0.0000000000000001, + 0.00000000000000001, + 0.000000000000000001, + 0.0000000000000000001 + }; + static const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS) / sizeof(double); + + /** + Technical version of StringToDouble, meant for parsing bigger strings in-place. + + Only works for dot decimal numbers without digit separation + + The parsing stops when encountering EITHER \x00 or stopChar. + + Instead of filling r and returning true if number is legit, it fills r then + returns the number of parsed characters (NOT including the end character (which + can be zero, since an empty string is an allowed edge case) BUT including the trailing + spaces), or -1 if a parsing error occurred. + + Please note that if stopChar is a number, a minus sign, the decimal separator + or the letters e and E, the behavior is UNDEFINED!!! + + In order to allow the containing string not to be space-stripped: + - Spaces and tabs are ignored if they occur before the scientific notation e or E letter + - Spaces and tabs are ignored between the end of the number and the \x00 or stopChar + - Spaces and tabs cause errors anywhere else + + It is up to the caller to detect whether a successful parsing has reached the + terminator (\x00) or stopChar. - static const double FRAC_FACTORS[] = - { - 1.0, - 0.1, - 0.01, - 0.001, - 0.0001, - 0.00001, - 0.000001, - 0.0000001, - 0.00000001, - 0.000000001, - 0.0000000001, - 0.00000000001, - 0.000000000001, - 0.0000000000001, - 0.00000000000001, - 0.000000000000001, - 0.0000000000000001, - 0.00000000000000001, - 0.000000000000000001, - 0.0000000000000000001 - }; - const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS)/sizeof(double); + In case of an error returned, in a parsing scenario where multiple numbers + are to be read in a bigger surrounding string, it is up to the caller to + recover from the error by advancing the read pointer to the next character, + if desirable. + + Example: + ------ + const char* s = "0.0/.123/3/12.5//-43.1"; + + int size; + double r; + const char* p = s; + + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 3 + + p += size + 1; // gobble the separator + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0.123 and size = 4 + + p += size + 1; // gobble the separator + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 3.0 and size = 1 + + p += size + 1; // gobble the separator + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 12.5 and size = 3 + + p += size + 1; // gobble the separator + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 0 + + p += size + 1; // gobble the separator + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 0 + + p += size + if(p == 0) + ...stop parsing! + */ + + inline int32_t StringToDoubleEx(double& r, const char* text, char stopChar = 0) + { + if (!LegitDoubleString(text,stopChar)) + return -1; r = 0.0; double neg = 1.0; @@ -151,7 +244,7 @@ // 12345.67890 while (*p >= '0' && *p <= '9') { - r = (r*10.0) + (*p - '0'); // 1 12 123 123 12345 + r = (r * 10.0) + (*p - '0'); // 1 12 123 123 12345 ++p; } if (*p == '.') @@ -171,12 +264,13 @@ // skip the remaining numbers until we reach not-a-digit (either the // end of the string OR the scientific notation symbol) - while ((*p >= '0' && *p <= '9')) + // spaces are skipped in this phase here + while ((*p >= '0' && *p <= '9') || *p == ' ' || *p == '\t') ++p; - if (*p == 0 ) + if ( (*p == 0) || (*p == stopChar)) { - return true; + return static_cast(p - text); } else if ((*p == 'e') || (*p == 'E')) { @@ -202,7 +296,7 @@ else { // only a sign char or a number is allowed - return false; + return -1; } // now p points to the absolute value of the exponent double exp = 0; @@ -216,16 +310,42 @@ double scFac = ::pow(10.0, exp); r *= scFac; - // only allowed symbol here is EOS - return (*p == 0); + // skip the trailing spaces + while (*p == ' ' || *p == '\t') + ++p; + + // only allowed symbol here is EOS or stopChar + if ((*p == 0) || (*p == stopChar)) + return static_cast(p - text); + else + return -1; } else { // not allowed - return false; + return -1; } } + /** + Fast string --> double conversion. + Must pass the LegitDoubleString test + + String to doubles with at most 18 digits + + Returns true if okay and false if failed. + + The end-of-substring is character \x00 + */ + inline bool StringToDouble(double& r, const char* text) + { + int32_t size = StringToDoubleEx(r, text, 0); + return (size != -1); + } + + /** + See main overload + */ inline bool StringToDouble(double& r, const std::string& text) { return StringToDouble(r, text.c_str()); @@ -289,28 +409,166 @@ Same as GetRgbValuesFromString */ bool GetRgbaValuesFromString(uint8_t& red, - uint8_t& green, - uint8_t& blue, - uint8_t& alpha, - const char* text); + uint8_t& green, + uint8_t& blue, + uint8_t& alpha, + const char* text); /** Same as GetRgbValuesFromString */ - inline bool GetRgbaValuesFromString(uint8_t& red, - uint8_t& green, - uint8_t& blue, - uint8_t& alpha, - const std::string& text) + inline bool GetRgbaValuesFromString(uint8_t& red, + uint8_t& green, + uint8_t& blue, + uint8_t& alpha, + const std::string& text) { return GetRgbaValuesFromString(red, green, blue, alpha, text.c_str()); } - + /** - This method could have been called StripSpacesAndChangeToLower but we might want to + This method could have been called StripSpacesAndChangeToLower but we might want to add some UUID validation to the argument */ void NormalizeUuid(std::string& uuid); + + + inline void FastTokenizeString(std::vector& result, + const std::string& value, + char separator) + { + size_t countSeparators = 0; + + for (size_t i = 0; i < value.size(); i++) + { + if (value[i] == separator) + { + countSeparators++; + } + } + + result.clear(); + result.reserve(countSeparators + 1); + + std::string currentItem; + + for (size_t i = 0; i < value.size(); i++) + { + if (value[i] == separator) + { + result.push_back(currentItem); + currentItem.clear(); + } + else + { + currentItem.push_back(value[i]); + } + } + + result.push_back(currentItem); + } + + + inline std::string FastStripSpaces(const std::string& source) + { + size_t first = 0; + + while (first < source.length() && + isspace(source[first])) + { + first++; + } + + if (first == source.length()) + { + // String containing only spaces + return ""; + } + + size_t last = source.length(); + while (last > first && + isspace(source[last - 1])) + { + last--; + } + + assert(first <= last); + return source.substr(first, last - first); + } + + /** + Return the raw numbers of occurrences of `separator` in s (starting at s up to \x00) + */ + inline size_t GetCharCount(const char* s, const char separator) + { + const char* p = s; + size_t sepCount = 0; + + while (*p != 0) + { + if(*p == separator) + sepCount++; + ++p; + } + return sepCount; + } + + inline bool FastParseVector(Vector& target, const std::string& value) + { + const char* s = value.c_str(); + const char SEP = '\\'; + + size_t sepCount = GetCharCount(s, SEP); + + size_t itemCount = sepCount + 1; + target.resize(itemCount); + + while (*s == ' ' || *s == '\t') + ++s; + + const char* p = s; + + double r; + for (size_t i = 0; i < itemCount; i++) + { + int32_t numberCharCount = StringToDoubleEx(r, p, SEP); + if (numberCharCount == -1) + { + LOG(ERROR) << "Parsing error for vector \"" << value << "\". Current position (0-based) = " << i; + return false; + } + p += numberCharCount; + if (*p == 0) + { + // if we are at the end of the string, it means we have processed the last character + // let's check this. this is a small price to pay for a useful check + if (i != (itemCount - 1)) + { + LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the string without consuming the right # of items! Current position (0-based) = " << i; + return false; + } + } + else + { + if (*p != SEP) + { + LOG(ERROR) << "Parsing error for vector \"" << value << "\". Character past end of number Reached end of the string without consuming the right # of items! Current position (0-based) = " << i << " and r = " << r; + return false; + } + if (i == (itemCount - 1)) + { + LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the vector too soon. Current position (0-based) = " << i << " and r = " << r; + return false; + } + // advance to next number + p += 1; + } + target[i] = r; + } + return true; + } + + } } diff -r 730604db88b8 -r b6a6ad64192a UnitTestsSources/GenericToolboxTests.cpp --- a/UnitTestsSources/GenericToolboxTests.cpp Fri Feb 12 11:09:07 2021 +0100 +++ b/UnitTestsSources/GenericToolboxTests.cpp Mon Feb 22 14:55:13 2021 +0100 @@ -4287,9 +4287,212 @@ EXPECT_EQ(0, blue); } - - - - - - +TEST(GenericToolbox, FastParseTest_StringToDoubleEx01) +{ + using OrthancStone::GenericToolbox::StringToDoubleEx; + + const char* s = "0.0/.123/3/12.5//-43.1"; + + int32_t size; + double r; + const char* p = s; + + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 3 + ASSERT_EQ(3, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(4, size); + ASSERT_DOUBLE_EQ(0.123, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(1, size); + ASSERT_DOUBLE_EQ(3, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(4, size); + ASSERT_DOUBLE_EQ(12.5, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(0, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(5, size); + ASSERT_DOUBLE_EQ(-43.1, r); + + p += size; + ASSERT_EQ(0, *p); +} + +TEST(GenericToolbox, FastParseTest_StringToDoubleEx02) +{ + using OrthancStone::GenericToolbox::StringToDoubleEx; + + const char* s = " \t 0.0/.123/3 \t/12.5e-3//-43.1 \t "; + + int32_t size; + double r; + const char* p = s; + + while (*p == ' ' || *p == '\t') + ++p; + + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 3 + ASSERT_EQ(3, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(4, size); + ASSERT_DOUBLE_EQ(0.123, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(4, size); + ASSERT_DOUBLE_EQ(3, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(7, size); + ASSERT_DOUBLE_EQ(12.5e-3, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(0, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(14, size); + ASSERT_DOUBLE_EQ(-43.1, r); + + p += size; + ASSERT_EQ(0, *p); +} + +TEST(GenericToolbox, FastParseTest_StringToDoubleEx03) +{ + using OrthancStone::GenericToolbox::StringToDoubleEx; + + const char* s = " \t 0.0/.123/3/12.5e-3//-43.1e-2 \t "; + + int32_t size; + double r; + const char* p = s; + + while (*p == ' ' || *p == '\t') + ++p; + + size = StringToDoubleEx(r, p, '/'); + // --> + // r = 0 and size = 3 + ASSERT_EQ(3, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(4, size); + ASSERT_DOUBLE_EQ(0.123, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(1, size); + ASSERT_DOUBLE_EQ(3, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(7, size); + ASSERT_DOUBLE_EQ(12.5e-3, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(0, size); + ASSERT_DOUBLE_EQ(0, r); + + p += (size + 1); + size = StringToDoubleEx(r, p, '/'); + ASSERT_EQ(17, size); + ASSERT_DOUBLE_EQ(-43.1e-2, r); + + p += size; + ASSERT_EQ(0, *p); +} + + +TEST(GenericToolbox, FastParseTest_GetCharCount) +{ + using OrthancStone::GenericToolbox::GetCharCount; + + ASSERT_EQ(0u, GetCharCount("-1e-22", '\\')); + ASSERT_EQ(0u, GetCharCount(" -1e-22", '\\')); + ASSERT_EQ(0u, GetCharCount(" -1e-22 ", '\\')); + ASSERT_EQ(0u, GetCharCount("-1e-22 ", '\\')); + + ASSERT_EQ(1u, GetCharCount("-1e-2\\2", '\\')); + ASSERT_EQ(1u, GetCharCount(" -1e-2\\2", '\\')); + ASSERT_EQ(1u, GetCharCount("-1e-2\\2 ", '\\')); + ASSERT_EQ(1u, GetCharCount(" -1e-2\\2 ", '\\')); + + + ASSERT_EQ(11u, GetCharCount(" -1e-2\\\\3\\12.473\\-2.34e4\\-284\\423.23\\\\0.234423\\.786 \\ 9093\\ ", '\\')); +} + + +TEST(GenericToolbox, FastParseTest_FastParseVector01) +{ + using OrthancStone::GenericToolbox::FastParseVector; + + OrthancStone::Vector v; + + ASSERT_TRUE(FastParseVector(v, "1.2")); + ASSERT_EQ(1u, v.size()); + ASSERT_DOUBLE_EQ(1.2, v[0]); + + ASSERT_TRUE(FastParseVector(v, "-1.2e+2")); + ASSERT_EQ(1u, v.size()); + ASSERT_DOUBLE_EQ(-120.0, v[0]); + + ASSERT_TRUE(FastParseVector(v, "-1e-2\\2")); + ASSERT_EQ(2u, v.size()); + ASSERT_DOUBLE_EQ(-0.01, v[0]); + ASSERT_DOUBLE_EQ(2.0, v[1]); + + ASSERT_TRUE(FastParseVector(v, "1.3671875\\1.3671875")); + ASSERT_EQ(2u, v.size()); + ASSERT_DOUBLE_EQ(1.3671875, v[0]); + ASSERT_DOUBLE_EQ(1.3671875, v[1]); +} + +TEST(GenericToolbox, FastParseTest_FastParseVector02) +{ + using OrthancStone::GenericToolbox::FastParseVector; + + const char* vectorString = " -1e-2\\\\3\\12.473\\-2.34e4\\-284\\423.23\\\\0.234423\\.786 \\9093\\ "; + + OrthancStone::Vector v; + + ASSERT_TRUE(FastParseVector(v, vectorString)); + ASSERT_EQ(12u, v.size()); + ASSERT_DOUBLE_EQ(-1e-2 , v[ 0]); + ASSERT_DOUBLE_EQ(0 , v[ 1]); + ASSERT_DOUBLE_EQ(3 , v[ 2]); + ASSERT_DOUBLE_EQ(12.473 , v[ 3]); + ASSERT_DOUBLE_EQ(-2.34e4 , v[ 4]); + ASSERT_DOUBLE_EQ(-284 , v[ 5]); + ASSERT_DOUBLE_EQ(423.23 , v[ 6]); + ASSERT_DOUBLE_EQ(0 , v[ 7]); + ASSERT_DOUBLE_EQ(0.234423 , v[ 8]); + ASSERT_DOUBLE_EQ(.786 , v[ 9]); + ASSERT_DOUBLE_EQ(9093 , v[10]); + ASSERT_DOUBLE_EQ(0 , v[11]); +}