changeset 1748:b6a6ad64192a

FastParseVector : manually written code to parse strings like 3.1315\-1.2e12\2344.5\123 into boost::numeric::ublas::vector<double> + tests
author Benjamin Golinvaux <bgo@osimis.io>
date Mon, 22 Feb 2021 14:55:13 +0100
parents 730604db88b8
children 076aeb019cf1
files OrthancStone/Sources/Toolbox/GenericToolbox.h UnitTestsSources/GenericToolboxTests.cpp
diffstat 2 files changed, 529 insertions(+), 68 deletions(-) [+]
line wrap: on
line diff
--- a/OrthancStone/Sources/Toolbox/GenericToolbox.h	Fri Feb 12 11:09:07 2021 +0100
+++ b/OrthancStone/Sources/Toolbox/GenericToolbox.h	Mon Feb 22 14:55:13 2021 +0100
@@ -23,6 +23,9 @@
 
 #include <Compatibility.h>
 #include <OrthancException.h>
+#include <Logging.h>
+
+#include "LinearAlgebra.h"
 
 #include <boost/shared_ptr.hpp>
 
@@ -31,6 +34,7 @@
 #include <math.h>
 
 #include <memory>
+#include <vector>
 
 namespace OrthancStone
 {
@@ -38,27 +42,30 @@
   {
     /**
     Fast floating point string validation.
-    No trimming applied, so the input must match regex 
+    No trimming applied, so the input must match regex
     /^[-]?[0-9]*\.?[0-9]*([eE][-+]?[0-9]+)?$/
     The following are allowed as edge cases: "" and "-"
+
+    The parsing always stops if encountering either 0 or the stopChar
+
     */
-    inline bool LegitDoubleString(const char* text)
+    inline bool LegitDoubleString(const char* text, char stopChar = 0)
     {
       const char* p = text;
-      if(*p == '-')
+      if (*p == '-')
         p++;
       size_t period = 0;
-      while(*p != 0)
+      while ((*p != 0) && (*p != stopChar) && (*p != ' ') && (*p != '\t'))
       {
         if (*p >= '0' && *p <= '9')
           ++p;
-        else if(*p == '.')
+        else if (*p == '.')
         {
-          if(period > 0)
+          if (period > 0)
             return false;
           else
             period++;
-        ++p;
+          ++p;
         }
         else if (*p == 'e' || *p == 'E')
         {
@@ -70,30 +77,51 @@
             return false;
 
           // these must be the last in the string
-          while(*p >= '0' && *p <= '9')
+          while (*p >= '0' && *p <= '9')
             ++p;
 
-          return (*p == 0);
+          // after that, there can only be spaces
+          while ((*p != 0) && (*p != stopChar))
+          {
+            if ((*p != ' ') && (*p != '\t'))
+              return false;
+            ++p;
+          }
+
+          return ((*p == 0) || (*p == stopChar));
         }
         else
         {
           return false;
         }
       }
+
+      // we only accept trailing whitespace
+      while ((*p != 0) && (*p != stopChar))
+      {
+        if( (*p != ' ') && (*p != '\t'))
+          return false;
+        ++p;
+      }
       return true;
     }
 
+
+
     /**
     Fast integer string validation.
     No trimming applied, so the input must match regex /^-?[0-9]*$/
     The following are allowed as edge cases: "" and "-"
+
+    The parsing always stops if encountering either 0 or the stopChar
+
     */
-    inline bool LegitIntegerString(const char* text)
+    inline bool LegitIntegerString(const char* text, char stopChar = 0)
     {
       const char* p = text;
       if (*p == '-')
         p++;
-      while (*p != 0)
+      while ((*p != 0) && (*p != stopChar))
       {
         if (*p >= '0' && *p <= '9')
           ++p;
@@ -103,41 +131,106 @@
       return true;
     }
 
-    /*
-      Fast string --> double conversion.
-      Must pass the LegitDoubleString test
 
-      String to doubles with at most 18 digits
-    */
-    inline bool StringToDouble(double& r, const char* text)
+    static const double FRAC_FACTORS[] =
     {
-      if(!LegitDoubleString(text))
-        return false;
+      1.0,
+      0.1,
+      0.01,
+      0.001,
+      0.0001,
+      0.00001,
+      0.000001,
+      0.0000001,
+      0.00000001,
+      0.000000001,
+      0.0000000001,
+      0.00000000001,
+      0.000000000001,
+      0.0000000000001,
+      0.00000000000001,
+      0.000000000000001,
+      0.0000000000000001,
+      0.00000000000000001,
+      0.000000000000000001,
+      0.0000000000000000001
+    };
+    static const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS) / sizeof(double);
+
+    /**
+      Technical version of StringToDouble, meant for parsing bigger strings in-place.
+
+      Only works for dot decimal numbers without digit separation
+
+      The parsing stops when encountering EITHER \x00 or stopChar.
+
+      Instead of filling r and returning true if number is legit, it fills r then 
+      returns the number of parsed characters (NOT including the end character (which 
+      can be zero, since an empty string is an allowed edge case) BUT including the trailing
+      spaces), or -1 if a parsing error occurred.
+
+      Please note that if stopChar is a number, a minus sign, the decimal separator
+      or the letters e and E, the behavior is UNDEFINED!!!
+
+      In order to allow the containing string not to be space-stripped:
+      - Spaces and tabs are ignored if they occur before the scientific notation e or E letter
+      - Spaces and tabs are ignored between the end of the number and the \x00 or stopChar
+      - Spaces and tabs cause errors anywhere else
+
+      It is up to the caller to detect whether a successful parsing has reached the
+      terminator (\x00) or stopChar.
 
-      static const double FRAC_FACTORS[] = 
-      {
-        1.0,
-        0.1,
-        0.01,
-        0.001,
-        0.0001,
-        0.00001,
-        0.000001,
-        0.0000001,
-        0.00000001,
-        0.000000001,
-        0.0000000001,
-        0.00000000001,
-        0.000000000001,
-        0.0000000000001,
-        0.00000000000001,
-        0.000000000000001,
-        0.0000000000000001,
-        0.00000000000000001,
-        0.000000000000000001,
-        0.0000000000000000001
-      };
-      const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS)/sizeof(double);
+      In case of an error returned, in a parsing scenario where multiple numbers 
+      are to be read in a bigger surrounding string, it is up to the caller to 
+      recover from the error by advancing the read pointer to the next character,
+      if desirable.
+           
+      Example: 
+      ------
+      const char* s = "0.0/.123/3/12.5//-43.1"; 
+
+      int size;
+      double r;
+      const char* p = s;
+      
+      size = StringToDoubleEx(r, p, '/');
+      // -->
+      // r = 0 and size = 3
+      
+      p += size + 1; // gobble the separator
+      size = StringToDoubleEx(r, p, '/');
+      // --> 
+      // r = 0.123 and size = 4
+
+      p += size + 1; // gobble the separator
+      size = StringToDoubleEx(r, p, '/');
+      // -->
+      // r = 3.0 and size = 1
+
+      p += size + 1; // gobble the separator
+      size = StringToDoubleEx(r, p, '/');
+      // -->
+      // r = 12.5 and size = 3
+
+      p += size + 1; // gobble the separator
+      size = StringToDoubleEx(r, p, '/');
+      // -->
+      // r = 0 and size = 0
+
+      p += size + 1; // gobble the separator
+      size = StringToDoubleEx(r, p, '/');
+      // -->
+      // r = 0 and size = 0
+
+      p += size
+      if(p == 0)
+        ...stop parsing!
+    */
+
+    inline int32_t StringToDoubleEx(double& r, const char* text, char stopChar = 0)
+    {
+      if (!LegitDoubleString(text,stopChar))
+        return -1;
 
       r = 0.0;
       double neg = 1.0;
@@ -151,7 +244,7 @@
       // 12345.67890
       while (*p >= '0' && *p <= '9')
       {
-          r = (r*10.0) + (*p - '0'); // 1 12 123 123 12345
+        r = (r * 10.0) + (*p - '0'); // 1 12 123 123 12345
         ++p;
       }
       if (*p == '.')
@@ -171,12 +264,13 @@
 
       // skip the remaining numbers until we reach not-a-digit (either the 
       // end of the string OR the scientific notation symbol)
-      while ((*p >= '0' && *p <= '9'))
+      // spaces are skipped in this phase here
+      while ((*p >= '0' && *p <= '9') || *p == ' ' || *p == '\t')
         ++p;
 
-      if (*p == 0 )
+      if ( (*p == 0) || (*p == stopChar))
       {
-        return true;
+        return static_cast<int32_t>(p - text);
       }
       else if ((*p == 'e') || (*p == 'E'))
       {
@@ -202,7 +296,7 @@
         else
         {
           // only a sign char or a number is allowed
-          return false;
+          return -1;
         }
         // now p points to the absolute value of the exponent
         double exp = 0;
@@ -216,16 +310,42 @@
         double scFac = ::pow(10.0, exp);
         r *= scFac;
 
-        // only allowed symbol here is EOS
-        return (*p == 0);
+        // skip the trailing spaces
+        while (*p == ' ' || *p == '\t')
+          ++p;
+
+        // only allowed symbol here is EOS or stopChar
+        if ((*p == 0) || (*p == stopChar))
+          return static_cast<int32_t>(p - text);
+        else
+          return -1;
       }
       else
       {
         // not allowed
-        return false;
+        return -1;
       }
     }
 
+    /**
+      Fast string --> double conversion.
+      Must pass the LegitDoubleString test
+
+      String to doubles with at most 18 digits
+
+      Returns true if okay and false if failed.
+
+      The end-of-substring is character \x00
+    */
+    inline bool StringToDouble(double& r, const char* text)
+    {
+      int32_t size = StringToDoubleEx(r, text, 0);
+      return (size != -1);
+    }
+
+    /**
+      See main overload
+    */
     inline bool StringToDouble(double& r, const std::string& text)
     {
       return StringToDouble(r, text.c_str());
@@ -289,28 +409,166 @@
     Same as GetRgbValuesFromString
     */
     bool GetRgbaValuesFromString(uint8_t& red,
-                                 uint8_t& green,
-                                 uint8_t& blue,
-                                 uint8_t& alpha,
-                                 const char* text);
+      uint8_t& green,
+      uint8_t& blue,
+      uint8_t& alpha,
+      const char* text);
 
     /**
     Same as GetRgbValuesFromString
     */
-    inline bool GetRgbaValuesFromString(uint8_t& red, 
-                                        uint8_t& green, 
-                                        uint8_t& blue, 
-                                        uint8_t& alpha, 
-                                        const std::string& text)
+    inline bool GetRgbaValuesFromString(uint8_t& red,
+      uint8_t& green,
+      uint8_t& blue,
+      uint8_t& alpha,
+      const std::string& text)
     {
       return GetRgbaValuesFromString(red, green, blue, alpha, text.c_str());
     }
 
-    
+
     /**
-    This method could have been called StripSpacesAndChangeToLower but we might want to 
+    This method could have been called StripSpacesAndChangeToLower but we might want to
     add some UUID validation to the argument
     */
     void NormalizeUuid(std::string& uuid);
+
+
+    inline void FastTokenizeString(std::vector<std::string>& result,
+      const std::string& value,
+      char separator)
+    {
+      size_t countSeparators = 0;
+
+      for (size_t i = 0; i < value.size(); i++)
+      {
+        if (value[i] == separator)
+        {
+          countSeparators++;
+        }
+      }
+
+      result.clear();
+      result.reserve(countSeparators + 1);
+
+      std::string currentItem;
+
+      for (size_t i = 0; i < value.size(); i++)
+      {
+        if (value[i] == separator)
+        {
+          result.push_back(currentItem);
+          currentItem.clear();
+        }
+        else
+        {
+          currentItem.push_back(value[i]);
+        }
+      }
+
+      result.push_back(currentItem);
+    }
+
+
+    inline std::string FastStripSpaces(const std::string& source)
+    {
+      size_t first = 0;
+
+      while (first < source.length() &&
+        isspace(source[first]))
+      {
+        first++;
+      }
+
+      if (first == source.length())
+      {
+        // String containing only spaces
+        return "";
+      }
+
+      size_t last = source.length();
+      while (last > first &&
+        isspace(source[last - 1]))
+      {
+        last--;
+      }
+
+      assert(first <= last);
+      return source.substr(first, last - first);
+    }
+
+    /**
+    Return the raw numbers of occurrences of `separator` in s (starting at s up to \x00)
+    */
+    inline size_t GetCharCount(const char* s, const char separator)
+    {
+      const char* p = s;
+      size_t sepCount = 0;
+
+      while (*p != 0)
+      {
+        if(*p == separator)
+          sepCount++;
+        ++p;
+      }
+      return sepCount;
+    }
+
+    inline bool FastParseVector(Vector& target, const std::string& value)
+    {
+      const char* s = value.c_str();
+      const char SEP = '\\';
+
+      size_t sepCount = GetCharCount(s, SEP);
+
+      size_t itemCount = sepCount + 1;
+      target.resize(itemCount);
+
+      while (*s == ' ' || *s == '\t')
+        ++s;
+
+      const char* p = s;
+
+      double r;
+      for (size_t i = 0; i < itemCount; i++)
+      {
+        int32_t numberCharCount = StringToDoubleEx(r, p, SEP);
+        if (numberCharCount == -1)
+        {
+          LOG(ERROR) << "Parsing error for vector \"" << value << "\". Current position (0-based) = " << i;
+          return false;
+        }
+        p += numberCharCount;
+        if (*p == 0)
+        {
+          // if we are at the end of the string, it means we have processed the last character
+          // let's check this. this is a small price to pay for a useful check
+          if (i != (itemCount - 1))
+          {
+            LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the string without consuming the right # of items! Current position (0-based) = " << i;
+            return false;
+          }
+        }
+        else
+        {
+          if (*p != SEP)
+          {
+            LOG(ERROR) << "Parsing error for vector \"" << value << "\". Character past end of number Reached end of the string without consuming the right # of items! Current position (0-based) = " << i << " and r = " << r;
+            return false;
+          }
+          if (i == (itemCount - 1))
+          {
+            LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the vector too soon. Current position (0-based) = " << i << " and r = " << r;
+            return false;
+          }
+          // advance to next number 
+          p += 1;
+        }
+        target[i] = r;
+      }
+      return true;
+    }
+
+
   }
 }
--- a/UnitTestsSources/GenericToolboxTests.cpp	Fri Feb 12 11:09:07 2021 +0100
+++ b/UnitTestsSources/GenericToolboxTests.cpp	Mon Feb 22 14:55:13 2021 +0100
@@ -4287,9 +4287,212 @@
   EXPECT_EQ(0, blue);
 }
 
-
-
-
-
-
-
+TEST(GenericToolbox, FastParseTest_StringToDoubleEx01)
+{
+  using OrthancStone::GenericToolbox::StringToDoubleEx;
+
+  const char* s = "0.0/.123/3/12.5//-43.1";
+
+  int32_t size;
+  double r;
+  const char* p = s;
+
+  size = StringToDoubleEx(r, p, '/');
+  // -->
+  // r = 0 and size = 3
+  ASSERT_EQ(3, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(4, size);
+  ASSERT_DOUBLE_EQ(0.123, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(1, size);
+  ASSERT_DOUBLE_EQ(3, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(4, size);
+  ASSERT_DOUBLE_EQ(12.5, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(0, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(5, size);
+  ASSERT_DOUBLE_EQ(-43.1, r);
+
+  p += size;
+  ASSERT_EQ(0, *p);
+}
+
+TEST(GenericToolbox, FastParseTest_StringToDoubleEx02)
+{
+  using OrthancStone::GenericToolbox::StringToDoubleEx;
+
+  const char* s = "  \t   0.0/.123/3  \t/12.5e-3//-43.1   \t     ";
+
+  int32_t size;
+  double r;
+  const char* p = s;
+
+  while (*p == ' ' || *p == '\t')
+    ++p;
+
+  size = StringToDoubleEx(r, p, '/');
+  // -->
+  // r = 0 and size = 3
+  ASSERT_EQ(3, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(4, size);
+  ASSERT_DOUBLE_EQ(0.123, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(4, size);
+  ASSERT_DOUBLE_EQ(3, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(7, size);
+  ASSERT_DOUBLE_EQ(12.5e-3, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(0, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(14, size);
+  ASSERT_DOUBLE_EQ(-43.1, r);
+
+  p += size;
+  ASSERT_EQ(0, *p);
+}
+
+TEST(GenericToolbox, FastParseTest_StringToDoubleEx03)
+{
+  using OrthancStone::GenericToolbox::StringToDoubleEx;
+
+  const char* s = "  \t   0.0/.123/3/12.5e-3//-43.1e-2   \t     ";
+
+  int32_t size;
+  double r;
+  const char* p = s;
+
+  while (*p == ' ' || *p == '\t')
+    ++p;
+
+  size = StringToDoubleEx(r, p, '/');
+  // -->
+  // r = 0 and size = 3
+  ASSERT_EQ(3, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(4, size);
+  ASSERT_DOUBLE_EQ(0.123, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(1, size);
+  ASSERT_DOUBLE_EQ(3, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(7, size);
+  ASSERT_DOUBLE_EQ(12.5e-3, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(0, size);
+  ASSERT_DOUBLE_EQ(0, r);
+
+  p += (size + 1);
+  size = StringToDoubleEx(r, p, '/');
+  ASSERT_EQ(17, size);
+  ASSERT_DOUBLE_EQ(-43.1e-2, r);
+
+  p += size;
+  ASSERT_EQ(0, *p);
+}
+
+
+TEST(GenericToolbox, FastParseTest_GetCharCount)
+{
+  using OrthancStone::GenericToolbox::GetCharCount;
+
+  ASSERT_EQ(0u, GetCharCount("-1e-22", '\\'));
+  ASSERT_EQ(0u, GetCharCount("   -1e-22", '\\'));
+  ASSERT_EQ(0u, GetCharCount("   -1e-22   ", '\\'));
+  ASSERT_EQ(0u, GetCharCount("-1e-22   ", '\\'));
+
+  ASSERT_EQ(1u, GetCharCount("-1e-2\\2", '\\'));
+  ASSERT_EQ(1u, GetCharCount("     -1e-2\\2", '\\'));
+  ASSERT_EQ(1u, GetCharCount("-1e-2\\2       ", '\\'));
+  ASSERT_EQ(1u, GetCharCount("    -1e-2\\2   ", '\\'));
+
+
+  ASSERT_EQ(11u, GetCharCount("    -1e-2\\\\3\\12.473\\-2.34e4\\-284\\423.23\\\\0.234423\\.786 \\ 9093\\   ", '\\'));
+}
+
+
+TEST(GenericToolbox, FastParseTest_FastParseVector01)
+{
+  using OrthancStone::GenericToolbox::FastParseVector;
+
+  OrthancStone::Vector v;
+
+  ASSERT_TRUE(FastParseVector(v, "1.2"));
+  ASSERT_EQ(1u, v.size());
+  ASSERT_DOUBLE_EQ(1.2, v[0]);
+
+  ASSERT_TRUE(FastParseVector(v, "-1.2e+2"));
+  ASSERT_EQ(1u, v.size());
+  ASSERT_DOUBLE_EQ(-120.0, v[0]);
+
+  ASSERT_TRUE(FastParseVector(v, "-1e-2\\2"));
+  ASSERT_EQ(2u, v.size());
+  ASSERT_DOUBLE_EQ(-0.01, v[0]);
+  ASSERT_DOUBLE_EQ(2.0, v[1]);
+
+  ASSERT_TRUE(FastParseVector(v, "1.3671875\\1.3671875"));
+  ASSERT_EQ(2u, v.size());
+  ASSERT_DOUBLE_EQ(1.3671875, v[0]);
+  ASSERT_DOUBLE_EQ(1.3671875, v[1]);
+}
+
+TEST(GenericToolbox, FastParseTest_FastParseVector02)
+{
+  using OrthancStone::GenericToolbox::FastParseVector;
+
+  const char* vectorString = "    -1e-2\\\\3\\12.473\\-2.34e4\\-284\\423.23\\\\0.234423\\.786 \\9093\\   ";
+
+  OrthancStone::Vector v;
+
+  ASSERT_TRUE(FastParseVector(v, vectorString));
+  ASSERT_EQ(12u, v.size());
+  ASSERT_DOUBLE_EQ(-1e-2    , v[ 0]);
+  ASSERT_DOUBLE_EQ(0        , v[ 1]);
+  ASSERT_DOUBLE_EQ(3        , v[ 2]);
+  ASSERT_DOUBLE_EQ(12.473   , v[ 3]);
+  ASSERT_DOUBLE_EQ(-2.34e4  , v[ 4]);
+  ASSERT_DOUBLE_EQ(-284     , v[ 5]);
+  ASSERT_DOUBLE_EQ(423.23   , v[ 6]);
+  ASSERT_DOUBLE_EQ(0        , v[ 7]);
+  ASSERT_DOUBLE_EQ(0.234423 , v[ 8]);
+  ASSERT_DOUBLE_EQ(.786     , v[ 9]);
+  ASSERT_DOUBLE_EQ(9093     , v[10]);
+  ASSERT_DOUBLE_EQ(0        , v[11]);
+}