comparison OrthancStone/Sources/Toolbox/GenericToolbox.h @ 1748:b6a6ad64192a

FastParseVector : manually written code to parse strings like 3.1315\-1.2e12\2344.5\123 into boost::numeric::ublas::vector<double> + tests
author Benjamin Golinvaux <bgo@osimis.io>
date Mon, 22 Feb 2021 14:55:13 +0100
parents 9ac2a65d4172
children 3889ae96d2e9
comparison
equal deleted inserted replaced
1747:730604db88b8 1748:b6a6ad64192a
21 21
22 #pragma once 22 #pragma once
23 23
24 #include <Compatibility.h> 24 #include <Compatibility.h>
25 #include <OrthancException.h> 25 #include <OrthancException.h>
26 #include <Logging.h>
27
28 #include "LinearAlgebra.h"
26 29
27 #include <boost/shared_ptr.hpp> 30 #include <boost/shared_ptr.hpp>
28 31
29 #include <string> 32 #include <string>
30 #include <stdint.h> 33 #include <stdint.h>
31 #include <math.h> 34 #include <math.h>
32 35
33 #include <memory> 36 #include <memory>
37 #include <vector>
34 38
35 namespace OrthancStone 39 namespace OrthancStone
36 { 40 {
37 namespace GenericToolbox 41 namespace GenericToolbox
38 { 42 {
39 /** 43 /**
40 Fast floating point string validation. 44 Fast floating point string validation.
41 No trimming applied, so the input must match regex 45 No trimming applied, so the input must match regex
42 /^[-]?[0-9]*\.?[0-9]*([eE][-+]?[0-9]+)?$/ 46 /^[-]?[0-9]*\.?[0-9]*([eE][-+]?[0-9]+)?$/
43 The following are allowed as edge cases: "" and "-" 47 The following are allowed as edge cases: "" and "-"
44 */ 48
45 inline bool LegitDoubleString(const char* text) 49 The parsing always stops if encountering either 0 or the stopChar
50
51 */
52 inline bool LegitDoubleString(const char* text, char stopChar = 0)
46 { 53 {
47 const char* p = text; 54 const char* p = text;
48 if(*p == '-') 55 if (*p == '-')
49 p++; 56 p++;
50 size_t period = 0; 57 size_t period = 0;
51 while(*p != 0) 58 while ((*p != 0) && (*p != stopChar) && (*p != ' ') && (*p != '\t'))
52 { 59 {
53 if (*p >= '0' && *p <= '9') 60 if (*p >= '0' && *p <= '9')
54 ++p; 61 ++p;
55 else if(*p == '.') 62 else if (*p == '.')
56 { 63 {
57 if(period > 0) 64 if (period > 0)
58 return false; 65 return false;
59 else 66 else
60 period++; 67 period++;
61 ++p; 68 ++p;
62 } 69 }
63 else if (*p == 'e' || *p == 'E') 70 else if (*p == 'e' || *p == 'E')
64 { 71 {
65 ++p; 72 ++p;
66 if (*p == '-' || *p == '+') 73 if (*p == '-' || *p == '+')
68 // "e+"/"E+" "e-"/"E-" or "e"/"E" must be followed by a number 75 // "e+"/"E+" "e-"/"E-" or "e"/"E" must be followed by a number
69 if (!(*p >= '0' && *p <= '9')) 76 if (!(*p >= '0' && *p <= '9'))
70 return false; 77 return false;
71 78
72 // these must be the last in the string 79 // these must be the last in the string
73 while(*p >= '0' && *p <= '9') 80 while (*p >= '0' && *p <= '9')
74 ++p; 81 ++p;
75 82
76 return (*p == 0); 83 // after that, there can only be spaces
84 while ((*p != 0) && (*p != stopChar))
85 {
86 if ((*p != ' ') && (*p != '\t'))
87 return false;
88 ++p;
89 }
90
91 return ((*p == 0) || (*p == stopChar));
77 } 92 }
78 else 93 else
79 { 94 {
80 return false; 95 return false;
81 } 96 }
82 } 97 }
98
99 // we only accept trailing whitespace
100 while ((*p != 0) && (*p != stopChar))
101 {
102 if( (*p != ' ') && (*p != '\t'))
103 return false;
104 ++p;
105 }
83 return true; 106 return true;
84 } 107 }
108
109
85 110
86 /** 111 /**
87 Fast integer string validation. 112 Fast integer string validation.
88 No trimming applied, so the input must match regex /^-?[0-9]*$/ 113 No trimming applied, so the input must match regex /^-?[0-9]*$/
89 The following are allowed as edge cases: "" and "-" 114 The following are allowed as edge cases: "" and "-"
90 */ 115
91 inline bool LegitIntegerString(const char* text) 116 The parsing always stops if encountering either 0 or the stopChar
117
118 */
119 inline bool LegitIntegerString(const char* text, char stopChar = 0)
92 { 120 {
93 const char* p = text; 121 const char* p = text;
94 if (*p == '-') 122 if (*p == '-')
95 p++; 123 p++;
96 while (*p != 0) 124 while ((*p != 0) && (*p != stopChar))
97 { 125 {
98 if (*p >= '0' && *p <= '9') 126 if (*p >= '0' && *p <= '9')
99 ++p; 127 ++p;
100 else 128 else
101 return false; 129 return false;
102 } 130 }
103 return true; 131 return true;
104 } 132 }
105 133
106 /* 134
107 Fast string --> double conversion. 135 static const double FRAC_FACTORS[] =
108 Must pass the LegitDoubleString test 136 {
109 137 1.0,
110 String to doubles with at most 18 digits 138 0.1,
111 */ 139 0.01,
112 inline bool StringToDouble(double& r, const char* text) 140 0.001,
113 { 141 0.0001,
114 if(!LegitDoubleString(text)) 142 0.00001,
115 return false; 143 0.000001,
116 144 0.0000001,
117 static const double FRAC_FACTORS[] = 145 0.00000001,
118 { 146 0.000000001,
119 1.0, 147 0.0000000001,
120 0.1, 148 0.00000000001,
121 0.01, 149 0.000000000001,
122 0.001, 150 0.0000000000001,
123 0.0001, 151 0.00000000000001,
124 0.00001, 152 0.000000000000001,
125 0.000001, 153 0.0000000000000001,
126 0.0000001, 154 0.00000000000000001,
127 0.00000001, 155 0.000000000000000001,
128 0.000000001, 156 0.0000000000000000001
129 0.0000000001, 157 };
130 0.00000000001, 158 static const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS) / sizeof(double);
131 0.000000000001, 159
132 0.0000000000001, 160 /**
133 0.00000000000001, 161 Technical version of StringToDouble, meant for parsing bigger strings in-place.
134 0.000000000000001, 162
135 0.0000000000000001, 163 Only works for dot decimal numbers without digit separation
136 0.00000000000000001, 164
137 0.000000000000000001, 165 The parsing stops when encountering EITHER \x00 or stopChar.
138 0.0000000000000000001 166
139 }; 167 Instead of filling r and returning true if number is legit, it fills r then
140 const size_t FRAC_FACTORS_LEN = sizeof(FRAC_FACTORS)/sizeof(double); 168 returns the number of parsed characters (NOT including the end character (which
169 can be zero, since an empty string is an allowed edge case) BUT including the trailing
170 spaces), or -1 if a parsing error occurred.
171
172 Please note that if stopChar is a number, a minus sign, the decimal separator
173 or the letters e and E, the behavior is UNDEFINED!!!
174
175 In order to allow the containing string not to be space-stripped:
176 - Spaces and tabs are ignored if they occur before the scientific notation e or E letter
177 - Spaces and tabs are ignored between the end of the number and the \x00 or stopChar
178 - Spaces and tabs cause errors anywhere else
179
180 It is up to the caller to detect whether a successful parsing has reached the
181 terminator (\x00) or stopChar.
182
183 In case of an error returned, in a parsing scenario where multiple numbers
184 are to be read in a bigger surrounding string, it is up to the caller to
185 recover from the error by advancing the read pointer to the next character,
186 if desirable.
187
188 Example:
189 ------
190 const char* s = "0.0/.123/3/12.5//-43.1";
191
192 int size;
193 double r;
194 const char* p = s;
195
196 size = StringToDoubleEx(r, p, '/');
197 // -->
198 // r = 0 and size = 3
199
200 p += size + 1; // gobble the separator
201 size = StringToDoubleEx(r, p, '/');
202 // -->
203 // r = 0.123 and size = 4
204
205 p += size + 1; // gobble the separator
206 size = StringToDoubleEx(r, p, '/');
207 // -->
208 // r = 3.0 and size = 1
209
210 p += size + 1; // gobble the separator
211 size = StringToDoubleEx(r, p, '/');
212 // -->
213 // r = 12.5 and size = 3
214
215 p += size + 1; // gobble the separator
216 size = StringToDoubleEx(r, p, '/');
217 // -->
218 // r = 0 and size = 0
219
220 p += size + 1; // gobble the separator
221 size = StringToDoubleEx(r, p, '/');
222 // -->
223 // r = 0 and size = 0
224
225 p += size
226 if(p == 0)
227 ...stop parsing!
228 */
229
230 inline int32_t StringToDoubleEx(double& r, const char* text, char stopChar = 0)
231 {
232 if (!LegitDoubleString(text,stopChar))
233 return -1;
141 234
142 r = 0.0; 235 r = 0.0;
143 double neg = 1.0; 236 double neg = 1.0;
144 const char* p = text; 237 const char* p = text;
145 238
149 ++p; 242 ++p;
150 } 243 }
151 // 12345.67890 244 // 12345.67890
152 while (*p >= '0' && *p <= '9') 245 while (*p >= '0' && *p <= '9')
153 { 246 {
154 r = (r*10.0) + (*p - '0'); // 1 12 123 123 12345 247 r = (r * 10.0) + (*p - '0'); // 1 12 123 123 12345
155 ++p; 248 ++p;
156 } 249 }
157 if (*p == '.') 250 if (*p == '.')
158 { 251 {
159 double f = 0.0; 252 double f = 0.0;
169 } 262 }
170 r *= neg; 263 r *= neg;
171 264
172 // skip the remaining numbers until we reach not-a-digit (either the 265 // skip the remaining numbers until we reach not-a-digit (either the
173 // end of the string OR the scientific notation symbol) 266 // end of the string OR the scientific notation symbol)
174 while ((*p >= '0' && *p <= '9')) 267 // spaces are skipped in this phase here
175 ++p; 268 while ((*p >= '0' && *p <= '9') || *p == ' ' || *p == '\t')
176 269 ++p;
177 if (*p == 0 ) 270
178 { 271 if ( (*p == 0) || (*p == stopChar))
179 return true; 272 {
273 return static_cast<int32_t>(p - text);
180 } 274 }
181 else if ((*p == 'e') || (*p == 'E')) 275 else if ((*p == 'e') || (*p == 'E'))
182 { 276 {
183 // process the scientific notation 277 // process the scientific notation
184 double sign; // no init is safe (read below) 278 double sign; // no init is safe (read below)
200 sign = 1.0; 294 sign = 1.0;
201 } 295 }
202 else 296 else
203 { 297 {
204 // only a sign char or a number is allowed 298 // only a sign char or a number is allowed
205 return false; 299 return -1;
206 } 300 }
207 // now p points to the absolute value of the exponent 301 // now p points to the absolute value of the exponent
208 double exp = 0; 302 double exp = 0;
209 while (*p >= '0' && *p <= '9') 303 while (*p >= '0' && *p <= '9')
210 { 304 {
214 // now we have our exponent. put a sign on it. 308 // now we have our exponent. put a sign on it.
215 exp *= sign; 309 exp *= sign;
216 double scFac = ::pow(10.0, exp); 310 double scFac = ::pow(10.0, exp);
217 r *= scFac; 311 r *= scFac;
218 312
219 // only allowed symbol here is EOS 313 // skip the trailing spaces
220 return (*p == 0); 314 while (*p == ' ' || *p == '\t')
315 ++p;
316
317 // only allowed symbol here is EOS or stopChar
318 if ((*p == 0) || (*p == stopChar))
319 return static_cast<int32_t>(p - text);
320 else
321 return -1;
221 } 322 }
222 else 323 else
223 { 324 {
224 // not allowed 325 // not allowed
225 return false; 326 return -1;
226 } 327 }
227 } 328 }
228 329
330 /**
331 Fast string --> double conversion.
332 Must pass the LegitDoubleString test
333
334 String to doubles with at most 18 digits
335
336 Returns true if okay and false if failed.
337
338 The end-of-substring is character \x00
339 */
340 inline bool StringToDouble(double& r, const char* text)
341 {
342 int32_t size = StringToDoubleEx(r, text, 0);
343 return (size != -1);
344 }
345
346 /**
347 See main overload
348 */
229 inline bool StringToDouble(double& r, const std::string& text) 349 inline bool StringToDouble(double& r, const std::string& text)
230 { 350 {
231 return StringToDouble(r, text.c_str()); 351 return StringToDouble(r, text.c_str());
232 } 352 }
233 353
287 407
288 /** 408 /**
289 Same as GetRgbValuesFromString 409 Same as GetRgbValuesFromString
290 */ 410 */
291 bool GetRgbaValuesFromString(uint8_t& red, 411 bool GetRgbaValuesFromString(uint8_t& red,
292 uint8_t& green, 412 uint8_t& green,
293 uint8_t& blue, 413 uint8_t& blue,
294 uint8_t& alpha, 414 uint8_t& alpha,
295 const char* text); 415 const char* text);
296 416
297 /** 417 /**
298 Same as GetRgbValuesFromString 418 Same as GetRgbValuesFromString
299 */ 419 */
300 inline bool GetRgbaValuesFromString(uint8_t& red, 420 inline bool GetRgbaValuesFromString(uint8_t& red,
301 uint8_t& green, 421 uint8_t& green,
302 uint8_t& blue, 422 uint8_t& blue,
303 uint8_t& alpha, 423 uint8_t& alpha,
304 const std::string& text) 424 const std::string& text)
305 { 425 {
306 return GetRgbaValuesFromString(red, green, blue, alpha, text.c_str()); 426 return GetRgbaValuesFromString(red, green, blue, alpha, text.c_str());
307 } 427 }
308 428
309 429
310 /** 430 /**
311 This method could have been called StripSpacesAndChangeToLower but we might want to 431 This method could have been called StripSpacesAndChangeToLower but we might want to
312 add some UUID validation to the argument 432 add some UUID validation to the argument
313 */ 433 */
314 void NormalizeUuid(std::string& uuid); 434 void NormalizeUuid(std::string& uuid);
435
436
437 inline void FastTokenizeString(std::vector<std::string>& result,
438 const std::string& value,
439 char separator)
440 {
441 size_t countSeparators = 0;
442
443 for (size_t i = 0; i < value.size(); i++)
444 {
445 if (value[i] == separator)
446 {
447 countSeparators++;
448 }
449 }
450
451 result.clear();
452 result.reserve(countSeparators + 1);
453
454 std::string currentItem;
455
456 for (size_t i = 0; i < value.size(); i++)
457 {
458 if (value[i] == separator)
459 {
460 result.push_back(currentItem);
461 currentItem.clear();
462 }
463 else
464 {
465 currentItem.push_back(value[i]);
466 }
467 }
468
469 result.push_back(currentItem);
470 }
471
472
473 inline std::string FastStripSpaces(const std::string& source)
474 {
475 size_t first = 0;
476
477 while (first < source.length() &&
478 isspace(source[first]))
479 {
480 first++;
481 }
482
483 if (first == source.length())
484 {
485 // String containing only spaces
486 return "";
487 }
488
489 size_t last = source.length();
490 while (last > first &&
491 isspace(source[last - 1]))
492 {
493 last--;
494 }
495
496 assert(first <= last);
497 return source.substr(first, last - first);
498 }
499
500 /**
501 Return the raw numbers of occurrences of `separator` in s (starting at s up to \x00)
502 */
503 inline size_t GetCharCount(const char* s, const char separator)
504 {
505 const char* p = s;
506 size_t sepCount = 0;
507
508 while (*p != 0)
509 {
510 if(*p == separator)
511 sepCount++;
512 ++p;
513 }
514 return sepCount;
515 }
516
517 inline bool FastParseVector(Vector& target, const std::string& value)
518 {
519 const char* s = value.c_str();
520 const char SEP = '\\';
521
522 size_t sepCount = GetCharCount(s, SEP);
523
524 size_t itemCount = sepCount + 1;
525 target.resize(itemCount);
526
527 while (*s == ' ' || *s == '\t')
528 ++s;
529
530 const char* p = s;
531
532 double r;
533 for (size_t i = 0; i < itemCount; i++)
534 {
535 int32_t numberCharCount = StringToDoubleEx(r, p, SEP);
536 if (numberCharCount == -1)
537 {
538 LOG(ERROR) << "Parsing error for vector \"" << value << "\". Current position (0-based) = " << i;
539 return false;
540 }
541 p += numberCharCount;
542 if (*p == 0)
543 {
544 // if we are at the end of the string, it means we have processed the last character
545 // let's check this. this is a small price to pay for a useful check
546 if (i != (itemCount - 1))
547 {
548 LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the string without consuming the right # of items! Current position (0-based) = " << i;
549 return false;
550 }
551 }
552 else
553 {
554 if (*p != SEP)
555 {
556 LOG(ERROR) << "Parsing error for vector \"" << value << "\". Character past end of number Reached end of the string without consuming the right # of items! Current position (0-based) = " << i << " and r = " << r;
557 return false;
558 }
559 if (i == (itemCount - 1))
560 {
561 LOG(ERROR) << "Parsing error for vector \"" << value << "\". Reached end of the vector too soon. Current position (0-based) = " << i << " and r = " << r;
562 return false;
563 }
564 // advance to next number
565 p += 1;
566 }
567 target[i] = r;
568 }
569 return true;
570 }
571
572
315 } 573 }
316 } 574 }