Mercurial > hg > orthanc-stone
comparison OrthancStone/Sources/Toolbox/AlignedMatrix.cpp @ 2068:22a83fb9dd23 deep-learning
added AlignedMatrix and TimerLogger
author | Sebastien Jodogne <s.jodogne@gmail.com> |
---|---|
date | Wed, 17 May 2023 17:30:52 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2067:20222330cdf6 | 2068:22a83fb9dd23 |
---|---|
1 /** | |
2 * Stone of Orthanc | |
3 * Copyright (C) 2012-2016 Sebastien Jodogne, Medical Physics | |
4 * Department, University Hospital of Liege, Belgium | |
5 * Copyright (C) 2017-2022 Osimis S.A., Belgium | |
6 * Copyright (C) 2021-2022 Sebastien Jodogne, ICTEAM UCLouvain, Belgium | |
7 * | |
8 * This program is free software: you can redistribute it and/or | |
9 * modify it under the terms of the GNU Lesser General Public License | |
10 * as published by the Free Software Foundation, either version 3 of | |
11 * the License, or (at your option) any later version. | |
12 * | |
13 * This program is distributed in the hope that it will be useful, but | |
14 * WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 * Lesser General Public License for more details. | |
17 * | |
18 * You should have received a copy of the GNU Lesser General Public | |
19 * License along with this program. If not, see | |
20 * <http://www.gnu.org/licenses/>. | |
21 **/ | |
22 | |
23 | |
24 #include "AlignedMatrix.h" | |
25 | |
26 #include <OrthancException.h> | |
27 | |
28 #include <string.h> | |
29 | |
30 namespace OrthancStone | |
31 { | |
32 static unsigned int Ceiling(unsigned int a, | |
33 unsigned int b) | |
34 { | |
35 if (a % b == 0) | |
36 { | |
37 return a / b; | |
38 } | |
39 else | |
40 { | |
41 return a / b + 1; | |
42 } | |
43 } | |
44 | |
45 | |
46 void AlignedMatrix::Setup(unsigned int rows, | |
47 unsigned int cols) | |
48 { | |
49 assert(sizeof(float) == 4); | |
50 | |
51 if (rows == 0 || | |
52 cols == 0) | |
53 { | |
54 rows_ = 0; | |
55 cols_ = 0; | |
56 pitch_ = 0; | |
57 pitchFloatPointer_ = 0; | |
58 content_ = NULL; | |
59 } | |
60 else | |
61 { | |
62 rows_ = rows; | |
63 cols_ = cols; | |
64 pitch_ = Ceiling(cols * sizeof(float), ORTHANC_MEMORY_ALIGNMENT) * ORTHANC_MEMORY_ALIGNMENT; | |
65 pitchFloatPointer_ = pitch_ / sizeof(float); | |
66 | |
67 void* tmp = NULL; | |
68 if (posix_memalign(&tmp, ORTHANC_MEMORY_ALIGNMENT, rows_ * pitch_) != 0) | |
69 { | |
70 throw Orthanc::OrthancException(Orthanc::ErrorCode_NotEnoughMemory); | |
71 } | |
72 | |
73 assert(reinterpret_cast<intptr_t>(tmp) % ORTHANC_MEMORY_ALIGNMENT == 0); | |
74 assert(pitch_ % ORTHANC_MEMORY_ALIGNMENT == 0); | |
75 assert(pitch_ % sizeof(float) == 0); | |
76 assert((rows_ * pitch_) % ORTHANC_MEMORY_ALIGNMENT == 0); | |
77 | |
78 content_ = static_cast<float*>(tmp); | |
79 } | |
80 } | |
81 | |
82 | |
83 AlignedMatrix::~AlignedMatrix() | |
84 { | |
85 if (content_ != NULL) | |
86 { | |
87 free(content_); | |
88 } | |
89 } | |
90 | |
91 | |
92 void AlignedMatrix::FillZeros() | |
93 { | |
94 memset(content_, 0, rows_ * pitch_); | |
95 } | |
96 | |
97 | |
98 void AlignedMatrix::ProductPlain(AlignedMatrix& c, | |
99 const AlignedMatrix& a, | |
100 const AlignedMatrix& b) | |
101 { | |
102 if (c.GetRows() != a.GetRows() || | |
103 c.GetColumns() != b.GetColumns() || | |
104 a.GetColumns() != b.GetRows()) | |
105 { | |
106 throw Orthanc::OrthancException(Orthanc::ErrorCode_IncompatibleImageSize); | |
107 } | |
108 | |
109 const unsigned int M = c.GetRows(); | |
110 const unsigned int N = c.GetColumns(); | |
111 const unsigned int K = a.GetColumns(); | |
112 | |
113 c.FillZeros(); | |
114 | |
115 for (unsigned int i = 0; i < M; i++) | |
116 { | |
117 // Loop over "k" to be more cache-friendly | |
118 // https://sahnimanas.github.io/post/anatomy-of-a-high-performance-convolution/ | |
119 for (unsigned int k = 0; k < K; k++) | |
120 { | |
121 for (unsigned int j = 0; j < N; j++) | |
122 { | |
123 c.AddValue(i, j, a.GetValue(i, k) * b.GetValue(k, j)); | |
124 } | |
125 } | |
126 } | |
127 } | |
128 | |
129 | |
130 #if ORTHANC_HAS_MATRIX_PRODUCT_TRANSPOSED_VECTORIZED == 1 | |
131 // Computes "C = A*B^T" | |
132 class AlignedMatrix::ProductTransposedVectorizedContext : public boost::noncopyable | |
133 { | |
134 private: | |
135 unsigned int vectorizedSteps_; | |
136 uint8_t finalSteps_; | |
137 | |
138 public: | |
139 ORTHANC_FORCE_INLINE | |
140 ProductTransposedVectorizedContext(const AlignedMatrix& a) | |
141 { | |
142 #if ORTHANC_HAS_AVX2 == 1 | |
143 const unsigned int blockSize = 8; | |
144 #elif ORTHANC_HAS_SSE2 == 1 || ORTHANC_HAS_WASM_SIMD == 1 | |
145 const unsigned int blockSize = 4; | |
146 #else | |
147 # error No supported SIMD instruction set | |
148 #endif | |
149 | |
150 vectorizedSteps_ = a.GetColumns() / blockSize; | |
151 finalSteps_ = a.GetColumns() - vectorizedSteps_ * blockSize; | |
152 } | |
153 | |
154 ORTHANC_FORCE_INLINE | |
155 float Apply(const float* ap, | |
156 const float* btp) const noexcept | |
157 { | |
158 float result; | |
159 | |
160 #if ORTHANC_HAS_AVX2 == 1 | |
161 __m256 accumulator = _mm256_set1_ps(0); | |
162 | |
163 for (unsigned int k = 0; k < vectorizedSteps_; k++) | |
164 { | |
165 __m256 a = _mm256_load_ps(ap); | |
166 __m256 b = _mm256_load_ps(btp); | |
167 //accumulator = _mm256_add_ps(accumulator, _mm256_mul_ps(a, b)); | |
168 accumulator = _mm256_fmadd_ps(a, b, accumulator); // Requires the "-mfma" compiler flag | |
169 | |
170 ap += 8; | |
171 btp += 8; | |
172 } | |
173 | |
174 float tmp[8] __attribute__ ((aligned (ORTHANC_MEMORY_ALIGNMENT))); | |
175 _mm256_store_ps(tmp, accumulator); | |
176 result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7]; | |
177 | |
178 #elif ORTHANC_HAS_SSE2 == 1 | |
179 __m128 accumulator = _mm_set1_ps(0); | |
180 | |
181 for (unsigned int k = 0; k < vectorizedSteps_; k++) | |
182 { | |
183 __m128 a = _mm_load_ps(ap); | |
184 __m128 b = _mm_load_ps(btp); | |
185 accumulator = _mm_add_ps(accumulator, _mm_mul_ps(a, b)); | |
186 ap += 4; | |
187 btp += 4; | |
188 } | |
189 | |
190 #if 1 | |
191 float tmp[4] __attribute__ ((aligned (ORTHANC_MEMORY_ALIGNMENT))); | |
192 _mm_storeu_ps(tmp, accumulator); | |
193 result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; | |
194 #else | |
195 // This trickier version is theoretically faster, but no much difference in practice | |
196 const __m128 sum2 = _mm_add_ps(accumulator, _mm_shuffle_ps(accumulator, accumulator, _MM_SHUFFLE(2, 3, 0, 1))); | |
197 const __m128 sum1 = _mm_add_ps(sum2, _mm_shuffle_ps(sum2, sum2, _MM_SHUFFLE(0, 1, 2, 3))); | |
198 result = _mm_cvtss_f32(sum1); | |
199 #endif | |
200 | |
201 #elif ORTHANC_HAS_WASM_SIMD == 1 | |
202 v128_t accumulator = wasm_f32x4_splat(0); | |
203 | |
204 for (unsigned int k = 0; k < vectorizedSteps_; k++) | |
205 { | |
206 v128_t a = wasm_v128_load(ap); | |
207 v128_t b = wasm_v128_load(btp); | |
208 accumulator = wasm_f32x4_add(accumulator, wasm_f32x4_mul(a, b)); | |
209 ap += 4; | |
210 btp += 4; | |
211 } | |
212 | |
213 #if 1 | |
214 float tmp[4]; | |
215 wasm_v128_store(tmp, accumulator); | |
216 result = tmp[0] + tmp[1] + tmp[2] + tmp[3]; | |
217 #else | |
218 const v128_t sum2 = wasm_f32x4_add(accumulator, wasm_i32x4_shuffle(accumulator, accumulator, 2, 3, 0, 0)); | |
219 const v128_t sum1 = wasm_f32x4_add(sum2, wasm_i32x4_shuffle(sum2, sum2, 1, 0, 0, 0)); | |
220 result = wasm_f32x4_extract_lane(sum1, 0); | |
221 #endif | |
222 | |
223 #else | |
224 # error No supported SIMD instruction set | |
225 #endif | |
226 | |
227 for (uint8_t k = 0; k < finalSteps_; k++) | |
228 { | |
229 result += (*ap) * (*btp); | |
230 ap++; | |
231 btp++; | |
232 } | |
233 | |
234 return result; | |
235 } | |
236 }; | |
237 #endif | |
238 | |
239 | |
240 #if ORTHANC_HAS_MATRIX_PRODUCT_TRANSPOSED_VECTORIZED == 1 | |
241 void AlignedMatrix::ProductTransposedVectorized(AlignedMatrix& c, | |
242 const AlignedMatrix& a, | |
243 const AlignedMatrix& bt) | |
244 { | |
245 if (c.GetRows() != a.GetRows() || | |
246 c.GetColumns() != bt.GetRows() || | |
247 a.GetColumns() != bt.GetColumns()) | |
248 { | |
249 throw Orthanc::OrthancException(Orthanc::ErrorCode_IncompatibleImageSize); | |
250 } | |
251 | |
252 AlignedMatrix::ProductTransposedVectorizedContext context(a); | |
253 | |
254 const unsigned int M = a.GetRows(); | |
255 const unsigned int N = bt.GetRows(); | |
256 | |
257 const size_t rowSizeA = a.GetPitch() / sizeof(float); | |
258 const size_t rowSizeB = bt.GetPitch() / sizeof(float); | |
259 | |
260 const float* ap = a.GetRowPointer(0); | |
261 for (unsigned int i = 0; i < M; i++) | |
262 { | |
263 float* cp = c.GetRowPointer(i); | |
264 | |
265 const float* btp = bt.GetRowPointer(0); | |
266 for (unsigned int j = 0; j < N; j++, cp++) | |
267 { | |
268 *cp = context.Apply(ap, btp); | |
269 btp += rowSizeB; | |
270 } | |
271 | |
272 ap += rowSizeA; | |
273 } | |
274 } | |
275 #endif | |
276 } |