comparison OrthancStone/Sources/Toolbox/AlignedMatrix.cpp @ 2068:22a83fb9dd23 deep-learning

added AlignedMatrix and TimerLogger
author Sebastien Jodogne <s.jodogne@gmail.com>
date Wed, 17 May 2023 17:30:52 +0200
parents
children
comparison
equal deleted inserted replaced
2067:20222330cdf6 2068:22a83fb9dd23
1 /**
2 * Stone of Orthanc
3 * Copyright (C) 2012-2016 Sebastien Jodogne, Medical Physics
4 * Department, University Hospital of Liege, Belgium
5 * Copyright (C) 2017-2022 Osimis S.A., Belgium
6 * Copyright (C) 2021-2022 Sebastien Jodogne, ICTEAM UCLouvain, Belgium
7 *
8 * This program is free software: you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public License
10 * as published by the Free Software Foundation, either version 3 of
11 * the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this program. If not, see
20 * <http://www.gnu.org/licenses/>.
21 **/
22
23
24 #include "AlignedMatrix.h"
25
26 #include <OrthancException.h>
27
28 #include <string.h>
29
30 namespace OrthancStone
31 {
32 static unsigned int Ceiling(unsigned int a,
33 unsigned int b)
34 {
35 if (a % b == 0)
36 {
37 return a / b;
38 }
39 else
40 {
41 return a / b + 1;
42 }
43 }
44
45
46 void AlignedMatrix::Setup(unsigned int rows,
47 unsigned int cols)
48 {
49 assert(sizeof(float) == 4);
50
51 if (rows == 0 ||
52 cols == 0)
53 {
54 rows_ = 0;
55 cols_ = 0;
56 pitch_ = 0;
57 pitchFloatPointer_ = 0;
58 content_ = NULL;
59 }
60 else
61 {
62 rows_ = rows;
63 cols_ = cols;
64 pitch_ = Ceiling(cols * sizeof(float), ORTHANC_MEMORY_ALIGNMENT) * ORTHANC_MEMORY_ALIGNMENT;
65 pitchFloatPointer_ = pitch_ / sizeof(float);
66
67 void* tmp = NULL;
68 if (posix_memalign(&tmp, ORTHANC_MEMORY_ALIGNMENT, rows_ * pitch_) != 0)
69 {
70 throw Orthanc::OrthancException(Orthanc::ErrorCode_NotEnoughMemory);
71 }
72
73 assert(reinterpret_cast<intptr_t>(tmp) % ORTHANC_MEMORY_ALIGNMENT == 0);
74 assert(pitch_ % ORTHANC_MEMORY_ALIGNMENT == 0);
75 assert(pitch_ % sizeof(float) == 0);
76 assert((rows_ * pitch_) % ORTHANC_MEMORY_ALIGNMENT == 0);
77
78 content_ = static_cast<float*>(tmp);
79 }
80 }
81
82
83 AlignedMatrix::~AlignedMatrix()
84 {
85 if (content_ != NULL)
86 {
87 free(content_);
88 }
89 }
90
91
92 void AlignedMatrix::FillZeros()
93 {
94 memset(content_, 0, rows_ * pitch_);
95 }
96
97
98 void AlignedMatrix::ProductPlain(AlignedMatrix& c,
99 const AlignedMatrix& a,
100 const AlignedMatrix& b)
101 {
102 if (c.GetRows() != a.GetRows() ||
103 c.GetColumns() != b.GetColumns() ||
104 a.GetColumns() != b.GetRows())
105 {
106 throw Orthanc::OrthancException(Orthanc::ErrorCode_IncompatibleImageSize);
107 }
108
109 const unsigned int M = c.GetRows();
110 const unsigned int N = c.GetColumns();
111 const unsigned int K = a.GetColumns();
112
113 c.FillZeros();
114
115 for (unsigned int i = 0; i < M; i++)
116 {
117 // Loop over "k" to be more cache-friendly
118 // https://sahnimanas.github.io/post/anatomy-of-a-high-performance-convolution/
119 for (unsigned int k = 0; k < K; k++)
120 {
121 for (unsigned int j = 0; j < N; j++)
122 {
123 c.AddValue(i, j, a.GetValue(i, k) * b.GetValue(k, j));
124 }
125 }
126 }
127 }
128
129
130 #if ORTHANC_HAS_MATRIX_PRODUCT_TRANSPOSED_VECTORIZED == 1
131 // Computes "C = A*B^T"
132 class AlignedMatrix::ProductTransposedVectorizedContext : public boost::noncopyable
133 {
134 private:
135 unsigned int vectorizedSteps_;
136 uint8_t finalSteps_;
137
138 public:
139 ORTHANC_FORCE_INLINE
140 ProductTransposedVectorizedContext(const AlignedMatrix& a)
141 {
142 #if ORTHANC_HAS_AVX2 == 1
143 const unsigned int blockSize = 8;
144 #elif ORTHANC_HAS_SSE2 == 1 || ORTHANC_HAS_WASM_SIMD == 1
145 const unsigned int blockSize = 4;
146 #else
147 # error No supported SIMD instruction set
148 #endif
149
150 vectorizedSteps_ = a.GetColumns() / blockSize;
151 finalSteps_ = a.GetColumns() - vectorizedSteps_ * blockSize;
152 }
153
154 ORTHANC_FORCE_INLINE
155 float Apply(const float* ap,
156 const float* btp) const noexcept
157 {
158 float result;
159
160 #if ORTHANC_HAS_AVX2 == 1
161 __m256 accumulator = _mm256_set1_ps(0);
162
163 for (unsigned int k = 0; k < vectorizedSteps_; k++)
164 {
165 __m256 a = _mm256_load_ps(ap);
166 __m256 b = _mm256_load_ps(btp);
167 //accumulator = _mm256_add_ps(accumulator, _mm256_mul_ps(a, b));
168 accumulator = _mm256_fmadd_ps(a, b, accumulator); // Requires the "-mfma" compiler flag
169
170 ap += 8;
171 btp += 8;
172 }
173
174 float tmp[8] __attribute__ ((aligned (ORTHANC_MEMORY_ALIGNMENT)));
175 _mm256_store_ps(tmp, accumulator);
176 result = tmp[0] + tmp[1] + tmp[2] + tmp[3] + tmp[4] + tmp[5] + tmp[6] + tmp[7];
177
178 #elif ORTHANC_HAS_SSE2 == 1
179 __m128 accumulator = _mm_set1_ps(0);
180
181 for (unsigned int k = 0; k < vectorizedSteps_; k++)
182 {
183 __m128 a = _mm_load_ps(ap);
184 __m128 b = _mm_load_ps(btp);
185 accumulator = _mm_add_ps(accumulator, _mm_mul_ps(a, b));
186 ap += 4;
187 btp += 4;
188 }
189
190 #if 1
191 float tmp[4] __attribute__ ((aligned (ORTHANC_MEMORY_ALIGNMENT)));
192 _mm_storeu_ps(tmp, accumulator);
193 result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
194 #else
195 // This trickier version is theoretically faster, but no much difference in practice
196 const __m128 sum2 = _mm_add_ps(accumulator, _mm_shuffle_ps(accumulator, accumulator, _MM_SHUFFLE(2, 3, 0, 1)));
197 const __m128 sum1 = _mm_add_ps(sum2, _mm_shuffle_ps(sum2, sum2, _MM_SHUFFLE(0, 1, 2, 3)));
198 result = _mm_cvtss_f32(sum1);
199 #endif
200
201 #elif ORTHANC_HAS_WASM_SIMD == 1
202 v128_t accumulator = wasm_f32x4_splat(0);
203
204 for (unsigned int k = 0; k < vectorizedSteps_; k++)
205 {
206 v128_t a = wasm_v128_load(ap);
207 v128_t b = wasm_v128_load(btp);
208 accumulator = wasm_f32x4_add(accumulator, wasm_f32x4_mul(a, b));
209 ap += 4;
210 btp += 4;
211 }
212
213 #if 1
214 float tmp[4];
215 wasm_v128_store(tmp, accumulator);
216 result = tmp[0] + tmp[1] + tmp[2] + tmp[3];
217 #else
218 const v128_t sum2 = wasm_f32x4_add(accumulator, wasm_i32x4_shuffle(accumulator, accumulator, 2, 3, 0, 0));
219 const v128_t sum1 = wasm_f32x4_add(sum2, wasm_i32x4_shuffle(sum2, sum2, 1, 0, 0, 0));
220 result = wasm_f32x4_extract_lane(sum1, 0);
221 #endif
222
223 #else
224 # error No supported SIMD instruction set
225 #endif
226
227 for (uint8_t k = 0; k < finalSteps_; k++)
228 {
229 result += (*ap) * (*btp);
230 ap++;
231 btp++;
232 }
233
234 return result;
235 }
236 };
237 #endif
238
239
240 #if ORTHANC_HAS_MATRIX_PRODUCT_TRANSPOSED_VECTORIZED == 1
241 void AlignedMatrix::ProductTransposedVectorized(AlignedMatrix& c,
242 const AlignedMatrix& a,
243 const AlignedMatrix& bt)
244 {
245 if (c.GetRows() != a.GetRows() ||
246 c.GetColumns() != bt.GetRows() ||
247 a.GetColumns() != bt.GetColumns())
248 {
249 throw Orthanc::OrthancException(Orthanc::ErrorCode_IncompatibleImageSize);
250 }
251
252 AlignedMatrix::ProductTransposedVectorizedContext context(a);
253
254 const unsigned int M = a.GetRows();
255 const unsigned int N = bt.GetRows();
256
257 const size_t rowSizeA = a.GetPitch() / sizeof(float);
258 const size_t rowSizeB = bt.GetPitch() / sizeof(float);
259
260 const float* ap = a.GetRowPointer(0);
261 for (unsigned int i = 0; i < M; i++)
262 {
263 float* cp = c.GetRowPointer(i);
264
265 const float* btp = bt.GetRowPointer(0);
266 for (unsigned int j = 0; j < N; j++, cp++)
267 {
268 *cp = context.Apply(ap, btp);
269 btp += rowSizeB;
270 }
271
272 ap += rowSizeA;
273 }
274 }
275 #endif
276 }