1 // Copyright 2015 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 // multi_thread_gemm.h: Multi-threaded GEMM entry point.
16 // Readers note: To understand this file, it is useful to first
17 // read and understand the much simpler single_thread_gemm.h.
18
19 #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
20 #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
21
22 #include <pthread.h>
23 #include <unistd.h>
24 #include <vector>
25
26 #include "single_thread_gemm.h"
27
28 namespace gemmlowp {
29
30 #ifdef GEMMLOWP_ALLOW_INLINE_ASM
31 // Where inline asm is allowed, we use some busy-waiting,
32 // preferably implemented using NOP instructions.
33 const int kMaxBusyWaitNOPs = 32 * 1000 * 1000;
34
35 #define GEMMLOWP_NOP "nop\n"
36
37 #define GEMMLOWP_STRING_CONCAT_4(X) X X X X
38 #define GEMMLOWP_NOP4 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP)
39 #define GEMMLOWP_NOP16 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP4)
40 #define GEMMLOWP_NOP64 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP16)
41 #define GEMMLOWP_NOP256 GEMMLOWP_STRING_CONCAT_4(GEMMLOWP_NOP64)
42
Do256NOPs()43 inline int Do256NOPs() {
44 asm volatile(GEMMLOWP_NOP256);
45 return 256;
46 }
47
48 #undef GEMMLOWP_STRING_CONCAT_4
49 #undef GEMMLOWP_NOP256
50 #undef GEMMLOWP_NOP64
51 #undef GEMMLOWP_NOP16
52 #undef GEMMLOWP_NOP4
53 #undef GEMMLOWP_NOP
54
55 #else // not GEMMLOWP_ALLOW_INLINE_ASM
56
57 // It is nontrivial to implement a good busy-waiting without
58 // using asm; NOP instructions have the least side effects
59 // and the lowest power usage; and since the whole busy-waiting
60 // story is an optimization, it's not very interesting anyway
61 // in places where we're slow anyway due to not being able to
62 // use our inline asm kernels.
63
64 const int kMaxBusyWaitNOPs = 0;
65 inline int Do256NOPs() { return 0; }
66
67 #endif // not GEMMLOWP_ALLOW_INLINE_ASM
68
69 // Waits until *var != initial_value.
70 //
71 // Returns the new value of *var. The guarantee here is that
72 // the return value is different from initial_value, and that that
73 // new value has been taken by *var at some point during the
74 // execution of this function. There is no guarantee that this is
75 // still the value of *var when this function returns, since *var is
76 // not assumed to be guarded by any lock.
77 //
78 // First does some busy-waiting for a fixed number of no-op cycles,
79 // then falls back to passive waiting for the given condvar, guarded
80 // by the given mutex.
81 //
82 // The idea of doing some initial busy-waiting is to help get
83 // better and more consistent multithreading benefits for small GEMM sizes.
84 // Busy-waiting help ensuring that if we need to wake up soon after having
85 // started waiting, then we can wake up quickly (as opposed to, say,
86 // having to wait to be scheduled again by the OS). On the other hand,
87 // we must still eventually revert to passive waiting for longer waits
88 // (e.g. worker threads having finished a GEMM and waiting until the next GEMM)
89 // so as to avoid permanently spinning.
90 //
91 template <typename T>
WaitForVariableChange(volatile T * var,T initial_value,pthread_cond_t * cond,pthread_mutex_t * mutex)92 T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond,
93 pthread_mutex_t* mutex) {
94 int nops = 0;
95 // First, trivial case where the variable already changed value.
96 T new_value = *var;
97 if (new_value != initial_value) {
98 return new_value;
99 }
100 // Then try busy-waiting.
101 while (nops < kMaxBusyWaitNOPs) {
102 nops += Do256NOPs();
103 new_value = *var;
104 if (new_value != initial_value) {
105 return new_value;
106 }
107 }
108 // Finally, do real passive waiting.
109 pthread_mutex_lock(mutex);
110 new_value = *var;
111 if (new_value == initial_value) {
112 pthread_cond_wait(cond, mutex);
113 new_value = *var;
114 assert(new_value != initial_value);
115 }
116 pthread_mutex_unlock(mutex);
117 return new_value;
118 }
119
120 // A BlockingCounter lets one thread to wait for N events to occur.
121 // This is how the master thread waits for all the worker threads
122 // to have finished working.
123 class BlockingCounter {
124 public:
BlockingCounter()125 BlockingCounter()
126 : cond_(PTHREAD_COND_INITIALIZER),
127 mutex_(PTHREAD_MUTEX_INITIALIZER),
128 count_(0),
129 initial_count_(0) {}
130
131 // Sets/resets the counter; initial_count is the number of
132 // decrementing events that the Wait() call will be waiting for.
Reset(std::size_t initial_count)133 void Reset(std::size_t initial_count) {
134 pthread_mutex_lock(&mutex_);
135 assert(count_ == 0);
136 initial_count_ = initial_count;
137 count_ = initial_count_;
138 pthread_mutex_unlock(&mutex_);
139 }
140
141 // Decrements the counter; if the counter hits zero, signals
142 // the thread that was waiting for that, and returns true.
143 // Otherwise (if the decremented count is still nonzero),
144 // returns false.
DecrementCount()145 bool DecrementCount() {
146 pthread_mutex_lock(&mutex_);
147 assert(count_ > 0);
148 count_--;
149 if (count_ == 0) {
150 pthread_cond_signal(&cond_);
151 }
152 bool retval = count_ == 0;
153 pthread_mutex_unlock(&mutex_);
154 return retval;
155 }
156
157 // Waits for the N other threads (N having been set by Reset())
158 // to hit the BlockingCounter.
Wait()159 void Wait() {
160 ScopedProfilingLabel label("BlockingCounter::Wait");
161 while (count_) {
162 MemoryBarrier();
163 const std::size_t count_value = count_;
164 if (count_value) {
165 WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
166 }
167 }
168 }
169
170 private:
171 pthread_cond_t cond_;
172 pthread_mutex_t mutex_;
173 std::size_t count_;
174 std::size_t initial_count_;
175 };
176
177 // A workload for a worker.
178 struct Task {
TaskTask179 Task() : local_allocator(nullptr) {}
~TaskTask180 virtual ~Task() {}
181 virtual void Run() const = 0;
182 Allocator* local_allocator;
183 };
184
185 // A worker thread.
186 class Worker {
187 public:
188 enum class State {
189 ThreadStartup, // The initial state before the thread main loop runs.
190 Ready, // Is not working, has not yet received new work to do.
191 HasWork, // Has work to do.
192 ExitAsSoonAsPossible // Should exit at earliest convenience.
193 };
194
Worker(BlockingCounter * counter_to_decrement_when_ready)195 explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
196 : task_(nullptr),
197 state_cond_(PTHREAD_COND_INITIALIZER),
198 state_mutex_(PTHREAD_MUTEX_INITIALIZER),
199 state_(State::ThreadStartup),
200 counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
201 pthread_create(&thread_, nullptr, ThreadFunc, this);
202 }
203
~Worker()204 ~Worker() {
205 ChangeState(State::ExitAsSoonAsPossible);
206 pthread_join(thread_, nullptr);
207 }
208
209 // Changes State; may be called from either the worker thread
210 // or the master thread; however, not all state transitions are legal,
211 // which is guarded by assertions.
ChangeState(State new_state)212 void ChangeState(State new_state) {
213 ScopedProfilingLabel label("Worker::ChangeState");
214 pthread_mutex_lock(&state_mutex_);
215 assert(new_state != state_);
216 switch (state_) {
217 case State::ThreadStartup:
218 assert(new_state == State::Ready);
219 break;
220 case State::Ready:
221 assert(new_state == State::HasWork ||
222 new_state == State::ExitAsSoonAsPossible);
223 break;
224 case State::HasWork:
225 assert(new_state == State::Ready ||
226 new_state == State::ExitAsSoonAsPossible);
227 break;
228 default:
229 abort();
230 }
231 state_ = new_state;
232 pthread_cond_signal(&state_cond_);
233 if (state_ == State::Ready) {
234 counter_to_decrement_when_ready_->DecrementCount();
235 }
236 pthread_mutex_unlock(&state_mutex_);
237 }
238
239 // Thread entry point.
ThreadFunc()240 void ThreadFunc() {
241 ScopedProfilingLabel label("Worker::ThreadFunc");
242 RegisterCurrentThreadForProfiling();
243
244 ChangeState(State::Ready);
245
246 // Thread main loop
247 while (true) {
248 // Get a state to act on
249 // In the 'Ready' state, we have nothing to do but to wait until
250 // we switch to another state.
251 State state_to_act_upon = WaitForVariableChange(
252 &state_, State::Ready, &state_cond_, &state_mutex_);
253
254 // We now have a state to act on, so act.
255 switch (state_to_act_upon) {
256 case State::HasWork:
257 // Got work to do! So do it, and then revert to 'Ready' state.
258 assert(task_);
259 task_->Run();
260 delete task_;
261 task_ = nullptr;
262 ChangeState(State::Ready);
263 break;
264 case State::ExitAsSoonAsPossible:
265 return;
266 default:
267 abort();
268 }
269 }
270 }
271
ThreadFunc(void * arg)272 static void* ThreadFunc(void* arg) {
273 static_cast<Worker*>(arg)->ThreadFunc();
274 return nullptr;
275 }
276
277 // Called by the master thead to give this worker work to do.
278 // It is only legal to call this if the worker
StartWork(Task * task)279 void StartWork(Task* task) {
280 assert(!task_);
281 task->local_allocator = &local_allocator_;
282 task_ = task;
283 assert(state_ == State::Ready);
284 ChangeState(State::HasWork);
285 }
286
287 private:
288 // The underlying thread.
289 pthread_t thread_;
290
291 // The task to be worked on.
292 const Task* task_;
293
294 // The condition variable and mutex guarding state changes.
295 pthread_cond_t state_cond_;
296 pthread_mutex_t state_mutex_;
297
298 // The state enum tells if we're currently working, waiting for work, etc.
299 State state_;
300
301 // Each thread had a local allocator so they can allocate temporary
302 // buffers without blocking each other.
303 Allocator local_allocator_;
304
305 // pointer to the master's thread BlockingCounter object, to notify the
306 // master thread of when this worker switches to the 'Ready' state.
307 BlockingCounter* const counter_to_decrement_when_ready_;
308 };
309
310 // A very simple pool of workers, that only allows the very
311 // specific parallelization pattern that we use here:
312 // a fixed number of workers can be given work, and one then
313 // waits for all of them to finish.
314 class WorkersPool {
315 public:
WorkersPool()316 WorkersPool() {}
317
~WorkersPool()318 ~WorkersPool() {
319 for (auto w : workers_) {
320 delete w;
321 }
322 }
323
counter_to_decrement_when_ready()324 BlockingCounter& counter_to_decrement_when_ready() {
325 return counter_to_decrement_when_ready_;
326 }
327
328 // Give work to a specific worker.
StartWorker(int index,Task * task_)329 void StartWorker(int index, Task* task_) {
330 assert(static_cast<std::size_t>(index) < workers_.size());
331 workers_[index]->StartWork(task_);
332 }
333
334 // Ensures that the pool has at least the given count of workers.
335 // If any new worker has to be created, this function waits for it to
336 // be ready.
CreateWorkers(std::size_t workers_count)337 void CreateWorkers(std::size_t workers_count) {
338 if (workers_.size() >= workers_count) {
339 return;
340 }
341 counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
342 while (workers_.size() < workers_count) {
343 workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
344 }
345 counter_to_decrement_when_ready_.Wait();
346 }
347
348 private:
349 // copy construction disallowed
350 WorkersPool(const WorkersPool&) = delete;
351
352 // The workers in this pool. They are owned by the pool:
353 // the pool creates workers and destroys them in its destructor.
354 std::vector<Worker*> workers_;
355
356 // The BlockingCounter used to wait for the workers.
357 BlockingCounter counter_to_decrement_when_ready_;
358 };
359
360 // The task we use to implement a multi-threaded Gemm: a block of the
361 // RHS has been packed by the master thread; each worker thread
362 // then has to pack a block of the LHS and accumulate the Gemm of these
363 // packed LHS and RHS blocks.
364 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
365 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
366 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
367 typename OutputPipelineType>
368 struct GemmWithPackedRhsTask : Task {
369 typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
370 typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
GemmWithPackedRhsTaskGemmWithPackedRhsTask371 GemmWithPackedRhsTask(const KernelBase& _kernel,
372 const MatrixMap<const InputScalar, LhsOrder>& _lhs,
373 const PackedRhs& _packed_rhs,
374 MatrixMap<OutputScalar, ResultOrder>* _result,
375 const LhsOffset& _lhs_offset,
376 const RhsOffset& _rhs_offset,
377 const OutputPipelineType& _output_pipeline)
378 : kernel(_kernel),
379 lhs(_lhs),
380 packed_rhs(_packed_rhs),
381 result(*_result),
382 lhs_offset(_lhs_offset),
383 rhs_offset(_rhs_offset),
384 output_pipeline(_output_pipeline) {}
385
RunGemmWithPackedRhsTask386 void Run() const override {
387 ScopedProfilingLabel label("GemmWithPackedRhsTask");
388
389 const int rows = result.rows();
390 const int cols = result.cols();
391 const int depth = lhs.cols();
392
393 BlockParams block_params;
394 block_params.Init<KernelFormat>(rows, cols, depth, 1);
395
396 PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
397
398 PackedResult packed_result(local_allocator, block_params);
399
400 local_allocator->Commit();
401
402 for (int c = 0; c < cols; c += block_params.l2_cols) {
403 int cs = std::min(block_params.l2_cols, cols - c);
404
405 for (int r = 0; r < rows; r += block_params.l2_rows) {
406 int rs = std::min(block_params.l2_rows, rows - r);
407
408 PackLhs<BitDepthParams>(&packed_lhs, lhs.block(r, 0, rs, depth));
409
410 Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
411
412 auto result_block = result.block(r, c, rs, cs);
413 UnpackResult<BitDepthParams>(&result_block, packed_result, depth,
414 packed_lhs.sums_of_each_slice(),
415 packed_rhs.sums_of_each_slice(),
416 lhs_offset, rhs_offset, output_pipeline);
417 }
418 }
419
420 local_allocator->Decommit();
421 }
422
423 const KernelBase& kernel;
424 const MatrixMap<const InputScalar, LhsOrder> lhs;
425 const PackedRhs packed_rhs;
426 MatrixMap<OutputScalar, ResultOrder> result;
427 const LhsOffset& lhs_offset;
428 const RhsOffset& rhs_offset;
429 const OutputPipelineType& output_pipeline;
430 };
431
432 class MultiThreadGemmContext : public SingleThreadGemmContext {
433 public:
MultiThreadGemmContext()434 MultiThreadGemmContext() : max_num_threads_(0) {}
435
set_max_num_threads(int n)436 void set_max_num_threads(int n) { max_num_threads_ = n; }
437
max_num_threads()438 int max_num_threads() const { return max_num_threads_; }
439
workers_pool()440 WorkersPool* workers_pool() { return &workers_pool_; }
441
main_thread_task_allocator()442 Allocator* main_thread_task_allocator() {
443 return &main_thread_task_allocator_;
444 }
445
446 protected:
447 // The workers pool used by MultiThreadGemm. Making
448 // this part of the context allows it to be persistent,
449 // avoiding recreating threads on every Gemm.
450 WorkersPool workers_pool_;
451
452 // The maximum number of worker threads to use (in addition
453 // to the master thread).
454 // The default value 0 means the default behavior of
455 // detecting the number of hardware threads. Nonzero values mean
456 // skipping and overriding hardware detection.
457 int max_num_threads_;
458
459 // For N-threaded operations, we will use only N-1 worker threads
460 // while the last task will be run directly on the main thread.
461 // It will then use this main_thread_task_allocator_; having a
462 // dedicated allocator for that (separate from the base allocator_)
463 // allows to use the same code for all tasks regardless of which
464 // thread they run on.
465 Allocator main_thread_task_allocator_;
466 };
467
468 // Determines how many threads should be used for a given Gemm
469 // operation.
470 template <int KernelRows>
HowManyThreads(MultiThreadGemmContext * context,int rows,int cols,int depth)471 inline int HowManyThreads(MultiThreadGemmContext* context, int rows, int cols,
472 int depth) {
473 // First check if the user set an explicit maximum number of threads.
474 int max_count = context->max_num_threads();
475 if (!max_count) {
476 // No user-set maximum number of threads, so we need to
477 // do some hardware detection.
478 // This is expensive to query so we do it only once.
479 // Too bad for dynamicness. Also, we dont use the c++11 standard getter
480 // because Google's coding style currently bans #include <thread_>.
481 static const int hardware_threads_count =
482 static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
483
484 max_count = hardware_threads_count;
485 }
486
487 // Basic calculation: take into account max pool size, and
488 // how many rows we have to feed our kernel.
489 // The motivation for an absolute minimum number of rows per thread,
490 // potentially higher than KernelRows, is that very thin thread workload
491 // currently defeat assumptions of the AddMod generator, resulting
492 // in substantial bias in TestWithRealData on 24 threads.
493 // Ideally, the AddMod generator should be aware of global (r,c) coordinates
494 // so as to be independent of the number of threads.
495 static const int AbsoluteMinRowsPerThread = 16;
496 static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
497 ? KernelRows
498 : AbsoluteMinRowsPerThread;
499 int thread_count = std::min(max_count, CeilQuotient(rows, MinRowsPerThread));
500
501 // At this point for small products we already have thread_count==1 so
502 // we can avoid doing more work; otherwise, we still want to check
503 // that the cubic size (rows*cols*depth) is big enough to keep
504 // workers_ busy.
505 if (thread_count > 1) {
506 // Empirically determined value.
507 static const std::uint64_t min_cubic_size_per_thread = 64 * 1024;
508
509 // We can only multiply two out of three sizes without risking overflow
510 const std::uint64_t cubic_size =
511 std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
512
513 thread_count =
514 std::min(thread_count, int(cubic_size / min_cubic_size_per_thread));
515
516 if (thread_count < 1) {
517 thread_count = 1;
518 }
519 }
520
521 assert(thread_count > 0 && thread_count <= max_count);
522 return thread_count;
523 }
524
525 // The main multi-threaded Gemm function.
526 // To understand it, first read the code of SingleThreadedGemm().
527 // The parallelization scheme used here is to have this master function
528 // pack a block of RHS and then start worker threads to pack a block of LHS
529 // each, and accumulate the corresponding products.
530 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
531 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
532 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
533 typename OutputPipelineType>
MultiThreadGemm(MultiThreadGemmContext * context,const KernelBase & kernel,const MatrixMap<const InputScalar,LhsOrder> & lhs,const MatrixMap<const InputScalar,RhsOrder> & rhs,MatrixMap<OutputScalar,ResultOrder> * result,const LhsOffset & lhs_offset,const RhsOffset & rhs_offset,const OutputPipelineType & output_pipeline)534 void MultiThreadGemm(MultiThreadGemmContext* context, const KernelBase& kernel,
535 const MatrixMap<const InputScalar, LhsOrder>& lhs,
536 const MatrixMap<const InputScalar, RhsOrder>& rhs,
537 MatrixMap<OutputScalar, ResultOrder>* result,
538 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
539 const OutputPipelineType& output_pipeline) {
540 ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
541
542 assert(lhs.cols() == rhs.rows());
543
544 int rows = result->rows();
545 int cols = result->cols();
546 int depth = lhs.cols();
547
548 assert(rows > 0);
549 assert(cols > 0);
550 assert(depth > 0);
551
552 const int thread_count =
553 HowManyThreads<KernelFormat::kRows>(context, rows, cols, depth);
554 if (thread_count == 1) {
555 return SingleThreadGemm<KernelFormat, InputScalar, OutputScalar,
556 BitDepthParams>(context, kernel, lhs, rhs, result,
557 lhs_offset, rhs_offset,
558 output_pipeline);
559 }
560 assert(thread_count > 1);
561
562 // We choose to use a worker thread for all but one
563 // of the thread workloads. The remaining thread workload will be
564 // executed immediately on the current thread.
565 // In this way, the total number of threads (1 master, N-1 workers)
566 // equals the value returned by HowManyThread. This simple
567 // 1:1 mapping of threads to physical cores, is very important
568 // to getting good multithreaded performance especially for
569 // not-very-large GEMMs, and especially on Android.
570 const int workers_count = thread_count - 1;
571
572 Allocator* allocator = context->allocator();
573 WorkersPool* workers_pool = context->workers_pool();
574
575 workers_pool->CreateWorkers(workers_count);
576
577 BlockParams block_params;
578 block_params.Init<KernelFormat>(rows, cols, depth, workers_count);
579
580 PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
581 Side::Rhs, allocator, block_params);
582 allocator->Commit();
583
584 // We loop over large blocks of the RHS.
585 for (int c = 0; c < cols; c += block_params.l2_cols) {
586 int cs = std::min(block_params.l2_cols, cols - c);
587
588 // Pack a large block of the RHS.
589 PackRhs<BitDepthParams>(&packed_rhs, rhs.block(0, c, depth, cs));
590
591 // Give work to each worker.
592 int next_start_row = 0;
593 workers_pool->counter_to_decrement_when_ready().Reset(workers_count);
594 for (int thread = 0; thread < thread_count; thread++) {
595 int start_row = next_start_row;
596 next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>(
597 rows * (thread + 1) / thread_count));
598
599 int block_rows = next_start_row - start_row;
600 auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
601 auto result_block = result->block(start_row, c, block_rows, cs);
602 typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
603 BitDepthParams, LhsOrder, RhsOrder,
604 ResultOrder, LhsOffset, RhsOffset,
605 OutputPipelineType>
606 TaskType;
607 auto task = new TaskType(kernel, lhs_block, packed_rhs, &result_block,
608 lhs_offset, rhs_offset, output_pipeline);
609 if (thread < workers_count) {
610 workers_pool->StartWorker(thread, task);
611 } else {
612 // Execute the remaining workload immediately on the current thread.
613 task->local_allocator = context->main_thread_task_allocator();
614 task->Run();
615 delete task;
616 }
617 }
618 // Wait for the workers.
619 workers_pool->counter_to_decrement_when_ready().Wait();
620 }
621
622 allocator->Decommit();
623 }
624
625 } // namespace gemmlowp
626
627 #endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
628