1 /*
2  * kmp_affinity.cpp -- affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22 
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25 
__kmp_cleanup_hierarchy()26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
27 
__kmp_get_hierarchy(kmp_uint32 nproc,kmp_bstate_t * thr_bar)28 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
29   kmp_uint32 depth;
30   // The test below is true if affinity is available, but set to "none". Need to
31   // init on first use of hierarchical barrier.
32   if (TCR_1(machine_hierarchy.uninitialized))
33     machine_hierarchy.init(NULL, nproc);
34 
35   // Adjust the hierarchy in case num threads exceeds original
36   if (nproc > machine_hierarchy.base_num_threads)
37     machine_hierarchy.resize(nproc);
38 
39   depth = machine_hierarchy.depth;
40   KMP_DEBUG_ASSERT(depth > 0);
41 
42   thr_bar->depth = depth;
43   thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
44   thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
45 }
46 
47 #if KMP_AFFINITY_SUPPORTED
48 
49 bool KMPAffinity::picked_api = false;
50 
operator new(size_t n)51 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
operator new[](size_t n)52 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
operator delete(void * p)53 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
operator delete[](void * p)54 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
operator new(size_t n)55 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
operator delete(void * p)56 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
57 
pick_api()58 void KMPAffinity::pick_api() {
59   KMPAffinity *affinity_dispatch;
60   if (picked_api)
61     return;
62 #if KMP_USE_HWLOC
63   // Only use Hwloc if affinity isn't explicitly disabled and
64   // user requests Hwloc topology method
65   if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
66       __kmp_affinity_type != affinity_disabled) {
67     affinity_dispatch = new KMPHwlocAffinity();
68   } else
69 #endif
70   {
71     affinity_dispatch = new KMPNativeAffinity();
72   }
73   __kmp_affinity_dispatch = affinity_dispatch;
74   picked_api = true;
75 }
76 
destroy_api()77 void KMPAffinity::destroy_api() {
78   if (__kmp_affinity_dispatch != NULL) {
79     delete __kmp_affinity_dispatch;
80     __kmp_affinity_dispatch = NULL;
81     picked_api = false;
82   }
83 }
84 
85 #define KMP_ADVANCE_SCAN(scan)                                                 \
86   while (*scan != '\0') {                                                      \
87     scan++;                                                                    \
88   }
89 
90 // Print the affinity mask to the character array in a pretty format.
91 // The format is a comma separated list of non-negative integers or integer
92 // ranges: e.g., 1,2,3-5,7,9-15
93 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_print_mask(char * buf,int buf_len,kmp_affin_mask_t * mask)94 char *__kmp_affinity_print_mask(char *buf, int buf_len,
95                                 kmp_affin_mask_t *mask) {
96   int start = 0, finish = 0, previous = 0;
97   bool first_range;
98   KMP_ASSERT(buf);
99   KMP_ASSERT(buf_len >= 40);
100   KMP_ASSERT(mask);
101   char *scan = buf;
102   char *end = buf + buf_len - 1;
103 
104   // Check for empty set.
105   if (mask->begin() == mask->end()) {
106     KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
107     KMP_ADVANCE_SCAN(scan);
108     KMP_ASSERT(scan <= end);
109     return buf;
110   }
111 
112   first_range = true;
113   start = mask->begin();
114   while (1) {
115     // Find next range
116     // [start, previous] is inclusive range of contiguous bits in mask
117     for (finish = mask->next(start), previous = start;
118          finish == previous + 1 && finish != mask->end();
119          finish = mask->next(finish)) {
120       previous = finish;
121     }
122 
123     // The first range does not need a comma printed before it, but the rest
124     // of the ranges do need a comma beforehand
125     if (!first_range) {
126       KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
127       KMP_ADVANCE_SCAN(scan);
128     } else {
129       first_range = false;
130     }
131     // Range with three or more contiguous bits in the affinity mask
132     if (previous - start > 1) {
133       KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
134                    static_cast<int>(previous));
135     } else {
136       // Range with one or two contiguous bits in the affinity mask
137       KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
138       KMP_ADVANCE_SCAN(scan);
139       if (previous - start > 0) {
140         KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
141       }
142     }
143     KMP_ADVANCE_SCAN(scan);
144     // Start over with new start point
145     start = finish;
146     if (start == mask->end())
147       break;
148     // Check for overflow
149     if (end - scan < 2)
150       break;
151   }
152 
153   // Check for overflow
154   KMP_ASSERT(scan <= end);
155   return buf;
156 }
157 #undef KMP_ADVANCE_SCAN
158 
159 // Print the affinity mask to the string buffer object in a pretty format
160 // The format is a comma separated list of non-negative integers or integer
161 // ranges: e.g., 1,2,3-5,7,9-15
162 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_str_buf_mask(kmp_str_buf_t * buf,kmp_affin_mask_t * mask)163 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
164                                            kmp_affin_mask_t *mask) {
165   int start = 0, finish = 0, previous = 0;
166   bool first_range;
167   KMP_ASSERT(buf);
168   KMP_ASSERT(mask);
169 
170   __kmp_str_buf_clear(buf);
171 
172   // Check for empty set.
173   if (mask->begin() == mask->end()) {
174     __kmp_str_buf_print(buf, "%s", "{<empty>}");
175     return buf;
176   }
177 
178   first_range = true;
179   start = mask->begin();
180   while (1) {
181     // Find next range
182     // [start, previous] is inclusive range of contiguous bits in mask
183     for (finish = mask->next(start), previous = start;
184          finish == previous + 1 && finish != mask->end();
185          finish = mask->next(finish)) {
186       previous = finish;
187     }
188 
189     // The first range does not need a comma printed before it, but the rest
190     // of the ranges do need a comma beforehand
191     if (!first_range) {
192       __kmp_str_buf_print(buf, "%s", ",");
193     } else {
194       first_range = false;
195     }
196     // Range with three or more contiguous bits in the affinity mask
197     if (previous - start > 1) {
198       __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
199                           static_cast<int>(previous));
200     } else {
201       // Range with one or two contiguous bits in the affinity mask
202       __kmp_str_buf_print(buf, "%d", static_cast<int>(start));
203       if (previous - start > 0) {
204         __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
205       }
206     }
207     // Start over with new start point
208     start = finish;
209     if (start == mask->end())
210       break;
211   }
212   return buf;
213 }
214 
__kmp_affinity_entire_machine_mask(kmp_affin_mask_t * mask)215 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
216   KMP_CPU_ZERO(mask);
217 
218 #if KMP_GROUP_AFFINITY
219 
220   if (__kmp_num_proc_groups > 1) {
221     int group;
222     KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
223     for (group = 0; group < __kmp_num_proc_groups; group++) {
224       int i;
225       int num = __kmp_GetActiveProcessorCount(group);
226       for (i = 0; i < num; i++) {
227         KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
228       }
229     }
230   } else
231 
232 #endif /* KMP_GROUP_AFFINITY */
233 
234   {
235     int proc;
236     for (proc = 0; proc < __kmp_xproc; proc++) {
237       KMP_CPU_SET(proc, mask);
238     }
239   }
240 }
241 
242 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
243 // called to renumber the labels from [0..n] and place them into the child_num
244 // vector of the address object.  This is done in case the labels used for
245 // the children at one node of the hierarchy differ from those used for
246 // another node at the same level.  Example:  suppose the machine has 2 nodes
247 // with 2 packages each.  The first node contains packages 601 and 602, and
248 // second node contains packages 603 and 604.  If we try to sort the table
249 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
250 // because we are paying attention to the labels themselves, not the ordinal
251 // child numbers.  By using the child numbers in the sort, the result is
252 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
__kmp_affinity_assign_child_nums(AddrUnsPair * address2os,int numAddrs)253 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
254                                              int numAddrs) {
255   KMP_DEBUG_ASSERT(numAddrs > 0);
256   int depth = address2os->first.depth;
257   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
258   unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
259   int labCt;
260   for (labCt = 0; labCt < depth; labCt++) {
261     address2os[0].first.childNums[labCt] = counts[labCt] = 0;
262     lastLabel[labCt] = address2os[0].first.labels[labCt];
263   }
264   int i;
265   for (i = 1; i < numAddrs; i++) {
266     for (labCt = 0; labCt < depth; labCt++) {
267       if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
268         int labCt2;
269         for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
270           counts[labCt2] = 0;
271           lastLabel[labCt2] = address2os[i].first.labels[labCt2];
272         }
273         counts[labCt]++;
274         lastLabel[labCt] = address2os[i].first.labels[labCt];
275         break;
276       }
277     }
278     for (labCt = 0; labCt < depth; labCt++) {
279       address2os[i].first.childNums[labCt] = counts[labCt];
280     }
281     for (; labCt < (int)Address::maxDepth; labCt++) {
282       address2os[i].first.childNums[labCt] = 0;
283     }
284   }
285   __kmp_free(lastLabel);
286   __kmp_free(counts);
287 }
288 
289 // All of the __kmp_affinity_create_*_map() routines should set
290 // __kmp_affinity_masks to a vector of affinity mask objects of length
291 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
292 // the number of levels in the machine topology tree (zero if
293 // __kmp_affinity_type == affinity_none).
294 //
295 // All of the __kmp_affinity_create_*_map() routines should set
296 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
297 // They need to save and restore the mask, and it could be needed later, so
298 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
299 // again.
300 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
301 
302 static int nCoresPerPkg, nPackages;
303 static int __kmp_nThreadsPerCore;
304 #ifndef KMP_DFLT_NTH_CORES
305 static int __kmp_ncores;
306 #endif
307 static int *__kmp_pu_os_idx = NULL;
308 
309 // __kmp_affinity_uniform_topology() doesn't work when called from
310 // places which support arbitrarily many levels in the machine topology
311 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
312 // __kmp_affinity_create_x2apicid_map().
__kmp_affinity_uniform_topology()313 inline static bool __kmp_affinity_uniform_topology() {
314   return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
315 }
316 
317 // Print out the detailed machine topology map, i.e. the physical locations
318 // of each OS proc.
__kmp_affinity_print_topology(AddrUnsPair * address2os,int len,int depth,int pkgLevel,int coreLevel,int threadLevel)319 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
320                                           int depth, int pkgLevel,
321                                           int coreLevel, int threadLevel) {
322   int proc;
323 
324   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
325   for (proc = 0; proc < len; proc++) {
326     int level;
327     kmp_str_buf_t buf;
328     __kmp_str_buf_init(&buf);
329     for (level = 0; level < depth; level++) {
330       if (level == threadLevel) {
331         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
332       } else if (level == coreLevel) {
333         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
334       } else if (level == pkgLevel) {
335         __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
336       } else if (level > pkgLevel) {
337         __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
338                             level - pkgLevel - 1);
339       } else {
340         __kmp_str_buf_print(&buf, "L%d ", level);
341       }
342       __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
343     }
344     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
345                buf.str);
346     __kmp_str_buf_free(&buf);
347   }
348 }
349 
350 #if KMP_USE_HWLOC
351 
__kmp_affinity_print_hwloc_tp(AddrUnsPair * addrP,int len,int depth,int * levels)352 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
353                                           int depth, int *levels) {
354   int proc;
355   kmp_str_buf_t buf;
356   __kmp_str_buf_init(&buf);
357   KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
358   for (proc = 0; proc < len; proc++) {
359     __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
360                         addrP[proc].first.labels[0]);
361     if (depth > 1) {
362       int level = 1; // iterate over levels
363       int label = 1; // iterate over labels
364       if (__kmp_numa_detected)
365         // node level follows package
366         if (levels[level++] > 0)
367           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
368                               addrP[proc].first.labels[label++]);
369       if (__kmp_tile_depth > 0)
370         // tile level follows node if any, or package
371         if (levels[level++] > 0)
372           __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
373                               addrP[proc].first.labels[label++]);
374       if (levels[level++] > 0)
375         // core level follows
376         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
377                             addrP[proc].first.labels[label++]);
378       if (levels[level++] > 0)
379         // thread level is the latest
380         __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
381                             addrP[proc].first.labels[label++]);
382       KMP_DEBUG_ASSERT(label == depth);
383     }
384     KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
385     __kmp_str_buf_clear(&buf);
386   }
387   __kmp_str_buf_free(&buf);
388 }
389 
390 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
391 
392 // This function removes the topology levels that are radix 1 and don't offer
393 // further information about the topology.  The most common example is when you
394 // have one thread context per core, we don't want the extra thread context
395 // level if it offers no unique labels.  So they are removed.
396 // return value: the new depth of address2os
__kmp_affinity_remove_radix_one_levels(AddrUnsPair * addrP,int nTh,int depth,int * levels)397 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
398                                                   int depth, int *levels) {
399   int level;
400   int i;
401   int radix1_detected;
402   int new_depth = depth;
403   for (level = depth - 1; level > 0; --level) {
404     // Detect if this level is radix 1
405     radix1_detected = 1;
406     for (i = 1; i < nTh; ++i) {
407       if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
408         // There are differing label values for this level so it stays
409         radix1_detected = 0;
410         break;
411       }
412     }
413     if (!radix1_detected)
414       continue;
415     // Radix 1 was detected
416     --new_depth;
417     levels[level] = -1; // mark level as not present in address2os array
418     if (level == new_depth) {
419       // "turn off" deepest level, just decrement the depth that removes
420       // the level from address2os array
421       for (i = 0; i < nTh; ++i) {
422         addrP[i].first.depth--;
423       }
424     } else {
425       // For other levels, we move labels over and also reduce the depth
426       int j;
427       for (j = level; j < new_depth; ++j) {
428         for (i = 0; i < nTh; ++i) {
429           addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
430           addrP[i].first.depth--;
431         }
432         levels[j + 1] -= 1;
433       }
434     }
435   }
436   return new_depth;
437 }
438 
439 // Returns the number of objects of type 'type' below 'obj' within the topology
440 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
441 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
442 // object.
__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,hwloc_obj_type_t type)443 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
444                                            hwloc_obj_type_t type) {
445   int retval = 0;
446   hwloc_obj_t first;
447   for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
448                                            obj->logical_index, type, 0);
449        first != NULL &&
450        hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
451            obj;
452        first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
453                                           first)) {
454     ++retval;
455   }
456   return retval;
457 }
458 
__kmp_hwloc_count_children_by_depth(hwloc_topology_t t,hwloc_obj_t o,kmp_hwloc_depth_t depth,hwloc_obj_t * f)459 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
460                                                hwloc_obj_t o,
461                                                kmp_hwloc_depth_t depth,
462                                                hwloc_obj_t *f) {
463   if (o->depth == depth) {
464     if (*f == NULL)
465       *f = o; // output first descendant found
466     return 1;
467   }
468   int sum = 0;
469   for (unsigned i = 0; i < o->arity; i++)
470     sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
471   return sum; // will be 0 if no one found (as PU arity is 0)
472 }
473 
__kmp_hwloc_count_children_by_type(hwloc_topology_t t,hwloc_obj_t o,hwloc_obj_type_t type,hwloc_obj_t * f)474 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
475                                               hwloc_obj_type_t type,
476                                               hwloc_obj_t *f) {
477   if (!hwloc_compare_types(o->type, type)) {
478     if (*f == NULL)
479       *f = o; // output first descendant found
480     return 1;
481   }
482   int sum = 0;
483   for (unsigned i = 0; i < o->arity; i++)
484     sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
485   return sum; // will be 0 if no one found (as PU arity is 0)
486 }
487 
__kmp_hwloc_process_obj_core_pu(AddrUnsPair * addrPair,int & nActiveThreads,int & num_active_cores,hwloc_obj_t obj,int depth,int * labels)488 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
489                                            int &nActiveThreads,
490                                            int &num_active_cores,
491                                            hwloc_obj_t obj, int depth,
492                                            int *labels) {
493   hwloc_obj_t core = NULL;
494   hwloc_topology_t &tp = __kmp_hwloc_topology;
495   int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
496   for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
497     hwloc_obj_t pu = NULL;
498     KMP_DEBUG_ASSERT(core != NULL);
499     int num_active_threads = 0;
500     int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
501     // int NT = core->arity; pu = core->first_child; // faster?
502     for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
503       KMP_DEBUG_ASSERT(pu != NULL);
504       if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
505         continue; // skip inactive (inaccessible) unit
506       Address addr(depth + 2);
507       KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
508                     obj->os_index, obj->logical_index, core->os_index,
509                     core->logical_index, pu->os_index, pu->logical_index));
510       for (int i = 0; i < depth; ++i)
511         addr.labels[i] = labels[i]; // package, etc.
512       addr.labels[depth] = core_id; // core
513       addr.labels[depth + 1] = pu_id; // pu
514       addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
515       __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
516       nActiveThreads++;
517       ++num_active_threads; // count active threads per core
518     }
519     if (num_active_threads) { // were there any active threads on the core?
520       ++__kmp_ncores; // count total active cores
521       ++num_active_cores; // count active cores per socket
522       if (num_active_threads > __kmp_nThreadsPerCore)
523         __kmp_nThreadsPerCore = num_active_threads; // calc maximum
524     }
525   }
526   return 0;
527 }
528 
529 // Check if NUMA node detected below the package,
530 // and if tile object is detected and return its depth
__kmp_hwloc_check_numa()531 static int __kmp_hwloc_check_numa() {
532   hwloc_topology_t &tp = __kmp_hwloc_topology;
533   hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
534   int depth, l2cache_depth, package_depth;
535 
536   // Get some PU
537   hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
538   if (hT == NULL) // something has gone wrong
539     return 1;
540 
541   // check NUMA node below PACKAGE
542   hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
543   hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
544   KMP_DEBUG_ASSERT(hS != NULL);
545   if (hN != NULL && hN->depth > hS->depth) {
546     __kmp_numa_detected = TRUE; // socket includes node(s)
547     if (__kmp_affinity_gran == affinity_gran_node) {
548       __kmp_affinity_gran = affinity_gran_numa;
549     }
550   }
551 
552   package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
553   l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
554   // check tile, get object by depth because of multiple caches possible
555   depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
556   hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
557   hC = NULL; // not used, but reset it here just in case
558   if (hL != NULL &&
559       __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
560     __kmp_tile_depth = depth; // tile consists of multiple cores
561   return 0;
562 }
563 
__kmp_affinity_create_hwloc_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)564 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
565                                            kmp_i18n_id_t *const msg_id) {
566   hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
567   *address2os = NULL;
568   *msg_id = kmp_i18n_null;
569 
570   // Save the affinity mask for the current thread.
571   kmp_affin_mask_t *oldMask;
572   KMP_CPU_ALLOC(oldMask);
573   __kmp_get_system_affinity(oldMask, TRUE);
574   __kmp_hwloc_check_numa();
575 
576   if (!KMP_AFFINITY_CAPABLE()) {
577     // Hack to try and infer the machine topology using only the data
578     // available from cpuid on the current thread, and __kmp_xproc.
579     KMP_ASSERT(__kmp_affinity_type == affinity_none);
580     // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
581     hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
582     if (o != NULL)
583       nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
584     else
585       nCoresPerPkg = 1; // no PACKAGE found
586     o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
587     if (o != NULL)
588       __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
589     else
590       __kmp_nThreadsPerCore = 1; // no CORE found
591     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
592     if (nCoresPerPkg == 0)
593       nCoresPerPkg = 1; // to prevent possible division by 0
594     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
595     if (__kmp_affinity_verbose) {
596       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
597       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
598       if (__kmp_affinity_uniform_topology()) {
599         KMP_INFORM(Uniform, "KMP_AFFINITY");
600       } else {
601         KMP_INFORM(NonUniform, "KMP_AFFINITY");
602       }
603       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
604                  __kmp_nThreadsPerCore, __kmp_ncores);
605     }
606     KMP_CPU_FREE(oldMask);
607     return 0;
608   }
609 
610   int depth = 3;
611   int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
612   int labels[3] = {0}; // package [,node] [,tile] - head of labels array
613   if (__kmp_numa_detected)
614     ++depth;
615   if (__kmp_tile_depth)
616     ++depth;
617 
618   // Allocate the data structure to be returned.
619   AddrUnsPair *retval =
620       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
621   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
622   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
623 
624   // When affinity is off, this routine will still be called to set
625   // __kmp_ncores, as well as __kmp_nThreadsPerCore,
626   // nCoresPerPkg, & nPackages.  Make sure all these vars are set
627   // correctly, and return if affinity is not enabled.
628 
629   hwloc_obj_t socket, node, tile;
630   int nActiveThreads = 0;
631   int socket_id = 0;
632   // re-calculate globals to count only accessible resources
633   __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
634   nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
635   for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
636        socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
637       socket_id++) {
638     labels[0] = socket_id;
639     if (__kmp_numa_detected) {
640       int NN;
641       int n_active_nodes = 0;
642       node = NULL;
643       NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
644                                               &node);
645       for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
646         labels[1] = node_id;
647         if (__kmp_tile_depth) {
648           // NUMA + tiles
649           int NT;
650           int n_active_tiles = 0;
651           tile = NULL;
652           NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
653                                                    &tile);
654           for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
655             labels[2] = tl_id;
656             int n_active_cores = 0;
657             __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
658                                             n_active_cores, tile, 3, labels);
659             if (n_active_cores) { // were there any active cores on the socket?
660               ++n_active_tiles; // count active tiles per node
661               if (n_active_cores > nCorePerTile)
662                 nCorePerTile = n_active_cores; // calc maximum
663             }
664           }
665           if (n_active_tiles) { // were there any active tiles on the socket?
666             ++n_active_nodes; // count active nodes per package
667             if (n_active_tiles > nTilePerNode)
668               nTilePerNode = n_active_tiles; // calc maximum
669           }
670         } else {
671           // NUMA, no tiles
672           int n_active_cores = 0;
673           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
674                                           n_active_cores, node, 2, labels);
675           if (n_active_cores) { // were there any active cores on the socket?
676             ++n_active_nodes; // count active nodes per package
677             if (n_active_cores > nCorePerNode)
678               nCorePerNode = n_active_cores; // calc maximum
679           }
680         }
681       }
682       if (n_active_nodes) { // were there any active nodes on the socket?
683         ++nPackages; // count total active packages
684         if (n_active_nodes > nNodePerPkg)
685           nNodePerPkg = n_active_nodes; // calc maximum
686       }
687     } else {
688       if (__kmp_tile_depth) {
689         // no NUMA, tiles
690         int NT;
691         int n_active_tiles = 0;
692         tile = NULL;
693         NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
694                                                  &tile);
695         for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
696           labels[1] = tl_id;
697           int n_active_cores = 0;
698           __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
699                                           n_active_cores, tile, 2, labels);
700           if (n_active_cores) { // were there any active cores on the socket?
701             ++n_active_tiles; // count active tiles per package
702             if (n_active_cores > nCorePerTile)
703               nCorePerTile = n_active_cores; // calc maximum
704           }
705         }
706         if (n_active_tiles) { // were there any active tiles on the socket?
707           ++nPackages; // count total active packages
708           if (n_active_tiles > nTilePerPkg)
709             nTilePerPkg = n_active_tiles; // calc maximum
710         }
711       } else {
712         // no NUMA, no tiles
713         int n_active_cores = 0;
714         __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
715                                         socket, 1, labels);
716         if (n_active_cores) { // were there any active cores on the socket?
717           ++nPackages; // count total active packages
718           if (n_active_cores > nCoresPerPkg)
719             nCoresPerPkg = n_active_cores; // calc maximum
720         }
721       }
722     }
723   }
724 
725   // If there's only one thread context to bind to, return now.
726   KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
727   KMP_ASSERT(nActiveThreads > 0);
728   if (nActiveThreads == 1) {
729     __kmp_ncores = nPackages = 1;
730     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
731     if (__kmp_affinity_verbose) {
732       char buf[KMP_AFFIN_MASK_PRINT_LEN];
733       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
734 
735       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
736       if (__kmp_affinity_respect_mask) {
737         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
738       } else {
739         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
740       }
741       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
742       KMP_INFORM(Uniform, "KMP_AFFINITY");
743       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
744                  __kmp_nThreadsPerCore, __kmp_ncores);
745     }
746 
747     if (__kmp_affinity_type == affinity_none) {
748       __kmp_free(retval);
749       KMP_CPU_FREE(oldMask);
750       return 0;
751     }
752 
753     // Form an Address object which only includes the package level.
754     Address addr(1);
755     addr.labels[0] = retval[0].first.labels[0];
756     retval[0].first = addr;
757 
758     if (__kmp_affinity_gran_levels < 0) {
759       __kmp_affinity_gran_levels = 0;
760     }
761 
762     if (__kmp_affinity_verbose) {
763       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
764     }
765 
766     *address2os = retval;
767     KMP_CPU_FREE(oldMask);
768     return 1;
769   }
770 
771   // Sort the table by physical Id.
772   qsort(retval, nActiveThreads, sizeof(*retval),
773         __kmp_affinity_cmp_Address_labels);
774 
775   // Check to see if the machine topology is uniform
776   int nPUs = nPackages * __kmp_nThreadsPerCore;
777   if (__kmp_numa_detected) {
778     if (__kmp_tile_depth) { // NUMA + tiles
779       nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
780     } else { // NUMA, no tiles
781       nPUs *= (nNodePerPkg * nCorePerNode);
782     }
783   } else {
784     if (__kmp_tile_depth) { // no NUMA, tiles
785       nPUs *= (nTilePerPkg * nCorePerTile);
786     } else { // no NUMA, no tiles
787       nPUs *= nCoresPerPkg;
788     }
789   }
790   unsigned uniform = (nPUs == nActiveThreads);
791 
792   // Print the machine topology summary.
793   if (__kmp_affinity_verbose) {
794     char mask[KMP_AFFIN_MASK_PRINT_LEN];
795     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
796     if (__kmp_affinity_respect_mask) {
797       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
798     } else {
799       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
800     }
801     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
802     if (uniform) {
803       KMP_INFORM(Uniform, "KMP_AFFINITY");
804     } else {
805       KMP_INFORM(NonUniform, "KMP_AFFINITY");
806     }
807     if (__kmp_numa_detected) {
808       if (__kmp_tile_depth) { // NUMA + tiles
809         KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
810                    nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
811                    __kmp_ncores);
812       } else { // NUMA, no tiles
813         KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
814                    nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
815         nPUs *= (nNodePerPkg * nCorePerNode);
816       }
817     } else {
818       if (__kmp_tile_depth) { // no NUMA, tiles
819         KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
820                    nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
821       } else { // no NUMA, no tiles
822         kmp_str_buf_t buf;
823         __kmp_str_buf_init(&buf);
824         __kmp_str_buf_print(&buf, "%d", nPackages);
825         KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
826                    __kmp_nThreadsPerCore, __kmp_ncores);
827         __kmp_str_buf_free(&buf);
828       }
829     }
830   }
831 
832   if (__kmp_affinity_type == affinity_none) {
833     __kmp_free(retval);
834     KMP_CPU_FREE(oldMask);
835     return 0;
836   }
837 
838   int depth_full = depth; // number of levels before compressing
839   // Find any levels with radix 1, and remove them from the map
840   // (except for the package level).
841   depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
842                                                  levels);
843   KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
844   if (__kmp_affinity_gran_levels < 0) {
845     // Set the granularity level based on what levels are modeled
846     // in the machine topology map.
847     __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
848     if (__kmp_affinity_gran > affinity_gran_thread) {
849       for (int i = 1; i <= depth_full; ++i) {
850         if (__kmp_affinity_gran <= i) // only count deeper levels
851           break;
852         if (levels[depth_full - i] > 0)
853           __kmp_affinity_gran_levels++;
854       }
855     }
856     if (__kmp_affinity_gran > affinity_gran_package)
857       __kmp_affinity_gran_levels++; // e.g. granularity = group
858   }
859 
860   if (__kmp_affinity_verbose)
861     __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
862 
863   KMP_CPU_FREE(oldMask);
864   *address2os = retval;
865   return depth;
866 }
867 #endif // KMP_USE_HWLOC
868 
869 // If we don't know how to retrieve the machine's processor topology, or
870 // encounter an error in doing so, this routine is called to form a "flat"
871 // mapping of os thread id's <-> processor id's.
__kmp_affinity_create_flat_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)872 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
873                                           kmp_i18n_id_t *const msg_id) {
874   *address2os = NULL;
875   *msg_id = kmp_i18n_null;
876 
877   // Even if __kmp_affinity_type == affinity_none, this routine might still
878   // called to set __kmp_ncores, as well as
879   // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
880   if (!KMP_AFFINITY_CAPABLE()) {
881     KMP_ASSERT(__kmp_affinity_type == affinity_none);
882     __kmp_ncores = nPackages = __kmp_xproc;
883     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
884     if (__kmp_affinity_verbose) {
885       KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
886       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
887       KMP_INFORM(Uniform, "KMP_AFFINITY");
888       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
889                  __kmp_nThreadsPerCore, __kmp_ncores);
890     }
891     return 0;
892   }
893 
894   // When affinity is off, this routine will still be called to set
895   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
896   // Make sure all these vars are set correctly, and return now if affinity is
897   // not enabled.
898   __kmp_ncores = nPackages = __kmp_avail_proc;
899   __kmp_nThreadsPerCore = nCoresPerPkg = 1;
900   if (__kmp_affinity_verbose) {
901     char buf[KMP_AFFIN_MASK_PRINT_LEN];
902     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
903                               __kmp_affin_fullMask);
904 
905     KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
906     if (__kmp_affinity_respect_mask) {
907       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
908     } else {
909       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
910     }
911     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
912     KMP_INFORM(Uniform, "KMP_AFFINITY");
913     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
914                __kmp_nThreadsPerCore, __kmp_ncores);
915   }
916   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
917   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
918   if (__kmp_affinity_type == affinity_none) {
919     int avail_ct = 0;
920     int i;
921     KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
922       if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
923         continue;
924       __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
925     }
926     return 0;
927   }
928 
929   // Construct the data structure to be returned.
930   *address2os =
931       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
932   int avail_ct = 0;
933   int i;
934   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
935     // Skip this proc if it is not included in the machine model.
936     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
937       continue;
938     }
939     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
940     Address addr(1);
941     addr.labels[0] = i;
942     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
943   }
944   if (__kmp_affinity_verbose) {
945     KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
946   }
947 
948   if (__kmp_affinity_gran_levels < 0) {
949     // Only the package level is modeled in the machine topology map,
950     // so the #levels of granularity is either 0 or 1.
951     if (__kmp_affinity_gran > affinity_gran_package) {
952       __kmp_affinity_gran_levels = 1;
953     } else {
954       __kmp_affinity_gran_levels = 0;
955     }
956   }
957   return 1;
958 }
959 
960 #if KMP_GROUP_AFFINITY
961 
962 // If multiple Windows* OS processor groups exist, we can create a 2-level
963 // topology map with the groups at level 0 and the individual procs at level 1.
964 // This facilitates letting the threads float among all procs in a group,
965 // if granularity=group (the default when there are multiple groups).
__kmp_affinity_create_proc_group_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)966 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
967                                                 kmp_i18n_id_t *const msg_id) {
968   *address2os = NULL;
969   *msg_id = kmp_i18n_null;
970 
971   // If we aren't affinity capable, then return now.
972   // The flat mapping will be used.
973   if (!KMP_AFFINITY_CAPABLE()) {
974     // FIXME set *msg_id
975     return -1;
976   }
977 
978   // Construct the data structure to be returned.
979   *address2os =
980       (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
981   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
982   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
983   int avail_ct = 0;
984   int i;
985   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
986     // Skip this proc if it is not included in the machine model.
987     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
988       continue;
989     }
990     __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
991     Address addr(2);
992     addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
993     addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
994     (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
995 
996     if (__kmp_affinity_verbose) {
997       KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
998                  addr.labels[1]);
999     }
1000   }
1001 
1002   if (__kmp_affinity_gran_levels < 0) {
1003     if (__kmp_affinity_gran == affinity_gran_group) {
1004       __kmp_affinity_gran_levels = 1;
1005     } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
1006                (__kmp_affinity_gran == affinity_gran_thread)) {
1007       __kmp_affinity_gran_levels = 0;
1008     } else {
1009       const char *gran_str = NULL;
1010       if (__kmp_affinity_gran == affinity_gran_core) {
1011         gran_str = "core";
1012       } else if (__kmp_affinity_gran == affinity_gran_package) {
1013         gran_str = "package";
1014       } else if (__kmp_affinity_gran == affinity_gran_node) {
1015         gran_str = "node";
1016       } else {
1017         KMP_ASSERT(0);
1018       }
1019 
1020       // Warning: can't use affinity granularity \"gran\" with group topology
1021       // method, using "thread"
1022       __kmp_affinity_gran_levels = 0;
1023     }
1024   }
1025   return 2;
1026 }
1027 
1028 #endif /* KMP_GROUP_AFFINITY */
1029 
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031 
__kmp_cpuid_mask_width(int count)1032 static int __kmp_cpuid_mask_width(int count) {
1033   int r = 0;
1034 
1035   while ((1 << r) < count)
1036     ++r;
1037   return r;
1038 }
1039 
1040 class apicThreadInfo {
1041 public:
1042   unsigned osId; // param to __kmp_affinity_bind_thread
1043   unsigned apicId; // from cpuid after binding
1044   unsigned maxCoresPerPkg; //      ""
1045   unsigned maxThreadsPerPkg; //      ""
1046   unsigned pkgId; // inferred from above values
1047   unsigned coreId; //      ""
1048   unsigned threadId; //      ""
1049 };
1050 
__kmp_affinity_cmp_apicThreadInfo_phys_id(const void * a,const void * b)1051 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
1052                                                      const void *b) {
1053   const apicThreadInfo *aa = (const apicThreadInfo *)a;
1054   const apicThreadInfo *bb = (const apicThreadInfo *)b;
1055   if (aa->pkgId < bb->pkgId)
1056     return -1;
1057   if (aa->pkgId > bb->pkgId)
1058     return 1;
1059   if (aa->coreId < bb->coreId)
1060     return -1;
1061   if (aa->coreId > bb->coreId)
1062     return 1;
1063   if (aa->threadId < bb->threadId)
1064     return -1;
1065   if (aa->threadId > bb->threadId)
1066     return 1;
1067   return 0;
1068 }
1069 
1070 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1071 // an algorithm which cycles through the available os threads, setting
1072 // the current thread's affinity mask to that thread, and then retrieves
1073 // the Apic Id for each thread context using the cpuid instruction.
__kmp_affinity_create_apicid_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)1074 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
1075                                             kmp_i18n_id_t *const msg_id) {
1076   kmp_cpuid buf;
1077   *address2os = NULL;
1078   *msg_id = kmp_i18n_null;
1079 
1080   // Check if cpuid leaf 4 is supported.
1081   __kmp_x86_cpuid(0, 0, &buf);
1082   if (buf.eax < 4) {
1083     *msg_id = kmp_i18n_str_NoLeaf4Support;
1084     return -1;
1085   }
1086 
1087   // The algorithm used starts by setting the affinity to each available thread
1088   // and retrieving info from the cpuid instruction, so if we are not capable of
1089   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1090   // need to do something else - use the defaults that we calculated from
1091   // issuing cpuid without binding to each proc.
1092   if (!KMP_AFFINITY_CAPABLE()) {
1093     // Hack to try and infer the machine topology using only the data
1094     // available from cpuid on the current thread, and __kmp_xproc.
1095     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1096 
1097     // Get an upper bound on the number of threads per package using cpuid(1).
1098     // On some OS/chps combinations where HT is supported by the chip but is
1099     // disabled, this value will be 2 on a single core chip. Usually, it will be
1100     // 2 if HT is enabled and 1 if HT is disabled.
1101     __kmp_x86_cpuid(1, 0, &buf);
1102     int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1103     if (maxThreadsPerPkg == 0) {
1104       maxThreadsPerPkg = 1;
1105     }
1106 
1107     // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1108     // value.
1109     //
1110     // The author of cpu_count.cpp treated this only an upper bound on the
1111     // number of cores, but I haven't seen any cases where it was greater than
1112     // the actual number of cores, so we will treat it as exact in this block of
1113     // code.
1114     //
1115     // First, we need to check if cpuid(4) is supported on this chip. To see if
1116     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1117     // greater.
1118     __kmp_x86_cpuid(0, 0, &buf);
1119     if (buf.eax >= 4) {
1120       __kmp_x86_cpuid(4, 0, &buf);
1121       nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1122     } else {
1123       nCoresPerPkg = 1;
1124     }
1125 
1126     // There is no way to reliably tell if HT is enabled without issuing the
1127     // cpuid instruction from every thread, can correlating the cpuid info, so
1128     // if the machine is not affinity capable, we assume that HT is off. We have
1129     // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1130     // does not support HT.
1131     //
1132     // - Older OSes are usually found on machines with older chips, which do not
1133     //   support HT.
1134     // - The performance penalty for mistakenly identifying a machine as HT when
1135     //   it isn't (which results in blocktime being incorrectly set to 0) is
1136     //   greater than the penalty when for mistakenly identifying a machine as
1137     //   being 1 thread/core when it is really HT enabled (which results in
1138     //   blocktime being incorrectly set to a positive value).
1139     __kmp_ncores = __kmp_xproc;
1140     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1141     __kmp_nThreadsPerCore = 1;
1142     if (__kmp_affinity_verbose) {
1143       KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
1144       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1145       if (__kmp_affinity_uniform_topology()) {
1146         KMP_INFORM(Uniform, "KMP_AFFINITY");
1147       } else {
1148         KMP_INFORM(NonUniform, "KMP_AFFINITY");
1149       }
1150       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1151                  __kmp_nThreadsPerCore, __kmp_ncores);
1152     }
1153     return 0;
1154   }
1155 
1156   // From here on, we can assume that it is safe to call
1157   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1158   // __kmp_affinity_type = affinity_none.
1159 
1160   // Save the affinity mask for the current thread.
1161   kmp_affin_mask_t *oldMask;
1162   KMP_CPU_ALLOC(oldMask);
1163   KMP_ASSERT(oldMask != NULL);
1164   __kmp_get_system_affinity(oldMask, TRUE);
1165 
1166   // Run through each of the available contexts, binding the current thread
1167   // to it, and obtaining the pertinent information using the cpuid instr.
1168   //
1169   // The relevant information is:
1170   // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1171   //     has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1172   // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1173   //     of this field determines the width of the core# + thread# fields in the
1174   //     Apic Id. It is also an upper bound on the number of threads per
1175   //     package, but it has been verified that situations happen were it is not
1176   //     exact. In particular, on certain OS/chip combinations where Intel(R)
1177   //     Hyper-Threading Technology is supported by the chip but has been
1178   //     disabled, the value of this field will be 2 (for a single core chip).
1179   //     On other OS/chip combinations supporting Intel(R) Hyper-Threading
1180   //     Technology, the value of this field will be 1 when Intel(R)
1181   //     Hyper-Threading Technology is disabled and 2 when it is enabled.
1182   // - Max Cores Per Pkg:  Bits 26:31 of eax after issuing cpuid(4). The value
1183   //     of this field (+1) determines the width of the core# field in the Apic
1184   //     Id. The comments in "cpucount.cpp" say that this value is an upper
1185   //     bound, but the IA-32 architecture manual says that it is exactly the
1186   //     number of cores per package, and I haven't seen any case where it
1187   //     wasn't.
1188   //
1189   // From this information, deduce the package Id, core Id, and thread Id,
1190   // and set the corresponding fields in the apicThreadInfo struct.
1191   unsigned i;
1192   apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1193       __kmp_avail_proc * sizeof(apicThreadInfo));
1194   unsigned nApics = 0;
1195   KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1196     // Skip this proc if it is not included in the machine model.
1197     if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1198       continue;
1199     }
1200     KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1201 
1202     __kmp_affinity_dispatch->bind_thread(i);
1203     threadInfo[nApics].osId = i;
1204 
1205     // The apic id and max threads per pkg come from cpuid(1).
1206     __kmp_x86_cpuid(1, 0, &buf);
1207     if (((buf.edx >> 9) & 1) == 0) {
1208       __kmp_set_system_affinity(oldMask, TRUE);
1209       __kmp_free(threadInfo);
1210       KMP_CPU_FREE(oldMask);
1211       *msg_id = kmp_i18n_str_ApicNotPresent;
1212       return -1;
1213     }
1214     threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1215     threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1216     if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1217       threadInfo[nApics].maxThreadsPerPkg = 1;
1218     }
1219 
1220     // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1221     // value.
1222     //
1223     // First, we need to check if cpuid(4) is supported on this chip. To see if
1224     // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1225     // or greater.
1226     __kmp_x86_cpuid(0, 0, &buf);
1227     if (buf.eax >= 4) {
1228       __kmp_x86_cpuid(4, 0, &buf);
1229       threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1230     } else {
1231       threadInfo[nApics].maxCoresPerPkg = 1;
1232     }
1233 
1234     // Infer the pkgId / coreId / threadId using only the info obtained locally.
1235     int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1236     threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1237 
1238     int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1239     int widthT = widthCT - widthC;
1240     if (widthT < 0) {
1241       // I've never seen this one happen, but I suppose it could, if the cpuid
1242       // instruction on a chip was really screwed up. Make sure to restore the
1243       // affinity mask before the tail call.
1244       __kmp_set_system_affinity(oldMask, TRUE);
1245       __kmp_free(threadInfo);
1246       KMP_CPU_FREE(oldMask);
1247       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1248       return -1;
1249     }
1250 
1251     int maskC = (1 << widthC) - 1;
1252     threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1253 
1254     int maskT = (1 << widthT) - 1;
1255     threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1256 
1257     nApics++;
1258   }
1259 
1260   // We've collected all the info we need.
1261   // Restore the old affinity mask for this thread.
1262   __kmp_set_system_affinity(oldMask, TRUE);
1263 
1264   // If there's only one thread context to bind to, form an Address object
1265   // with depth 1 and return immediately (or, if affinity is off, set
1266   // address2os to NULL and return).
1267   //
1268   // If it is configured to omit the package level when there is only a single
1269   // package, the logic at the end of this routine won't work if there is only
1270   // a single thread - it would try to form an Address object with depth 0.
1271   KMP_ASSERT(nApics > 0);
1272   if (nApics == 1) {
1273     __kmp_ncores = nPackages = 1;
1274     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1275     if (__kmp_affinity_verbose) {
1276       char buf[KMP_AFFIN_MASK_PRINT_LEN];
1277       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1278 
1279       KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1280       if (__kmp_affinity_respect_mask) {
1281         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1282       } else {
1283         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1284       }
1285       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1286       KMP_INFORM(Uniform, "KMP_AFFINITY");
1287       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1288                  __kmp_nThreadsPerCore, __kmp_ncores);
1289     }
1290 
1291     if (__kmp_affinity_type == affinity_none) {
1292       __kmp_free(threadInfo);
1293       KMP_CPU_FREE(oldMask);
1294       return 0;
1295     }
1296 
1297     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
1298     Address addr(1);
1299     addr.labels[0] = threadInfo[0].pkgId;
1300     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1301 
1302     if (__kmp_affinity_gran_levels < 0) {
1303       __kmp_affinity_gran_levels = 0;
1304     }
1305 
1306     if (__kmp_affinity_verbose) {
1307       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1308     }
1309 
1310     __kmp_free(threadInfo);
1311     KMP_CPU_FREE(oldMask);
1312     return 1;
1313   }
1314 
1315   // Sort the threadInfo table by physical Id.
1316   qsort(threadInfo, nApics, sizeof(*threadInfo),
1317         __kmp_affinity_cmp_apicThreadInfo_phys_id);
1318 
1319   // The table is now sorted by pkgId / coreId / threadId, but we really don't
1320   // know the radix of any of the fields. pkgId's may be sparsely assigned among
1321   // the chips on a system. Although coreId's are usually assigned
1322   // [0 .. coresPerPkg-1] and threadId's are usually assigned
1323   // [0..threadsPerCore-1], we don't want to make any such assumptions.
1324   //
1325   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1326   // total # packages) are at this point - we want to determine that now. We
1327   // only have an upper bound on the first two figures.
1328   //
1329   // We also perform a consistency check at this point: the values returned by
1330   // the cpuid instruction for any thread bound to a given package had better
1331   // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1332   nPackages = 1;
1333   nCoresPerPkg = 1;
1334   __kmp_nThreadsPerCore = 1;
1335   unsigned nCores = 1;
1336 
1337   unsigned pkgCt = 1; // to determine radii
1338   unsigned lastPkgId = threadInfo[0].pkgId;
1339   unsigned coreCt = 1;
1340   unsigned lastCoreId = threadInfo[0].coreId;
1341   unsigned threadCt = 1;
1342   unsigned lastThreadId = threadInfo[0].threadId;
1343 
1344   // intra-pkg consist checks
1345   unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1346   unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1347 
1348   for (i = 1; i < nApics; i++) {
1349     if (threadInfo[i].pkgId != lastPkgId) {
1350       nCores++;
1351       pkgCt++;
1352       lastPkgId = threadInfo[i].pkgId;
1353       if ((int)coreCt > nCoresPerPkg)
1354         nCoresPerPkg = coreCt;
1355       coreCt = 1;
1356       lastCoreId = threadInfo[i].coreId;
1357       if ((int)threadCt > __kmp_nThreadsPerCore)
1358         __kmp_nThreadsPerCore = threadCt;
1359       threadCt = 1;
1360       lastThreadId = threadInfo[i].threadId;
1361 
1362       // This is a different package, so go on to the next iteration without
1363       // doing any consistency checks. Reset the consistency check vars, though.
1364       prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1365       prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1366       continue;
1367     }
1368 
1369     if (threadInfo[i].coreId != lastCoreId) {
1370       nCores++;
1371       coreCt++;
1372       lastCoreId = threadInfo[i].coreId;
1373       if ((int)threadCt > __kmp_nThreadsPerCore)
1374         __kmp_nThreadsPerCore = threadCt;
1375       threadCt = 1;
1376       lastThreadId = threadInfo[i].threadId;
1377     } else if (threadInfo[i].threadId != lastThreadId) {
1378       threadCt++;
1379       lastThreadId = threadInfo[i].threadId;
1380     } else {
1381       __kmp_free(threadInfo);
1382       KMP_CPU_FREE(oldMask);
1383       *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1384       return -1;
1385     }
1386 
1387     // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1388     // fields agree between all the threads bounds to a given package.
1389     if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1390         (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1391       __kmp_free(threadInfo);
1392       KMP_CPU_FREE(oldMask);
1393       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1394       return -1;
1395     }
1396   }
1397   nPackages = pkgCt;
1398   if ((int)coreCt > nCoresPerPkg)
1399     nCoresPerPkg = coreCt;
1400   if ((int)threadCt > __kmp_nThreadsPerCore)
1401     __kmp_nThreadsPerCore = threadCt;
1402 
1403   // When affinity is off, this routine will still be called to set
1404   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1405   // Make sure all these vars are set correctly, and return now if affinity is
1406   // not enabled.
1407   __kmp_ncores = nCores;
1408   if (__kmp_affinity_verbose) {
1409     char buf[KMP_AFFIN_MASK_PRINT_LEN];
1410     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1411 
1412     KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1413     if (__kmp_affinity_respect_mask) {
1414       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1415     } else {
1416       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1417     }
1418     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1419     if (__kmp_affinity_uniform_topology()) {
1420       KMP_INFORM(Uniform, "KMP_AFFINITY");
1421     } else {
1422       KMP_INFORM(NonUniform, "KMP_AFFINITY");
1423     }
1424     KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1425                __kmp_nThreadsPerCore, __kmp_ncores);
1426   }
1427   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1428   KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
1429   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1430   for (i = 0; i < nApics; ++i) {
1431     __kmp_pu_os_idx[i] = threadInfo[i].osId;
1432   }
1433   if (__kmp_affinity_type == affinity_none) {
1434     __kmp_free(threadInfo);
1435     KMP_CPU_FREE(oldMask);
1436     return 0;
1437   }
1438 
1439   // Now that we've determined the number of packages, the number of cores per
1440   // package, and the number of threads per core, we can construct the data
1441   // structure that is to be returned.
1442   int pkgLevel = 0;
1443   int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1444   int threadLevel =
1445       (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1446   unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1447 
1448   KMP_ASSERT(depth > 0);
1449   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1450 
1451   for (i = 0; i < nApics; ++i) {
1452     Address addr(depth);
1453     unsigned os = threadInfo[i].osId;
1454     int d = 0;
1455 
1456     if (pkgLevel >= 0) {
1457       addr.labels[d++] = threadInfo[i].pkgId;
1458     }
1459     if (coreLevel >= 0) {
1460       addr.labels[d++] = threadInfo[i].coreId;
1461     }
1462     if (threadLevel >= 0) {
1463       addr.labels[d++] = threadInfo[i].threadId;
1464     }
1465     (*address2os)[i] = AddrUnsPair(addr, os);
1466   }
1467 
1468   if (__kmp_affinity_gran_levels < 0) {
1469     // Set the granularity level based on what levels are modeled in the machine
1470     // topology map.
1471     __kmp_affinity_gran_levels = 0;
1472     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1473       __kmp_affinity_gran_levels++;
1474     }
1475     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1476       __kmp_affinity_gran_levels++;
1477     }
1478     if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1479       __kmp_affinity_gran_levels++;
1480     }
1481   }
1482 
1483   if (__kmp_affinity_verbose) {
1484     __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1485                                   coreLevel, threadLevel);
1486   }
1487 
1488   __kmp_free(threadInfo);
1489   KMP_CPU_FREE(oldMask);
1490   return depth;
1491 }
1492 
1493 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1494 // architectures support a newer interface for specifying the x2APIC Ids,
1495 // based on cpuid leaf 11.
__kmp_affinity_create_x2apicid_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)1496 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1497                                               kmp_i18n_id_t *const msg_id) {
1498   kmp_cpuid buf;
1499   *address2os = NULL;
1500   *msg_id = kmp_i18n_null;
1501 
1502   // Check to see if cpuid leaf 11 is supported.
1503   __kmp_x86_cpuid(0, 0, &buf);
1504   if (buf.eax < 11) {
1505     *msg_id = kmp_i18n_str_NoLeaf11Support;
1506     return -1;
1507   }
1508   __kmp_x86_cpuid(11, 0, &buf);
1509   if (buf.ebx == 0) {
1510     *msg_id = kmp_i18n_str_NoLeaf11Support;
1511     return -1;
1512   }
1513 
1514   // Find the number of levels in the machine topology. While we're at it, get
1515   // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
1516   // get more accurate values later by explicitly counting them, but get
1517   // reasonable defaults now, in case we return early.
1518   int level;
1519   int threadLevel = -1;
1520   int coreLevel = -1;
1521   int pkgLevel = -1;
1522   __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1523 
1524   for (level = 0;; level++) {
1525     if (level > 31) {
1526       // FIXME: Hack for DPD200163180
1527       //
1528       // If level is big then something went wrong -> exiting
1529       //
1530       // There could actually be 32 valid levels in the machine topology, but so
1531       // far, the only machine we have seen which does not exit this loop before
1532       // iteration 32 has fubar x2APIC settings.
1533       //
1534       // For now, just reject this case based upon loop trip count.
1535       *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1536       return -1;
1537     }
1538     __kmp_x86_cpuid(11, level, &buf);
1539     if (buf.ebx == 0) {
1540       if (pkgLevel < 0) {
1541         // Will infer nPackages from __kmp_xproc
1542         pkgLevel = level;
1543         level++;
1544       }
1545       break;
1546     }
1547     int kind = (buf.ecx >> 8) & 0xff;
1548     if (kind == 1) {
1549       // SMT level
1550       threadLevel = level;
1551       coreLevel = -1;
1552       pkgLevel = -1;
1553       __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1554       if (__kmp_nThreadsPerCore == 0) {
1555         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1556         return -1;
1557       }
1558     } else if (kind == 2) {
1559       // core level
1560       coreLevel = level;
1561       pkgLevel = -1;
1562       nCoresPerPkg = buf.ebx & 0xffff;
1563       if (nCoresPerPkg == 0) {
1564         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1565         return -1;
1566       }
1567     } else {
1568       if (level <= 0) {
1569         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1570         return -1;
1571       }
1572       if (pkgLevel >= 0) {
1573         continue;
1574       }
1575       pkgLevel = level;
1576       nPackages = buf.ebx & 0xffff;
1577       if (nPackages == 0) {
1578         *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1579         return -1;
1580       }
1581     }
1582   }
1583   int depth = level;
1584 
1585   // In the above loop, "level" was counted from the finest level (usually
1586   // thread) to the coarsest.  The caller expects that we will place the labels
1587   // in (*address2os)[].first.labels[] in the inverse order, so we need to
1588   // invert the vars saying which level means what.
1589   if (threadLevel >= 0) {
1590     threadLevel = depth - threadLevel - 1;
1591   }
1592   if (coreLevel >= 0) {
1593     coreLevel = depth - coreLevel - 1;
1594   }
1595   KMP_DEBUG_ASSERT(pkgLevel >= 0);
1596   pkgLevel = depth - pkgLevel - 1;
1597 
1598   // The algorithm used starts by setting the affinity to each available thread
1599   // and retrieving info from the cpuid instruction, so if we are not capable of
1600   // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1601   // need to do something else - use the defaults that we calculated from
1602   // issuing cpuid without binding to each proc.
1603   if (!KMP_AFFINITY_CAPABLE()) {
1604     // Hack to try and infer the machine topology using only the data
1605     // available from cpuid on the current thread, and __kmp_xproc.
1606     KMP_ASSERT(__kmp_affinity_type == affinity_none);
1607 
1608     __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1609     nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1610     if (__kmp_affinity_verbose) {
1611       KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1612       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1613       if (__kmp_affinity_uniform_topology()) {
1614         KMP_INFORM(Uniform, "KMP_AFFINITY");
1615       } else {
1616         KMP_INFORM(NonUniform, "KMP_AFFINITY");
1617       }
1618       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1619                  __kmp_nThreadsPerCore, __kmp_ncores);
1620     }
1621     return 0;
1622   }
1623 
1624   // From here on, we can assume that it is safe to call
1625   // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1626   // __kmp_affinity_type = affinity_none.
1627 
1628   // Save the affinity mask for the current thread.
1629   kmp_affin_mask_t *oldMask;
1630   KMP_CPU_ALLOC(oldMask);
1631   __kmp_get_system_affinity(oldMask, TRUE);
1632 
1633   // Allocate the data structure to be returned.
1634   AddrUnsPair *retval =
1635       (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1636 
1637   // Run through each of the available contexts, binding the current thread
1638   // to it, and obtaining the pertinent information using the cpuid instr.
1639   unsigned int proc;
1640   int nApics = 0;
1641   KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1642     // Skip this proc if it is not included in the machine model.
1643     if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1644       continue;
1645     }
1646     KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1647 
1648     __kmp_affinity_dispatch->bind_thread(proc);
1649 
1650     // Extract labels for each level in the machine topology map from Apic ID.
1651     Address addr(depth);
1652     int prev_shift = 0;
1653 
1654     for (level = 0; level < depth; level++) {
1655       __kmp_x86_cpuid(11, level, &buf);
1656       unsigned apicId = buf.edx;
1657       if (buf.ebx == 0) {
1658         if (level != depth - 1) {
1659           KMP_CPU_FREE(oldMask);
1660           *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1661           return -1;
1662         }
1663         addr.labels[depth - level - 1] = apicId >> prev_shift;
1664         level++;
1665         break;
1666       }
1667       int shift = buf.eax & 0x1f;
1668       int mask = (1 << shift) - 1;
1669       addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1670       prev_shift = shift;
1671     }
1672     if (level != depth) {
1673       KMP_CPU_FREE(oldMask);
1674       *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1675       return -1;
1676     }
1677 
1678     retval[nApics] = AddrUnsPair(addr, proc);
1679     nApics++;
1680   }
1681 
1682   // We've collected all the info we need.
1683   // Restore the old affinity mask for this thread.
1684   __kmp_set_system_affinity(oldMask, TRUE);
1685 
1686   // If there's only one thread context to bind to, return now.
1687   KMP_ASSERT(nApics > 0);
1688   if (nApics == 1) {
1689     __kmp_ncores = nPackages = 1;
1690     __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1691     if (__kmp_affinity_verbose) {
1692       char buf[KMP_AFFIN_MASK_PRINT_LEN];
1693       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1694 
1695       KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1696       if (__kmp_affinity_respect_mask) {
1697         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1698       } else {
1699         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1700       }
1701       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1702       KMP_INFORM(Uniform, "KMP_AFFINITY");
1703       KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1704                  __kmp_nThreadsPerCore, __kmp_ncores);
1705     }
1706 
1707     if (__kmp_affinity_type == affinity_none) {
1708       __kmp_free(retval);
1709       KMP_CPU_FREE(oldMask);
1710       return 0;
1711     }
1712 
1713     // Form an Address object which only includes the package level.
1714     Address addr(1);
1715     addr.labels[0] = retval[0].first.labels[pkgLevel];
1716     retval[0].first = addr;
1717 
1718     if (__kmp_affinity_gran_levels < 0) {
1719       __kmp_affinity_gran_levels = 0;
1720     }
1721 
1722     if (__kmp_affinity_verbose) {
1723       __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1724     }
1725 
1726     *address2os = retval;
1727     KMP_CPU_FREE(oldMask);
1728     return 1;
1729   }
1730 
1731   // Sort the table by physical Id.
1732   qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1733 
1734   // Find the radix at each of the levels.
1735   unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1736   unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1737   unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1738   unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1739   for (level = 0; level < depth; level++) {
1740     totals[level] = 1;
1741     maxCt[level] = 1;
1742     counts[level] = 1;
1743     last[level] = retval[0].first.labels[level];
1744   }
1745 
1746   // From here on, the iteration variable "level" runs from the finest level to
1747   // the coarsest, i.e. we iterate forward through
1748   // (*address2os)[].first.labels[] - in the previous loops, we iterated
1749   // backwards.
1750   for (proc = 1; (int)proc < nApics; proc++) {
1751     int level;
1752     for (level = 0; level < depth; level++) {
1753       if (retval[proc].first.labels[level] != last[level]) {
1754         int j;
1755         for (j = level + 1; j < depth; j++) {
1756           totals[j]++;
1757           counts[j] = 1;
1758           // The line below causes printing incorrect topology information in
1759           // case the max value for some level (maxCt[level]) is encountered
1760           // earlier than some less value while going through the array. For
1761           // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
1762           // maxCt[1] == 2
1763           // whereas it must be 4.
1764           // TODO!!! Check if it can be commented safely
1765           // maxCt[j] = 1;
1766           last[j] = retval[proc].first.labels[j];
1767         }
1768         totals[level]++;
1769         counts[level]++;
1770         if (counts[level] > maxCt[level]) {
1771           maxCt[level] = counts[level];
1772         }
1773         last[level] = retval[proc].first.labels[level];
1774         break;
1775       } else if (level == depth - 1) {
1776         __kmp_free(last);
1777         __kmp_free(maxCt);
1778         __kmp_free(counts);
1779         __kmp_free(totals);
1780         __kmp_free(retval);
1781         KMP_CPU_FREE(oldMask);
1782         *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1783         return -1;
1784       }
1785     }
1786   }
1787 
1788   // When affinity is off, this routine will still be called to set
1789   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1790   // Make sure all these vars are set correctly, and return if affinity is not
1791   // enabled.
1792   if (threadLevel >= 0) {
1793     __kmp_nThreadsPerCore = maxCt[threadLevel];
1794   } else {
1795     __kmp_nThreadsPerCore = 1;
1796   }
1797   nPackages = totals[pkgLevel];
1798 
1799   if (coreLevel >= 0) {
1800     __kmp_ncores = totals[coreLevel];
1801     nCoresPerPkg = maxCt[coreLevel];
1802   } else {
1803     __kmp_ncores = nPackages;
1804     nCoresPerPkg = 1;
1805   }
1806 
1807   // Check to see if the machine topology is uniform
1808   unsigned prod = maxCt[0];
1809   for (level = 1; level < depth; level++) {
1810     prod *= maxCt[level];
1811   }
1812   bool uniform = (prod == totals[level - 1]);
1813 
1814   // Print the machine topology summary.
1815   if (__kmp_affinity_verbose) {
1816     char mask[KMP_AFFIN_MASK_PRINT_LEN];
1817     __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1818 
1819     KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1820     if (__kmp_affinity_respect_mask) {
1821       KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1822     } else {
1823       KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1824     }
1825     KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1826     if (uniform) {
1827       KMP_INFORM(Uniform, "KMP_AFFINITY");
1828     } else {
1829       KMP_INFORM(NonUniform, "KMP_AFFINITY");
1830     }
1831 
1832     kmp_str_buf_t buf;
1833     __kmp_str_buf_init(&buf);
1834 
1835     __kmp_str_buf_print(&buf, "%d", totals[0]);
1836     for (level = 1; level <= pkgLevel; level++) {
1837       __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1838     }
1839     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1840                __kmp_nThreadsPerCore, __kmp_ncores);
1841 
1842     __kmp_str_buf_free(&buf);
1843   }
1844   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1845   KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1846   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1847   for (proc = 0; (int)proc < nApics; ++proc) {
1848     __kmp_pu_os_idx[proc] = retval[proc].second;
1849   }
1850   if (__kmp_affinity_type == affinity_none) {
1851     __kmp_free(last);
1852     __kmp_free(maxCt);
1853     __kmp_free(counts);
1854     __kmp_free(totals);
1855     __kmp_free(retval);
1856     KMP_CPU_FREE(oldMask);
1857     return 0;
1858   }
1859 
1860   // Find any levels with radix 1, and remove them from the map
1861   // (except for the package level).
1862   int new_depth = 0;
1863   for (level = 0; level < depth; level++) {
1864     if ((maxCt[level] == 1) && (level != pkgLevel)) {
1865       continue;
1866     }
1867     new_depth++;
1868   }
1869 
1870   // If we are removing any levels, allocate a new vector to return,
1871   // and copy the relevant information to it.
1872   if (new_depth != depth) {
1873     AddrUnsPair *new_retval =
1874         (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1875     for (proc = 0; (int)proc < nApics; proc++) {
1876       Address addr(new_depth);
1877       new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1878     }
1879     int new_level = 0;
1880     int newPkgLevel = -1;
1881     int newCoreLevel = -1;
1882     int newThreadLevel = -1;
1883     for (level = 0; level < depth; level++) {
1884       if ((maxCt[level] == 1) && (level != pkgLevel)) {
1885         // Remove this level. Never remove the package level
1886         continue;
1887       }
1888       if (level == pkgLevel) {
1889         newPkgLevel = new_level;
1890       }
1891       if (level == coreLevel) {
1892         newCoreLevel = new_level;
1893       }
1894       if (level == threadLevel) {
1895         newThreadLevel = new_level;
1896       }
1897       for (proc = 0; (int)proc < nApics; proc++) {
1898         new_retval[proc].first.labels[new_level] =
1899             retval[proc].first.labels[level];
1900       }
1901       new_level++;
1902     }
1903 
1904     __kmp_free(retval);
1905     retval = new_retval;
1906     depth = new_depth;
1907     pkgLevel = newPkgLevel;
1908     coreLevel = newCoreLevel;
1909     threadLevel = newThreadLevel;
1910   }
1911 
1912   if (__kmp_affinity_gran_levels < 0) {
1913     // Set the granularity level based on what levels are modeled
1914     // in the machine topology map.
1915     __kmp_affinity_gran_levels = 0;
1916     if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1917       __kmp_affinity_gran_levels++;
1918     }
1919     if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1920       __kmp_affinity_gran_levels++;
1921     }
1922     if (__kmp_affinity_gran > affinity_gran_package) {
1923       __kmp_affinity_gran_levels++;
1924     }
1925   }
1926 
1927   if (__kmp_affinity_verbose) {
1928     __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
1929                                   threadLevel);
1930   }
1931 
1932   __kmp_free(last);
1933   __kmp_free(maxCt);
1934   __kmp_free(counts);
1935   __kmp_free(totals);
1936   KMP_CPU_FREE(oldMask);
1937   *address2os = retval;
1938   return depth;
1939 }
1940 
1941 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1942 
1943 #define osIdIndex 0
1944 #define threadIdIndex 1
1945 #define coreIdIndex 2
1946 #define pkgIdIndex 3
1947 #define nodeIdIndex 4
1948 
1949 typedef unsigned *ProcCpuInfo;
1950 static unsigned maxIndex = pkgIdIndex;
1951 
__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void * a,const void * b)1952 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
1953                                                   const void *b) {
1954   unsigned i;
1955   const unsigned *aa = *(unsigned *const *)a;
1956   const unsigned *bb = *(unsigned *const *)b;
1957   for (i = maxIndex;; i--) {
1958     if (aa[i] < bb[i])
1959       return -1;
1960     if (aa[i] > bb[i])
1961       return 1;
1962     if (i == osIdIndex)
1963       break;
1964   }
1965   return 0;
1966 }
1967 
1968 #if KMP_USE_HIER_SCHED
1969 // Set the array sizes for the hierarchy layers
__kmp_dispatch_set_hierarchy_values()1970 static void __kmp_dispatch_set_hierarchy_values() {
1971   // Set the maximum number of L1's to number of cores
1972   // Set the maximum number of L2's to to either number of cores / 2 for
1973   // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
1974   // Or the number of cores for Intel(R) Xeon(R) processors
1975   // Set the maximum number of NUMA nodes and L3's to number of packages
1976   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
1977       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
1978   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
1979 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1980     KMP_MIC_SUPPORTED
1981   if (__kmp_mic_type >= mic3)
1982     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
1983   else
1984 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1985     __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
1986   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
1987   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
1988   __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
1989   // Set the number of threads per unit
1990   // Number of hardware threads per L1/L2/L3/NUMA/LOOP
1991   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
1992   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
1993       __kmp_nThreadsPerCore;
1994 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1995     KMP_MIC_SUPPORTED
1996   if (__kmp_mic_type >= mic3)
1997     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
1998         2 * __kmp_nThreadsPerCore;
1999   else
2000 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2001     __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2002         __kmp_nThreadsPerCore;
2003   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2004       nCoresPerPkg * __kmp_nThreadsPerCore;
2005   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2006       nCoresPerPkg * __kmp_nThreadsPerCore;
2007   __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2008       nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2009 }
2010 
2011 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2012 // i.e., this thread's L1 or this thread's L2, etc.
__kmp_dispatch_get_index(int tid,kmp_hier_layer_e type)2013 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2014   int index = type + 1;
2015   int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2016   KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2017   if (type == kmp_hier_layer_e::LAYER_THREAD)
2018     return tid;
2019   else if (type == kmp_hier_layer_e::LAYER_LOOP)
2020     return 0;
2021   KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2022   if (tid >= num_hw_threads)
2023     tid = tid % num_hw_threads;
2024   return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2025 }
2026 
2027 // Return the number of t1's per t2
__kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,kmp_hier_layer_e t2)2028 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2029   int i1 = t1 + 1;
2030   int i2 = t2 + 1;
2031   KMP_DEBUG_ASSERT(i1 <= i2);
2032   KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2033   KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2034   KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2035   // (nthreads/t2) / (nthreads/t1) = t1 / t2
2036   return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2037 }
2038 #endif // KMP_USE_HIER_SCHED
2039 
2040 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2041 // affinity map.
__kmp_affinity_create_cpuinfo_map(AddrUnsPair ** address2os,int * line,kmp_i18n_id_t * const msg_id,FILE * f)2042 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
2043                                              int *line,
2044                                              kmp_i18n_id_t *const msg_id,
2045                                              FILE *f) {
2046   *address2os = NULL;
2047   *msg_id = kmp_i18n_null;
2048 
2049   // Scan of the file, and count the number of "processor" (osId) fields,
2050   // and find the highest value of <n> for a node_<n> field.
2051   char buf[256];
2052   unsigned num_records = 0;
2053   while (!feof(f)) {
2054     buf[sizeof(buf) - 1] = 1;
2055     if (!fgets(buf, sizeof(buf), f)) {
2056       // Read errors presumably because of EOF
2057       break;
2058     }
2059 
2060     char s1[] = "processor";
2061     if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2062       num_records++;
2063       continue;
2064     }
2065 
2066     // FIXME - this will match "node_<n> <garbage>"
2067     unsigned level;
2068     if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2069       if (nodeIdIndex + level >= maxIndex) {
2070         maxIndex = nodeIdIndex + level;
2071       }
2072       continue;
2073     }
2074   }
2075 
2076   // Check for empty file / no valid processor records, or too many. The number
2077   // of records can't exceed the number of valid bits in the affinity mask.
2078   if (num_records == 0) {
2079     *line = 0;
2080     *msg_id = kmp_i18n_str_NoProcRecords;
2081     return -1;
2082   }
2083   if (num_records > (unsigned)__kmp_xproc) {
2084     *line = 0;
2085     *msg_id = kmp_i18n_str_TooManyProcRecords;
2086     return -1;
2087   }
2088 
2089   // Set the file pointer back to the beginning, so that we can scan the file
2090   // again, this time performing a full parse of the data. Allocate a vector of
2091   // ProcCpuInfo object, where we will place the data. Adding an extra element
2092   // at the end allows us to remove a lot of extra checks for termination
2093   // conditions.
2094   if (fseek(f, 0, SEEK_SET) != 0) {
2095     *line = 0;
2096     *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2097     return -1;
2098   }
2099 
2100   // Allocate the array of records to store the proc info in.  The dummy
2101   // element at the end makes the logic in filling them out easier to code.
2102   unsigned **threadInfo =
2103       (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2104   unsigned i;
2105   for (i = 0; i <= num_records; i++) {
2106     threadInfo[i] =
2107         (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2108   }
2109 
2110 #define CLEANUP_THREAD_INFO                                                    \
2111   for (i = 0; i <= num_records; i++) {                                         \
2112     __kmp_free(threadInfo[i]);                                                 \
2113   }                                                                            \
2114   __kmp_free(threadInfo);
2115 
2116   // A value of UINT_MAX means that we didn't find the field
2117   unsigned __index;
2118 
2119 #define INIT_PROC_INFO(p)                                                      \
2120   for (__index = 0; __index <= maxIndex; __index++) {                          \
2121     (p)[__index] = UINT_MAX;                                                   \
2122   }
2123 
2124   for (i = 0; i <= num_records; i++) {
2125     INIT_PROC_INFO(threadInfo[i]);
2126   }
2127 
2128   unsigned num_avail = 0;
2129   *line = 0;
2130   while (!feof(f)) {
2131     // Create an inner scoping level, so that all the goto targets at the end of
2132     // the loop appear in an outer scoping level. This avoids warnings about
2133     // jumping past an initialization to a target in the same block.
2134     {
2135       buf[sizeof(buf) - 1] = 1;
2136       bool long_line = false;
2137       if (!fgets(buf, sizeof(buf), f)) {
2138         // Read errors presumably because of EOF
2139         // If there is valid data in threadInfo[num_avail], then fake
2140         // a blank line in ensure that the last address gets parsed.
2141         bool valid = false;
2142         for (i = 0; i <= maxIndex; i++) {
2143           if (threadInfo[num_avail][i] != UINT_MAX) {
2144             valid = true;
2145           }
2146         }
2147         if (!valid) {
2148           break;
2149         }
2150         buf[0] = 0;
2151       } else if (!buf[sizeof(buf) - 1]) {
2152         // The line is longer than the buffer.  Set a flag and don't
2153         // emit an error if we were going to ignore the line, anyway.
2154         long_line = true;
2155 
2156 #define CHECK_LINE                                                             \
2157   if (long_line) {                                                             \
2158     CLEANUP_THREAD_INFO;                                                       \
2159     *msg_id = kmp_i18n_str_LongLineCpuinfo;                                    \
2160     return -1;                                                                 \
2161   }
2162       }
2163       (*line)++;
2164 
2165       char s1[] = "processor";
2166       if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2167         CHECK_LINE;
2168         char *p = strchr(buf + sizeof(s1) - 1, ':');
2169         unsigned val;
2170         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2171           goto no_val;
2172         if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2173 #if KMP_ARCH_AARCH64
2174           // Handle the old AArch64 /proc/cpuinfo layout differently,
2175           // it contains all of the 'processor' entries listed in a
2176           // single 'Processor' section, therefore the normal looking
2177           // for duplicates in that section will always fail.
2178           num_avail++;
2179 #else
2180           goto dup_field;
2181 #endif
2182         threadInfo[num_avail][osIdIndex] = val;
2183 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2184         char path[256];
2185         KMP_SNPRINTF(
2186             path, sizeof(path),
2187             "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2188             threadInfo[num_avail][osIdIndex]);
2189         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2190 
2191         KMP_SNPRINTF(path, sizeof(path),
2192                      "/sys/devices/system/cpu/cpu%u/topology/core_id",
2193                      threadInfo[num_avail][osIdIndex]);
2194         __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2195         continue;
2196 #else
2197       }
2198       char s2[] = "physical id";
2199       if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2200         CHECK_LINE;
2201         char *p = strchr(buf + sizeof(s2) - 1, ':');
2202         unsigned val;
2203         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2204           goto no_val;
2205         if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2206           goto dup_field;
2207         threadInfo[num_avail][pkgIdIndex] = val;
2208         continue;
2209       }
2210       char s3[] = "core id";
2211       if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2212         CHECK_LINE;
2213         char *p = strchr(buf + sizeof(s3) - 1, ':');
2214         unsigned val;
2215         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2216           goto no_val;
2217         if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2218           goto dup_field;
2219         threadInfo[num_avail][coreIdIndex] = val;
2220         continue;
2221 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2222       }
2223       char s4[] = "thread id";
2224       if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2225         CHECK_LINE;
2226         char *p = strchr(buf + sizeof(s4) - 1, ':');
2227         unsigned val;
2228         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2229           goto no_val;
2230         if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2231           goto dup_field;
2232         threadInfo[num_avail][threadIdIndex] = val;
2233         continue;
2234       }
2235       unsigned level;
2236       if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2237         CHECK_LINE;
2238         char *p = strchr(buf + sizeof(s4) - 1, ':');
2239         unsigned val;
2240         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2241           goto no_val;
2242         KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2243         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2244           goto dup_field;
2245         threadInfo[num_avail][nodeIdIndex + level] = val;
2246         continue;
2247       }
2248 
2249       // We didn't recognize the leading token on the line. There are lots of
2250       // leading tokens that we don't recognize - if the line isn't empty, go on
2251       // to the next line.
2252       if ((*buf != 0) && (*buf != '\n')) {
2253         // If the line is longer than the buffer, read characters
2254         // until we find a newline.
2255         if (long_line) {
2256           int ch;
2257           while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2258             ;
2259         }
2260         continue;
2261       }
2262 
2263       // A newline has signalled the end of the processor record.
2264       // Check that there aren't too many procs specified.
2265       if ((int)num_avail == __kmp_xproc) {
2266         CLEANUP_THREAD_INFO;
2267         *msg_id = kmp_i18n_str_TooManyEntries;
2268         return -1;
2269       }
2270 
2271       // Check for missing fields.  The osId field must be there, and we
2272       // currently require that the physical id field is specified, also.
2273       if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2274         CLEANUP_THREAD_INFO;
2275         *msg_id = kmp_i18n_str_MissingProcField;
2276         return -1;
2277       }
2278       if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2279         CLEANUP_THREAD_INFO;
2280         *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2281         return -1;
2282       }
2283 
2284       // Skip this proc if it is not included in the machine model.
2285       if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2286                          __kmp_affin_fullMask)) {
2287         INIT_PROC_INFO(threadInfo[num_avail]);
2288         continue;
2289       }
2290 
2291       // We have a successful parse of this proc's info.
2292       // Increment the counter, and prepare for the next proc.
2293       num_avail++;
2294       KMP_ASSERT(num_avail <= num_records);
2295       INIT_PROC_INFO(threadInfo[num_avail]);
2296     }
2297     continue;
2298 
2299   no_val:
2300     CLEANUP_THREAD_INFO;
2301     *msg_id = kmp_i18n_str_MissingValCpuinfo;
2302     return -1;
2303 
2304   dup_field:
2305     CLEANUP_THREAD_INFO;
2306     *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2307     return -1;
2308   }
2309   *line = 0;
2310 
2311 #if KMP_MIC && REDUCE_TEAM_SIZE
2312   unsigned teamSize = 0;
2313 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2314 
2315   // check for num_records == __kmp_xproc ???
2316 
2317   // If there's only one thread context to bind to, form an Address object with
2318   // depth 1 and return immediately (or, if affinity is off, set address2os to
2319   // NULL and return).
2320   //
2321   // If it is configured to omit the package level when there is only a single
2322   // package, the logic at the end of this routine won't work if there is only a
2323   // single thread - it would try to form an Address object with depth 0.
2324   KMP_ASSERT(num_avail > 0);
2325   KMP_ASSERT(num_avail <= num_records);
2326   if (num_avail == 1) {
2327     __kmp_ncores = 1;
2328     __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2329     if (__kmp_affinity_verbose) {
2330       if (!KMP_AFFINITY_CAPABLE()) {
2331         KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2332         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2333         KMP_INFORM(Uniform, "KMP_AFFINITY");
2334       } else {
2335         char buf[KMP_AFFIN_MASK_PRINT_LEN];
2336         __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2337                                   __kmp_affin_fullMask);
2338         KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2339         if (__kmp_affinity_respect_mask) {
2340           KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2341         } else {
2342           KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2343         }
2344         KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2345         KMP_INFORM(Uniform, "KMP_AFFINITY");
2346       }
2347       int index;
2348       kmp_str_buf_t buf;
2349       __kmp_str_buf_init(&buf);
2350       __kmp_str_buf_print(&buf, "1");
2351       for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2352         __kmp_str_buf_print(&buf, " x 1");
2353       }
2354       KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2355       __kmp_str_buf_free(&buf);
2356     }
2357 
2358     if (__kmp_affinity_type == affinity_none) {
2359       CLEANUP_THREAD_INFO;
2360       return 0;
2361     }
2362 
2363     *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
2364     Address addr(1);
2365     addr.labels[0] = threadInfo[0][pkgIdIndex];
2366     (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2367 
2368     if (__kmp_affinity_gran_levels < 0) {
2369       __kmp_affinity_gran_levels = 0;
2370     }
2371 
2372     if (__kmp_affinity_verbose) {
2373       __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2374     }
2375 
2376     CLEANUP_THREAD_INFO;
2377     return 1;
2378   }
2379 
2380   // Sort the threadInfo table by physical Id.
2381   qsort(threadInfo, num_avail, sizeof(*threadInfo),
2382         __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2383 
2384   // The table is now sorted by pkgId / coreId / threadId, but we really don't
2385   // know the radix of any of the fields. pkgId's may be sparsely assigned among
2386   // the chips on a system. Although coreId's are usually assigned
2387   // [0 .. coresPerPkg-1] and threadId's are usually assigned
2388   // [0..threadsPerCore-1], we don't want to make any such assumptions.
2389   //
2390   // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2391   // total # packages) are at this point - we want to determine that now. We
2392   // only have an upper bound on the first two figures.
2393   unsigned *counts =
2394       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2395   unsigned *maxCt =
2396       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2397   unsigned *totals =
2398       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2399   unsigned *lastId =
2400       (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2401 
2402   bool assign_thread_ids = false;
2403   unsigned threadIdCt;
2404   unsigned index;
2405 
2406 restart_radix_check:
2407   threadIdCt = 0;
2408 
2409   // Initialize the counter arrays with data from threadInfo[0].
2410   if (assign_thread_ids) {
2411     if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2412       threadInfo[0][threadIdIndex] = threadIdCt++;
2413     } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2414       threadIdCt = threadInfo[0][threadIdIndex] + 1;
2415     }
2416   }
2417   for (index = 0; index <= maxIndex; index++) {
2418     counts[index] = 1;
2419     maxCt[index] = 1;
2420     totals[index] = 1;
2421     lastId[index] = threadInfo[0][index];
2422     ;
2423   }
2424 
2425   // Run through the rest of the OS procs.
2426   for (i = 1; i < num_avail; i++) {
2427     // Find the most significant index whose id differs from the id for the
2428     // previous OS proc.
2429     for (index = maxIndex; index >= threadIdIndex; index--) {
2430       if (assign_thread_ids && (index == threadIdIndex)) {
2431         // Auto-assign the thread id field if it wasn't specified.
2432         if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2433           threadInfo[i][threadIdIndex] = threadIdCt++;
2434         }
2435         // Apparently the thread id field was specified for some entries and not
2436         // others. Start the thread id counter off at the next higher thread id.
2437         else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2438           threadIdCt = threadInfo[i][threadIdIndex] + 1;
2439         }
2440       }
2441       if (threadInfo[i][index] != lastId[index]) {
2442         // Run through all indices which are less significant, and reset the
2443         // counts to 1. At all levels up to and including index, we need to
2444         // increment the totals and record the last id.
2445         unsigned index2;
2446         for (index2 = threadIdIndex; index2 < index; index2++) {
2447           totals[index2]++;
2448           if (counts[index2] > maxCt[index2]) {
2449             maxCt[index2] = counts[index2];
2450           }
2451           counts[index2] = 1;
2452           lastId[index2] = threadInfo[i][index2];
2453         }
2454         counts[index]++;
2455         totals[index]++;
2456         lastId[index] = threadInfo[i][index];
2457 
2458         if (assign_thread_ids && (index > threadIdIndex)) {
2459 
2460 #if KMP_MIC && REDUCE_TEAM_SIZE
2461           // The default team size is the total #threads in the machine
2462           // minus 1 thread for every core that has 3 or more threads.
2463           teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2464 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2465 
2466           // Restart the thread counter, as we are on a new core.
2467           threadIdCt = 0;
2468 
2469           // Auto-assign the thread id field if it wasn't specified.
2470           if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2471             threadInfo[i][threadIdIndex] = threadIdCt++;
2472           }
2473 
2474           // Apparently the thread id field was specified for some entries and
2475           // not others. Start the thread id counter off at the next higher
2476           // thread id.
2477           else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2478             threadIdCt = threadInfo[i][threadIdIndex] + 1;
2479           }
2480         }
2481         break;
2482       }
2483     }
2484     if (index < threadIdIndex) {
2485       // If thread ids were specified, it is an error if they are not unique.
2486       // Also, check that we waven't already restarted the loop (to be safe -
2487       // shouldn't need to).
2488       if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2489         __kmp_free(lastId);
2490         __kmp_free(totals);
2491         __kmp_free(maxCt);
2492         __kmp_free(counts);
2493         CLEANUP_THREAD_INFO;
2494         *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2495         return -1;
2496       }
2497 
2498       // If the thread ids were not specified and we see entries entries that
2499       // are duplicates, start the loop over and assign the thread ids manually.
2500       assign_thread_ids = true;
2501       goto restart_radix_check;
2502     }
2503   }
2504 
2505 #if KMP_MIC && REDUCE_TEAM_SIZE
2506   // The default team size is the total #threads in the machine
2507   // minus 1 thread for every core that has 3 or more threads.
2508   teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2509 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2510 
2511   for (index = threadIdIndex; index <= maxIndex; index++) {
2512     if (counts[index] > maxCt[index]) {
2513       maxCt[index] = counts[index];
2514     }
2515   }
2516 
2517   __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2518   nCoresPerPkg = maxCt[coreIdIndex];
2519   nPackages = totals[pkgIdIndex];
2520 
2521   // Check to see if the machine topology is uniform
2522   unsigned prod = totals[maxIndex];
2523   for (index = threadIdIndex; index < maxIndex; index++) {
2524     prod *= maxCt[index];
2525   }
2526   bool uniform = (prod == totals[threadIdIndex]);
2527 
2528   // When affinity is off, this routine will still be called to set
2529   // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2530   // Make sure all these vars are set correctly, and return now if affinity is
2531   // not enabled.
2532   __kmp_ncores = totals[coreIdIndex];
2533 
2534   if (__kmp_affinity_verbose) {
2535     if (!KMP_AFFINITY_CAPABLE()) {
2536       KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2537       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2538       if (uniform) {
2539         KMP_INFORM(Uniform, "KMP_AFFINITY");
2540       } else {
2541         KMP_INFORM(NonUniform, "KMP_AFFINITY");
2542       }
2543     } else {
2544       char buf[KMP_AFFIN_MASK_PRINT_LEN];
2545       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2546                                 __kmp_affin_fullMask);
2547       KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2548       if (__kmp_affinity_respect_mask) {
2549         KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2550       } else {
2551         KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2552       }
2553       KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2554       if (uniform) {
2555         KMP_INFORM(Uniform, "KMP_AFFINITY");
2556       } else {
2557         KMP_INFORM(NonUniform, "KMP_AFFINITY");
2558       }
2559     }
2560     kmp_str_buf_t buf;
2561     __kmp_str_buf_init(&buf);
2562 
2563     __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2564     for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2565       __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2566     }
2567     KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2568                maxCt[threadIdIndex], __kmp_ncores);
2569 
2570     __kmp_str_buf_free(&buf);
2571   }
2572 
2573 #if KMP_MIC && REDUCE_TEAM_SIZE
2574   // Set the default team size.
2575   if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2576     __kmp_dflt_team_nth = teamSize;
2577     KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2578                   "__kmp_dflt_team_nth = %d\n",
2579                   __kmp_dflt_team_nth));
2580   }
2581 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2582 
2583   KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2584   KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
2585   __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2586   for (i = 0; i < num_avail; ++i) { // fill the os indices
2587     __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2588   }
2589 
2590   if (__kmp_affinity_type == affinity_none) {
2591     __kmp_free(lastId);
2592     __kmp_free(totals);
2593     __kmp_free(maxCt);
2594     __kmp_free(counts);
2595     CLEANUP_THREAD_INFO;
2596     return 0;
2597   }
2598 
2599   // Count the number of levels which have more nodes at that level than at the
2600   // parent's level (with there being an implicit root node of the top level).
2601   // This is equivalent to saying that there is at least one node at this level
2602   // which has a sibling. These levels are in the map, and the package level is
2603   // always in the map.
2604   bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2605   for (index = threadIdIndex; index < maxIndex; index++) {
2606     KMP_ASSERT(totals[index] >= totals[index + 1]);
2607     inMap[index] = (totals[index] > totals[index + 1]);
2608   }
2609   inMap[maxIndex] = (totals[maxIndex] > 1);
2610   inMap[pkgIdIndex] = true;
2611 
2612   int depth = 0;
2613   for (index = threadIdIndex; index <= maxIndex; index++) {
2614     if (inMap[index]) {
2615       depth++;
2616     }
2617   }
2618   KMP_ASSERT(depth > 0);
2619 
2620   // Construct the data structure that is to be returned.
2621   *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2622   int pkgLevel = -1;
2623   int coreLevel = -1;
2624   int threadLevel = -1;
2625 
2626   for (i = 0; i < num_avail; ++i) {
2627     Address addr(depth);
2628     unsigned os = threadInfo[i][osIdIndex];
2629     int src_index;
2630     int dst_index = 0;
2631 
2632     for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2633       if (!inMap[src_index]) {
2634         continue;
2635       }
2636       addr.labels[dst_index] = threadInfo[i][src_index];
2637       if (src_index == pkgIdIndex) {
2638         pkgLevel = dst_index;
2639       } else if (src_index == coreIdIndex) {
2640         coreLevel = dst_index;
2641       } else if (src_index == threadIdIndex) {
2642         threadLevel = dst_index;
2643       }
2644       dst_index++;
2645     }
2646     (*address2os)[i] = AddrUnsPair(addr, os);
2647   }
2648 
2649   if (__kmp_affinity_gran_levels < 0) {
2650     // Set the granularity level based on what levels are modeled
2651     // in the machine topology map.
2652     unsigned src_index;
2653     __kmp_affinity_gran_levels = 0;
2654     for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2655       if (!inMap[src_index]) {
2656         continue;
2657       }
2658       switch (src_index) {
2659       case threadIdIndex:
2660         if (__kmp_affinity_gran > affinity_gran_thread) {
2661           __kmp_affinity_gran_levels++;
2662         }
2663 
2664         break;
2665       case coreIdIndex:
2666         if (__kmp_affinity_gran > affinity_gran_core) {
2667           __kmp_affinity_gran_levels++;
2668         }
2669         break;
2670 
2671       case pkgIdIndex:
2672         if (__kmp_affinity_gran > affinity_gran_package) {
2673           __kmp_affinity_gran_levels++;
2674         }
2675         break;
2676       }
2677     }
2678   }
2679 
2680   if (__kmp_affinity_verbose) {
2681     __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2682                                   coreLevel, threadLevel);
2683   }
2684 
2685   __kmp_free(inMap);
2686   __kmp_free(lastId);
2687   __kmp_free(totals);
2688   __kmp_free(maxCt);
2689   __kmp_free(counts);
2690   CLEANUP_THREAD_INFO;
2691   return depth;
2692 }
2693 
2694 // Create and return a table of affinity masks, indexed by OS thread ID.
2695 // This routine handles OR'ing together all the affinity masks of threads
2696 // that are sufficiently close, if granularity > fine.
__kmp_create_masks(unsigned * maxIndex,unsigned * numUnique,AddrUnsPair * address2os,unsigned numAddrs)2697 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2698                                             unsigned *numUnique,
2699                                             AddrUnsPair *address2os,
2700                                             unsigned numAddrs) {
2701   // First form a table of affinity masks in order of OS thread id.
2702   unsigned depth;
2703   unsigned maxOsId;
2704   unsigned i;
2705 
2706   KMP_ASSERT(numAddrs > 0);
2707   depth = address2os[0].first.depth;
2708 
2709   maxOsId = 0;
2710   for (i = numAddrs - 1;; --i) {
2711     unsigned osId = address2os[i].second;
2712     if (osId > maxOsId) {
2713       maxOsId = osId;
2714     }
2715     if (i == 0)
2716       break;
2717   }
2718   kmp_affin_mask_t *osId2Mask;
2719   KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2720 
2721   // Sort the address2os table according to physical order. Doing so will put
2722   // all threads on the same core/package/node in consecutive locations.
2723   qsort(address2os, numAddrs, sizeof(*address2os),
2724         __kmp_affinity_cmp_Address_labels);
2725 
2726   KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2727   if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2728     KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2729   }
2730   if (__kmp_affinity_gran_levels >= (int)depth) {
2731     if (__kmp_affinity_verbose ||
2732         (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2733       KMP_WARNING(AffThreadsMayMigrate);
2734     }
2735   }
2736 
2737   // Run through the table, forming the masks for all threads on each core.
2738   // Threads on the same core will have identical "Address" objects, not
2739   // considering the last level, which must be the thread id. All threads on a
2740   // core will appear consecutively.
2741   unsigned unique = 0;
2742   unsigned j = 0; // index of 1st thread on core
2743   unsigned leader = 0;
2744   Address *leaderAddr = &(address2os[0].first);
2745   kmp_affin_mask_t *sum;
2746   KMP_CPU_ALLOC_ON_STACK(sum);
2747   KMP_CPU_ZERO(sum);
2748   KMP_CPU_SET(address2os[0].second, sum);
2749   for (i = 1; i < numAddrs; i++) {
2750     // If this thread is sufficiently close to the leader (within the
2751     // granularity setting), then set the bit for this os thread in the
2752     // affinity mask for this group, and go on to the next thread.
2753     if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
2754       KMP_CPU_SET(address2os[i].second, sum);
2755       continue;
2756     }
2757 
2758     // For every thread in this group, copy the mask to the thread's entry in
2759     // the osId2Mask table.  Mark the first address as a leader.
2760     for (; j < i; j++) {
2761       unsigned osId = address2os[j].second;
2762       KMP_DEBUG_ASSERT(osId <= maxOsId);
2763       kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2764       KMP_CPU_COPY(mask, sum);
2765       address2os[j].first.leader = (j == leader);
2766     }
2767     unique++;
2768 
2769     // Start a new mask.
2770     leader = i;
2771     leaderAddr = &(address2os[i].first);
2772     KMP_CPU_ZERO(sum);
2773     KMP_CPU_SET(address2os[i].second, sum);
2774   }
2775 
2776   // For every thread in last group, copy the mask to the thread's
2777   // entry in the osId2Mask table.
2778   for (; j < i; j++) {
2779     unsigned osId = address2os[j].second;
2780     KMP_DEBUG_ASSERT(osId <= maxOsId);
2781     kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2782     KMP_CPU_COPY(mask, sum);
2783     address2os[j].first.leader = (j == leader);
2784   }
2785   unique++;
2786   KMP_CPU_FREE_FROM_STACK(sum);
2787 
2788   *maxIndex = maxOsId;
2789   *numUnique = unique;
2790   return osId2Mask;
2791 }
2792 
2793 // Stuff for the affinity proclist parsers.  It's easier to declare these vars
2794 // as file-static than to try and pass them through the calling sequence of
2795 // the recursive-descent OMP_PLACES parser.
2796 static kmp_affin_mask_t *newMasks;
2797 static int numNewMasks;
2798 static int nextNewMask;
2799 
2800 #define ADD_MASK(_mask)                                                        \
2801   {                                                                            \
2802     if (nextNewMask >= numNewMasks) {                                          \
2803       int i;                                                                   \
2804       numNewMasks *= 2;                                                        \
2805       kmp_affin_mask_t *temp;                                                  \
2806       KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks);                         \
2807       for (i = 0; i < numNewMasks / 2; i++) {                                  \
2808         kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);                    \
2809         kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i);                       \
2810         KMP_CPU_COPY(dest, src);                                               \
2811       }                                                                        \
2812       KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2);                  \
2813       newMasks = temp;                                                         \
2814     }                                                                          \
2815     KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask));               \
2816     nextNewMask++;                                                             \
2817   }
2818 
2819 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId)                             \
2820   {                                                                            \
2821     if (((_osId) > _maxOsId) ||                                                \
2822         (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) {     \
2823       if (__kmp_affinity_verbose ||                                            \
2824           (__kmp_affinity_warnings &&                                          \
2825            (__kmp_affinity_type != affinity_none))) {                          \
2826         KMP_WARNING(AffIgnoreInvalidProcID, _osId);                            \
2827       }                                                                        \
2828     } else {                                                                   \
2829       ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId)));                            \
2830     }                                                                          \
2831   }
2832 
2833 // Re-parse the proclist (for the explicit affinity type), and form the list
2834 // of affinity newMasks indexed by gtid.
__kmp_affinity_process_proclist(kmp_affin_mask_t ** out_masks,unsigned int * out_numMasks,const char * proclist,kmp_affin_mask_t * osId2Mask,int maxOsId)2835 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2836                                             unsigned int *out_numMasks,
2837                                             const char *proclist,
2838                                             kmp_affin_mask_t *osId2Mask,
2839                                             int maxOsId) {
2840   int i;
2841   const char *scan = proclist;
2842   const char *next = proclist;
2843 
2844   // We use malloc() for the temporary mask vector, so that we can use
2845   // realloc() to extend it.
2846   numNewMasks = 2;
2847   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2848   nextNewMask = 0;
2849   kmp_affin_mask_t *sumMask;
2850   KMP_CPU_ALLOC(sumMask);
2851   int setSize = 0;
2852 
2853   for (;;) {
2854     int start, end, stride;
2855 
2856     SKIP_WS(scan);
2857     next = scan;
2858     if (*next == '\0') {
2859       break;
2860     }
2861 
2862     if (*next == '{') {
2863       int num;
2864       setSize = 0;
2865       next++; // skip '{'
2866       SKIP_WS(next);
2867       scan = next;
2868 
2869       // Read the first integer in the set.
2870       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2871       SKIP_DIGITS(next);
2872       num = __kmp_str_to_int(scan, *next);
2873       KMP_ASSERT2(num >= 0, "bad explicit proc list");
2874 
2875       // Copy the mask for that osId to the sum (union) mask.
2876       if ((num > maxOsId) ||
2877           (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2878         if (__kmp_affinity_verbose ||
2879             (__kmp_affinity_warnings &&
2880              (__kmp_affinity_type != affinity_none))) {
2881           KMP_WARNING(AffIgnoreInvalidProcID, num);
2882         }
2883         KMP_CPU_ZERO(sumMask);
2884       } else {
2885         KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2886         setSize = 1;
2887       }
2888 
2889       for (;;) {
2890         // Check for end of set.
2891         SKIP_WS(next);
2892         if (*next == '}') {
2893           next++; // skip '}'
2894           break;
2895         }
2896 
2897         // Skip optional comma.
2898         if (*next == ',') {
2899           next++;
2900         }
2901         SKIP_WS(next);
2902 
2903         // Read the next integer in the set.
2904         scan = next;
2905         KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2906 
2907         SKIP_DIGITS(next);
2908         num = __kmp_str_to_int(scan, *next);
2909         KMP_ASSERT2(num >= 0, "bad explicit proc list");
2910 
2911         // Add the mask for that osId to the sum mask.
2912         if ((num > maxOsId) ||
2913             (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2914           if (__kmp_affinity_verbose ||
2915               (__kmp_affinity_warnings &&
2916                (__kmp_affinity_type != affinity_none))) {
2917             KMP_WARNING(AffIgnoreInvalidProcID, num);
2918           }
2919         } else {
2920           KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2921           setSize++;
2922         }
2923       }
2924       if (setSize > 0) {
2925         ADD_MASK(sumMask);
2926       }
2927 
2928       SKIP_WS(next);
2929       if (*next == ',') {
2930         next++;
2931       }
2932       scan = next;
2933       continue;
2934     }
2935 
2936     // Read the first integer.
2937     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2938     SKIP_DIGITS(next);
2939     start = __kmp_str_to_int(scan, *next);
2940     KMP_ASSERT2(start >= 0, "bad explicit proc list");
2941     SKIP_WS(next);
2942 
2943     // If this isn't a range, then add a mask to the list and go on.
2944     if (*next != '-') {
2945       ADD_MASK_OSID(start, osId2Mask, maxOsId);
2946 
2947       // Skip optional comma.
2948       if (*next == ',') {
2949         next++;
2950       }
2951       scan = next;
2952       continue;
2953     }
2954 
2955     // This is a range.  Skip over the '-' and read in the 2nd int.
2956     next++; // skip '-'
2957     SKIP_WS(next);
2958     scan = next;
2959     KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2960     SKIP_DIGITS(next);
2961     end = __kmp_str_to_int(scan, *next);
2962     KMP_ASSERT2(end >= 0, "bad explicit proc list");
2963 
2964     // Check for a stride parameter
2965     stride = 1;
2966     SKIP_WS(next);
2967     if (*next == ':') {
2968       // A stride is specified.  Skip over the ':" and read the 3rd int.
2969       int sign = +1;
2970       next++; // skip ':'
2971       SKIP_WS(next);
2972       scan = next;
2973       if (*next == '-') {
2974         sign = -1;
2975         next++;
2976         SKIP_WS(next);
2977         scan = next;
2978       }
2979       KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2980       SKIP_DIGITS(next);
2981       stride = __kmp_str_to_int(scan, *next);
2982       KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2983       stride *= sign;
2984     }
2985 
2986     // Do some range checks.
2987     KMP_ASSERT2(stride != 0, "bad explicit proc list");
2988     if (stride > 0) {
2989       KMP_ASSERT2(start <= end, "bad explicit proc list");
2990     } else {
2991       KMP_ASSERT2(start >= end, "bad explicit proc list");
2992     }
2993     KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2994 
2995     // Add the mask for each OS proc # to the list.
2996     if (stride > 0) {
2997       do {
2998         ADD_MASK_OSID(start, osId2Mask, maxOsId);
2999         start += stride;
3000       } while (start <= end);
3001     } else {
3002       do {
3003         ADD_MASK_OSID(start, osId2Mask, maxOsId);
3004         start += stride;
3005       } while (start >= end);
3006     }
3007 
3008     // Skip optional comma.
3009     SKIP_WS(next);
3010     if (*next == ',') {
3011       next++;
3012     }
3013     scan = next;
3014   }
3015 
3016   *out_numMasks = nextNewMask;
3017   if (nextNewMask == 0) {
3018     *out_masks = NULL;
3019     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3020     return;
3021   }
3022   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3023   for (i = 0; i < nextNewMask; i++) {
3024     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3025     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3026     KMP_CPU_COPY(dest, src);
3027   }
3028   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3029   KMP_CPU_FREE(sumMask);
3030 }
3031 
3032 /*-----------------------------------------------------------------------------
3033 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3034 places.  Again, Here is the grammar:
3035 
3036 place_list := place
3037 place_list := place , place_list
3038 place := num
3039 place := place : num
3040 place := place : num : signed
3041 place := { subplacelist }
3042 place := ! place                  // (lowest priority)
3043 subplace_list := subplace
3044 subplace_list := subplace , subplace_list
3045 subplace := num
3046 subplace := num : num
3047 subplace := num : num : signed
3048 signed := num
3049 signed := + signed
3050 signed := - signed
3051 -----------------------------------------------------------------------------*/
__kmp_process_subplace_list(const char ** scan,kmp_affin_mask_t * osId2Mask,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3052 static void __kmp_process_subplace_list(const char **scan,
3053                                         kmp_affin_mask_t *osId2Mask,
3054                                         int maxOsId, kmp_affin_mask_t *tempMask,
3055                                         int *setSize) {
3056   const char *next;
3057 
3058   for (;;) {
3059     int start, count, stride, i;
3060 
3061     // Read in the starting proc id
3062     SKIP_WS(*scan);
3063     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3064     next = *scan;
3065     SKIP_DIGITS(next);
3066     start = __kmp_str_to_int(*scan, *next);
3067     KMP_ASSERT(start >= 0);
3068     *scan = next;
3069 
3070     // valid follow sets are ',' ':' and '}'
3071     SKIP_WS(*scan);
3072     if (**scan == '}' || **scan == ',') {
3073       if ((start > maxOsId) ||
3074           (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3075         if (__kmp_affinity_verbose ||
3076             (__kmp_affinity_warnings &&
3077              (__kmp_affinity_type != affinity_none))) {
3078           KMP_WARNING(AffIgnoreInvalidProcID, start);
3079         }
3080       } else {
3081         KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3082         (*setSize)++;
3083       }
3084       if (**scan == '}') {
3085         break;
3086       }
3087       (*scan)++; // skip ','
3088       continue;
3089     }
3090     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3091     (*scan)++; // skip ':'
3092 
3093     // Read count parameter
3094     SKIP_WS(*scan);
3095     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3096     next = *scan;
3097     SKIP_DIGITS(next);
3098     count = __kmp_str_to_int(*scan, *next);
3099     KMP_ASSERT(count >= 0);
3100     *scan = next;
3101 
3102     // valid follow sets are ',' ':' and '}'
3103     SKIP_WS(*scan);
3104     if (**scan == '}' || **scan == ',') {
3105       for (i = 0; i < count; i++) {
3106         if ((start > maxOsId) ||
3107             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3108           if (__kmp_affinity_verbose ||
3109               (__kmp_affinity_warnings &&
3110                (__kmp_affinity_type != affinity_none))) {
3111             KMP_WARNING(AffIgnoreInvalidProcID, start);
3112           }
3113           break; // don't proliferate warnings for large count
3114         } else {
3115           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3116           start++;
3117           (*setSize)++;
3118         }
3119       }
3120       if (**scan == '}') {
3121         break;
3122       }
3123       (*scan)++; // skip ','
3124       continue;
3125     }
3126     KMP_ASSERT2(**scan == ':', "bad explicit places list");
3127     (*scan)++; // skip ':'
3128 
3129     // Read stride parameter
3130     int sign = +1;
3131     for (;;) {
3132       SKIP_WS(*scan);
3133       if (**scan == '+') {
3134         (*scan)++; // skip '+'
3135         continue;
3136       }
3137       if (**scan == '-') {
3138         sign *= -1;
3139         (*scan)++; // skip '-'
3140         continue;
3141       }
3142       break;
3143     }
3144     SKIP_WS(*scan);
3145     KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3146     next = *scan;
3147     SKIP_DIGITS(next);
3148     stride = __kmp_str_to_int(*scan, *next);
3149     KMP_ASSERT(stride >= 0);
3150     *scan = next;
3151     stride *= sign;
3152 
3153     // valid follow sets are ',' and '}'
3154     SKIP_WS(*scan);
3155     if (**scan == '}' || **scan == ',') {
3156       for (i = 0; i < count; i++) {
3157         if ((start > maxOsId) ||
3158             (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3159           if (__kmp_affinity_verbose ||
3160               (__kmp_affinity_warnings &&
3161                (__kmp_affinity_type != affinity_none))) {
3162             KMP_WARNING(AffIgnoreInvalidProcID, start);
3163           }
3164           break; // don't proliferate warnings for large count
3165         } else {
3166           KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3167           start += stride;
3168           (*setSize)++;
3169         }
3170       }
3171       if (**scan == '}') {
3172         break;
3173       }
3174       (*scan)++; // skip ','
3175       continue;
3176     }
3177 
3178     KMP_ASSERT2(0, "bad explicit places list");
3179   }
3180 }
3181 
__kmp_process_place(const char ** scan,kmp_affin_mask_t * osId2Mask,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3182 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3183                                 int maxOsId, kmp_affin_mask_t *tempMask,
3184                                 int *setSize) {
3185   const char *next;
3186 
3187   // valid follow sets are '{' '!' and num
3188   SKIP_WS(*scan);
3189   if (**scan == '{') {
3190     (*scan)++; // skip '{'
3191     __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3192     KMP_ASSERT2(**scan == '}', "bad explicit places list");
3193     (*scan)++; // skip '}'
3194   } else if (**scan == '!') {
3195     (*scan)++; // skip '!'
3196     __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3197     KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3198   } else if ((**scan >= '0') && (**scan <= '9')) {
3199     next = *scan;
3200     SKIP_DIGITS(next);
3201     int num = __kmp_str_to_int(*scan, *next);
3202     KMP_ASSERT(num >= 0);
3203     if ((num > maxOsId) ||
3204         (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3205       if (__kmp_affinity_verbose ||
3206           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3207         KMP_WARNING(AffIgnoreInvalidProcID, num);
3208       }
3209     } else {
3210       KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3211       (*setSize)++;
3212     }
3213     *scan = next; // skip num
3214   } else {
3215     KMP_ASSERT2(0, "bad explicit places list");
3216   }
3217 }
3218 
3219 // static void
__kmp_affinity_process_placelist(kmp_affin_mask_t ** out_masks,unsigned int * out_numMasks,const char * placelist,kmp_affin_mask_t * osId2Mask,int maxOsId)3220 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3221                                       unsigned int *out_numMasks,
3222                                       const char *placelist,
3223                                       kmp_affin_mask_t *osId2Mask,
3224                                       int maxOsId) {
3225   int i, j, count, stride, sign;
3226   const char *scan = placelist;
3227   const char *next = placelist;
3228 
3229   numNewMasks = 2;
3230   KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3231   nextNewMask = 0;
3232 
3233   // tempMask is modified based on the previous or initial
3234   //   place to form the current place
3235   // previousMask contains the previous place
3236   kmp_affin_mask_t *tempMask;
3237   kmp_affin_mask_t *previousMask;
3238   KMP_CPU_ALLOC(tempMask);
3239   KMP_CPU_ZERO(tempMask);
3240   KMP_CPU_ALLOC(previousMask);
3241   KMP_CPU_ZERO(previousMask);
3242   int setSize = 0;
3243 
3244   for (;;) {
3245     __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3246 
3247     // valid follow sets are ',' ':' and EOL
3248     SKIP_WS(scan);
3249     if (*scan == '\0' || *scan == ',') {
3250       if (setSize > 0) {
3251         ADD_MASK(tempMask);
3252       }
3253       KMP_CPU_ZERO(tempMask);
3254       setSize = 0;
3255       if (*scan == '\0') {
3256         break;
3257       }
3258       scan++; // skip ','
3259       continue;
3260     }
3261 
3262     KMP_ASSERT2(*scan == ':', "bad explicit places list");
3263     scan++; // skip ':'
3264 
3265     // Read count parameter
3266     SKIP_WS(scan);
3267     KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3268     next = scan;
3269     SKIP_DIGITS(next);
3270     count = __kmp_str_to_int(scan, *next);
3271     KMP_ASSERT(count >= 0);
3272     scan = next;
3273 
3274     // valid follow sets are ',' ':' and EOL
3275     SKIP_WS(scan);
3276     if (*scan == '\0' || *scan == ',') {
3277       stride = +1;
3278     } else {
3279       KMP_ASSERT2(*scan == ':', "bad explicit places list");
3280       scan++; // skip ':'
3281 
3282       // Read stride parameter
3283       sign = +1;
3284       for (;;) {
3285         SKIP_WS(scan);
3286         if (*scan == '+') {
3287           scan++; // skip '+'
3288           continue;
3289         }
3290         if (*scan == '-') {
3291           sign *= -1;
3292           scan++; // skip '-'
3293           continue;
3294         }
3295         break;
3296       }
3297       SKIP_WS(scan);
3298       KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3299       next = scan;
3300       SKIP_DIGITS(next);
3301       stride = __kmp_str_to_int(scan, *next);
3302       KMP_DEBUG_ASSERT(stride >= 0);
3303       scan = next;
3304       stride *= sign;
3305     }
3306 
3307     // Add places determined by initial_place : count : stride
3308     for (i = 0; i < count; i++) {
3309       if (setSize == 0) {
3310         break;
3311       }
3312       // Add the current place, then build the next place (tempMask) from that
3313       KMP_CPU_COPY(previousMask, tempMask);
3314       ADD_MASK(previousMask);
3315       KMP_CPU_ZERO(tempMask);
3316       setSize = 0;
3317       KMP_CPU_SET_ITERATE(j, previousMask) {
3318         if (!KMP_CPU_ISSET(j, previousMask)) {
3319           continue;
3320         }
3321         if ((j + stride > maxOsId) || (j + stride < 0) ||
3322             (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3323             (!KMP_CPU_ISSET(j + stride,
3324                             KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3325           if ((__kmp_affinity_verbose ||
3326                (__kmp_affinity_warnings &&
3327                 (__kmp_affinity_type != affinity_none))) &&
3328               i < count - 1) {
3329             KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3330           }
3331           continue;
3332         }
3333         KMP_CPU_SET(j + stride, tempMask);
3334         setSize++;
3335       }
3336     }
3337     KMP_CPU_ZERO(tempMask);
3338     setSize = 0;
3339 
3340     // valid follow sets are ',' and EOL
3341     SKIP_WS(scan);
3342     if (*scan == '\0') {
3343       break;
3344     }
3345     if (*scan == ',') {
3346       scan++; // skip ','
3347       continue;
3348     }
3349 
3350     KMP_ASSERT2(0, "bad explicit places list");
3351   }
3352 
3353   *out_numMasks = nextNewMask;
3354   if (nextNewMask == 0) {
3355     *out_masks = NULL;
3356     KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3357     return;
3358   }
3359   KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3360   KMP_CPU_FREE(tempMask);
3361   KMP_CPU_FREE(previousMask);
3362   for (i = 0; i < nextNewMask; i++) {
3363     kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3364     kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3365     KMP_CPU_COPY(dest, src);
3366   }
3367   KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3368 }
3369 
3370 #undef ADD_MASK
3371 #undef ADD_MASK_OSID
3372 
3373 #if KMP_USE_HWLOC
__kmp_hwloc_skip_PUs_obj(hwloc_topology_t t,hwloc_obj_t o)3374 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
3375   // skip PUs descendants of the object o
3376   int skipped = 0;
3377   hwloc_obj_t hT = NULL;
3378   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3379   for (int i = 0; i < N; ++i) {
3380     KMP_DEBUG_ASSERT(hT);
3381     unsigned idx = hT->os_index;
3382     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3383       KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3384       KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3385       ++skipped;
3386     }
3387     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3388   }
3389   return skipped; // count number of skipped units
3390 }
3391 
__kmp_hwloc_obj_has_PUs(hwloc_topology_t t,hwloc_obj_t o)3392 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
3393   // check if obj has PUs present in fullMask
3394   hwloc_obj_t hT = NULL;
3395   int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3396   for (int i = 0; i < N; ++i) {
3397     KMP_DEBUG_ASSERT(hT);
3398     unsigned idx = hT->os_index;
3399     if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
3400       return 1; // found PU
3401     hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3402   }
3403   return 0; // no PUs found
3404 }
3405 #endif // KMP_USE_HWLOC
3406 
__kmp_apply_thread_places(AddrUnsPair ** pAddr,int depth)3407 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
3408   AddrUnsPair *newAddr;
3409   if (__kmp_hws_requested == 0)
3410     goto _exit; // no topology limiting actions requested, exit
3411 #if KMP_USE_HWLOC
3412   if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3413     // Number of subobjects calculated dynamically, this works fine for
3414     // any non-uniform topology.
3415     // L2 cache objects are determined by depth, other objects - by type.
3416     hwloc_topology_t tp = __kmp_hwloc_topology;
3417     int nS = 0, nN = 0, nL = 0, nC = 0,
3418         nT = 0; // logical index including skipped
3419     int nCr = 0, nTr = 0; // number of requested units
3420     int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
3421     hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
3422     int L2depth, idx;
3423 
3424     // check support of extensions ----------------------------------
3425     int numa_support = 0, tile_support = 0;
3426     if (__kmp_pu_os_idx)
3427       hT = hwloc_get_pu_obj_by_os_index(tp,
3428                                         __kmp_pu_os_idx[__kmp_avail_proc - 1]);
3429     else
3430       hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
3431     if (hT == NULL) { // something's gone wrong
3432       KMP_WARNING(AffHWSubsetUnsupported);
3433       goto _exit;
3434     }
3435     // check NUMA node
3436     hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
3437     hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
3438     if (hN != NULL && hN->depth > hS->depth) {
3439       numa_support = 1; // 1 in case socket includes node(s)
3440     } else if (__kmp_hws_node.num > 0) {
3441       // don't support sockets inside NUMA node (no such HW found for testing)
3442       KMP_WARNING(AffHWSubsetUnsupported);
3443       goto _exit;
3444     }
3445     // check L2 cahce, get object by depth because of multiple caches
3446     L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
3447     hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
3448     if (hL != NULL &&
3449         __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
3450       tile_support = 1; // no sense to count L2 if it includes single core
3451     } else if (__kmp_hws_tile.num > 0) {
3452       if (__kmp_hws_core.num == 0) {
3453         __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
3454         __kmp_hws_tile.num = 0;
3455       } else {
3456         // L2 and core are both requested, but represent same object
3457         KMP_WARNING(AffHWSubsetInvalid);
3458         goto _exit;
3459       }
3460     }
3461     // end of check of extensions -----------------------------------
3462 
3463     // fill in unset items, validate settings -----------------------
3464     if (__kmp_hws_socket.num == 0)
3465       __kmp_hws_socket.num = nPackages; // use all available sockets
3466     if (__kmp_hws_socket.offset >= nPackages) {
3467       KMP_WARNING(AffHWSubsetManySockets);
3468       goto _exit;
3469     }
3470     if (numa_support) {
3471       hN = NULL;
3472       int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
3473                                                   &hN); // num nodes in socket
3474       if (__kmp_hws_node.num == 0)
3475         __kmp_hws_node.num = NN; // use all available nodes
3476       if (__kmp_hws_node.offset >= NN) {
3477         KMP_WARNING(AffHWSubsetManyNodes);
3478         goto _exit;
3479       }
3480       if (tile_support) {
3481         // get num tiles in node
3482         int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3483         if (__kmp_hws_tile.num == 0) {
3484           __kmp_hws_tile.num = NL + 1;
3485         } // use all available tiles, some node may have more tiles, thus +1
3486         if (__kmp_hws_tile.offset >= NL) {
3487           KMP_WARNING(AffHWSubsetManyTiles);
3488           goto _exit;
3489         }
3490         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3491                                                     &hC); // num cores in tile
3492         if (__kmp_hws_core.num == 0)
3493           __kmp_hws_core.num = NC; // use all available cores
3494         if (__kmp_hws_core.offset >= NC) {
3495           KMP_WARNING(AffHWSubsetManyCores);
3496           goto _exit;
3497         }
3498       } else { // tile_support
3499         int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
3500                                                     &hC); // num cores in node
3501         if (__kmp_hws_core.num == 0)
3502           __kmp_hws_core.num = NC; // use all available cores
3503         if (__kmp_hws_core.offset >= NC) {
3504           KMP_WARNING(AffHWSubsetManyCores);
3505           goto _exit;
3506         }
3507       } // tile_support
3508     } else { // numa_support
3509       if (tile_support) {
3510         // get num tiles in socket
3511         int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3512         if (__kmp_hws_tile.num == 0)
3513           __kmp_hws_tile.num = NL; // use all available tiles
3514         if (__kmp_hws_tile.offset >= NL) {
3515           KMP_WARNING(AffHWSubsetManyTiles);
3516           goto _exit;
3517         }
3518         int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3519                                                     &hC); // num cores in tile
3520         if (__kmp_hws_core.num == 0)
3521           __kmp_hws_core.num = NC; // use all available cores
3522         if (__kmp_hws_core.offset >= NC) {
3523           KMP_WARNING(AffHWSubsetManyCores);
3524           goto _exit;
3525         }
3526       } else { // tile_support
3527         int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
3528                                                     &hC); // num cores in socket
3529         if (__kmp_hws_core.num == 0)
3530           __kmp_hws_core.num = NC; // use all available cores
3531         if (__kmp_hws_core.offset >= NC) {
3532           KMP_WARNING(AffHWSubsetManyCores);
3533           goto _exit;
3534         }
3535       } // tile_support
3536     }
3537     if (__kmp_hws_proc.num == 0)
3538       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
3539     if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
3540       KMP_WARNING(AffHWSubsetManyProcs);
3541       goto _exit;
3542     }
3543     // end of validation --------------------------------------------
3544 
3545     if (pAddr) // pAddr is NULL in case of affinity_none
3546       newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
3547                                               __kmp_avail_proc); // max size
3548     // main loop to form HW subset ----------------------------------
3549     hS = NULL;
3550     int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
3551     for (int s = 0; s < NP; ++s) {
3552       // Check Socket -----------------------------------------------
3553       hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
3554       if (!__kmp_hwloc_obj_has_PUs(tp, hS))
3555         continue; // skip socket if all PUs are out of fullMask
3556       ++nS; // only count objects those have PUs in affinity mask
3557       if (nS <= __kmp_hws_socket.offset ||
3558           nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
3559         n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
3560         continue; // move to next socket
3561       }
3562       nCr = 0; // count number of cores per socket
3563       // socket requested, go down the topology tree
3564       // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
3565       if (numa_support) {
3566         nN = 0;
3567         hN = NULL;
3568         // num nodes in current socket
3569         int NN =
3570             __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
3571         for (int n = 0; n < NN; ++n) {
3572           // Check NUMA Node ----------------------------------------
3573           if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
3574             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3575             continue; // skip node if all PUs are out of fullMask
3576           }
3577           ++nN;
3578           if (nN <= __kmp_hws_node.offset ||
3579               nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
3580             // skip node as not requested
3581             n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
3582             hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3583             continue; // move to next node
3584           }
3585           // node requested, go down the topology tree
3586           if (tile_support) {
3587             nL = 0;
3588             hL = NULL;
3589             int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3590             for (int l = 0; l < NL; ++l) {
3591               // Check L2 (tile) ------------------------------------
3592               if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3593                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3594                 continue; // skip tile if all PUs are out of fullMask
3595               }
3596               ++nL;
3597               if (nL <= __kmp_hws_tile.offset ||
3598                   nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3599                 // skip tile as not requested
3600                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3601                 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3602                 continue; // move to next tile
3603               }
3604               // tile requested, go down the topology tree
3605               nC = 0;
3606               hC = NULL;
3607               // num cores in current tile
3608               int NC = __kmp_hwloc_count_children_by_type(tp, hL,
3609                                                           HWLOC_OBJ_CORE, &hC);
3610               for (int c = 0; c < NC; ++c) {
3611                 // Check Core ---------------------------------------
3612                 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3613                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3614                   continue; // skip core if all PUs are out of fullMask
3615                 }
3616                 ++nC;
3617                 if (nC <= __kmp_hws_core.offset ||
3618                     nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3619                   // skip node as not requested
3620                   n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3621                   hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3622                   continue; // move to next node
3623                 }
3624                 // core requested, go down to PUs
3625                 nT = 0;
3626                 nTr = 0;
3627                 hT = NULL;
3628                 // num procs in current core
3629                 int NT = __kmp_hwloc_count_children_by_type(tp, hC,
3630                                                             HWLOC_OBJ_PU, &hT);
3631                 for (int t = 0; t < NT; ++t) {
3632                   // Check PU ---------------------------------------
3633                   idx = hT->os_index;
3634                   if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3635                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3636                     continue; // skip PU if not in fullMask
3637                   }
3638                   ++nT;
3639                   if (nT <= __kmp_hws_proc.offset ||
3640                       nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3641                     // skip PU
3642                     KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3643                     ++n_old;
3644                     KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3645                     hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3646                     continue; // move to next node
3647                   }
3648                   ++nTr;
3649                   if (pAddr) // collect requested thread's data
3650                     newAddr[n_new] = (*pAddr)[n_old];
3651                   ++n_new;
3652                   ++n_old;
3653                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3654                 } // threads loop
3655                 if (nTr > 0) {
3656                   ++nCr; // num cores per socket
3657                   ++nCo; // total num cores
3658                   if (nTr > nTpC)
3659                     nTpC = nTr; // calc max threads per core
3660                 }
3661                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3662               } // cores loop
3663               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3664             } // tiles loop
3665           } else { // tile_support
3666             // no tiles, check cores
3667             nC = 0;
3668             hC = NULL;
3669             // num cores in current node
3670             int NC =
3671                 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
3672             for (int c = 0; c < NC; ++c) {
3673               // Check Core ---------------------------------------
3674               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3675                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3676                 continue; // skip core if all PUs are out of fullMask
3677               }
3678               ++nC;
3679               if (nC <= __kmp_hws_core.offset ||
3680                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3681                 // skip node as not requested
3682                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3683                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3684                 continue; // move to next node
3685               }
3686               // core requested, go down to PUs
3687               nT = 0;
3688               nTr = 0;
3689               hT = NULL;
3690               int NT =
3691                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3692               for (int t = 0; t < NT; ++t) {
3693                 // Check PU ---------------------------------------
3694                 idx = hT->os_index;
3695                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3696                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3697                   continue; // skip PU if not in fullMask
3698                 }
3699                 ++nT;
3700                 if (nT <= __kmp_hws_proc.offset ||
3701                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3702                   // skip PU
3703                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3704                   ++n_old;
3705                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3706                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3707                   continue; // move to next node
3708                 }
3709                 ++nTr;
3710                 if (pAddr) // collect requested thread's data
3711                   newAddr[n_new] = (*pAddr)[n_old];
3712                 ++n_new;
3713                 ++n_old;
3714                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3715               } // threads loop
3716               if (nTr > 0) {
3717                 ++nCr; // num cores per socket
3718                 ++nCo; // total num cores
3719                 if (nTr > nTpC)
3720                   nTpC = nTr; // calc max threads per core
3721               }
3722               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3723             } // cores loop
3724           } // tiles support
3725           hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3726         } // nodes loop
3727       } else { // numa_support
3728         // no NUMA support
3729         if (tile_support) {
3730           nL = 0;
3731           hL = NULL;
3732           // num tiles in current socket
3733           int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3734           for (int l = 0; l < NL; ++l) {
3735             // Check L2 (tile) ------------------------------------
3736             if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3737               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3738               continue; // skip tile if all PUs are out of fullMask
3739             }
3740             ++nL;
3741             if (nL <= __kmp_hws_tile.offset ||
3742                 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3743               // skip tile as not requested
3744               n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3745               hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3746               continue; // move to next tile
3747             }
3748             // tile requested, go down the topology tree
3749             nC = 0;
3750             hC = NULL;
3751             // num cores per tile
3752             int NC =
3753                 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
3754             for (int c = 0; c < NC; ++c) {
3755               // Check Core ---------------------------------------
3756               if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3757                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3758                 continue; // skip core if all PUs are out of fullMask
3759               }
3760               ++nC;
3761               if (nC <= __kmp_hws_core.offset ||
3762                   nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3763                 // skip node as not requested
3764                 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3765                 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3766                 continue; // move to next node
3767               }
3768               // core requested, go down to PUs
3769               nT = 0;
3770               nTr = 0;
3771               hT = NULL;
3772               // num procs per core
3773               int NT =
3774                   __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3775               for (int t = 0; t < NT; ++t) {
3776                 // Check PU ---------------------------------------
3777                 idx = hT->os_index;
3778                 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3779                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3780                   continue; // skip PU if not in fullMask
3781                 }
3782                 ++nT;
3783                 if (nT <= __kmp_hws_proc.offset ||
3784                     nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3785                   // skip PU
3786                   KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3787                   ++n_old;
3788                   KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3789                   hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3790                   continue; // move to next node
3791                 }
3792                 ++nTr;
3793                 if (pAddr) // collect requested thread's data
3794                   newAddr[n_new] = (*pAddr)[n_old];
3795                 ++n_new;
3796                 ++n_old;
3797                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3798               } // threads loop
3799               if (nTr > 0) {
3800                 ++nCr; // num cores per socket
3801                 ++nCo; // total num cores
3802                 if (nTr > nTpC)
3803                   nTpC = nTr; // calc max threads per core
3804               }
3805               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3806             } // cores loop
3807             hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3808           } // tiles loop
3809         } else { // tile_support
3810           // no tiles, check cores
3811           nC = 0;
3812           hC = NULL;
3813           // num cores in socket
3814           int NC =
3815               __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
3816           for (int c = 0; c < NC; ++c) {
3817             // Check Core -------------------------------------------
3818             if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3819               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3820               continue; // skip core if all PUs are out of fullMask
3821             }
3822             ++nC;
3823             if (nC <= __kmp_hws_core.offset ||
3824                 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3825               // skip node as not requested
3826               n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3827               hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3828               continue; // move to next node
3829             }
3830             // core requested, go down to PUs
3831             nT = 0;
3832             nTr = 0;
3833             hT = NULL;
3834             // num procs per core
3835             int NT =
3836                 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3837             for (int t = 0; t < NT; ++t) {
3838               // Check PU ---------------------------------------
3839               idx = hT->os_index;
3840               if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3841                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3842                 continue; // skip PU if not in fullMask
3843               }
3844               ++nT;
3845               if (nT <= __kmp_hws_proc.offset ||
3846                   nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3847                 // skip PU
3848                 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3849                 ++n_old;
3850                 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3851                 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3852                 continue; // move to next node
3853               }
3854               ++nTr;
3855               if (pAddr) // collect requested thread's data
3856                 newAddr[n_new] = (*pAddr)[n_old];
3857               ++n_new;
3858               ++n_old;
3859               hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3860             } // threads loop
3861             if (nTr > 0) {
3862               ++nCr; // num cores per socket
3863               ++nCo; // total num cores
3864               if (nTr > nTpC)
3865                 nTpC = nTr; // calc max threads per core
3866             }
3867             hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3868           } // cores loop
3869         } // tiles support
3870       } // numa_support
3871       if (nCr > 0) { // found cores?
3872         ++nPkg; // num sockets
3873         if (nCr > nCpP)
3874           nCpP = nCr; // calc max cores per socket
3875       }
3876     } // sockets loop
3877 
3878     // check the subset is valid
3879     KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
3880     KMP_DEBUG_ASSERT(nPkg > 0);
3881     KMP_DEBUG_ASSERT(nCpP > 0);
3882     KMP_DEBUG_ASSERT(nTpC > 0);
3883     KMP_DEBUG_ASSERT(nCo > 0);
3884     KMP_DEBUG_ASSERT(nPkg <= nPackages);
3885     KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
3886     KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
3887     KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
3888 
3889     nPackages = nPkg; // correct num sockets
3890     nCoresPerPkg = nCpP; // correct num cores per socket
3891     __kmp_nThreadsPerCore = nTpC; // correct num threads per core
3892     __kmp_avail_proc = n_new; // correct num procs
3893     __kmp_ncores = nCo; // correct num cores
3894     // hwloc topology method end
3895   } else
3896 #endif // KMP_USE_HWLOC
3897   {
3898     int n_old = 0, n_new = 0, proc_num = 0;
3899     if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
3900       KMP_WARNING(AffHWSubsetNoHWLOC);
3901       goto _exit;
3902     }
3903     if (__kmp_hws_socket.num == 0)
3904       __kmp_hws_socket.num = nPackages; // use all available sockets
3905     if (__kmp_hws_core.num == 0)
3906       __kmp_hws_core.num = nCoresPerPkg; // use all available cores
3907     if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
3908       __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
3909     if (!__kmp_affinity_uniform_topology()) {
3910       KMP_WARNING(AffHWSubsetNonUniform);
3911       goto _exit; // don't support non-uniform topology
3912     }
3913     if (depth > 3) {
3914       KMP_WARNING(AffHWSubsetNonThreeLevel);
3915       goto _exit; // don't support not-3-level topology
3916     }
3917     if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
3918       KMP_WARNING(AffHWSubsetManySockets);
3919       goto _exit;
3920     }
3921     if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
3922       KMP_WARNING(AffHWSubsetManyCores);
3923       goto _exit;
3924     }
3925     // Form the requested subset
3926     if (pAddr) // pAddr is NULL in case of affinity_none
3927       newAddr = (AddrUnsPair *)__kmp_allocate(
3928           sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
3929           __kmp_hws_proc.num);
3930     for (int i = 0; i < nPackages; ++i) {
3931       if (i < __kmp_hws_socket.offset ||
3932           i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
3933         // skip not-requested socket
3934         n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
3935         if (__kmp_pu_os_idx != NULL) {
3936           // walk through skipped socket
3937           for (int j = 0; j < nCoresPerPkg; ++j) {
3938             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3939               KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3940               ++proc_num;
3941             }
3942           }
3943         }
3944       } else {
3945         // walk through requested socket
3946         for (int j = 0; j < nCoresPerPkg; ++j) {
3947           if (j < __kmp_hws_core.offset ||
3948               j >= __kmp_hws_core.offset +
3949                        __kmp_hws_core.num) { // skip not-requested core
3950             n_old += __kmp_nThreadsPerCore;
3951             if (__kmp_pu_os_idx != NULL) {
3952               for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3953                 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3954                 ++proc_num;
3955               }
3956             }
3957           } else {
3958             // walk through requested core
3959             for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3960               if (k < __kmp_hws_proc.num) {
3961                 if (pAddr) // collect requested thread's data
3962                   newAddr[n_new] = (*pAddr)[n_old];
3963                 n_new++;
3964               } else {
3965                 if (__kmp_pu_os_idx != NULL)
3966                   KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3967               }
3968               n_old++;
3969               ++proc_num;
3970             }
3971           }
3972         }
3973       }
3974     }
3975     KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3976     KMP_DEBUG_ASSERT(n_new ==
3977                      __kmp_hws_socket.num * __kmp_hws_core.num *
3978                          __kmp_hws_proc.num);
3979     nPackages = __kmp_hws_socket.num; // correct nPackages
3980     nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
3981     __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
3982     __kmp_avail_proc = n_new; // correct avail_proc
3983     __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
3984   } // non-hwloc topology method
3985   if (pAddr) {
3986     __kmp_free(*pAddr);
3987     *pAddr = newAddr; // replace old topology with new one
3988   }
3989   if (__kmp_affinity_verbose) {
3990     char m[KMP_AFFIN_MASK_PRINT_LEN];
3991     __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
3992                               __kmp_affin_fullMask);
3993     if (__kmp_affinity_respect_mask) {
3994       KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
3995     } else {
3996       KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
3997     }
3998     KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
3999     kmp_str_buf_t buf;
4000     __kmp_str_buf_init(&buf);
4001     __kmp_str_buf_print(&buf, "%d", nPackages);
4002     KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
4003                __kmp_nThreadsPerCore, __kmp_ncores);
4004     __kmp_str_buf_free(&buf);
4005   }
4006 _exit:
4007   if (__kmp_pu_os_idx != NULL) {
4008     __kmp_free(__kmp_pu_os_idx);
4009     __kmp_pu_os_idx = NULL;
4010   }
4011 }
4012 
4013 // This function figures out the deepest level at which there is at least one
4014 // cluster/core with more than one processing unit bound to it.
__kmp_affinity_find_core_level(const AddrUnsPair * address2os,int nprocs,int bottom_level)4015 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
4016                                           int nprocs, int bottom_level) {
4017   int core_level = 0;
4018 
4019   for (int i = 0; i < nprocs; i++) {
4020     for (int j = bottom_level; j > 0; j--) {
4021       if (address2os[i].first.labels[j] > 0) {
4022         if (core_level < (j - 1)) {
4023           core_level = j - 1;
4024         }
4025       }
4026     }
4027   }
4028   return core_level;
4029 }
4030 
4031 // This function counts number of clusters/cores at given level.
__kmp_affinity_compute_ncores(const AddrUnsPair * address2os,int nprocs,int bottom_level,int core_level)4032 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
4033                                          int nprocs, int bottom_level,
4034                                          int core_level) {
4035   int ncores = 0;
4036   int i, j;
4037 
4038   j = bottom_level;
4039   for (i = 0; i < nprocs; i++) {
4040     for (j = bottom_level; j > core_level; j--) {
4041       if ((i + 1) < nprocs) {
4042         if (address2os[i + 1].first.labels[j] > 0) {
4043           break;
4044         }
4045       }
4046     }
4047     if (j == core_level) {
4048       ncores++;
4049     }
4050   }
4051   if (j > core_level) {
4052     // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
4053     // core. May occur when called from __kmp_affinity_find_core().
4054     ncores++;
4055   }
4056   return ncores;
4057 }
4058 
4059 // This function finds to which cluster/core given processing unit is bound.
__kmp_affinity_find_core(const AddrUnsPair * address2os,int proc,int bottom_level,int core_level)4060 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
4061                                     int bottom_level, int core_level) {
4062   return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
4063                                        core_level) -
4064          1;
4065 }
4066 
4067 // This function finds maximal number of processing units bound to a
4068 // cluster/core at given level.
__kmp_affinity_max_proc_per_core(const AddrUnsPair * address2os,int nprocs,int bottom_level,int core_level)4069 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
4070                                             int nprocs, int bottom_level,
4071                                             int core_level) {
4072   int maxprocpercore = 0;
4073 
4074   if (core_level < bottom_level) {
4075     for (int i = 0; i < nprocs; i++) {
4076       int percore = address2os[i].first.labels[core_level + 1] + 1;
4077 
4078       if (percore > maxprocpercore) {
4079         maxprocpercore = percore;
4080       }
4081     }
4082   } else {
4083     maxprocpercore = 1;
4084   }
4085   return maxprocpercore;
4086 }
4087 
4088 static AddrUnsPair *address2os = NULL;
4089 static int *procarr = NULL;
4090 static int __kmp_aff_depth = 0;
4091 
4092 #if KMP_USE_HIER_SCHED
4093 #define KMP_EXIT_AFF_NONE                                                      \
4094   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
4095   KMP_ASSERT(address2os == NULL);                                              \
4096   __kmp_apply_thread_places(NULL, 0);                                          \
4097   __kmp_create_affinity_none_places();                                         \
4098   __kmp_dispatch_set_hierarchy_values();                                       \
4099   return;
4100 #else
4101 #define KMP_EXIT_AFF_NONE                                                      \
4102   KMP_ASSERT(__kmp_affinity_type == affinity_none);                            \
4103   KMP_ASSERT(address2os == NULL);                                              \
4104   __kmp_apply_thread_places(NULL, 0);                                          \
4105   __kmp_create_affinity_none_places();                                         \
4106   return;
4107 #endif
4108 
4109 // Create a one element mask array (set of places) which only contains the
4110 // initial process's affinity mask
__kmp_create_affinity_none_places()4111 static void __kmp_create_affinity_none_places() {
4112   KMP_ASSERT(__kmp_affin_fullMask != NULL);
4113   KMP_ASSERT(__kmp_affinity_type == affinity_none);
4114   __kmp_affinity_num_masks = 1;
4115   KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4116   kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
4117   KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4118 }
4119 
__kmp_affinity_cmp_Address_child_num(const void * a,const void * b)4120 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
4121   const Address *aa = &(((const AddrUnsPair *)a)->first);
4122   const Address *bb = &(((const AddrUnsPair *)b)->first);
4123   unsigned depth = aa->depth;
4124   unsigned i;
4125   KMP_DEBUG_ASSERT(depth == bb->depth);
4126   KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
4127   KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
4128   for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
4129     int j = depth - i - 1;
4130     if (aa->childNums[j] < bb->childNums[j])
4131       return -1;
4132     if (aa->childNums[j] > bb->childNums[j])
4133       return 1;
4134   }
4135   for (; i < depth; i++) {
4136     int j = i - __kmp_affinity_compact;
4137     if (aa->childNums[j] < bb->childNums[j])
4138       return -1;
4139     if (aa->childNums[j] > bb->childNums[j])
4140       return 1;
4141   }
4142   return 0;
4143 }
4144 
__kmp_aux_affinity_initialize(void)4145 static void __kmp_aux_affinity_initialize(void) {
4146   if (__kmp_affinity_masks != NULL) {
4147     KMP_ASSERT(__kmp_affin_fullMask != NULL);
4148     return;
4149   }
4150 
4151   // Create the "full" mask - this defines all of the processors that we
4152   // consider to be in the machine model. If respect is set, then it is the
4153   // initialization thread's affinity mask. Otherwise, it is all processors that
4154   // we know about on the machine.
4155   if (__kmp_affin_fullMask == NULL) {
4156     KMP_CPU_ALLOC(__kmp_affin_fullMask);
4157   }
4158   if (KMP_AFFINITY_CAPABLE()) {
4159     if (__kmp_affinity_respect_mask) {
4160       __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4161 
4162       // Count the number of available processors.
4163       unsigned i;
4164       __kmp_avail_proc = 0;
4165       KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4166         if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4167           continue;
4168         }
4169         __kmp_avail_proc++;
4170       }
4171       if (__kmp_avail_proc > __kmp_xproc) {
4172         if (__kmp_affinity_verbose ||
4173             (__kmp_affinity_warnings &&
4174              (__kmp_affinity_type != affinity_none))) {
4175           KMP_WARNING(ErrorInitializeAffinity);
4176         }
4177         __kmp_affinity_type = affinity_none;
4178         KMP_AFFINITY_DISABLE();
4179         return;
4180       }
4181     } else {
4182       __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4183       __kmp_avail_proc = __kmp_xproc;
4184     }
4185   }
4186 
4187   if (__kmp_affinity_gran == affinity_gran_tile &&
4188       // check if user's request is valid
4189       __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
4190     KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
4191     __kmp_affinity_gran = affinity_gran_package;
4192   }
4193 
4194   int depth = -1;
4195   kmp_i18n_id_t msg_id = kmp_i18n_null;
4196 
4197   // For backward compatibility, setting KMP_CPUINFO_FILE =>
4198   // KMP_TOPOLOGY_METHOD=cpuinfo
4199   if ((__kmp_cpuinfo_file != NULL) &&
4200       (__kmp_affinity_top_method == affinity_top_method_all)) {
4201     __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4202   }
4203 
4204   if (__kmp_affinity_top_method == affinity_top_method_all) {
4205     // In the default code path, errors are not fatal - we just try using
4206     // another method. We only emit a warning message if affinity is on, or the
4207     // verbose flag is set, and the nowarnings flag was not set.
4208     const char *file_name = NULL;
4209     int line = 0;
4210 #if KMP_USE_HWLOC
4211     if (depth < 0 &&
4212         __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4213       if (__kmp_affinity_verbose) {
4214         KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4215       }
4216       if (!__kmp_hwloc_error) {
4217         depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4218         if (depth == 0) {
4219           KMP_EXIT_AFF_NONE;
4220         } else if (depth < 0 && __kmp_affinity_verbose) {
4221           KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4222         }
4223       } else if (__kmp_affinity_verbose) {
4224         KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4225       }
4226     }
4227 #endif
4228 
4229 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4230 
4231     if (depth < 0) {
4232       if (__kmp_affinity_verbose) {
4233         KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4234       }
4235 
4236       file_name = NULL;
4237       depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4238       if (depth == 0) {
4239         KMP_EXIT_AFF_NONE;
4240       }
4241 
4242       if (depth < 0) {
4243         if (__kmp_affinity_verbose) {
4244           if (msg_id != kmp_i18n_null) {
4245             KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
4246                        __kmp_i18n_catgets(msg_id),
4247                        KMP_I18N_STR(DecodingLegacyAPIC));
4248           } else {
4249             KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
4250                        KMP_I18N_STR(DecodingLegacyAPIC));
4251           }
4252         }
4253 
4254         file_name = NULL;
4255         depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4256         if (depth == 0) {
4257           KMP_EXIT_AFF_NONE;
4258         }
4259       }
4260     }
4261 
4262 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4263 
4264 #if KMP_OS_LINUX
4265 
4266     if (depth < 0) {
4267       if (__kmp_affinity_verbose) {
4268         if (msg_id != kmp_i18n_null) {
4269           KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
4270                      __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
4271         } else {
4272           KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
4273         }
4274       }
4275 
4276       FILE *f = fopen("/proc/cpuinfo", "r");
4277       if (f == NULL) {
4278         msg_id = kmp_i18n_str_CantOpenCpuinfo;
4279       } else {
4280         file_name = "/proc/cpuinfo";
4281         depth =
4282             __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4283         fclose(f);
4284         if (depth == 0) {
4285           KMP_EXIT_AFF_NONE;
4286         }
4287       }
4288     }
4289 
4290 #endif /* KMP_OS_LINUX */
4291 
4292 #if KMP_GROUP_AFFINITY
4293 
4294     if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
4295       if (__kmp_affinity_verbose) {
4296         KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4297       }
4298 
4299       depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4300       KMP_ASSERT(depth != 0);
4301     }
4302 
4303 #endif /* KMP_GROUP_AFFINITY */
4304 
4305     if (depth < 0) {
4306       if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
4307         if (file_name == NULL) {
4308           KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
4309         } else if (line == 0) {
4310           KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
4311         } else {
4312           KMP_INFORM(UsingFlatOSFileLine, file_name, line,
4313                      __kmp_i18n_catgets(msg_id));
4314         }
4315       }
4316       // FIXME - print msg if msg_id = kmp_i18n_null ???
4317 
4318       file_name = "";
4319       depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4320       if (depth == 0) {
4321         KMP_EXIT_AFF_NONE;
4322       }
4323       KMP_ASSERT(depth > 0);
4324       KMP_ASSERT(address2os != NULL);
4325     }
4326   }
4327 
4328 #if KMP_USE_HWLOC
4329   else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4330     KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4331     if (__kmp_affinity_verbose) {
4332       KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4333     }
4334     depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4335     if (depth == 0) {
4336       KMP_EXIT_AFF_NONE;
4337     }
4338   }
4339 #endif // KMP_USE_HWLOC
4340 
4341 // If the user has specified that a particular topology discovery method is to be
4342 // used, then we abort if that method fails. The exception is group affinity,
4343 // which might have been implicitly set.
4344 
4345 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4346 
4347   else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
4348     if (__kmp_affinity_verbose) {
4349       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4350     }
4351 
4352     depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4353     if (depth == 0) {
4354       KMP_EXIT_AFF_NONE;
4355     }
4356     if (depth < 0) {
4357       KMP_ASSERT(msg_id != kmp_i18n_null);
4358       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4359     }
4360   } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4361     if (__kmp_affinity_verbose) {
4362       KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
4363     }
4364 
4365     depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4366     if (depth == 0) {
4367       KMP_EXIT_AFF_NONE;
4368     }
4369     if (depth < 0) {
4370       KMP_ASSERT(msg_id != kmp_i18n_null);
4371       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4372     }
4373   }
4374 
4375 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4376 
4377   else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4378     const char *filename;
4379     if (__kmp_cpuinfo_file != NULL) {
4380       filename = __kmp_cpuinfo_file;
4381     } else {
4382       filename = "/proc/cpuinfo";
4383     }
4384 
4385     if (__kmp_affinity_verbose) {
4386       KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
4387     }
4388 
4389     FILE *f = fopen(filename, "r");
4390     if (f == NULL) {
4391       int code = errno;
4392       if (__kmp_cpuinfo_file != NULL) {
4393         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4394                     KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
4395       } else {
4396         __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4397                     __kmp_msg_null);
4398       }
4399     }
4400     int line = 0;
4401     depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4402     fclose(f);
4403     if (depth < 0) {
4404       KMP_ASSERT(msg_id != kmp_i18n_null);
4405       if (line > 0) {
4406         KMP_FATAL(FileLineMsgExiting, filename, line,
4407                   __kmp_i18n_catgets(msg_id));
4408       } else {
4409         KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4410       }
4411     }
4412     if (__kmp_affinity_type == affinity_none) {
4413       KMP_ASSERT(depth == 0);
4414       KMP_EXIT_AFF_NONE;
4415     }
4416   }
4417 
4418 #if KMP_GROUP_AFFINITY
4419 
4420   else if (__kmp_affinity_top_method == affinity_top_method_group) {
4421     if (__kmp_affinity_verbose) {
4422       KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4423     }
4424 
4425     depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4426     KMP_ASSERT(depth != 0);
4427     if (depth < 0) {
4428       KMP_ASSERT(msg_id != kmp_i18n_null);
4429       KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4430     }
4431   }
4432 
4433 #endif /* KMP_GROUP_AFFINITY */
4434 
4435   else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4436     if (__kmp_affinity_verbose) {
4437       KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
4438     }
4439 
4440     depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4441     if (depth == 0) {
4442       KMP_EXIT_AFF_NONE;
4443     }
4444     // should not fail
4445     KMP_ASSERT(depth > 0);
4446     KMP_ASSERT(address2os != NULL);
4447   }
4448 
4449 #if KMP_USE_HIER_SCHED
4450   __kmp_dispatch_set_hierarchy_values();
4451 #endif
4452 
4453   if (address2os == NULL) {
4454     if (KMP_AFFINITY_CAPABLE() &&
4455         (__kmp_affinity_verbose ||
4456          (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
4457       KMP_WARNING(ErrorInitializeAffinity);
4458     }
4459     __kmp_affinity_type = affinity_none;
4460     __kmp_create_affinity_none_places();
4461     KMP_AFFINITY_DISABLE();
4462     return;
4463   }
4464 
4465   if (__kmp_affinity_gran == affinity_gran_tile
4466 #if KMP_USE_HWLOC
4467       && __kmp_tile_depth == 0
4468 #endif
4469       ) {
4470     // tiles requested but not detected, warn user on this
4471     KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
4472   }
4473 
4474   __kmp_apply_thread_places(&address2os, depth);
4475 
4476   // Create the table of masks, indexed by thread Id.
4477   unsigned maxIndex;
4478   unsigned numUnique;
4479   kmp_affin_mask_t *osId2Mask =
4480       __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
4481   if (__kmp_affinity_gran_levels == 0) {
4482     KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4483   }
4484 
4485   // Set the childNums vector in all Address objects. This must be done before
4486   // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
4487   // account the setting of __kmp_affinity_compact.
4488   __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
4489 
4490   switch (__kmp_affinity_type) {
4491 
4492   case affinity_explicit:
4493     KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
4494     if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4495       __kmp_affinity_process_proclist(
4496           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4497           __kmp_affinity_proclist, osId2Mask, maxIndex);
4498     } else {
4499       __kmp_affinity_process_placelist(
4500           &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4501           __kmp_affinity_proclist, osId2Mask, maxIndex);
4502     }
4503     if (__kmp_affinity_num_masks == 0) {
4504       if (__kmp_affinity_verbose ||
4505           (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
4506         KMP_WARNING(AffNoValidProcID);
4507       }
4508       __kmp_affinity_type = affinity_none;
4509       __kmp_create_affinity_none_places();
4510       return;
4511     }
4512     break;
4513 
4514   // The other affinity types rely on sorting the Addresses according to some
4515   // permutation of the machine topology tree. Set __kmp_affinity_compact and
4516   // __kmp_affinity_offset appropriately, then jump to a common code fragment
4517   // to do the sort and create the array of affinity masks.
4518 
4519   case affinity_logical:
4520     __kmp_affinity_compact = 0;
4521     if (__kmp_affinity_offset) {
4522       __kmp_affinity_offset =
4523           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4524     }
4525     goto sortAddresses;
4526 
4527   case affinity_physical:
4528     if (__kmp_nThreadsPerCore > 1) {
4529       __kmp_affinity_compact = 1;
4530       if (__kmp_affinity_compact >= depth) {
4531         __kmp_affinity_compact = 0;
4532       }
4533     } else {
4534       __kmp_affinity_compact = 0;
4535     }
4536     if (__kmp_affinity_offset) {
4537       __kmp_affinity_offset =
4538           __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4539     }
4540     goto sortAddresses;
4541 
4542   case affinity_scatter:
4543     if (__kmp_affinity_compact >= depth) {
4544       __kmp_affinity_compact = 0;
4545     } else {
4546       __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4547     }
4548     goto sortAddresses;
4549 
4550   case affinity_compact:
4551     if (__kmp_affinity_compact >= depth) {
4552       __kmp_affinity_compact = depth - 1;
4553     }
4554     goto sortAddresses;
4555 
4556   case affinity_balanced:
4557     if (depth <= 1) {
4558       if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4559         KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4560       }
4561       __kmp_affinity_type = affinity_none;
4562       __kmp_create_affinity_none_places();
4563       return;
4564     } else if (!__kmp_affinity_uniform_topology()) {
4565       // Save the depth for further usage
4566       __kmp_aff_depth = depth;
4567 
4568       int core_level = __kmp_affinity_find_core_level(
4569           address2os, __kmp_avail_proc, depth - 1);
4570       int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
4571                                                  depth - 1, core_level);
4572       int maxprocpercore = __kmp_affinity_max_proc_per_core(
4573           address2os, __kmp_avail_proc, depth - 1, core_level);
4574 
4575       int nproc = ncores * maxprocpercore;
4576       if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4577         if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4578           KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4579         }
4580         __kmp_affinity_type = affinity_none;
4581         return;
4582       }
4583 
4584       procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4585       for (int i = 0; i < nproc; i++) {
4586         procarr[i] = -1;
4587       }
4588 
4589       int lastcore = -1;
4590       int inlastcore = 0;
4591       for (int i = 0; i < __kmp_avail_proc; i++) {
4592         int proc = address2os[i].second;
4593         int core =
4594             __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4595 
4596         if (core == lastcore) {
4597           inlastcore++;
4598         } else {
4599           inlastcore = 0;
4600         }
4601         lastcore = core;
4602 
4603         procarr[core * maxprocpercore + inlastcore] = proc;
4604       }
4605     }
4606     if (__kmp_affinity_compact >= depth) {
4607       __kmp_affinity_compact = depth - 1;
4608     }
4609 
4610   sortAddresses:
4611     // Allocate the gtid->affinity mask table.
4612     if (__kmp_affinity_dups) {
4613       __kmp_affinity_num_masks = __kmp_avail_proc;
4614     } else {
4615       __kmp_affinity_num_masks = numUnique;
4616     }
4617 
4618     if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4619         (__kmp_affinity_num_places > 0) &&
4620         ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
4621       __kmp_affinity_num_masks = __kmp_affinity_num_places;
4622     }
4623 
4624     KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4625 
4626     // Sort the address2os table according to the current setting of
4627     // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4628     qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4629           __kmp_affinity_cmp_Address_child_num);
4630     {
4631       int i;
4632       unsigned j;
4633       for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4634         if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
4635           continue;
4636         }
4637         unsigned osId = address2os[i].second;
4638         kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4639         kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4640         KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4641         KMP_CPU_COPY(dest, src);
4642         if (++j >= __kmp_affinity_num_masks) {
4643           break;
4644         }
4645       }
4646       KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4647     }
4648     break;
4649 
4650   default:
4651     KMP_ASSERT2(0, "Unexpected affinity setting");
4652   }
4653 
4654   KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
4655   machine_hierarchy.init(address2os, __kmp_avail_proc);
4656 }
4657 #undef KMP_EXIT_AFF_NONE
4658 
__kmp_affinity_initialize(void)4659 void __kmp_affinity_initialize(void) {
4660   // Much of the code above was written assuming that if a machine was not
4661   // affinity capable, then __kmp_affinity_type == affinity_none.  We now
4662   // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4663   // There are too many checks for __kmp_affinity_type == affinity_none
4664   // in this code.  Instead of trying to change them all, check if
4665   // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4666   // affinity_none, call the real initialization routine, then restore
4667   // __kmp_affinity_type to affinity_disabled.
4668   int disabled = (__kmp_affinity_type == affinity_disabled);
4669   if (!KMP_AFFINITY_CAPABLE()) {
4670     KMP_ASSERT(disabled);
4671   }
4672   if (disabled) {
4673     __kmp_affinity_type = affinity_none;
4674   }
4675   __kmp_aux_affinity_initialize();
4676   if (disabled) {
4677     __kmp_affinity_type = affinity_disabled;
4678   }
4679 }
4680 
__kmp_affinity_uninitialize(void)4681 void __kmp_affinity_uninitialize(void) {
4682   if (__kmp_affinity_masks != NULL) {
4683     KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4684     __kmp_affinity_masks = NULL;
4685   }
4686   if (__kmp_affin_fullMask != NULL) {
4687     KMP_CPU_FREE(__kmp_affin_fullMask);
4688     __kmp_affin_fullMask = NULL;
4689   }
4690   __kmp_affinity_num_masks = 0;
4691   __kmp_affinity_type = affinity_default;
4692   __kmp_affinity_num_places = 0;
4693   if (__kmp_affinity_proclist != NULL) {
4694     __kmp_free(__kmp_affinity_proclist);
4695     __kmp_affinity_proclist = NULL;
4696   }
4697   if (address2os != NULL) {
4698     __kmp_free(address2os);
4699     address2os = NULL;
4700   }
4701   if (procarr != NULL) {
4702     __kmp_free(procarr);
4703     procarr = NULL;
4704   }
4705 #if KMP_USE_HWLOC
4706   if (__kmp_hwloc_topology != NULL) {
4707     hwloc_topology_destroy(__kmp_hwloc_topology);
4708     __kmp_hwloc_topology = NULL;
4709   }
4710 #endif
4711   KMPAffinity::destroy_api();
4712 }
4713 
__kmp_affinity_set_init_mask(int gtid,int isa_root)4714 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4715   if (!KMP_AFFINITY_CAPABLE()) {
4716     return;
4717   }
4718 
4719   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4720   if (th->th.th_affin_mask == NULL) {
4721     KMP_CPU_ALLOC(th->th.th_affin_mask);
4722   } else {
4723     KMP_CPU_ZERO(th->th.th_affin_mask);
4724   }
4725 
4726   // Copy the thread mask to the kmp_info_t structure. If
4727   // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4728   // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4729   // then the full mask is the same as the mask of the initialization thread.
4730   kmp_affin_mask_t *mask;
4731   int i;
4732 
4733   if (KMP_AFFINITY_NON_PROC_BIND) {
4734     if ((__kmp_affinity_type == affinity_none) ||
4735         (__kmp_affinity_type == affinity_balanced)) {
4736 #if KMP_GROUP_AFFINITY
4737       if (__kmp_num_proc_groups > 1) {
4738         return;
4739       }
4740 #endif
4741       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4742       i = 0;
4743       mask = __kmp_affin_fullMask;
4744     } else {
4745       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4746       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4747       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4748     }
4749   } else {
4750     if ((!isa_root) ||
4751         (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4752 #if KMP_GROUP_AFFINITY
4753       if (__kmp_num_proc_groups > 1) {
4754         return;
4755       }
4756 #endif
4757       KMP_ASSERT(__kmp_affin_fullMask != NULL);
4758       i = KMP_PLACE_ALL;
4759       mask = __kmp_affin_fullMask;
4760     } else {
4761       // int i = some hash function or just a counter that doesn't
4762       // always start at 0.  Use gtid for now.
4763       KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4764       i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4765       mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4766     }
4767   }
4768 
4769   th->th.th_current_place = i;
4770   if (isa_root) {
4771     th->th.th_new_place = i;
4772     th->th.th_first_place = 0;
4773     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4774   } else if (KMP_AFFINITY_NON_PROC_BIND) {
4775     // When using a Non-OMP_PROC_BIND affinity method,
4776     // set all threads' place-partition-var to the entire place list
4777     th->th.th_first_place = 0;
4778     th->th.th_last_place = __kmp_affinity_num_masks - 1;
4779   }
4780 
4781   if (i == KMP_PLACE_ALL) {
4782     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4783                    gtid));
4784   } else {
4785     KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4786                    gtid, i));
4787   }
4788 
4789   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4790 
4791   if (__kmp_affinity_verbose
4792       /* to avoid duplicate printing (will be correctly printed on barrier) */
4793       && (__kmp_affinity_type == affinity_none ||
4794           (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
4795     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4796     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4797                               th->th.th_affin_mask);
4798     KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4799                __kmp_gettid(), gtid, buf);
4800   }
4801 
4802 #if KMP_OS_WINDOWS
4803   // On Windows* OS, the process affinity mask might have changed. If the user
4804   // didn't request affinity and this call fails, just continue silently.
4805   // See CQ171393.
4806   if (__kmp_affinity_type == affinity_none) {
4807     __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4808   } else
4809 #endif
4810     __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4811 }
4812 
__kmp_affinity_set_place(int gtid)4813 void __kmp_affinity_set_place(int gtid) {
4814   if (!KMP_AFFINITY_CAPABLE()) {
4815     return;
4816   }
4817 
4818   kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4819 
4820   KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4821                  "place = %d)\n",
4822                  gtid, th->th.th_new_place, th->th.th_current_place));
4823 
4824   // Check that the new place is within this thread's partition.
4825   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4826   KMP_ASSERT(th->th.th_new_place >= 0);
4827   KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4828   if (th->th.th_first_place <= th->th.th_last_place) {
4829     KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4830                (th->th.th_new_place <= th->th.th_last_place));
4831   } else {
4832     KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4833                (th->th.th_new_place >= th->th.th_last_place));
4834   }
4835 
4836   // Copy the thread mask to the kmp_info_t structure,
4837   // and set this thread's affinity.
4838   kmp_affin_mask_t *mask =
4839       KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4840   KMP_CPU_COPY(th->th.th_affin_mask, mask);
4841   th->th.th_current_place = th->th.th_new_place;
4842 
4843   if (__kmp_affinity_verbose) {
4844     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4845     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4846                               th->th.th_affin_mask);
4847     KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4848                __kmp_gettid(), gtid, buf);
4849   }
4850   __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4851 }
4852 
__kmp_aux_set_affinity(void ** mask)4853 int __kmp_aux_set_affinity(void **mask) {
4854   int gtid;
4855   kmp_info_t *th;
4856   int retval;
4857 
4858   if (!KMP_AFFINITY_CAPABLE()) {
4859     return -1;
4860   }
4861 
4862   gtid = __kmp_entry_gtid();
4863   KA_TRACE(1000, (""); {
4864     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4865     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4866                               (kmp_affin_mask_t *)(*mask));
4867     __kmp_debug_printf(
4868         "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
4869         buf);
4870   });
4871 
4872   if (__kmp_env_consistency_check) {
4873     if ((mask == NULL) || (*mask == NULL)) {
4874       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4875     } else {
4876       unsigned proc;
4877       int num_procs = 0;
4878 
4879       KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4880         if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4881           KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4882         }
4883         if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4884           continue;
4885         }
4886         num_procs++;
4887       }
4888       if (num_procs == 0) {
4889         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4890       }
4891 
4892 #if KMP_GROUP_AFFINITY
4893       if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4894         KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4895       }
4896 #endif /* KMP_GROUP_AFFINITY */
4897     }
4898   }
4899 
4900   th = __kmp_threads[gtid];
4901   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4902   retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4903   if (retval == 0) {
4904     KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4905   }
4906 
4907   th->th.th_current_place = KMP_PLACE_UNDEFINED;
4908   th->th.th_new_place = KMP_PLACE_UNDEFINED;
4909   th->th.th_first_place = 0;
4910   th->th.th_last_place = __kmp_affinity_num_masks - 1;
4911 
4912   // Turn off 4.0 affinity for the current tread at this parallel level.
4913   th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4914 
4915   return retval;
4916 }
4917 
__kmp_aux_get_affinity(void ** mask)4918 int __kmp_aux_get_affinity(void **mask) {
4919   int gtid;
4920   int retval;
4921   kmp_info_t *th;
4922 
4923   if (!KMP_AFFINITY_CAPABLE()) {
4924     return -1;
4925   }
4926 
4927   gtid = __kmp_entry_gtid();
4928   th = __kmp_threads[gtid];
4929   KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4930 
4931   KA_TRACE(1000, (""); {
4932     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4933     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4934                               th->th.th_affin_mask);
4935     __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
4936                  gtid, buf);
4937   });
4938 
4939   if (__kmp_env_consistency_check) {
4940     if ((mask == NULL) || (*mask == NULL)) {
4941       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4942     }
4943   }
4944 
4945 #if !KMP_OS_WINDOWS
4946 
4947   retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4948   KA_TRACE(1000, (""); {
4949     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4950     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4951                               (kmp_affin_mask_t *)(*mask));
4952     __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
4953                  gtid, buf);
4954   });
4955   return retval;
4956 
4957 #else
4958 
4959   KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4960   return 0;
4961 
4962 #endif /* KMP_OS_WINDOWS */
4963 }
4964 
__kmp_aux_get_affinity_max_proc()4965 int __kmp_aux_get_affinity_max_proc() {
4966   if (!KMP_AFFINITY_CAPABLE()) {
4967     return 0;
4968   }
4969 #if KMP_GROUP_AFFINITY
4970   if (__kmp_num_proc_groups > 1) {
4971     return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4972   }
4973 #endif
4974   return __kmp_xproc;
4975 }
4976 
__kmp_aux_set_affinity_mask_proc(int proc,void ** mask)4977 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4978   if (!KMP_AFFINITY_CAPABLE()) {
4979     return -1;
4980   }
4981 
4982   KA_TRACE(1000, (""); {
4983     int gtid = __kmp_entry_gtid();
4984     char buf[KMP_AFFIN_MASK_PRINT_LEN];
4985     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4986                               (kmp_affin_mask_t *)(*mask));
4987     __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4988                        "affinity mask for thread %d = %s\n",
4989                        proc, gtid, buf);
4990   });
4991 
4992   if (__kmp_env_consistency_check) {
4993     if ((mask == NULL) || (*mask == NULL)) {
4994       KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4995     }
4996   }
4997 
4998   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4999     return -1;
5000   }
5001   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5002     return -2;
5003   }
5004 
5005   KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5006   return 0;
5007 }
5008 
__kmp_aux_unset_affinity_mask_proc(int proc,void ** mask)5009 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5010   if (!KMP_AFFINITY_CAPABLE()) {
5011     return -1;
5012   }
5013 
5014   KA_TRACE(1000, (""); {
5015     int gtid = __kmp_entry_gtid();
5016     char buf[KMP_AFFIN_MASK_PRINT_LEN];
5017     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5018                               (kmp_affin_mask_t *)(*mask));
5019     __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5020                        "affinity mask for thread %d = %s\n",
5021                        proc, gtid, buf);
5022   });
5023 
5024   if (__kmp_env_consistency_check) {
5025     if ((mask == NULL) || (*mask == NULL)) {
5026       KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5027     }
5028   }
5029 
5030   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5031     return -1;
5032   }
5033   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5034     return -2;
5035   }
5036 
5037   KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5038   return 0;
5039 }
5040 
__kmp_aux_get_affinity_mask_proc(int proc,void ** mask)5041 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5042   if (!KMP_AFFINITY_CAPABLE()) {
5043     return -1;
5044   }
5045 
5046   KA_TRACE(1000, (""); {
5047     int gtid = __kmp_entry_gtid();
5048     char buf[KMP_AFFIN_MASK_PRINT_LEN];
5049     __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5050                               (kmp_affin_mask_t *)(*mask));
5051     __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5052                        "affinity mask for thread %d = %s\n",
5053                        proc, gtid, buf);
5054   });
5055 
5056   if (__kmp_env_consistency_check) {
5057     if ((mask == NULL) || (*mask == NULL)) {
5058       KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5059     }
5060   }
5061 
5062   if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5063     return -1;
5064   }
5065   if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5066     return 0;
5067   }
5068 
5069   return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5070 }
5071 
5072 // Dynamic affinity settings - Affinity balanced
__kmp_balanced_affinity(kmp_info_t * th,int nthreads)5073 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5074   KMP_DEBUG_ASSERT(th);
5075   bool fine_gran = true;
5076   int tid = th->th.th_info.ds.ds_tid;
5077 
5078   switch (__kmp_affinity_gran) {
5079   case affinity_gran_fine:
5080   case affinity_gran_thread:
5081     break;
5082   case affinity_gran_core:
5083     if (__kmp_nThreadsPerCore > 1) {
5084       fine_gran = false;
5085     }
5086     break;
5087   case affinity_gran_package:
5088     if (nCoresPerPkg > 1) {
5089       fine_gran = false;
5090     }
5091     break;
5092   default:
5093     fine_gran = false;
5094   }
5095 
5096   if (__kmp_affinity_uniform_topology()) {
5097     int coreID;
5098     int threadID;
5099     // Number of hyper threads per core in HT machine
5100     int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5101     // Number of cores
5102     int ncores = __kmp_ncores;
5103     if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5104       __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5105       ncores = nPackages;
5106     }
5107     // How many threads will be bound to each core
5108     int chunk = nthreads / ncores;
5109     // How many cores will have an additional thread bound to it - "big cores"
5110     int big_cores = nthreads % ncores;
5111     // Number of threads on the big cores
5112     int big_nth = (chunk + 1) * big_cores;
5113     if (tid < big_nth) {
5114       coreID = tid / (chunk + 1);
5115       threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5116     } else { // tid >= big_nth
5117       coreID = (tid - big_cores) / chunk;
5118       threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5119     }
5120 
5121     KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5122                       "Illegal set affinity operation when not capable");
5123 
5124     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5125     KMP_CPU_ZERO(mask);
5126 
5127     if (fine_gran) {
5128       int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
5129       KMP_CPU_SET(osID, mask);
5130     } else {
5131       for (int i = 0; i < __kmp_nth_per_core; i++) {
5132         int osID;
5133         osID = address2os[coreID * __kmp_nth_per_core + i].second;
5134         KMP_CPU_SET(osID, mask);
5135       }
5136     }
5137     if (__kmp_affinity_verbose) {
5138       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5139       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5140       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5141                  __kmp_gettid(), tid, buf);
5142     }
5143     __kmp_set_system_affinity(mask, TRUE);
5144   } else { // Non-uniform topology
5145 
5146     kmp_affin_mask_t *mask = th->th.th_affin_mask;
5147     KMP_CPU_ZERO(mask);
5148 
5149     int core_level = __kmp_affinity_find_core_level(
5150         address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
5151     int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
5152                                                __kmp_aff_depth - 1, core_level);
5153     int nth_per_core = __kmp_affinity_max_proc_per_core(
5154         address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5155 
5156     // For performance gain consider the special case nthreads ==
5157     // __kmp_avail_proc
5158     if (nthreads == __kmp_avail_proc) {
5159       if (fine_gran) {
5160         int osID = address2os[tid].second;
5161         KMP_CPU_SET(osID, mask);
5162       } else {
5163         int core = __kmp_affinity_find_core(address2os, tid,
5164                                             __kmp_aff_depth - 1, core_level);
5165         for (int i = 0; i < __kmp_avail_proc; i++) {
5166           int osID = address2os[i].second;
5167           if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
5168                                        core_level) == core) {
5169             KMP_CPU_SET(osID, mask);
5170           }
5171         }
5172       }
5173     } else if (nthreads <= ncores) {
5174 
5175       int core = 0;
5176       for (int i = 0; i < ncores; i++) {
5177         // Check if this core from procarr[] is in the mask
5178         int in_mask = 0;
5179         for (int j = 0; j < nth_per_core; j++) {
5180           if (procarr[i * nth_per_core + j] != -1) {
5181             in_mask = 1;
5182             break;
5183           }
5184         }
5185         if (in_mask) {
5186           if (tid == core) {
5187             for (int j = 0; j < nth_per_core; j++) {
5188               int osID = procarr[i * nth_per_core + j];
5189               if (osID != -1) {
5190                 KMP_CPU_SET(osID, mask);
5191                 // For fine granularity it is enough to set the first available
5192                 // osID for this core
5193                 if (fine_gran) {
5194                   break;
5195                 }
5196               }
5197             }
5198             break;
5199           } else {
5200             core++;
5201           }
5202         }
5203       }
5204     } else { // nthreads > ncores
5205       // Array to save the number of processors at each core
5206       int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5207       // Array to save the number of cores with "x" available processors;
5208       int *ncores_with_x_procs =
5209           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5210       // Array to save the number of cores with # procs from x to nth_per_core
5211       int *ncores_with_x_to_max_procs =
5212           (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5213 
5214       for (int i = 0; i <= nth_per_core; i++) {
5215         ncores_with_x_procs[i] = 0;
5216         ncores_with_x_to_max_procs[i] = 0;
5217       }
5218 
5219       for (int i = 0; i < ncores; i++) {
5220         int cnt = 0;
5221         for (int j = 0; j < nth_per_core; j++) {
5222           if (procarr[i * nth_per_core + j] != -1) {
5223             cnt++;
5224           }
5225         }
5226         nproc_at_core[i] = cnt;
5227         ncores_with_x_procs[cnt]++;
5228       }
5229 
5230       for (int i = 0; i <= nth_per_core; i++) {
5231         for (int j = i; j <= nth_per_core; j++) {
5232           ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5233         }
5234       }
5235 
5236       // Max number of processors
5237       int nproc = nth_per_core * ncores;
5238       // An array to keep number of threads per each context
5239       int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5240       for (int i = 0; i < nproc; i++) {
5241         newarr[i] = 0;
5242       }
5243 
5244       int nth = nthreads;
5245       int flag = 0;
5246       while (nth > 0) {
5247         for (int j = 1; j <= nth_per_core; j++) {
5248           int cnt = ncores_with_x_to_max_procs[j];
5249           for (int i = 0; i < ncores; i++) {
5250             // Skip the core with 0 processors
5251             if (nproc_at_core[i] == 0) {
5252               continue;
5253             }
5254             for (int k = 0; k < nth_per_core; k++) {
5255               if (procarr[i * nth_per_core + k] != -1) {
5256                 if (newarr[i * nth_per_core + k] == 0) {
5257                   newarr[i * nth_per_core + k] = 1;
5258                   cnt--;
5259                   nth--;
5260                   break;
5261                 } else {
5262                   if (flag != 0) {
5263                     newarr[i * nth_per_core + k]++;
5264                     cnt--;
5265                     nth--;
5266                     break;
5267                   }
5268                 }
5269               }
5270             }
5271             if (cnt == 0 || nth == 0) {
5272               break;
5273             }
5274           }
5275           if (nth == 0) {
5276             break;
5277           }
5278         }
5279         flag = 1;
5280       }
5281       int sum = 0;
5282       for (int i = 0; i < nproc; i++) {
5283         sum += newarr[i];
5284         if (sum > tid) {
5285           if (fine_gran) {
5286             int osID = procarr[i];
5287             KMP_CPU_SET(osID, mask);
5288           } else {
5289             int coreID = i / nth_per_core;
5290             for (int ii = 0; ii < nth_per_core; ii++) {
5291               int osID = procarr[coreID * nth_per_core + ii];
5292               if (osID != -1) {
5293                 KMP_CPU_SET(osID, mask);
5294               }
5295             }
5296           }
5297           break;
5298         }
5299       }
5300       __kmp_free(newarr);
5301     }
5302 
5303     if (__kmp_affinity_verbose) {
5304       char buf[KMP_AFFIN_MASK_PRINT_LEN];
5305       __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5306       KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5307                  __kmp_gettid(), tid, buf);
5308     }
5309     __kmp_set_system_affinity(mask, TRUE);
5310   }
5311 }
5312 
5313 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5314 // We don't need this entry for Windows because
5315 // there is GetProcessAffinityMask() api
5316 //
5317 // The intended usage is indicated by these steps:
5318 // 1) The user gets the current affinity mask
5319 // 2) Then sets the affinity by calling this function
5320 // 3) Error check the return value
5321 // 4) Use non-OpenMP parallelization
5322 // 5) Reset the affinity to what was stored in step 1)
5323 #ifdef __cplusplus
5324 extern "C"
5325 #endif
5326     int
kmp_set_thread_affinity_mask_initial()5327     kmp_set_thread_affinity_mask_initial()
5328 // the function returns 0 on success,
5329 //   -1 if we cannot bind thread
5330 //   >0 (errno) if an error happened during binding
5331 {
5332   int gtid = __kmp_get_gtid();
5333   if (gtid < 0) {
5334     // Do not touch non-omp threads
5335     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5336                   "non-omp thread, returning\n"));
5337     return -1;
5338   }
5339   if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5340     KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5341                   "affinity not initialized, returning\n"));
5342     return -1;
5343   }
5344   KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5345                 "set full mask for thread %d\n",
5346                 gtid));
5347   KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5348   return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5349 }
5350 #endif
5351 
5352 #endif // KMP_AFFINITY_SUPPORTED
5353