1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2015 Benoit Jacob <benoitjacob@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #include <iostream>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <vector>
14 #include <algorithm>
15 #include <fstream>
16 #include <string>
17 #include <cmath>
18 #include <cassert>
19 #include <cstring>
20 #include <memory>
21 
22 #include <Eigen/Core>
23 
24 using namespace std;
25 
26 const int default_precision = 4;
27 
28 // see --only-cubic-sizes
29 bool only_cubic_sizes = false;
30 
31 // see --dump-tables
32 bool dump_tables = false;
33 
log2_pot(size_t x)34 uint8_t log2_pot(size_t x) {
35   size_t l = 0;
36   while (x >>= 1) l++;
37   return l;
38 }
39 
compact_size_triple(size_t k,size_t m,size_t n)40 uint16_t compact_size_triple(size_t k, size_t m, size_t n)
41 {
42   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
43 }
44 
45 // just a helper to store a triple of K,M,N sizes for matrix product
46 struct size_triple_t
47 {
48   uint16_t k, m, n;
size_triple_tsize_triple_t49   size_triple_t() : k(0), m(0), n(0) {}
size_triple_tsize_triple_t50   size_triple_t(size_t _k, size_t _m, size_t _n) : k(_k), m(_m), n(_n) {}
size_triple_tsize_triple_t51   size_triple_t(const size_triple_t& o) : k(o.k), m(o.m), n(o.n) {}
size_triple_tsize_triple_t52   size_triple_t(uint16_t compact)
53   {
54     k = 1 << ((compact & 0xf00) >> 8);
55     m = 1 << ((compact & 0x0f0) >> 4);
56     n = 1 << ((compact & 0x00f) >> 0);
57   }
is_cubicsize_triple_t58   bool is_cubic() const { return k == m && m == n; }
59 };
60 
operator <<(ostream & s,const size_triple_t & t)61 ostream& operator<<(ostream& s, const size_triple_t& t)
62 {
63   return s << "(" << t.k << ", " << t.m << ", " << t.n << ")";
64 }
65 
66 struct inputfile_entry_t
67 {
68   uint16_t product_size;
69   uint16_t pot_block_size;
70   size_triple_t nonpot_block_size;
71   float gflops;
72 };
73 
74 struct inputfile_t
75 {
76   enum class type_t {
77     unknown,
78     all_pot_sizes,
79     default_sizes
80   };
81 
82   string filename;
83   vector<inputfile_entry_t> entries;
84   type_t type;
85 
inputfile_tinputfile_t86   inputfile_t(const string& fname)
87     : filename(fname)
88     , type(type_t::unknown)
89   {
90     ifstream stream(filename);
91     if (!stream.is_open()) {
92       cerr << "couldn't open input file: " << filename << endl;
93       exit(1);
94     }
95     string line;
96     while (getline(stream, line)) {
97       if (line.empty()) continue;
98       if (line.find("BEGIN MEASUREMENTS ALL POT SIZES") == 0) {
99         if (type != type_t::unknown) {
100           cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
101           exit(1);
102         }
103         type = type_t::all_pot_sizes;
104         continue;
105       }
106       if (line.find("BEGIN MEASUREMENTS DEFAULT SIZES") == 0) {
107         if (type != type_t::unknown) {
108           cerr << "Input file " << filename << " contains redundant BEGIN MEASUREMENTS lines";
109           exit(1);
110         }
111         type = type_t::default_sizes;
112         continue;
113       }
114 
115 
116       if (type == type_t::unknown) {
117         continue;
118       }
119       switch(type) {
120         case type_t::all_pot_sizes: {
121           unsigned int product_size, block_size;
122           float gflops;
123           int sscanf_result =
124             sscanf(line.c_str(), "%x %x %f",
125                    &product_size,
126                    &block_size,
127                    &gflops);
128           if (3 != sscanf_result ||
129               !product_size ||
130               product_size > 0xfff ||
131               !block_size ||
132               block_size > 0xfff ||
133               !isfinite(gflops))
134           {
135             cerr << "ill-formed input file: " << filename << endl;
136             cerr << "offending line:" << endl << line << endl;
137             exit(1);
138           }
139           if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
140             continue;
141           }
142           inputfile_entry_t entry;
143           entry.product_size = uint16_t(product_size);
144           entry.pot_block_size = uint16_t(block_size);
145           entry.gflops = gflops;
146           entries.push_back(entry);
147           break;
148         }
149         case type_t::default_sizes: {
150           unsigned int product_size;
151           float gflops;
152           int bk, bm, bn;
153           int sscanf_result =
154             sscanf(line.c_str(), "%x default(%d, %d, %d) %f",
155                    &product_size,
156                    &bk, &bm, &bn,
157                    &gflops);
158           if (5 != sscanf_result ||
159               !product_size ||
160               product_size > 0xfff ||
161               !isfinite(gflops))
162           {
163             cerr << "ill-formed input file: " << filename << endl;
164             cerr << "offending line:" << endl << line << endl;
165             exit(1);
166           }
167           if (only_cubic_sizes && !size_triple_t(product_size).is_cubic()) {
168             continue;
169           }
170           inputfile_entry_t entry;
171           entry.product_size = uint16_t(product_size);
172           entry.pot_block_size = 0;
173           entry.nonpot_block_size = size_triple_t(bk, bm, bn);
174           entry.gflops = gflops;
175           entries.push_back(entry);
176           break;
177         }
178 
179         default:
180           break;
181       }
182     }
183     stream.close();
184     if (type == type_t::unknown) {
185       cerr << "Unrecognized input file " << filename << endl;
186       exit(1);
187     }
188     if (entries.empty()) {
189       cerr << "didn't find any measurements in input file: " << filename << endl;
190       exit(1);
191     }
192   }
193 };
194 
195 struct preprocessed_inputfile_entry_t
196 {
197   uint16_t product_size;
198   uint16_t block_size;
199 
200   float efficiency;
201 };
202 
lower_efficiency(const preprocessed_inputfile_entry_t & e1,const preprocessed_inputfile_entry_t & e2)203 bool lower_efficiency(const preprocessed_inputfile_entry_t& e1, const preprocessed_inputfile_entry_t& e2)
204 {
205   return e1.efficiency < e2.efficiency;
206 }
207 
208 struct preprocessed_inputfile_t
209 {
210   string filename;
211   vector<preprocessed_inputfile_entry_t> entries;
212 
preprocessed_inputfile_tpreprocessed_inputfile_t213   preprocessed_inputfile_t(const inputfile_t& inputfile)
214     : filename(inputfile.filename)
215   {
216     if (inputfile.type != inputfile_t::type_t::all_pot_sizes) {
217       abort();
218     }
219     auto it = inputfile.entries.begin();
220     auto it_first_with_given_product_size = it;
221     while (it != inputfile.entries.end()) {
222       ++it;
223       if (it == inputfile.entries.end() ||
224         it->product_size != it_first_with_given_product_size->product_size)
225       {
226         import_input_file_range_one_product_size(it_first_with_given_product_size, it);
227         it_first_with_given_product_size = it;
228       }
229     }
230   }
231 
232 private:
import_input_file_range_one_product_sizepreprocessed_inputfile_t233   void import_input_file_range_one_product_size(
234     const vector<inputfile_entry_t>::const_iterator& begin,
235     const vector<inputfile_entry_t>::const_iterator& end)
236   {
237     uint16_t product_size = begin->product_size;
238     float max_gflops = 0.0f;
239     for (auto it = begin; it != end; ++it) {
240       if (it->product_size != product_size) {
241         cerr << "Unexpected ordering of entries in " << filename << endl;
242         cerr << "(Expected all entries for product size " << hex << product_size << dec << " to be grouped)" << endl;
243         exit(1);
244       }
245       max_gflops = max(max_gflops, it->gflops);
246     }
247     for (auto it = begin; it != end; ++it) {
248       preprocessed_inputfile_entry_t entry;
249       entry.product_size = it->product_size;
250       entry.block_size = it->pot_block_size;
251       entry.efficiency = it->gflops / max_gflops;
252       entries.push_back(entry);
253     }
254   }
255 };
256 
check_all_files_in_same_exact_order(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles)257 void check_all_files_in_same_exact_order(
258        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles)
259 {
260   if (preprocessed_inputfiles.empty()) {
261     return;
262   }
263 
264   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[0];
265   const size_t num_entries = first_file.entries.size();
266 
267   for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
268     if (preprocessed_inputfiles[i].entries.size() != num_entries) {
269       cerr << "these files have different number of entries: "
270            << preprocessed_inputfiles[i].filename
271            << " and "
272            << first_file.filename
273            << endl;
274       exit(1);
275     }
276   }
277 
278   for (size_t entry_index = 0; entry_index < num_entries; entry_index++) {
279     const uint16_t entry_product_size = first_file.entries[entry_index].product_size;
280     const uint16_t entry_block_size = first_file.entries[entry_index].block_size;
281     for (size_t file_index = 0; file_index < preprocessed_inputfiles.size(); file_index++) {
282       const preprocessed_inputfile_t& cur_file = preprocessed_inputfiles[file_index];
283       if (cur_file.entries[entry_index].product_size != entry_product_size ||
284           cur_file.entries[entry_index].block_size != entry_block_size)
285       {
286         cerr << "entries not in same order between these files: "
287              << first_file.filename
288              << " and "
289              << cur_file.filename
290              << endl;
291         exit(1);
292       }
293     }
294   }
295 }
296 
efficiency_of_subset(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<size_t> & subset)297 float efficiency_of_subset(
298         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
299         const vector<size_t>& subset)
300 {
301   if (subset.size() <= 1) {
302     return 1.0f;
303   }
304   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
305   const size_t num_entries = first_file.entries.size();
306   float efficiency = 1.0f;
307   size_t entry_index = 0;
308   size_t first_entry_index_with_this_product_size = 0;
309   uint16_t product_size = first_file.entries[0].product_size;
310   while (entry_index < num_entries) {
311     ++entry_index;
312     if (entry_index == num_entries ||
313         first_file.entries[entry_index].product_size != product_size)
314     {
315       float efficiency_this_product_size = 0.0f;
316       for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
317         float efficiency_this_entry = 1.0f;
318         for (auto i = subset.begin(); i != subset.end(); ++i) {
319           efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
320         }
321         efficiency_this_product_size = max(efficiency_this_product_size, efficiency_this_entry);
322       }
323       efficiency = min(efficiency, efficiency_this_product_size);
324       if (entry_index < num_entries) {
325         first_entry_index_with_this_product_size = entry_index;
326         product_size = first_file.entries[entry_index].product_size;
327       }
328     }
329   }
330 
331   return efficiency;
332 }
333 
dump_table_for_subset(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<size_t> & subset)334 void dump_table_for_subset(
335         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
336         const vector<size_t>& subset)
337 {
338   const preprocessed_inputfile_t& first_file = preprocessed_inputfiles[subset[0]];
339   const size_t num_entries = first_file.entries.size();
340   size_t entry_index = 0;
341   size_t first_entry_index_with_this_product_size = 0;
342   uint16_t product_size = first_file.entries[0].product_size;
343   size_t i = 0;
344   size_triple_t min_product_size(first_file.entries.front().product_size);
345   size_triple_t max_product_size(first_file.entries.back().product_size);
346   if (!min_product_size.is_cubic() || !max_product_size.is_cubic()) {
347     abort();
348   }
349   if (only_cubic_sizes) {
350     cerr << "Can't generate tables with --only-cubic-sizes." << endl;
351     abort();
352   }
353   cout << "struct LookupTable {" << endl;
354   cout << "  static const size_t BaseSize = " << min_product_size.k << ";" << endl;
355   const size_t NumSizes = log2_pot(max_product_size.k / min_product_size.k) + 1;
356   const size_t TableSize = NumSizes * NumSizes * NumSizes;
357   cout << "  static const size_t NumSizes = " << NumSizes << ";" << endl;
358   cout << "  static const unsigned short* Data() {" << endl;
359   cout << "    static const unsigned short data[" << TableSize << "] = {";
360   while (entry_index < num_entries) {
361     ++entry_index;
362     if (entry_index == num_entries ||
363         first_file.entries[entry_index].product_size != product_size)
364     {
365       float best_efficiency_this_product_size = 0.0f;
366       uint16_t best_block_size_this_product_size = 0;
367       for (size_t e = first_entry_index_with_this_product_size; e < entry_index; e++) {
368         float efficiency_this_entry = 1.0f;
369         for (auto i = subset.begin(); i != subset.end(); ++i) {
370           efficiency_this_entry = min(efficiency_this_entry, preprocessed_inputfiles[*i].entries[e].efficiency);
371         }
372         if (efficiency_this_entry > best_efficiency_this_product_size) {
373           best_efficiency_this_product_size = efficiency_this_entry;
374           best_block_size_this_product_size = first_file.entries[e].block_size;
375         }
376       }
377       if ((i++) % NumSizes) {
378         cout << " ";
379       } else {
380         cout << endl << "      ";
381       }
382       cout << "0x" << hex << best_block_size_this_product_size << dec;
383       if (entry_index < num_entries) {
384         cout << ",";
385         first_entry_index_with_this_product_size = entry_index;
386         product_size = first_file.entries[entry_index].product_size;
387       }
388     }
389   }
390   if (i != TableSize) {
391     cerr << endl << "Wrote " << i << " table entries, expected " << TableSize << endl;
392     abort();
393   }
394   cout << endl << "    };" << endl;
395   cout << "    return data;" << endl;
396   cout << "  }" << endl;
397   cout << "};" << endl;
398 }
399 
efficiency_of_partition(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<vector<size_t>> & partition)400 float efficiency_of_partition(
401         const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
402         const vector<vector<size_t>>& partition)
403 {
404   float efficiency = 1.0f;
405   for (auto s = partition.begin(); s != partition.end(); ++s) {
406     efficiency = min(efficiency, efficiency_of_subset(preprocessed_inputfiles, *s));
407   }
408   return efficiency;
409 }
410 
make_first_subset(size_t subset_size,vector<size_t> & out_subset,size_t set_size)411 void make_first_subset(size_t subset_size, vector<size_t>& out_subset, size_t set_size)
412 {
413   assert(subset_size >= 1 && subset_size <= set_size);
414   out_subset.resize(subset_size);
415   for (size_t i = 0; i < subset_size; i++) {
416     out_subset[i] = i;
417   }
418 }
419 
is_last_subset(const vector<size_t> & subset,size_t set_size)420 bool is_last_subset(const vector<size_t>& subset, size_t set_size)
421 {
422   return subset[0] == set_size - subset.size();
423 }
424 
next_subset(vector<size_t> & inout_subset,size_t set_size)425 void next_subset(vector<size_t>& inout_subset, size_t set_size)
426 {
427   if (is_last_subset(inout_subset, set_size)) {
428     cerr << "iterating past the last subset" << endl;
429     abort();
430   }
431   size_t i = 1;
432   while (inout_subset[inout_subset.size() - i] == set_size - i) {
433     i++;
434     assert(i <= inout_subset.size());
435   }
436   size_t first_index_to_change = inout_subset.size() - i;
437   inout_subset[first_index_to_change]++;
438   size_t p = inout_subset[first_index_to_change];
439   for (size_t j = first_index_to_change + 1; j < inout_subset.size(); j++) {
440     inout_subset[j] = ++p;
441   }
442 }
443 
444 const size_t number_of_subsets_limit = 100;
445 const size_t always_search_subsets_of_size_at_least = 2;
446 
is_number_of_subsets_feasible(size_t n,size_t p)447 bool is_number_of_subsets_feasible(size_t n, size_t p)
448 {
449   assert(n>0 && p>0 && p<=n);
450   uint64_t numerator = 1, denominator = 1;
451   for (size_t i = 0; i < p; i++) {
452     numerator *= n - i;
453     denominator *= i + 1;
454     if (numerator > denominator * number_of_subsets_limit) {
455       return false;
456     }
457   }
458   return true;
459 }
460 
max_feasible_subset_size(size_t n)461 size_t max_feasible_subset_size(size_t n)
462 {
463   assert(n > 0);
464   const size_t minresult = min<size_t>(n-1, always_search_subsets_of_size_at_least);
465   for (size_t p = 1; p <= n - 1; p++) {
466     if (!is_number_of_subsets_feasible(n, p+1)) {
467       return max(p, minresult);
468     }
469   }
470   return n - 1;
471 }
472 
find_subset_with_efficiency_higher_than(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,float required_efficiency_to_beat,vector<size_t> & inout_remainder,vector<size_t> & out_subset)473 void find_subset_with_efficiency_higher_than(
474        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
475        float required_efficiency_to_beat,
476        vector<size_t>& inout_remainder,
477        vector<size_t>& out_subset)
478 {
479   out_subset.resize(0);
480 
481   if (required_efficiency_to_beat >= 1.0f) {
482     cerr << "can't beat efficiency 1." << endl;
483     abort();
484   }
485 
486   while (!inout_remainder.empty()) {
487 
488     vector<size_t> candidate_indices(inout_remainder.size());
489     for (size_t i = 0; i < candidate_indices.size(); i++) {
490       candidate_indices[i] = i;
491     }
492 
493     size_t candidate_indices_subset_size = max_feasible_subset_size(candidate_indices.size());
494     while (candidate_indices_subset_size >= 1) {
495       vector<size_t> candidate_indices_subset;
496       make_first_subset(candidate_indices_subset_size,
497                         candidate_indices_subset,
498                         candidate_indices.size());
499 
500       vector<size_t> best_candidate_indices_subset;
501       float best_efficiency = 0.0f;
502       vector<size_t> trial_subset = out_subset;
503       trial_subset.resize(out_subset.size() + candidate_indices_subset_size);
504       while (true)
505       {
506         for (size_t i = 0; i < candidate_indices_subset_size; i++) {
507           trial_subset[out_subset.size() + i] = inout_remainder[candidate_indices_subset[i]];
508         }
509 
510         float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
511         if (trial_efficiency > best_efficiency) {
512           best_efficiency = trial_efficiency;
513           best_candidate_indices_subset = candidate_indices_subset;
514         }
515         if (is_last_subset(candidate_indices_subset, candidate_indices.size())) {
516           break;
517         }
518         next_subset(candidate_indices_subset, candidate_indices.size());
519       }
520 
521       if (best_efficiency > required_efficiency_to_beat) {
522         for (size_t i = 0; i < best_candidate_indices_subset.size(); i++) {
523           candidate_indices[i] = candidate_indices[best_candidate_indices_subset[i]];
524         }
525         candidate_indices.resize(best_candidate_indices_subset.size());
526       }
527       candidate_indices_subset_size--;
528     }
529 
530     size_t candidate_index = candidate_indices[0];
531     auto candidate_iterator = inout_remainder.begin() + candidate_index;
532     vector<size_t> trial_subset = out_subset;
533 
534     trial_subset.push_back(*candidate_iterator);
535     float trial_efficiency = efficiency_of_subset(preprocessed_inputfiles, trial_subset);
536     if (trial_efficiency > required_efficiency_to_beat) {
537       out_subset.push_back(*candidate_iterator);
538       inout_remainder.erase(candidate_iterator);
539     } else {
540       break;
541     }
542   }
543 }
544 
find_partition_with_efficiency_higher_than(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,float required_efficiency_to_beat,vector<vector<size_t>> & out_partition)545 void find_partition_with_efficiency_higher_than(
546        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
547        float required_efficiency_to_beat,
548        vector<vector<size_t>>& out_partition)
549 {
550   out_partition.resize(0);
551 
552   vector<size_t> remainder;
553   for (size_t i = 0; i < preprocessed_inputfiles.size(); i++) {
554     remainder.push_back(i);
555   }
556 
557   while (!remainder.empty()) {
558     vector<size_t> new_subset;
559     find_subset_with_efficiency_higher_than(
560       preprocessed_inputfiles,
561       required_efficiency_to_beat,
562       remainder,
563       new_subset);
564     out_partition.push_back(new_subset);
565   }
566 }
567 
print_partition(const vector<preprocessed_inputfile_t> & preprocessed_inputfiles,const vector<vector<size_t>> & partition)568 void print_partition(
569        const vector<preprocessed_inputfile_t>& preprocessed_inputfiles,
570        const vector<vector<size_t>>& partition)
571 {
572   float efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
573   cout << "Partition into " << partition.size() << " subsets for " << efficiency * 100.0f << "% efficiency"  << endl;
574   for (auto subset = partition.begin(); subset != partition.end(); ++subset) {
575     cout << "  Subset " << (subset - partition.begin())
576          << ", efficiency " << efficiency_of_subset(preprocessed_inputfiles, *subset) * 100.0f << "%:"
577          << endl;
578     for (auto file = subset->begin(); file != subset->end(); ++file) {
579       cout << "    " << preprocessed_inputfiles[*file].filename << endl;
580     }
581     if (dump_tables) {
582       cout << "  Table:" << endl;
583       dump_table_for_subset(preprocessed_inputfiles, *subset);
584     }
585   }
586   cout << endl;
587 }
588 
589 struct action_t
590 {
invokation_nameaction_t591   virtual const char* invokation_name() const { abort(); return nullptr; }
runaction_t592   virtual void run(const vector<string>&) const { abort(); }
~action_taction_t593   virtual ~action_t() {}
594 };
595 
596 struct partition_action_t : action_t
597 {
invokation_namepartition_action_t598   virtual const char* invokation_name() const override { return "partition"; }
runpartition_action_t599   virtual void run(const vector<string>& input_filenames) const override
600   {
601     vector<preprocessed_inputfile_t> preprocessed_inputfiles;
602 
603     if (input_filenames.empty()) {
604       cerr << "The " << invokation_name() << " action needs a list of input files." << endl;
605       exit(1);
606     }
607 
608     for (auto it = input_filenames.begin(); it != input_filenames.end(); ++it) {
609       inputfile_t inputfile(*it);
610       switch (inputfile.type) {
611         case inputfile_t::type_t::all_pot_sizes:
612           preprocessed_inputfiles.emplace_back(inputfile);
613           break;
614         case inputfile_t::type_t::default_sizes:
615           cerr << "The " << invokation_name() << " action only uses measurements for all pot sizes, and "
616                << "has no use for " << *it << " which contains measurements for default sizes." << endl;
617           exit(1);
618           break;
619         default:
620           cerr << "Unrecognized input file: " << *it << endl;
621           exit(1);
622       }
623     }
624 
625     check_all_files_in_same_exact_order(preprocessed_inputfiles);
626 
627     float required_efficiency_to_beat = 0.0f;
628     vector<vector<vector<size_t>>> partitions;
629     cerr << "searching for partitions...\r" << flush;
630     while (true)
631     {
632       vector<vector<size_t>> partition;
633       find_partition_with_efficiency_higher_than(
634         preprocessed_inputfiles,
635         required_efficiency_to_beat,
636         partition);
637       float actual_efficiency = efficiency_of_partition(preprocessed_inputfiles, partition);
638       cerr << "partition " << preprocessed_inputfiles.size() << " files into " << partition.size()
639            << " subsets for " << 100.0f * actual_efficiency
640            << " % efficiency"
641            << "                  \r" << flush;
642       partitions.push_back(partition);
643       if (partition.size() == preprocessed_inputfiles.size() || actual_efficiency == 1.0f) {
644         break;
645       }
646       required_efficiency_to_beat = actual_efficiency;
647     }
648     cerr << "                                                                  " << endl;
649     while (true) {
650       bool repeat = false;
651       for (size_t i = 0; i < partitions.size() - 1; i++) {
652         if (partitions[i].size() >= partitions[i+1].size()) {
653           partitions.erase(partitions.begin() + i);
654           repeat = true;
655           break;
656         }
657       }
658       if (!repeat) {
659         break;
660       }
661     }
662     for (auto it = partitions.begin(); it != partitions.end(); ++it) {
663       print_partition(preprocessed_inputfiles, *it);
664     }
665   }
666 };
667 
668 struct evaluate_defaults_action_t : action_t
669 {
670   struct results_entry_t {
671     uint16_t product_size;
672     size_triple_t default_block_size;
673     uint16_t best_pot_block_size;
674     float default_gflops;
675     float best_pot_gflops;
676     float default_efficiency;
677   };
operator <<(ostream & s,const results_entry_t & entry)678   friend ostream& operator<<(ostream& s, const results_entry_t& entry)
679   {
680     return s
681       << "Product size " << size_triple_t(entry.product_size)
682       << ": default block size " << entry.default_block_size
683       << " -> " << entry.default_gflops
684       << " GFlop/s = " << entry.default_efficiency * 100.0f << " %"
685       << " of best POT block size " << size_triple_t(entry.best_pot_block_size)
686       << " -> " << entry.best_pot_gflops
687       << " GFlop/s" << dec;
688   }
lower_efficiencyevaluate_defaults_action_t689   static bool lower_efficiency(const results_entry_t& e1, const results_entry_t& e2) {
690     return e1.default_efficiency < e2.default_efficiency;
691   }
invokation_nameevaluate_defaults_action_t692   virtual const char* invokation_name() const override { return "evaluate-defaults"; }
show_usage_and_exitevaluate_defaults_action_t693   void show_usage_and_exit() const
694   {
695     cerr << "usage: " << invokation_name() << " default-sizes-data all-pot-sizes-data" << endl;
696     cerr << "checks how well the performance with default sizes compares to the best "
697          << "performance measured over all POT sizes." << endl;
698     exit(1);
699   }
runevaluate_defaults_action_t700   virtual void run(const vector<string>& input_filenames) const override
701   {
702     if (input_filenames.size() != 2) {
703       show_usage_and_exit();
704     }
705     inputfile_t inputfile_default_sizes(input_filenames[0]);
706     inputfile_t inputfile_all_pot_sizes(input_filenames[1]);
707     if (inputfile_default_sizes.type != inputfile_t::type_t::default_sizes) {
708       cerr << inputfile_default_sizes.filename << " is not an input file with default sizes." << endl;
709       show_usage_and_exit();
710     }
711     if (inputfile_all_pot_sizes.type != inputfile_t::type_t::all_pot_sizes) {
712       cerr << inputfile_all_pot_sizes.filename << " is not an input file with all POT sizes." << endl;
713       show_usage_and_exit();
714     }
715     vector<results_entry_t> results;
716     vector<results_entry_t> cubic_results;
717 
718     uint16_t product_size = 0;
719     auto it_all_pot_sizes = inputfile_all_pot_sizes.entries.begin();
720     for (auto it_default_sizes = inputfile_default_sizes.entries.begin();
721          it_default_sizes != inputfile_default_sizes.entries.end();
722          ++it_default_sizes)
723     {
724       if (it_default_sizes->product_size == product_size) {
725         continue;
726       }
727       product_size = it_default_sizes->product_size;
728       while (it_all_pot_sizes != inputfile_all_pot_sizes.entries.end() &&
729              it_all_pot_sizes->product_size != product_size)
730       {
731         ++it_all_pot_sizes;
732       }
733       if (it_all_pot_sizes == inputfile_all_pot_sizes.entries.end()) {
734         break;
735       }
736       uint16_t best_pot_block_size = 0;
737       float best_pot_gflops = 0;
738       for (auto it = it_all_pot_sizes;
739            it != inputfile_all_pot_sizes.entries.end() && it->product_size == product_size;
740            ++it)
741       {
742         if (it->gflops > best_pot_gflops) {
743           best_pot_gflops = it->gflops;
744           best_pot_block_size = it->pot_block_size;
745         }
746       }
747       results_entry_t entry;
748       entry.product_size = product_size;
749       entry.default_block_size = it_default_sizes->nonpot_block_size;
750       entry.best_pot_block_size = best_pot_block_size;
751       entry.default_gflops = it_default_sizes->gflops;
752       entry.best_pot_gflops = best_pot_gflops;
753       entry.default_efficiency = entry.default_gflops / entry.best_pot_gflops;
754       results.push_back(entry);
755 
756       size_triple_t t(product_size);
757       if (t.k == t.m && t.m == t.n) {
758         cubic_results.push_back(entry);
759       }
760     }
761 
762     cout << "All results:" << endl;
763     for (auto it = results.begin(); it != results.end(); ++it) {
764       cout << *it << endl;
765     }
766     cout << endl;
767 
768     sort(results.begin(), results.end(), lower_efficiency);
769 
770     const size_t n = min<size_t>(20, results.size());
771     cout << n << " worst results:" << endl;
772     for (size_t i = 0; i < n; i++) {
773       cout << results[i] << endl;
774     }
775     cout << endl;
776 
777     cout << "cubic results:" << endl;
778     for (auto it = cubic_results.begin(); it != cubic_results.end(); ++it) {
779       cout << *it << endl;
780     }
781     cout << endl;
782 
783     sort(cubic_results.begin(), cubic_results.end(), lower_efficiency);
784 
785     cout.precision(2);
786     vector<float> a = {0.5f, 0.20f, 0.10f, 0.05f, 0.02f, 0.01f};
787     for (auto it = a.begin(); it != a.end(); ++it) {
788       size_t n = min(results.size() - 1, size_t(*it * results.size()));
789       cout << (100.0f * n / (results.size() - 1))
790            << " % of product sizes have default efficiency <= "
791            << 100.0f * results[n].default_efficiency << " %" << endl;
792     }
793     cout.precision(default_precision);
794   }
795 };
796 
797 
show_usage_and_exit(int argc,char * argv[],const vector<unique_ptr<action_t>> & available_actions)798 void show_usage_and_exit(int argc, char* argv[],
799                          const vector<unique_ptr<action_t>>& available_actions)
800 {
801   cerr << "usage: " << argv[0] << " <action> [options...] <input files...>" << endl;
802   cerr << "available actions:" << endl;
803   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
804     cerr << "  " << (*it)->invokation_name() << endl;
805   }
806   cerr << "the input files should each contain an output of benchmark-blocking-sizes" << endl;
807   exit(1);
808 }
809 
main(int argc,char * argv[])810 int main(int argc, char* argv[])
811 {
812   cout.precision(default_precision);
813   cerr.precision(default_precision);
814 
815   vector<unique_ptr<action_t>> available_actions;
816   available_actions.emplace_back(new partition_action_t);
817   available_actions.emplace_back(new evaluate_defaults_action_t);
818 
819   vector<string> input_filenames;
820 
821   action_t* action = nullptr;
822 
823   if (argc < 2) {
824     show_usage_and_exit(argc, argv, available_actions);
825   }
826   for (int i = 1; i < argc; i++) {
827     bool arg_handled = false;
828     // Step 1. Try to match action invokation names.
829     for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
830       if (!strcmp(argv[i], (*it)->invokation_name())) {
831         if (!action) {
832           action = it->get();
833           arg_handled = true;
834           break;
835         } else {
836           cerr << "can't specify more than one action!" << endl;
837           show_usage_and_exit(argc, argv, available_actions);
838         }
839       }
840     }
841     if (arg_handled) {
842       continue;
843     }
844     // Step 2. Try to match option names.
845     if (argv[i][0] == '-') {
846       if (!strcmp(argv[i], "--only-cubic-sizes")) {
847         only_cubic_sizes = true;
848         arg_handled = true;
849       }
850       if (!strcmp(argv[i], "--dump-tables")) {
851         dump_tables = true;
852         arg_handled = true;
853       }
854       if (!arg_handled) {
855         cerr << "Unrecognized option: " << argv[i] << endl;
856         show_usage_and_exit(argc, argv, available_actions);
857       }
858     }
859     if (arg_handled) {
860       continue;
861     }
862     // Step 3. Default to interpreting args as input filenames.
863     input_filenames.emplace_back(argv[i]);
864   }
865 
866   if (dump_tables && only_cubic_sizes) {
867     cerr << "Incompatible options: --only-cubic-sizes and --dump-tables." << endl;
868     show_usage_and_exit(argc, argv, available_actions);
869   }
870 
871   if (!action) {
872     show_usage_and_exit(argc, argv, available_actions);
873   }
874 
875   action->run(input_filenames);
876 }
877