1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "gtest/gtest.h"
18 
19 extern "C" void foo();
20 
21 namespace {
22 
f0()23 int f0() {
24   return 1;
25 }
26 
f1()27 int f1() {
28   return 2;
29 }
30 
f2()31 int f2() {
32   return 3;
33 }
34 
f3()35 int f3() {
36   return 4;
37 }
38 
39 }  // namespace
40 
TEST(BerberisPerf,XorLoop)41 TEST(BerberisPerf, XorLoop) {
42   unsigned c = 0xdeadbeef;
43 
44   // c "wraps" every 32 iterations.  Since 1,000,000,000 is divisible
45   // by 32, we expect to get the original value back.
46   for (int i = 0; i < 1000 * 1000 * 1000; i++) {
47     c ^= (c << 1);
48   }
49 
50   EXPECT_EQ(c, 0xdeadbeef);
51 }
52 
TEST(BerberisPerf,LoopWithCondition)53 TEST(BerberisPerf, LoopWithCondition) {
54   unsigned res = 0xf00dfeed;
55 
56   // We want to make sure the loop body is efficiently executed even when loop
57   // body is split by an unconditional branch. E.g. this shouldn't result in two
58   // translated regions.
59   // Note that simple if-else won't suffice. First, it can be replaced by
60   // a conditional MOV instruction. Second, one uncoditional branch can be merged
61   // with the back branch of the loop. Thus we intentionally use if-else_if-else.
62   for (int i = 0; i < 1000 * 1000 * 1000; i++) {
63     int mod = i % 4;
64     if (mod == 0) {
65       res ^= res << 1;
66     } else if (mod == 1) {
67       res ^= res << 2;
68     } else if (mod == 2) {
69       res ^= res << 3;
70     } else {
71       res ^= res << 4;
72     }
73   }
74 
75   EXPECT_EQ(res, 0xf00dfeed);
76 }
77 
TEST(BerberisPerf,Pi)78 TEST(BerberisPerf, Pi) {
79   // Calculate the area of a circle with r = 10000 by checking to see
80   // if each point in the 20000 x 20000 square lies within the circle.
81   const int N = 10000;
82   int c = 0;
83   for (int i = -N; i < N; i++) {
84     for (int j = -N; j < N; j++) {
85       c += ((i * i + j * j) < N * N);
86     }
87   }
88   EXPECT_EQ(c, 314159017);
89 }
90 
TEST(BerberisPerf,FuncPtr)91 TEST(BerberisPerf, FuncPtr) {
92   using FuncPtr = int (*)(void);
93   static const FuncPtr fptrs[4] = {f0, f1, f2, f3};
94 
95   // Call functions with their pointers 100 million times.
96   int a = 0;
97   for (int i = 0; i < 100 * 1000 * 1000; i++) {
98     // The array index expression below has a period of length 16.
99     a += fptrs[(i ^ (i >> 2)) & 3]();
100   }
101   EXPECT_EQ(a, 250000000);
102 }
103 
TEST(BerberisPerf,StrlenFruits)104 TEST(BerberisPerf, StrlenFruits) {
105   // Call strlen about 35 million times while incrementing the pointer
106   // to the string.  This way, we get to test different alignments.
107   //
108   // Dropping "256" below seems to change the characteristics of the
109   // test, and the execution time would collapse to 300ms from 4000ms.
110   static const char str[256] =
111       "banana apple orange strawberry pinapple grape lemon cherry pear melon watermelon peach";
112   unsigned result = 0;
113   int e = strlen(str);
114 
115   for (int i = 0; i < 300 * 1000; i++) {
116     for (int j = 0; j != e; j++) {
117       result ^= strlen(str + j);
118     }
119   }
120   EXPECT_EQ(result, 0U);
121 }
122 
TEST(BerberisPerf,StrlenEmpty)123 TEST(BerberisPerf, StrlenEmpty) {
124   // Call strlen with the empty string to measure the overhead of
125   // trampoline.
126   //
127   // We keep assigning to and using "len" to prevent the compiler from
128   // optimizing away calls to strlen.
129   unsigned len = 0;
130   int i;
131   for (i = 0; i < 30 * 1000 * 1000; i++) {
132     char str[1] = {static_cast<char>(len)};
133     len = strlen(str);
134   }
135   EXPECT_EQ(len, 0U);
136 }
137 
TEST(BerberisPerf,HighRegPres)138 TEST(BerberisPerf, HighRegPres) {
139   // High register pressure test.
140   //
141   // The generated code on ARM has no spill.  Twelve variables from v0
142   // to vb, "i", SP, LR, and PC use up exactly 16 registers.
143   unsigned v0 = 0;
144   unsigned v1 = 1;
145   unsigned v2 = 2;
146   unsigned v3 = 3;
147   unsigned v4 = 4;
148   unsigned v5 = 5;
149   unsigned v6 = 6;
150   unsigned v7 = 7;
151   unsigned v8 = 8;
152   unsigned v9 = 9;
153   unsigned va = 10;
154   unsigned vb = 11;
155   volatile unsigned vol = 0;
156   for (size_t i = 0; i < 100 * 1000 * 1000; i++) {
157     // Disable the auto vectorization by reading a volatile variable.
158     i += vol;
159 
160     v0 += i ^ 3;
161     v1 += i ^ 4;
162     v2 += i ^ 5;
163     v3 += i ^ 6;
164     v4 += i ^ 7;
165     v5 += i ^ 8;
166     v6 += i ^ 9;
167     v7 += i ^ 10;
168     v8 += i ^ 11;
169     v9 += i ^ 12;
170     va += i ^ 13;
171     vb += i ^ 14;
172   }
173   unsigned result = (v0 ^ v1 ^ v2 ^ v3 ^ v4 ^ v5 ^ v6 ^ v7 ^ v8 ^ v9 ^ va ^ vb);
174   EXPECT_EQ(result, 0U);
175 }
176 
TEST(BerberisPerf,EmptyFunc)177 TEST(BerberisPerf, EmptyFunc) {
178   // Keep calling an empty function.
179   for (size_t i = 0; i < 500 * 1000 * 1000; i++) {
180     foo();
181   }
182   EXPECT_EQ(0, 0);
183 }
184 
TEST(BerberisPerf,ConvertF32I32)185 TEST(BerberisPerf, ConvertF32I32) {
186   static const float vals[] = {0.5, 1.2};
187   int sum = 0;
188   for (int i = 0; i < 100 * 1000 * 1000; i++) {
189     sum += static_cast<int>(vals[i & 1]);
190   }
191   EXPECT_EQ(sum, 50000000);
192 }
193 
194 #if defined __arm__
195 
TEST(BerberisPerf,ReadWriteFPSCR)196 TEST(BerberisPerf, ReadWriteFPSCR) {
197   for (int i = 0; i < 0x1ffffff; i++) {
198     // Filter-out bits which implementation does not support and exception bits.
199     // If we set exception bits then we get FP-exception (correct behavior), but
200     // it's handling dwarfs the execution time by huge margin thus we couldn't do
201     // that in perf test.
202     uint32_t fpscr_in = i & 0xc01f00;
203     uint32_t fpscr_out;
204     asm("vmsr fpscr, %1\n"
205         "vmrs %0, fpscr\n"
206         : "=r"(fpscr_out)
207         : "r"(fpscr_in));
208     EXPECT_EQ(fpscr_in, fpscr_out);
209   }
210 }
211 
212 #endif  // defined __arm__