/* * Copyright (C) 2012 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include /* Code from now to qsort_local all copied from bionic source. * The code is duplicated here to remove dependency on optimized bionic */ static __inline char *med3(char *, char *, char *, int (*)(const void *, const void *)); static __inline void swapfunc(char *, char *, int, int); #define min(a, b) (a) < (b) ? a : b /* * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function". */ #define swapcode(TYPE, parmi, parmj, n) { \ long i = (n) / sizeof (TYPE); \ TYPE *pi = (TYPE *) (parmi); \ TYPE *pj = (TYPE *) (parmj); \ do { \ TYPE t = *pi; \ *pi++ = *pj; \ *pj++ = t; \ } while (--i > 0); \ } #define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \ es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1; static __inline void swapfunc(char *a, char *b, int n, int swaptype) { if (swaptype <= 1) swapcode(long, a, b, n) else swapcode(char, a, b, n) } #define swap(a, b) \ if (swaptype == 0) { \ long t = *(long *)(a); \ *(long *)(a) = *(long *)(b); \ *(long *)(b) = t; \ } else \ swapfunc(a, b, es, swaptype) #define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n, swaptype) static __inline char * med3(char *a, char *b, char *c, int (*cmp)(const void *, const void *)) { return cmp(a, b) < 0 ? (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a )) :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c )); } void qsort_local(void *aa, size_t n, size_t es, int (*cmp)(const void *, const void *)) { char *pa, *pb, *pc, *pd, *pl, *pm, *pn; int d, r, swaptype, swap_cnt; char *a = (char*)aa; loop: SWAPINIT(a, es); swap_cnt = 0; if (n < 7) { for (pm = (char *)a + es; pm < (char *) a + n * es; pm += es) for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } pm = (char *)a + (n / 2) * es; if (n > 7) { pl = (char *)a; pn = (char *)a + (n - 1) * es; if (n > 40) { d = (n / 8) * es; pl = med3(pl, pl + d, pl + 2 * d, cmp); pm = med3(pm - d, pm, pm + d, cmp); pn = med3(pn - 2 * d, pn - d, pn, cmp); } pm = med3(pl, pm, pn, cmp); } swap(a, pm); pa = pb = (char *)a + es; pc = pd = (char *)a + (n - 1) * es; for (;;) { while (pb <= pc && (r = cmp(pb, a)) <= 0) { if (r == 0) { swap_cnt = 1; swap(pa, pb); pa += es; } pb += es; } while (pb <= pc && (r = cmp(pc, a)) >= 0) { if (r == 0) { swap_cnt = 1; swap(pc, pd); pd -= es; } pc -= es; } if (pb > pc) break; swap(pb, pc); swap_cnt = 1; pb += es; pc -= es; } if (swap_cnt == 0) { /* Switch to insertion sort */ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es) for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0; pl -= es) swap(pl, pl - es); return; } pn = (char *)a + n * es; r = min(pa - (char *)a, pb - pa); vecswap(a, pb - r, r); r = min(pd - pc, pn - pd - (int)es); vecswap(pb, pn - r, r); if ((r = pb - pa) > (int)es) qsort_local(a, r / es, es, cmp); if ((r = pd - pc) > (int)es) { /* Iterate rather than recurse to save stack space */ a = pn - r; n = r / es; goto loop; } /* qsort(pn - r, r / es, es, cmp); */ } /* code duplication ends here */ /** * Util for getting time stamp */ double currentTimeMillis() { struct timeval tv; gettimeofday(&tv, (struct timezone *) NULL); return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0; } /** * Initialize given array randomly for the given seed */ template void randomInitArray(T* array, int len, unsigned int seed) { srand(seed); for (int i = 0; i < len; i++) { array[i] = (T) rand(); } } /** * comparison function for int, for qsort */ int cmpint(const void* p1, const void* p2) { return *(int*)p1 - *(int*)p2; } extern "C" JNIEXPORT jdouble JNICALL Java_android_simplecpu_cts_CpuNative_runSort(JNIEnv* env, jclass clazz, jint numberElements, jint repetition) { int* data = new int[numberElements]; if (data == NULL) { env->ThrowNew(env->FindClass("java/lang/OutOfMemoryError"), "No memory"); return -1; } double totalTime = 0; for (int i = 0; i < repetition; i++) { randomInitArray(data, numberElements, 0); double start = currentTimeMillis(); qsort_local(data, numberElements, sizeof(int), cmpint); double end = currentTimeMillis(); totalTime += (end - start); } delete[] data; return totalTime; } /** * Do matrix multiplication, C = A x B with all matrices having dimension of n x n * The implementation is not in the most efficient, but it is good enough for benchmarking purpose. * @param n should be multiple of 8 */ void doMatrixMultiplication(float* A, float* B, float* C, int n) { // batch size const int M = 8; for (int i = 0; i < n; i++) { for (int j = 0; j < n; j += M) { float sum[M]; for (int k = 0; k < M; k++) { sum[k] = 0; } // re-use the whole cache line for accessing B. // otherwise, the whole line will be read and only one value will be used. for (int k = 0; k < n; k++) { float a = A[i * n + k]; sum[0] += a * B[k * n + j]; sum[1] += a * B[k * n + j + 1]; sum[2] += a * B[k * n + j + 2]; sum[3] += a * B[k * n + j + 3]; sum[4] += a * B[k * n + j + 4]; sum[5] += a * B[k * n + j + 5]; sum[6] += a * B[k * n + j + 6]; sum[7] += a * B[k * n + j + 7]; } for (int k = 0; k < M; k++) { C[i * n + j + k] = sum[k]; } } } } extern "C" JNIEXPORT jdouble JNICALL Java_android_simplecpu_cts_CpuNative_runMatrixMultiplication( JNIEnv* env, jclass clazz, jint n, jint repetition) { // C = A x B float* A = new float[n * n]; float* B = new float[n * n]; float* C = new float[n * n]; if ((A == NULL) || (B == NULL) || (C == NULL)) { delete[] A; delete[] B; delete[] C; env->ThrowNew(env->FindClass("java/lang/OutOfMemoryError"), "No memory"); return -1; } double totalTime = 0; for (int i = 0; i < repetition; i++) { randomInitArray(A, n * n, 0); randomInitArray(B, n * n, 1); double start = currentTimeMillis(); doMatrixMultiplication(A, B, C, n); double end = currentTimeMillis(); totalTime += (end - start); } delete[] A; delete[] B; delete[] C; return totalTime; }