1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14 15; This file provides SSSE3 version of the forward transformation. Part 16; of the macro definitions are originally derived from the ffmpeg project. 17; The current version applies to x86 64-bit only. 18 19SECTION .text 20 21%if ARCH_X86_64 22; matrix transpose 23%macro INTERLEAVE_2X 4 24 punpckh%1 m%4, m%2, m%3 25 punpckl%1 m%2, m%3 26 SWAP %3, %4 27%endmacro 28 29%macro TRANSPOSE8X8 9 30 INTERLEAVE_2X wd, %1, %2, %9 31 INTERLEAVE_2X wd, %3, %4, %9 32 INTERLEAVE_2X wd, %5, %6, %9 33 INTERLEAVE_2X wd, %7, %8, %9 34 35 INTERLEAVE_2X dq, %1, %3, %9 36 INTERLEAVE_2X dq, %2, %4, %9 37 INTERLEAVE_2X dq, %5, %7, %9 38 INTERLEAVE_2X dq, %6, %8, %9 39 40 INTERLEAVE_2X qdq, %1, %5, %9 41 INTERLEAVE_2X qdq, %3, %7, %9 42 INTERLEAVE_2X qdq, %2, %6, %9 43 INTERLEAVE_2X qdq, %4, %8, %9 44 45 SWAP %2, %5 46 SWAP %4, %7 47%endmacro 48 49%macro HMD8_1D 0 50 psubw m8, m0, m1 51 psubw m9, m2, m3 52 paddw m0, m1 53 paddw m2, m3 54 SWAP 1, 8 55 SWAP 3, 9 56 psubw m8, m4, m5 57 psubw m9, m6, m7 58 paddw m4, m5 59 paddw m6, m7 60 SWAP 5, 8 61 SWAP 7, 9 62 63 psubw m8, m0, m2 64 psubw m9, m1, m3 65 paddw m0, m2 66 paddw m1, m3 67 SWAP 2, 8 68 SWAP 3, 9 69 psubw m8, m4, m6 70 psubw m9, m5, m7 71 paddw m4, m6 72 paddw m5, m7 73 SWAP 6, 8 74 SWAP 7, 9 75 76 psubw m8, m0, m4 77 psubw m9, m1, m5 78 paddw m0, m4 79 paddw m1, m5 80 SWAP 4, 8 81 SWAP 5, 9 82 psubw m8, m2, m6 83 psubw m9, m3, m7 84 paddw m2, m6 85 paddw m3, m7 86 SWAP 6, 8 87 SWAP 7, 9 88%endmacro 89 90INIT_XMM ssse3 91cglobal hadamard_8x8, 3, 5, 10, input, stride, output 92 lea r3, [2 * strideq] 93 lea r4, [4 * strideq] 94 95 mova m0, [inputq] 96 mova m1, [inputq + r3] 97 lea inputq, [inputq + r4] 98 mova m2, [inputq] 99 mova m3, [inputq + r3] 100 lea inputq, [inputq + r4] 101 mova m4, [inputq] 102 mova m5, [inputq + r3] 103 lea inputq, [inputq + r4] 104 mova m6, [inputq] 105 mova m7, [inputq + r3] 106 107 HMD8_1D 108 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 109 HMD8_1D 110 111 mova [outputq + 0], m0 112 mova [outputq + 16], m1 113 mova [outputq + 32], m2 114 mova [outputq + 48], m3 115 mova [outputq + 64], m4 116 mova [outputq + 80], m5 117 mova [outputq + 96], m6 118 mova [outputq + 112], m7 119 120 RET 121%endif 122