/****************************************************************************** * * Copyright (C) 2015 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ***************************************************************************** * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore */ /** ******************************************************************************* * @file * impeg2_inter_pred_sse42_intr.c * * @brief * Contains Motion compensation function definitions for MPEG2 decoder * * @author * Mohit [100664] * * - impeg2_copy_mb_sse42() * - impeg2_interpolate_sse42() * - impeg2_mc_halfx_halfy_8x8_sse42() * - impeg2_mc_halfx_fully_8x8_sse42() * - impeg2_mc_fullx_halfy_8x8_sse42() * - impeg2_mc_fullx_fully_8x8_sse42() * * @remarks * None * ******************************************************************************* */ #include #include #include "iv_datatypedef.h" #include "impeg2_macros.h" #include "impeg2_defs.h" #include "impeg2_inter_pred.h" #include #include #include #include /******************************************************************************* * Function Name : impeg2_copy_mb * * Description : copies 3 components to the frame from mc_buf * * Arguments : * src_buf : Source Buffer * dst_buf : Destination Buffer * src_wd : Source Width * dst_wd : destination Width * * Values Returned : None *******************************************************************************/ void impeg2_copy_mb_sse42(yuv_buf_t *src_buf, yuv_buf_t *dst_buf, UWORD32 src_wd, UWORD32 dst_wd) { UWORD8 *src; UWORD8 *dst; __m128i src_r0, src_r1, src_r2, src_r3; /*******************************************************/ /* copy Y */ /*******************************************************/ src = src_buf->pu1_y; dst = dst_buf->pu1_y; // Row 0-3 src_r0 = _mm_loadu_si128((__m128i *) (src)); src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); _mm_storeu_si128((__m128i *) dst, src_r0); _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); // Row 4-7 src += 4 * src_wd; dst += 4 * dst_wd; src_r0 = _mm_loadu_si128((__m128i *) (src)); src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); _mm_storeu_si128((__m128i *) dst, src_r0); _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); // Row 8-11 src += 4 * src_wd; dst += 4 * dst_wd; src_r0 = _mm_loadu_si128((__m128i *) (src)); src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); _mm_storeu_si128((__m128i *) dst, src_r0); _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); // Row 12-15 src += 4 * src_wd; dst += 4 * dst_wd; src_r0 = _mm_loadu_si128((__m128i *) (src)); src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd)); src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd)); src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd)); _mm_storeu_si128((__m128i *) dst, src_r0); _mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1); _mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2); _mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3); src_wd >>= 1; dst_wd >>= 1; /*******************************************************/ /* copy U */ /*******************************************************/ src = src_buf->pu1_u; dst = dst_buf->pu1_u; // Row 0-3 src_r0 = _mm_loadl_epi64((__m128i *)src); src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); _mm_storel_epi64((__m128i *)dst, src_r0); _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); // Row 4-7 src += 4 * src_wd; dst += 4 * dst_wd; src_r0 = _mm_loadl_epi64((__m128i *)src); src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); _mm_storel_epi64((__m128i *)dst, src_r0); _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); /*******************************************************/ /* copy V */ /*******************************************************/ src = src_buf->pu1_v; dst = dst_buf->pu1_v; // Row 0-3 src_r0 = _mm_loadl_epi64((__m128i *)src); src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); _mm_storel_epi64((__m128i *)dst, src_r0); _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); // Row 4-7 src += 4 * src_wd; dst += 4 * dst_wd; src_r0 = _mm_loadl_epi64((__m128i *)src); src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd)); src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd)); src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd)); _mm_storel_epi64((__m128i *)dst, src_r0); _mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1); _mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2); _mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3); } /*****************************************************************************/ /* */ /* Function Name : impeg2_interpolate */ /* */ /* Description : averages the contents of buf_src1 and buf_src2 and stores*/ /* result in buf_dst */ /* */ /* Inputs : buf_src1 - First Source */ /* buf_src2 - Second Source */ /* */ /* Globals : None */ /* */ /* Processing : Avg the values from two sources and store the result in */ /* destination buffer */ /* */ /* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */ /* */ /* Returns : None */ /* */ /* Issues : Assumes that all 3 buffers are of same size */ /* */ /*****************************************************************************/ void impeg2_interpolate_sse42(yuv_buf_t *buf_src1, yuv_buf_t *buf_src2, yuv_buf_t *buf_dst, UWORD32 stride) { UWORD8 *src1, *src2; UWORD8 *dst; __m128i src1_r0, src1_r1, src1_r2, src1_r3; __m128i src2_r0, src2_r1, src2_r2, src2_r3; /*******************************************************/ /* interpolate Y */ /*******************************************************/ src1 = buf_src1->pu1_y; src2 = buf_src2->pu1_y; dst = buf_dst->pu1_y; // Row 0-3 src1_r0 = _mm_loadu_si128((__m128i *) (src1)); src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); src2_r0 = _mm_loadu_si128((__m128i *) (src2)); src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storeu_si128((__m128i *) dst, src1_r0); _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); // Row 4-7 src1 += 4 * 16; src2 += 4 * 16; dst += 4 * stride; src1_r0 = _mm_loadu_si128((__m128i *) (src1)); src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); src2_r0 = _mm_loadu_si128((__m128i *) (src2)); src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storeu_si128((__m128i *) dst, src1_r0); _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); // Row 8-11 src1 += 4 * 16; src2 += 4 * 16; dst += 4 * stride; src1_r0 = _mm_loadu_si128((__m128i *) (src1)); src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); src2_r0 = _mm_loadu_si128((__m128i *) (src2)); src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storeu_si128((__m128i *) dst, src1_r0); _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); // Row 12-15 src1 += 4 * 16; src2 += 4 * 16; dst += 4 * stride; src1_r0 = _mm_loadu_si128((__m128i *) (src1)); src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16)); src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16)); src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16)); src2_r0 = _mm_loadu_si128((__m128i *) (src2)); src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16)); src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16)); src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storeu_si128((__m128i *) dst, src1_r0); _mm_storeu_si128((__m128i *) (dst + stride), src1_r1); _mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2); _mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3); stride >>= 1; /*******************************************************/ /* interpolate U */ /*******************************************************/ src1 = buf_src1->pu1_u; src2 = buf_src2->pu1_u; dst = buf_dst->pu1_u; // Row 0-3 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storel_epi64((__m128i *) dst, src1_r0); _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); // Row 4-7 src1 += 4 * 8; src2 += 4 * 8; dst += 4 * stride; src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storel_epi64((__m128i *) dst, src1_r0); _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); /*******************************************************/ /* interpolate V */ /*******************************************************/ src1 = buf_src1->pu1_v; src2 = buf_src2->pu1_v; dst = buf_dst->pu1_v; // Row 0-3 src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storel_epi64((__m128i *) dst, src1_r0); _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); // Row 4-7 src1 += 4 * 8; src2 += 4 * 8; dst += 4 * stride; src1_r0 = _mm_loadl_epi64((__m128i *) (src1)); src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8)); src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8)); src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8)); src2_r0 = _mm_loadl_epi64((__m128i *) (src2)); src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8)); src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8)); src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8)); src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0); src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1); src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2); src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3); _mm_storel_epi64((__m128i *) dst, src1_r0); _mm_storel_epi64((__m128i *) (dst + stride), src1_r1); _mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2); _mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3); } /*****************************************************************************/ /* */ /* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */ /* */ /* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */ /* and the above block of size 8 x 8 will be placed as a */ /* block from the current position of out_buf */ /* */ /* Inputs : ref - Reference frame from which the block will be */ /* block will be extracted. */ /* ref_wid - WIdth of reference frame */ /* out_wid - WIdth of the output frame */ /* blk_width - width of the block */ /* blk_width - height of the block */ /* */ /* Globals : None */ /* */ /* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */ /* the ref frame.Interpolate these four values to get the */ /* value at(0.5,0.5).Repeat this to get an 8 x 8 block */ /* using 9 x 9 block from reference frame */ /* */ /* Outputs : out - Output containing the extracted block */ /* */ /* Returns : None */ /* */ /* Issues : None */ /* */ /*****************************************************************************/ void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out, UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid) { UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3; /* P0-P3 are the pixels in the reference frame and Q is the value being */ /* estimated */ /* P0 P1 Q P2 P3 */ __m128i src_r0, src_r0_1, src_r1, src_r1_1; __m128i tmp0, tmp1; __m128i value_2 = _mm_set1_epi16(2); ref_p0 = ref; ref_p1 = ref + 1; ref_p2 = ref + ref_wid; ref_p3 = ref + ref_wid + 1; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); src_r1 = _mm_cvtepu8_epi16(src_r1); src_r1_1 = _mm_cvtepu8_epi16(src_r1_1); tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation tmp0 = _mm_add_epi16(tmp0, value_2); tmp0 = _mm_srli_epi16(tmp0, 2); tmp0 = _mm_packus_epi16(tmp0, value_2); _mm_storel_epi64((__m128i *)out, tmp0); //Row 1 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation tmp1 = _mm_add_epi16(tmp1, value_2); tmp1 = _mm_srli_epi16(tmp1, 2); tmp1 = _mm_packus_epi16(tmp1, value_2); _mm_storel_epi64((__m128i *)out, tmp1); //Row 2 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation tmp0 = _mm_add_epi16(tmp0, value_2); tmp0 = _mm_srli_epi16(tmp0, 2); tmp0 = _mm_packus_epi16(tmp0, value_2); _mm_storel_epi64((__m128i *)out, tmp0); //Row 3 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation tmp1 = _mm_add_epi16(tmp1, value_2); tmp1 = _mm_srli_epi16(tmp1, 2); tmp1 = _mm_packus_epi16(tmp1, value_2); _mm_storel_epi64((__m128i *)out, tmp1); //Row 4 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation tmp0 = _mm_add_epi16(tmp0, value_2); tmp0 = _mm_srli_epi16(tmp0, 2); tmp0 = _mm_packus_epi16(tmp0, value_2); _mm_storel_epi64((__m128i *)out, tmp0); //Row 5 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation tmp1 = _mm_add_epi16(tmp1, value_2); tmp1 = _mm_srli_epi16(tmp1, 2); tmp1 = _mm_packus_epi16(tmp1, value_2); _mm_storel_epi64((__m128i *)out, tmp1); //Row 6 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation tmp0 = _mm_add_epi16(tmp0, value_2); tmp0 = _mm_srli_epi16(tmp0, 2); tmp0 = _mm_packus_epi16(tmp0, value_2); _mm_storel_epi64((__m128i *)out, tmp0); //Row 7 ref_p2 += ref_wid; ref_p3 += ref_wid; out += out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3)); src_r0 = _mm_cvtepu8_epi16(src_r0); src_r0_1 = _mm_cvtepu8_epi16(src_r0_1); tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation tmp1 = _mm_add_epi16(tmp1, value_2); tmp1 = _mm_srli_epi16(tmp1, 2); tmp1 = _mm_packus_epi16(tmp1, value_2); _mm_storel_epi64((__m128i *)out, tmp1); return; } /*****************************************************************************/ /* */ /* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */ /* */ /* Description : Gets the buffer from (0.5,0) to (8.5,8) */ /* and the above block of size 8 x 8 will be placed as a */ /* block from the current position of out_buf */ /* */ /* Inputs : ref - Reference frame from which the block will be */ /* block will be extracted. */ /* ref_wid - WIdth of reference frame */ /* out_wid - WIdth of the output frame */ /* blk_width - width of the block */ /* blk_width - height of the block */ /* */ /* Globals : None */ /* */ /* Processing : Point to the (0,0) and (1,0) position in the ref frame */ /* Interpolate these two values to get the value at(0.5,0) */ /* Repeat this to get an 8 x 8 block using 9 x 8 block from */ /* reference frame */ /* */ /* Outputs : out - Output containing the extracted block */ /* */ /* Returns : None */ /* */ /* Issues : None */ /* */ /*****************************************************************************/ void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out, UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid) { UWORD8 *ref_p0,*ref_p1; __m128i src_r0, src_r0_1, src_r1, src_r1_1; /* P0-P3 are the pixels in the reference frame and Q is the value being */ /* estimated */ /* P0 Q P1 */ ref_p0 = ref; ref_p1 = ref + 1; // Row 0 and 1 src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); src_r0 = _mm_avg_epu8(src_r0, src_r0_1); src_r1 = _mm_avg_epu8(src_r1, src_r1_1); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); // Row 2 and 3 ref_p0 += 2*ref_wid; ref_p1 += 2*ref_wid; out += 2*out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); src_r0 = _mm_avg_epu8(src_r0, src_r0_1); src_r1 = _mm_avg_epu8(src_r1, src_r1_1); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); // Row 4 and 5 ref_p0 += 2*ref_wid; ref_p1 += 2*ref_wid; out += 2*out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); src_r0 = _mm_avg_epu8(src_r0, src_r0_1); src_r1 = _mm_avg_epu8(src_r1, src_r1_1); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); // Row 6 and 7 ref_p0 += 2*ref_wid; ref_p1 += 2*ref_wid; out += 2*out_wid; src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6 src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1)); src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7 src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid)); src_r0 = _mm_avg_epu8(src_r0, src_r0_1); src_r1 = _mm_avg_epu8(src_r1, src_r1_1); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); return; } /*****************************************************************************/ /* */ /* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */ /* */ /* Description : Gets the buffer from (0,0.5) to (8,8.5) */ /* and the above block of size 8 x 8 will be placed as a */ /* block from the current position of out_buf */ /* */ /* Inputs : ref - Reference frame from which the block will be */ /* block will be extracted. */ /* ref_wid - WIdth of reference frame */ /* out_wid - WIdth of the output frame */ /* blk_width - width of the block */ /* blk_width - height of the block */ /* */ /* Globals : None */ /* */ /* Processing : Point to the (0,0) and (0,1) position in the ref frame */ /* Interpolate these two values to get the value at(0,0.5) */ /* Repeat this to get an 8 x 8 block using 8 x 9 block from */ /* reference frame */ /* */ /* Outputs : out - Output containing the extracted block */ /* */ /* Returns : None */ /* */ /* Issues : None */ /* */ /*****************************************************************************/ void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out, UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid) { __m128i src_r0, src_r1, src_r2, temp0, temp1; /* P0-P3 are the pixels in the reference frame and Q is the value being */ /* estimated */ /* P0 x P1 */ src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1 src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2 temp0 = _mm_avg_epu8(src_r0, src_r1); temp1 = _mm_avg_epu8(src_r1, src_r2); _mm_storel_epi64((__m128i *)out, temp0); //Row 0 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1 ref+= 3*ref_wid; out+= 2*out_wid; src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3 src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4 temp0 = _mm_avg_epu8(src_r2, src_r0); temp1 = _mm_avg_epu8(src_r0, src_r1); _mm_storel_epi64((__m128i *)out, temp0); //Row 2 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3 ref += 2*ref_wid; out+= 2*out_wid; src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5 src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6 temp0 = _mm_avg_epu8(src_r1, src_r2); temp1 = _mm_avg_epu8(src_r2, src_r0); _mm_storel_epi64((__m128i *)out, temp0); //Row 4 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5 ref += 2*ref_wid; out+= 2*out_wid; src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7 src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8 temp0 = _mm_avg_epu8(src_r0, src_r1); temp1 = _mm_avg_epu8(src_r1, src_r2); _mm_storel_epi64((__m128i *)out, temp0); //Row 6 _mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7 return; } /*****************************************************************************/ /* */ /* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */ /* */ /* Description : Gets the buffer from (x,y) to (x+8,y+8) */ /* and the above block of size 8 x 8 will be placed as a */ /* block from the current position of out_buf */ /* */ /* Inputs : ref - Reference frame from which the block will be */ /* block will be extracted. */ /* ref_wid - WIdth of reference frame */ /* out_wid - WIdth of the output frame */ /* blk_width - width of the block */ /* blk_width - height of the block */ /* */ /* Globals : None */ /* */ /* Processing : Point to the (0,0) position in the ref frame */ /* Get an 8 x 8 block from reference frame */ /* */ /* Outputs : out - Output containing the extracted block */ /* */ /* Returns : None */ /* */ /* Issues : None */ /* */ /*****************************************************************************/ void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out, UWORD8 *ref, UWORD32 ref_wid, UWORD32 out_wid) { __m128i src_r0, src_r1, src_r2, src_r3; // Row 0-3 src_r0 = _mm_loadl_epi64((__m128i *)ref); src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); // Row 4-7 ref += 4 * ref_wid; out += 4 * out_wid; src_r0 = _mm_loadl_epi64((__m128i *)ref); src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid)); _mm_storel_epi64((__m128i *)out, src_r0); _mm_storel_epi64((__m128i *)(out + out_wid), src_r1); _mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2); _mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3); return; }