1 /*
2 * Copyright (c) 2009-2011 Intel Corporation.  All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 
17 #include <emmintrin.h>
18 #include <x86intrin.h>
19 
stream_memcpy(void * dst_buff,const void * src_buff,size_t size)20 inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
21 {
22     bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
23     if (!isAligned) {
24         memcpy(dst_buff, src_buff, size);
25         return;
26     }
27 
28     static const size_t regs_count = 8;
29 
30     __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
31     __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
32 
33     size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
34     size_t end_position = 0;
35 
36     __m128i* pWb_buff = (__m128i*)dst_buff;
37     __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
38     __m128i* pWc_buff = (__m128i*)src_buff;
39 
40     /*sync the wc memory data*/
41     _mm_mfence();
42 
43     while (pWb_buff < pWb_buff_end)
44     {
45         xmm_data0  = _mm_stream_load_si128(pWc_buff);
46         xmm_data1  = _mm_stream_load_si128(pWc_buff + 1);
47         xmm_data2  = _mm_stream_load_si128(pWc_buff + 2);
48         xmm_data3  = _mm_stream_load_si128(pWc_buff + 3);
49         xmm_data4  = _mm_stream_load_si128(pWc_buff + 4);
50         xmm_data5  = _mm_stream_load_si128(pWc_buff + 5);
51         xmm_data6  = _mm_stream_load_si128(pWc_buff + 6);
52         xmm_data7  = _mm_stream_load_si128(pWc_buff + 7);
53 
54         pWc_buff += regs_count;
55         _mm_store_si128(pWb_buff, xmm_data0);
56         _mm_store_si128(pWb_buff + 1, xmm_data1);
57         _mm_store_si128(pWb_buff + 2, xmm_data2);
58         _mm_store_si128(pWb_buff + 3, xmm_data3);
59         _mm_store_si128(pWb_buff + 4, xmm_data4);
60         _mm_store_si128(pWb_buff + 5, xmm_data5);
61         _mm_store_si128(pWb_buff + 6, xmm_data6);
62         _mm_store_si128(pWb_buff + 7, xmm_data7);
63 
64         pWb_buff += regs_count;
65     }
66 
67     /*copy data by 16 bytes step from the remainder*/
68     if (remain_data >= 16)
69     {
70         size = remain_data;
71         remain_data = size & 15;
72         end_position = size >> 4;
73         for (size_t i = 0; i < end_position; ++i)
74         {
75             pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
76         }
77     }
78 
79     /*copy the remainder data, if it still existed*/
80     if (remain_data)
81     {
82         __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
83 
84         char* psrc_buf = (char*)(&temp_data);
85         char* pdst_buf = (char*)(pWb_buff + end_position);
86 
87         for (size_t i = 0; i < remain_data; ++i)
88         {
89             pdst_buf[i] = psrc_buf[i];
90         }
91     }
92 
93 }
94