1 /*
2 * Copyright (c) 2009-2011 Intel Corporation. All rights reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <emmintrin.h>
18 #include <x86intrin.h>
19
stream_memcpy(void * dst_buff,const void * src_buff,size_t size)20 inline void stream_memcpy(void* dst_buff, const void* src_buff, size_t size)
21 {
22 bool isAligned = (((size_t)(src_buff) | (size_t)(dst_buff)) & 0xF) == 0;
23 if (!isAligned) {
24 memcpy(dst_buff, src_buff, size);
25 return;
26 }
27
28 static const size_t regs_count = 8;
29
30 __m128i xmm_data0, xmm_data1, xmm_data2, xmm_data3;
31 __m128i xmm_data4, xmm_data5, xmm_data6, xmm_data7;
32
33 size_t remain_data = size & (regs_count * sizeof(xmm_data0) - 1);
34 size_t end_position = 0;
35
36 __m128i* pWb_buff = (__m128i*)dst_buff;
37 __m128i* pWb_buff_end = pWb_buff + ((size - remain_data) >> 4);
38 __m128i* pWc_buff = (__m128i*)src_buff;
39
40 /*sync the wc memory data*/
41 _mm_mfence();
42
43 while (pWb_buff < pWb_buff_end)
44 {
45 xmm_data0 = _mm_stream_load_si128(pWc_buff);
46 xmm_data1 = _mm_stream_load_si128(pWc_buff + 1);
47 xmm_data2 = _mm_stream_load_si128(pWc_buff + 2);
48 xmm_data3 = _mm_stream_load_si128(pWc_buff + 3);
49 xmm_data4 = _mm_stream_load_si128(pWc_buff + 4);
50 xmm_data5 = _mm_stream_load_si128(pWc_buff + 5);
51 xmm_data6 = _mm_stream_load_si128(pWc_buff + 6);
52 xmm_data7 = _mm_stream_load_si128(pWc_buff + 7);
53
54 pWc_buff += regs_count;
55 _mm_store_si128(pWb_buff, xmm_data0);
56 _mm_store_si128(pWb_buff + 1, xmm_data1);
57 _mm_store_si128(pWb_buff + 2, xmm_data2);
58 _mm_store_si128(pWb_buff + 3, xmm_data3);
59 _mm_store_si128(pWb_buff + 4, xmm_data4);
60 _mm_store_si128(pWb_buff + 5, xmm_data5);
61 _mm_store_si128(pWb_buff + 6, xmm_data6);
62 _mm_store_si128(pWb_buff + 7, xmm_data7);
63
64 pWb_buff += regs_count;
65 }
66
67 /*copy data by 16 bytes step from the remainder*/
68 if (remain_data >= 16)
69 {
70 size = remain_data;
71 remain_data = size & 15;
72 end_position = size >> 4;
73 for (size_t i = 0; i < end_position; ++i)
74 {
75 pWb_buff[i] = _mm_stream_load_si128(pWc_buff + i);
76 }
77 }
78
79 /*copy the remainder data, if it still existed*/
80 if (remain_data)
81 {
82 __m128i temp_data = _mm_stream_load_si128(pWc_buff + end_position);
83
84 char* psrc_buf = (char*)(&temp_data);
85 char* pdst_buf = (char*)(pWb_buff + end_position);
86
87 for (size_t i = 0; i < remain_data; ++i)
88 {
89 pdst_buf[i] = psrc_buf[i];
90 }
91 }
92
93 }
94