1 /* AesOpt.c -- Intel's AES
2 2013-11-12 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "CpuArch.h"
7 
8 #ifdef MY_CPU_X86_OR_AMD64
9 #if _MSC_VER >= 1500
10 #define USE_INTEL_AES
11 #endif
12 #endif
13 
14 #ifdef USE_INTEL_AES
15 
16 #include <wmmintrin.h>
17 
AesCbc_Encode_Intel(__m128i * p,__m128i * data,size_t numBlocks)18 void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
19 {
20   __m128i m = *p;
21   for (; numBlocks != 0; numBlocks--, data++)
22   {
23     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
24     const __m128i *w = p + 3;
25     m = _mm_xor_si128(m, *data);
26     m = _mm_xor_si128(m, p[2]);
27     do
28     {
29       m = _mm_aesenc_si128(m, w[0]);
30       m = _mm_aesenc_si128(m, w[1]);
31       w += 2;
32     }
33     while (--numRounds2 != 0);
34     m = _mm_aesenc_si128(m, w[0]);
35     m = _mm_aesenclast_si128(m, w[1]);
36     *data = m;
37   }
38   *p = m;
39 }
40 
41 #define NUM_WAYS 3
42 
43 #define AES_OP_W(op, n) { \
44     const __m128i t = w[n]; \
45     m0 = op(m0, t); \
46     m1 = op(m1, t); \
47     m2 = op(m2, t); \
48     }
49 
50 #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
51 #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
52 #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
53 #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
54 
AesCbc_Decode_Intel(__m128i * p,__m128i * data,size_t numBlocks)55 void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
56 {
57   __m128i iv = *p;
58   for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
59   {
60     UInt32 numRounds2 = *(const UInt32 *)(p + 1);
61     const __m128i *w = p + numRounds2 * 2;
62     __m128i m0, m1, m2;
63     {
64       const __m128i t = w[2];
65       m0 = _mm_xor_si128(t, data[0]);
66       m1 = _mm_xor_si128(t, data[1]);
67       m2 = _mm_xor_si128(t, data[2]);
68     }
69     numRounds2--;
70     do
71     {
72       AES_DEC(1)
73       AES_DEC(0)
74       w -= 2;
75     }
76     while (--numRounds2 != 0);
77     AES_DEC(1)
78     AES_DEC_LAST(0)
79 
80     {
81       __m128i t;
82       t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
83       t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
84       t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
85     }
86   }
87   for (; numBlocks != 0; numBlocks--, data++)
88   {
89     UInt32 numRounds2 = *(const UInt32 *)(p + 1);
90     const __m128i *w = p + numRounds2 * 2;
91     __m128i m = _mm_xor_si128(w[2], *data);
92     numRounds2--;
93     do
94     {
95       m = _mm_aesdec_si128(m, w[1]);
96       m = _mm_aesdec_si128(m, w[0]);
97       w -= 2;
98     }
99     while (--numRounds2 != 0);
100     m = _mm_aesdec_si128(m, w[1]);
101     m = _mm_aesdeclast_si128(m, w[0]);
102 
103     m = _mm_xor_si128(m, iv);
104     iv = *data;
105     *data = m;
106   }
107   *p = iv;
108 }
109 
AesCtr_Code_Intel(__m128i * p,__m128i * data,size_t numBlocks)110 void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
111 {
112   __m128i ctr = *p;
113   __m128i one;
114   one.m128i_u64[0] = 1;
115   one.m128i_u64[1] = 0;
116   for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
117   {
118     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
119     const __m128i *w = p;
120     __m128i m0, m1, m2;
121     {
122       const __m128i t = w[2];
123       ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
124       ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
125       ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
126     }
127     w += 3;
128     do
129     {
130       AES_ENC(0)
131       AES_ENC(1)
132       w += 2;
133     }
134     while (--numRounds2 != 0);
135     AES_ENC(0)
136     AES_ENC_LAST(1)
137     data[0] = _mm_xor_si128(data[0], m0);
138     data[1] = _mm_xor_si128(data[1], m1);
139     data[2] = _mm_xor_si128(data[2], m2);
140   }
141   for (; numBlocks != 0; numBlocks--, data++)
142   {
143     UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
144     const __m128i *w = p;
145     __m128i m;
146     ctr = _mm_add_epi64(ctr, one);
147     m = _mm_xor_si128(ctr, p[2]);
148     w += 3;
149     do
150     {
151       m = _mm_aesenc_si128(m, w[0]);
152       m = _mm_aesenc_si128(m, w[1]);
153       w += 2;
154     }
155     while (--numRounds2 != 0);
156     m = _mm_aesenc_si128(m, w[0]);
157     m = _mm_aesenclast_si128(m, w[1]);
158     *data = _mm_xor_si128(*data, m);
159   }
160   *p = ctr;
161 }
162 
163 #else
164 
165 void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
166 void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
167 void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
168 
AesCbc_Encode_Intel(UInt32 * p,Byte * data,size_t numBlocks)169 void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
170 {
171   AesCbc_Encode(p, data, numBlocks);
172 }
173 
AesCbc_Decode_Intel(UInt32 * p,Byte * data,size_t numBlocks)174 void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
175 {
176   AesCbc_Decode(p, data, numBlocks);
177 }
178 
AesCtr_Code_Intel(UInt32 * p,Byte * data,size_t numBlocks)179 void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
180 {
181   AesCtr_Code(p, data, numBlocks);
182 }
183 
184 #endif
185