salsa.cpp

00001 // salsa.cpp - written and placed in the public domain by Wei Dai
00002 
00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM salsa.cpp" to generate MASM code
00004 
00005 #include "pch.h"
00006 
00007 #ifndef CRYPTOPP_GENERATE_X64_MASM
00008 
00009 #include "salsa.h"
00010 #include "misc.h"
00011 #include "argnames.h"
00012 #include "cpu.h"
00013 
00014 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00015 #include <emmintrin.h>
00016 #endif
00017 
00018 NAMESPACE_BEGIN(CryptoPP)
00019 
00020 void Salsa20_TestInstantiations()
00021 {
00022         Salsa20::Encryption x;
00023 }
00024 
00025 void Salsa20_Policy::CipherSetKey(const NameValuePairs &params, const byte *key, size_t length)
00026 {
00027         m_rounds = params.GetIntValueWithDefault(Name::Rounds(), 20);
00028 
00029         if (!(m_rounds == 8 || m_rounds == 12 || m_rounds == 20))
00030                 throw InvalidRounds(StaticAlgorithmName(), m_rounds);
00031 
00032         // m_state is reordered for SSE2
00033         GetBlock<word32, LittleEndian, false> get1(key);
00034         get1(m_state[13])(m_state[10])(m_state[7])(m_state[4]);
00035         GetBlock<word32, LittleEndian, false> get2(key + length - 16);
00036         get2(m_state[15])(m_state[12])(m_state[9])(m_state[6]);
00037 
00038         // "expand 16-byte k" or "expand 32-byte k"
00039         m_state[0] = 0x61707865;
00040         m_state[1] = (length == 16) ? 0x3120646e : 0x3320646e;
00041         m_state[2] = (length == 16) ? 0x79622d36 : 0x79622d32;
00042         m_state[3] = 0x6b206574;
00043 }
00044 
00045 void Salsa20_Policy::CipherResynchronize(byte *keystreamBuffer, const byte *IV)
00046 {
00047         GetBlock<word32, LittleEndian, false> get(IV);
00048         get(m_state[14])(m_state[11]);
00049         m_state[8] = m_state[5] = 0;
00050 }
00051 
00052 void Salsa20_Policy::SeekToIteration(lword iterationCount)
00053 {
00054         m_state[8] = (word32)iterationCount;
00055         m_state[5] = (word32)SafeRightShift<32>(iterationCount);
00056 }
00057 
00058 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00059 unsigned int Salsa20_Policy::GetAlignment() const
00060 {
00061 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00062         if (HasSSE2())
00063                 return 16;
00064         else
00065 #endif
00066                 return 1;
00067 }
00068 
00069 unsigned int Salsa20_Policy::GetOptimalBlockSize() const
00070 {
00071 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00072         if (HasSSE2())
00073                 return 4*BYTES_PER_ITERATION;
00074         else
00075 #endif
00076                 return BYTES_PER_ITERATION;
00077 }
00078 #endif
00079 
00080 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00081 extern "C" {
00082 void Salsa20_OperateKeystream(byte *output, const byte *input, size_t iterationCount, int rounds, void *state);
00083 }
00084 #endif
00085 
00086 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00087 
00088 void Salsa20_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00089 {
00090 #endif  // #ifdef CRYPTOPP_GENERATE_X64_MASM
00091 
00092 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00093         Salsa20_OperateKeystream(output, input, iterationCount, m_rounds, m_state.data());
00094         return;
00095 #endif
00096 
00097 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00098 #ifdef CRYPTOPP_GENERATE_X64_MASM
00099                 ALIGN   8
00100         Salsa20_OperateKeystream        PROC FRAME
00101                 mov             r10, [rsp + 5*8]                        ; state
00102                 alloc_stack(10*16 + 32*16 + 8)
00103                 save_xmm128 xmm6, 0200h
00104                 save_xmm128 xmm7, 0210h
00105                 save_xmm128 xmm8, 0220h
00106                 save_xmm128 xmm9, 0230h
00107                 save_xmm128 xmm10, 0240h
00108                 save_xmm128 xmm11, 0250h
00109                 save_xmm128 xmm12, 0260h
00110                 save_xmm128 xmm13, 0270h
00111                 save_xmm128 xmm14, 0280h
00112                 save_xmm128 xmm15, 0290h
00113                 .endprolog
00114 
00115         #define REG_output                      rcx
00116         #define REG_input                       rdx
00117         #define REG_iterationCount      r8
00118         #define REG_state                       r10
00119         #define REG_rounds                      e9d
00120         #define REG_roundsLeft          eax
00121         #define REG_temp32                      r11d
00122         #define REG_temp                        r11
00123         #define SSE2_WORKSPACE          rsp
00124 #else
00125         if (HasSSE2())
00126         {
00127         #if CRYPTOPP_BOOL_X64
00128                 #define REG_output                      %4
00129                 #define REG_input                       %1
00130                 #define REG_iterationCount      %2
00131                 #define REG_state                       %3
00132                 #define REG_rounds                      %0
00133                 #define REG_roundsLeft          eax
00134                 #define REG_temp32                      edx
00135                 #define REG_temp                        rdx
00136                 #define SSE2_WORKSPACE          %5
00137 
00138                 __m128i workspace[32];
00139         #else
00140                 #define REG_output                      edi
00141                 #define REG_input                       eax
00142                 #define REG_iterationCount      ecx
00143                 #define REG_state                       esi
00144                 #define REG_rounds                      edx
00145                 #define REG_roundsLeft          ebx
00146                 #define REG_temp32                      ebp
00147                 #define REG_temp                        ebp
00148                 #define SSE2_WORKSPACE          esp + WORD_SZ
00149         #endif
00150 
00151         #ifdef __GNUC__
00152                 __asm__ __volatile__
00153                 (
00154                         ".intel_syntax noprefix;"
00155                         AS_PUSH_IF86(   bx)
00156         #else
00157                 void *s = m_state.data();
00158                 word32 r = m_rounds;
00159 
00160                 AS2(    mov             REG_iterationCount, iterationCount)
00161                 AS2(    mov             REG_input, input)
00162                 AS2(    mov             REG_output, output)
00163                 AS2(    mov             REG_state, s)
00164                 AS2(    mov             REG_rounds, r)
00165         #endif
00166 #endif  // #ifndef CRYPTOPP_GENERATE_X64_MASM
00167 
00168                 AS_PUSH_IF86(   bp)
00169                 AS2(    cmp             REG_iterationCount, 4)
00170                 ASJ(    jl,             5, f)
00171 
00172 #if CRYPTOPP_BOOL_X86
00173                 AS2(    mov             ebx, esp)
00174                 AS2(    and             esp, -16)
00175                 AS2(    sub             esp, 32*16)
00176                 AS1(    push    ebx)
00177 #endif
00178 
00179 #define SSE2_EXPAND_S(i, j)             \
00180         ASS(    pshufd  xmm4, xmm##i, j, j, j, j)       \
00181         AS2(    movdqa  [SSE2_WORKSPACE + (i*4+j)*16 + 256], xmm4)
00182 
00183                 AS2(    movdqa  xmm0, [REG_state + 0*16])
00184                 AS2(    movdqa  xmm1, [REG_state + 1*16])
00185                 AS2(    movdqa  xmm2, [REG_state + 2*16])
00186                 AS2(    movdqa  xmm3, [REG_state + 3*16])
00187                 SSE2_EXPAND_S(0, 0)
00188                 SSE2_EXPAND_S(0, 1)
00189                 SSE2_EXPAND_S(0, 2)
00190                 SSE2_EXPAND_S(0, 3)
00191                 SSE2_EXPAND_S(1, 0)
00192                 SSE2_EXPAND_S(1, 2)
00193                 SSE2_EXPAND_S(1, 3)
00194                 SSE2_EXPAND_S(2, 1)
00195                 SSE2_EXPAND_S(2, 2)
00196                 SSE2_EXPAND_S(2, 3)
00197                 SSE2_EXPAND_S(3, 0)
00198                 SSE2_EXPAND_S(3, 1)
00199                 SSE2_EXPAND_S(3, 2)
00200                 SSE2_EXPAND_S(3, 3)
00201 
00202 #define SSE2_EXPAND_S85(i)              \
00203                 AS2(    mov             dword ptr [SSE2_WORKSPACE + 8*16 + i*4 + 256], REG_roundsLeft)  \
00204                 AS2(    mov             dword ptr [SSE2_WORKSPACE + 5*16 + i*4 + 256], REG_temp32)      \
00205                 AS2(    add             REG_roundsLeft, 1)      \
00206                 AS2(    adc             REG_temp32, 0)
00207 
00208                 ASL(1)
00209                 AS2(    mov             REG_roundsLeft, dword ptr [REG_state + 8*4])
00210                 AS2(    mov             REG_temp32, dword ptr [REG_state + 5*4])
00211                 SSE2_EXPAND_S85(0)
00212                 SSE2_EXPAND_S85(1)
00213                 SSE2_EXPAND_S85(2)
00214                 SSE2_EXPAND_S85(3)
00215                 AS2(    mov             dword ptr [REG_state + 8*4], REG_roundsLeft)
00216                 AS2(    mov             dword ptr [REG_state + 5*4], REG_temp32)
00217 
00218 #define SSE2_QUARTER_ROUND(a, b, d, i)          \
00219         AS2(    movdqa  xmm4, xmm##d)                   \
00220         AS2(    paddd   xmm4, xmm##a)                   \
00221         AS2(    movdqa  xmm5, xmm4)                             \
00222         AS2(    pslld   xmm4, i)                                \
00223         AS2(    psrld   xmm5, 32-i)                             \
00224         AS2(    pxor    xmm##b, xmm4)                   \
00225         AS2(    pxor    xmm##b, xmm5)
00226 
00227 #define L01(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##A, [SSE2_WORKSPACE + d*16 + i*256])        /* y3 */
00228 #define L02(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##C, [SSE2_WORKSPACE + a*16 + i*256])        /* y0 */        
00229 #define L03(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##C)         /* y0+y3 */                                                     
00230 #define L04(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00231 #define L05(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 7)                                                                                      
00232 #define L06(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##B, 32-7)                                                                                   
00233 #define L07(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + b*16 + i*256])                                
00234 #define L08(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##B)         /* z1 */                                                        
00235 #define L09(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + b*16], xmm##A)                                
00236 #define L10(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00237 #define L11(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##C)         /* z1+y0 */                                                     
00238 #define L12(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00239 #define L13(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 9)                                                                                      
00240 #define L14(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##D, 32-9)                                                                                   
00241 #define L15(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + c*16 + i*256])                                
00242 #define L16(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##D)         /* z2 */                                                        
00243 #define L17(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + c*16], xmm##A)                                
00244 #define L18(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00245 #define L19(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##B)         /* z2+z1 */                                                     
00246 #define L20(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##B, xmm##A)                                                                                 
00247 #define L21(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 13)                                                                                     
00248 #define L22(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##B, 32-13)                                                                          
00249 #define L23(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, [SSE2_WORKSPACE + d*16 + i*256])                                
00250 #define L24(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##B)         /* z3 */                                                        
00251 #define L25(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + d*16], xmm##A)                                
00252 #define L26(A,B,C,D,a,b,c,d,i)          AS2(    paddd   xmm##A, xmm##D)         /* z3+z2 */                                                     
00253 #define L27(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  xmm##D, xmm##A)                                                                                 
00254 #define L28(A,B,C,D,a,b,c,d,i)          AS2(    pslld   xmm##A, 18)                                                                                     
00255 #define L29(A,B,C,D,a,b,c,d,i)          AS2(    psrld   xmm##D, 32-18)                                                                          
00256 #define L30(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##C)         /* xor y0 */                                            
00257 #define L31(A,B,C,D,a,b,c,d,i)          AS2(    pxor    xmm##A, xmm##D)         /* z0 */                                                        
00258 #define L32(A,B,C,D,a,b,c,d,i)          AS2(    movdqa  [SSE2_WORKSPACE + a*16], xmm##A)                                
00259 
00260 #define SSE2_QUARTER_ROUND_X8(i, a, b, c, d, e, f, g, h)        \
00261         L01(0,1,2,3, a,b,c,d, i)        L01(4,5,6,7, e,f,g,h, i)        \
00262         L02(0,1,2,3, a,b,c,d, i)        L02(4,5,6,7, e,f,g,h, i)        \
00263         L03(0,1,2,3, a,b,c,d, i)        L03(4,5,6,7, e,f,g,h, i)        \
00264         L04(0,1,2,3, a,b,c,d, i)        L04(4,5,6,7, e,f,g,h, i)        \
00265         L05(0,1,2,3, a,b,c,d, i)        L05(4,5,6,7, e,f,g,h, i)        \
00266         L06(0,1,2,3, a,b,c,d, i)        L06(4,5,6,7, e,f,g,h, i)        \
00267         L07(0,1,2,3, a,b,c,d, i)        L07(4,5,6,7, e,f,g,h, i)        \
00268         L08(0,1,2,3, a,b,c,d, i)        L08(4,5,6,7, e,f,g,h, i)        \
00269         L09(0,1,2,3, a,b,c,d, i)        L09(4,5,6,7, e,f,g,h, i)        \
00270         L10(0,1,2,3, a,b,c,d, i)        L10(4,5,6,7, e,f,g,h, i)        \
00271         L11(0,1,2,3, a,b,c,d, i)        L11(4,5,6,7, e,f,g,h, i)        \
00272         L12(0,1,2,3, a,b,c,d, i)        L12(4,5,6,7, e,f,g,h, i)        \
00273         L13(0,1,2,3, a,b,c,d, i)        L13(4,5,6,7, e,f,g,h, i)        \
00274         L14(0,1,2,3, a,b,c,d, i)        L14(4,5,6,7, e,f,g,h, i)        \
00275         L15(0,1,2,3, a,b,c,d, i)        L15(4,5,6,7, e,f,g,h, i)        \
00276         L16(0,1,2,3, a,b,c,d, i)        L16(4,5,6,7, e,f,g,h, i)        \
00277         L17(0,1,2,3, a,b,c,d, i)        L17(4,5,6,7, e,f,g,h, i)        \
00278         L18(0,1,2,3, a,b,c,d, i)        L18(4,5,6,7, e,f,g,h, i)        \
00279         L19(0,1,2,3, a,b,c,d, i)        L19(4,5,6,7, e,f,g,h, i)        \
00280         L20(0,1,2,3, a,b,c,d, i)        L20(4,5,6,7, e,f,g,h, i)        \
00281         L21(0,1,2,3, a,b,c,d, i)        L21(4,5,6,7, e,f,g,h, i)        \
00282         L22(0,1,2,3, a,b,c,d, i)        L22(4,5,6,7, e,f,g,h, i)        \
00283         L23(0,1,2,3, a,b,c,d, i)        L23(4,5,6,7, e,f,g,h, i)        \
00284         L24(0,1,2,3, a,b,c,d, i)        L24(4,5,6,7, e,f,g,h, i)        \
00285         L25(0,1,2,3, a,b,c,d, i)        L25(4,5,6,7, e,f,g,h, i)        \
00286         L26(0,1,2,3, a,b,c,d, i)        L26(4,5,6,7, e,f,g,h, i)        \
00287         L27(0,1,2,3, a,b,c,d, i)        L27(4,5,6,7, e,f,g,h, i)        \
00288         L28(0,1,2,3, a,b,c,d, i)        L28(4,5,6,7, e,f,g,h, i)        \
00289         L29(0,1,2,3, a,b,c,d, i)        L29(4,5,6,7, e,f,g,h, i)        \
00290         L30(0,1,2,3, a,b,c,d, i)        L30(4,5,6,7, e,f,g,h, i)        \
00291         L31(0,1,2,3, a,b,c,d, i)        L31(4,5,6,7, e,f,g,h, i)        \
00292         L32(0,1,2,3, a,b,c,d, i)        L32(4,5,6,7, e,f,g,h, i)
00293 
00294 #define SSE2_QUARTER_ROUND_X16(i, a, b, c, d, e, f, g, h, A, B, C, D, E, F, G, H)       \
00295         L01(0,1,2,3, a,b,c,d, i)        L01(4,5,6,7, e,f,g,h, i)        L01(8,9,10,11, A,B,C,D, i)      L01(12,13,14,15, E,F,G,H, i)    \
00296         L02(0,1,2,3, a,b,c,d, i)        L02(4,5,6,7, e,f,g,h, i)        L02(8,9,10,11, A,B,C,D, i)      L02(12,13,14,15, E,F,G,H, i)    \
00297         L03(0,1,2,3, a,b,c,d, i)        L03(4,5,6,7, e,f,g,h, i)        L03(8,9,10,11, A,B,C,D, i)      L03(12,13,14,15, E,F,G,H, i)    \
00298         L04(0,1,2,3, a,b,c,d, i)        L04(4,5,6,7, e,f,g,h, i)        L04(8,9,10,11, A,B,C,D, i)      L04(12,13,14,15, E,F,G,H, i)    \
00299         L05(0,1,2,3, a,b,c,d, i)        L05(4,5,6,7, e,f,g,h, i)        L05(8,9,10,11, A,B,C,D, i)      L05(12,13,14,15, E,F,G,H, i)    \
00300         L06(0,1,2,3, a,b,c,d, i)        L06(4,5,6,7, e,f,g,h, i)        L06(8,9,10,11, A,B,C,D, i)      L06(12,13,14,15, E,F,G,H, i)    \
00301         L07(0,1,2,3, a,b,c,d, i)        L07(4,5,6,7, e,f,g,h, i)        L07(8,9,10,11, A,B,C,D, i)      L07(12,13,14,15, E,F,G,H, i)    \
00302         L08(0,1,2,3, a,b,c,d, i)        L08(4,5,6,7, e,f,g,h, i)        L08(8,9,10,11, A,B,C,D, i)      L08(12,13,14,15, E,F,G,H, i)    \
00303         L09(0,1,2,3, a,b,c,d, i)        L09(4,5,6,7, e,f,g,h, i)        L09(8,9,10,11, A,B,C,D, i)      L09(12,13,14,15, E,F,G,H, i)    \
00304         L10(0,1,2,3, a,b,c,d, i)        L10(4,5,6,7, e,f,g,h, i)        L10(8,9,10,11, A,B,C,D, i)      L10(12,13,14,15, E,F,G,H, i)    \
00305         L11(0,1,2,3, a,b,c,d, i)        L11(4,5,6,7, e,f,g,h, i)        L11(8,9,10,11, A,B,C,D, i)      L11(12,13,14,15, E,F,G,H, i)    \
00306         L12(0,1,2,3, a,b,c,d, i)        L12(4,5,6,7, e,f,g,h, i)        L12(8,9,10,11, A,B,C,D, i)      L12(12,13,14,15, E,F,G,H, i)    \
00307         L13(0,1,2,3, a,b,c,d, i)        L13(4,5,6,7, e,f,g,h, i)        L13(8,9,10,11, A,B,C,D, i)      L13(12,13,14,15, E,F,G,H, i)    \
00308         L14(0,1,2,3, a,b,c,d, i)        L14(4,5,6,7, e,f,g,h, i)        L14(8,9,10,11, A,B,C,D, i)      L14(12,13,14,15, E,F,G,H, i)    \
00309         L15(0,1,2,3, a,b,c,d, i)        L15(4,5,6,7, e,f,g,h, i)        L15(8,9,10,11, A,B,C,D, i)      L15(12,13,14,15, E,F,G,H, i)    \
00310         L16(0,1,2,3, a,b,c,d, i)        L16(4,5,6,7, e,f,g,h, i)        L16(8,9,10,11, A,B,C,D, i)      L16(12,13,14,15, E,F,G,H, i)    \
00311         L17(0,1,2,3, a,b,c,d, i)        L17(4,5,6,7, e,f,g,h, i)        L17(8,9,10,11, A,B,C,D, i)      L17(12,13,14,15, E,F,G,H, i)    \
00312         L18(0,1,2,3, a,b,c,d, i)        L18(4,5,6,7, e,f,g,h, i)        L18(8,9,10,11, A,B,C,D, i)      L18(12,13,14,15, E,F,G,H, i)    \
00313         L19(0,1,2,3, a,b,c,d, i)        L19(4,5,6,7, e,f,g,h, i)        L19(8,9,10,11, A,B,C,D, i)      L19(12,13,14,15, E,F,G,H, i)    \
00314         L20(0,1,2,3, a,b,c,d, i)        L20(4,5,6,7, e,f,g,h, i)        L20(8,9,10,11, A,B,C,D, i)      L20(12,13,14,15, E,F,G,H, i)    \
00315         L21(0,1,2,3, a,b,c,d, i)        L21(4,5,6,7, e,f,g,h, i)        L21(8,9,10,11, A,B,C,D, i)      L21(12,13,14,15, E,F,G,H, i)    \
00316         L22(0,1,2,3, a,b,c,d, i)        L22(4,5,6,7, e,f,g,h, i)        L22(8,9,10,11, A,B,C,D, i)      L22(12,13,14,15, E,F,G,H, i)    \
00317         L23(0,1,2,3, a,b,c,d, i)        L23(4,5,6,7, e,f,g,h, i)        L23(8,9,10,11, A,B,C,D, i)      L23(12,13,14,15, E,F,G,H, i)    \
00318         L24(0,1,2,3, a,b,c,d, i)        L24(4,5,6,7, e,f,g,h, i)        L24(8,9,10,11, A,B,C,D, i)      L24(12,13,14,15, E,F,G,H, i)    \
00319         L25(0,1,2,3, a,b,c,d, i)        L25(4,5,6,7, e,f,g,h, i)        L25(8,9,10,11, A,B,C,D, i)      L25(12,13,14,15, E,F,G,H, i)    \
00320         L26(0,1,2,3, a,b,c,d, i)        L26(4,5,6,7, e,f,g,h, i)        L26(8,9,10,11, A,B,C,D, i)      L26(12,13,14,15, E,F,G,H, i)    \
00321         L27(0,1,2,3, a,b,c,d, i)        L27(4,5,6,7, e,f,g,h, i)        L27(8,9,10,11, A,B,C,D, i)      L27(12,13,14,15, E,F,G,H, i)    \
00322         L28(0,1,2,3, a,b,c,d, i)        L28(4,5,6,7, e,f,g,h, i)        L28(8,9,10,11, A,B,C,D, i)      L28(12,13,14,15, E,F,G,H, i)    \
00323         L29(0,1,2,3, a,b,c,d, i)        L29(4,5,6,7, e,f,g,h, i)        L29(8,9,10,11, A,B,C,D, i)      L29(12,13,14,15, E,F,G,H, i)    \
00324         L30(0,1,2,3, a,b,c,d, i)        L30(4,5,6,7, e,f,g,h, i)        L30(8,9,10,11, A,B,C,D, i)      L30(12,13,14,15, E,F,G,H, i)    \
00325         L31(0,1,2,3, a,b,c,d, i)        L31(4,5,6,7, e,f,g,h, i)        L31(8,9,10,11, A,B,C,D, i)      L31(12,13,14,15, E,F,G,H, i)    \
00326         L32(0,1,2,3, a,b,c,d, i)        L32(4,5,6,7, e,f,g,h, i)        L32(8,9,10,11, A,B,C,D, i)      L32(12,13,14,15, E,F,G,H, i)
00327 
00328 #if CRYPTOPP_BOOL_X64
00329                 SSE2_QUARTER_ROUND_X16(1, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00330 #else
00331                 SSE2_QUARTER_ROUND_X8(1, 2, 6, 10, 14, 3, 7, 11, 15)
00332                 SSE2_QUARTER_ROUND_X8(1, 0, 4, 8, 12, 1, 5, 9, 13)
00333 #endif
00334                 AS2(    mov             REG_roundsLeft, REG_rounds)
00335                 ASJ(    jmp,    2, f)
00336 
00337                 ASL(SSE2_Salsa_Output)
00338                 AS2(    movdqa          xmm0, xmm4)
00339                 AS2(    punpckldq       xmm4, xmm5)
00340                 AS2(    movdqa          xmm1, xmm6)
00341                 AS2(    punpckldq       xmm6, xmm7)
00342                 AS2(    movdqa          xmm2, xmm4)
00343                 AS2(    punpcklqdq      xmm4, xmm6)     // e
00344                 AS2(    punpckhqdq      xmm2, xmm6)     // f
00345                 AS2(    punpckhdq       xmm0, xmm5)
00346                 AS2(    punpckhdq       xmm1, xmm7)
00347                 AS2(    movdqa          xmm6, xmm0)
00348                 AS2(    punpcklqdq      xmm0, xmm1)     // g
00349                 AS2(    punpckhqdq      xmm6, xmm1)     // h
00350                 AS_XMM_OUTPUT4(SSE2_Salsa_Output_A, REG_input, REG_output, 4, 2, 0, 6, 1, 0, 4, 8, 12, 1)
00351                 AS1(    ret)
00352 
00353                 ASL(6)
00354 #if CRYPTOPP_BOOL_X64
00355                 SSE2_QUARTER_ROUND_X16(0, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15)
00356                 ASL(2)
00357                 SSE2_QUARTER_ROUND_X16(0, 0, 13, 10, 7, 1, 14, 11, 4, 2, 15, 8, 5, 3, 12, 9, 6)
00358 #else
00359                 SSE2_QUARTER_ROUND_X8(0, 2, 6, 10, 14, 3, 7, 11, 15)
00360                 SSE2_QUARTER_ROUND_X8(0, 0, 4, 8, 12, 1, 5, 9, 13)
00361                 ASL(2)
00362                 SSE2_QUARTER_ROUND_X8(0, 2, 15, 8, 5, 3, 12, 9, 6)
00363                 SSE2_QUARTER_ROUND_X8(0, 0, 13, 10, 7, 1, 14, 11, 4)
00364 #endif
00365                 AS2(    sub             REG_roundsLeft, 2)
00366                 ASJ(    jnz,    6, b)
00367 
00368 #define SSE2_OUTPUT_4(a, b, c, d)       \
00369         AS2(    movdqa          xmm4, [SSE2_WORKSPACE + a*16 + 256])\
00370         AS2(    paddd           xmm4, [SSE2_WORKSPACE + a*16])\
00371         AS2(    movdqa          xmm5, [SSE2_WORKSPACE + b*16 + 256])\
00372         AS2(    paddd           xmm5, [SSE2_WORKSPACE + b*16])\
00373         AS2(    movdqa          xmm6, [SSE2_WORKSPACE + c*16 + 256])\
00374         AS2(    paddd           xmm6, [SSE2_WORKSPACE + c*16])\
00375         AS2(    movdqa          xmm7, [SSE2_WORKSPACE + d*16 + 256])\
00376         AS2(    paddd           xmm7, [SSE2_WORKSPACE + d*16])\
00377         ASC(    call,           SSE2_Salsa_Output)
00378 
00379                 SSE2_OUTPUT_4(0, 13, 10, 7)
00380                 SSE2_OUTPUT_4(4, 1, 14, 11)
00381                 SSE2_OUTPUT_4(8, 5, 2, 15)
00382                 SSE2_OUTPUT_4(12, 9, 6, 3)
00383                 AS2(    test    REG_input, REG_input)
00384                 ASJ(    jz,             9, f)
00385                 AS2(    add             REG_input, 12*16)
00386                 ASL(9)
00387                 AS2(    add             REG_output, 12*16)
00388                 AS2(    sub             REG_iterationCount, 4)
00389                 AS2(    cmp             REG_iterationCount, 4)
00390                 ASJ(    jge,    1, b)
00391                 AS_POP_IF86(    sp)
00392 
00393                 ASL(5)
00394                 AS2(    sub             REG_iterationCount, 1)
00395                 ASJ(    jl,             4, f)
00396                 AS2(    movdqa  xmm0, [REG_state + 0*16])
00397                 AS2(    movdqa  xmm1, [REG_state + 1*16])
00398                 AS2(    movdqa  xmm2, [REG_state + 2*16])
00399                 AS2(    movdqa  xmm3, [REG_state + 3*16])
00400                 AS2(    mov             REG_roundsLeft, REG_rounds)
00401 
00402                 ASL(0)
00403                 SSE2_QUARTER_ROUND(0, 1, 3, 7)
00404                 SSE2_QUARTER_ROUND(1, 2, 0, 9)
00405                 SSE2_QUARTER_ROUND(2, 3, 1, 13)
00406                 SSE2_QUARTER_ROUND(3, 0, 2, 18)
00407                 ASS(    pshufd  xmm1, xmm1, 2, 1, 0, 3)
00408                 ASS(    pshufd  xmm2, xmm2, 1, 0, 3, 2)
00409                 ASS(    pshufd  xmm3, xmm3, 0, 3, 2, 1)
00410                 SSE2_QUARTER_ROUND(0, 3, 1, 7)
00411                 SSE2_QUARTER_ROUND(3, 2, 0, 9)
00412                 SSE2_QUARTER_ROUND(2, 1, 3, 13)
00413                 SSE2_QUARTER_ROUND(1, 0, 2, 18)
00414                 ASS(    pshufd  xmm1, xmm1, 0, 3, 2, 1)
00415                 ASS(    pshufd  xmm2, xmm2, 1, 0, 3, 2)
00416                 ASS(    pshufd  xmm3, xmm3, 2, 1, 0, 3)
00417                 AS2(    sub             REG_roundsLeft, 2)
00418                 ASJ(    jnz,    0, b)
00419 
00420                 AS2(    paddd   xmm0, [REG_state + 0*16])
00421                 AS2(    paddd   xmm1, [REG_state + 1*16])
00422                 AS2(    paddd   xmm2, [REG_state + 2*16])
00423                 AS2(    paddd   xmm3, [REG_state + 3*16])
00424 
00425                 AS2(    add             dword ptr [REG_state + 8*4], 1)
00426                 AS2(    adc             dword ptr [REG_state + 5*4], 0)
00427 
00428                 AS2(    pcmpeqb xmm6, xmm6)                     // all ones
00429                 AS2(    psrlq   xmm6, 32)                       // lo32 mask
00430                 ASS(    pshufd  xmm7, xmm6, 0, 1, 2, 3)         // hi32 mask
00431                 AS2(    movdqa  xmm4, xmm0)
00432                 AS2(    movdqa  xmm5, xmm3)
00433                 AS2(    pand    xmm0, xmm7)
00434                 AS2(    pand    xmm4, xmm6)
00435                 AS2(    pand    xmm3, xmm6)
00436                 AS2(    pand    xmm5, xmm7)
00437                 AS2(    por             xmm4, xmm5)                     // 0,13,2,15
00438                 AS2(    movdqa  xmm5, xmm1)
00439                 AS2(    pand    xmm1, xmm7)
00440                 AS2(    pand    xmm5, xmm6)
00441                 AS2(    por             xmm0, xmm5)                     // 4,1,6,3
00442                 AS2(    pand    xmm6, xmm2)
00443                 AS2(    pand    xmm2, xmm7)
00444                 AS2(    por             xmm1, xmm6)                     // 8,5,10,7
00445                 AS2(    por             xmm2, xmm3)                     // 12,9,14,11
00446 
00447                 AS2(    movdqa  xmm5, xmm4)
00448                 AS2(    movdqa  xmm6, xmm0)
00449                 AS3(    shufpd  xmm4, xmm1, 2)          // 0,13,10,7
00450                 AS3(    shufpd  xmm0, xmm2, 2)          // 4,1,14,11
00451                 AS3(    shufpd  xmm1, xmm5, 2)          // 8,5,2,15
00452                 AS3(    shufpd  xmm2, xmm6, 2)          // 12,9,6,3
00453 
00454                 // output keystream
00455                 AS_XMM_OUTPUT4(SSE2_Salsa_Output_B, REG_input, REG_output, 4, 0, 1, 2, 3, 0, 1, 2, 3, 4)
00456                 ASJ(    jmp,    5, b)
00457                 ASL(4)
00458 
00459                 AS_POP_IF86(    bp)
00460 #ifdef __GNUC__
00461                 AS_POP_IF86(    bx)
00462                 ".att_syntax prefix;"
00463                         : 
00464         #if CRYPTOPP_BOOL_X64
00465                         : "r" (m_rounds), "r" (input), "r" (iterationCount), "r" (m_state.data()), "r" (output), "r" (workspace)
00466                         : "%eax", "%edx", "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15"
00467         #else
00468                         : "d" (m_rounds), "a" (input), "c" (iterationCount), "S" (m_state.data()), "D" (output)
00469                         : "memory", "cc"
00470         #endif
00471                 );
00472 #endif
00473 #ifdef CRYPTOPP_GENERATE_X64_MASM
00474         movdqa  xmm6, [rsp + 0200h]
00475         movdqa  xmm7, [rsp + 0210h]
00476         movdqa  xmm8, [rsp + 0220h]
00477         movdqa  xmm9, [rsp + 0230h]
00478         movdqa  xmm10, [rsp + 0240h]
00479         movdqa  xmm11, [rsp + 0250h]
00480         movdqa  xmm12, [rsp + 0260h]
00481         movdqa  xmm13, [rsp + 0270h]
00482         movdqa  xmm14, [rsp + 0280h]
00483         movdqa  xmm15, [rsp + 0290h]
00484         add             rsp, 10*16 + 32*16 + 8
00485         ret
00486 Salsa20_OperateKeystream ENDP
00487 #else
00488         }
00489         else
00490 #endif
00491 #endif
00492 #ifndef CRYPTOPP_GENERATE_X64_MASM
00493         {
00494                 word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
00495 
00496                 while (iterationCount--)
00497                 {
00498                         x0 = m_state[0];
00499                         x1 = m_state[1];
00500                         x2 = m_state[2];
00501                         x3 = m_state[3];
00502                         x4 = m_state[4];
00503                         x5 = m_state[5];
00504                         x6 = m_state[6];
00505                         x7 = m_state[7];
00506                         x8 = m_state[8];
00507                         x9 = m_state[9];
00508                         x10 = m_state[10];
00509                         x11 = m_state[11];
00510                         x12 = m_state[12];
00511                         x13 = m_state[13];
00512                         x14 = m_state[14];
00513                         x15 = m_state[15];
00514 
00515                         for (int i=m_rounds; i>0; i-=2)
00516                         {
00517                                 #define QUARTER_ROUND(a, b, c, d)       \
00518                                         b = b ^ rotlFixed(a + d, 7);    \
00519                                         c = c ^ rotlFixed(b + a, 9);    \
00520                                         d = d ^ rotlFixed(c + b, 13);   \
00521                                         a = a ^ rotlFixed(d + c, 18);
00522 
00523                                 QUARTER_ROUND(x0, x4, x8, x12)
00524                                 QUARTER_ROUND(x1, x5, x9, x13)
00525                                 QUARTER_ROUND(x2, x6, x10, x14)
00526                                 QUARTER_ROUND(x3, x7, x11, x15)
00527 
00528                                 QUARTER_ROUND(x0, x13, x10, x7)
00529                                 QUARTER_ROUND(x1, x14, x11, x4)
00530                                 QUARTER_ROUND(x2, x15, x8, x5)
00531                                 QUARTER_ROUND(x3, x12, x9, x6)
00532                         }
00533 
00534                         #define SALSA_OUTPUT(x) {\
00535                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\
00536                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x13 + m_state[13]);\
00537                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x10 + m_state[10]);\
00538                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x7 + m_state[7]);\
00539                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\
00540                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x1 + m_state[1]);\
00541                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x14 + m_state[14]);\
00542                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x11 + m_state[11]);\
00543                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\
00544                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x5 + m_state[5]);\
00545                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x2 + m_state[2]);\
00546                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x15 + m_state[15]);\
00547                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\
00548                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x9 + m_state[9]);\
00549                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x6 + m_state[6]);\
00550                                 CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x3 + m_state[3]);}
00551 
00552 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
00553                         CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SALSA_OUTPUT, BYTES_PER_ITERATION);
00554 #endif
00555 
00556                         if (++m_state[8] == 0)
00557                                 ++m_state[5];
00558                 }
00559         }
00560 }       // see comment above if an internal compiler error occurs here
00561 
00562 NAMESPACE_END
00563 
00564 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

Generated on Fri Feb 6 00:56:25 2009 for Crypto++ by  doxygen 1.4.7