sosemanuk.cpp

00001 // sosemanuk.cpp - written and placed in the public domain by Wei Dai
00002 
00003 // use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM sosemanuk.cpp" to generate MASM code
00004 
00005 #include "pch.h"
00006 
00007 #ifndef CRYPTOPP_GENERATE_X64_MASM
00008 
00009 #include "sosemanuk.h"
00010 #include "misc.h"
00011 #include "cpu.h"
00012 
00013 #include "serpentp.h"
00014 
00015 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
00016 #include <emmintrin.h>
00017 #endif
00018 
00019 NAMESPACE_BEGIN(CryptoPP)
00020 
00021 void SosemanukPolicy::CipherSetKey(const NameValuePairs &params, const byte *userKey, size_t keylen)
00022 {
00023         Serpent_KeySchedule(m_key, 24, userKey, keylen);
00024 }
00025 
00026 void SosemanukPolicy::CipherResynchronize(byte *keystreamBuffer, const byte *iv)
00027 {
00028         word32 a, b, c, d, e;
00029         
00030         typedef BlockGetAndPut<word32, LittleEndian> Block;
00031         Block::Get(iv)(a)(b)(c)(d);
00032 
00033         const word32 *k = m_key;
00034         unsigned int i=1;
00035 
00036         do
00037         {
00038                 beforeS0(KX); beforeS0(S0); afterS0(LT);
00039                 afterS0(KX); afterS0(S1); afterS1(LT);
00040                 if (i == 3)     // after 18th round
00041                 {
00042                         m_state[4] = b;
00043                         m_state[5] = e;
00044                         m_state[10] = c;
00045                         m_state[11] = a;
00046                 }
00047                 afterS1(KX); afterS1(S2); afterS2(LT);
00048                 afterS2(KX); afterS2(S3); afterS3(LT);
00049                 if (i == 2)     // after 12th round
00050                 {
00051                         m_state[6] = c;
00052                         m_state[7] = d;
00053                         m_state[8] = b;
00054                         m_state[9] = e;
00055                 }
00056                 afterS3(KX); afterS3(S4); afterS4(LT);
00057                 afterS4(KX); afterS4(S5); afterS5(LT);
00058                 afterS5(KX); afterS5(S6); afterS6(LT);
00059                 afterS6(KX); afterS6(S7); afterS7(LT);
00060 
00061                 if (i == 3)
00062                         break;
00063 
00064                 ++i;
00065                 c = b;
00066                 b = e;
00067                 e = d;
00068                 d = a;
00069                 a = e;
00070                 k += 32;
00071         }
00072         while (true);
00073 
00074         afterS7(KX);
00075 
00076         m_state[0] = a;
00077         m_state[1] = b;
00078         m_state[2] = e;
00079         m_state[3] = d;
00080 
00081 #define XMUX(c, x, y)   (x ^ (y & (0 - (c & 1))))
00082         m_state[11] += XMUX(m_state[10], m_state[1], m_state[8]);
00083         m_state[10] = rotlFixed(m_state[10] * 0x54655307, 7);
00084 }
00085 
00086 extern "C" {
00087 word32 s_sosemanukMulTables[512] = {
00088 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
00089         0x00000000, 0xE19FCF12, 0x6B973724, 0x8A08F836, 
00090         0xD6876E48, 0x3718A15A, 0xBD10596C, 0x5C8F967E, 
00091         0x05A7DC90, 0xE4381382, 0x6E30EBB4, 0x8FAF24A6, 
00092         0xD320B2D8, 0x32BF7DCA, 0xB8B785FC, 0x59284AEE, 
00093         0x0AE71189, 0xEB78DE9B, 0x617026AD, 0x80EFE9BF, 
00094         0xDC607FC1, 0x3DFFB0D3, 0xB7F748E5, 0x566887F7, 
00095         0x0F40CD19, 0xEEDF020B, 0x64D7FA3D, 0x8548352F, 
00096         0xD9C7A351, 0x38586C43, 0xB2509475, 0x53CF5B67, 
00097         0x146722BB, 0xF5F8EDA9, 0x7FF0159F, 0x9E6FDA8D, 
00098         0xC2E04CF3, 0x237F83E1, 0xA9777BD7, 0x48E8B4C5, 
00099         0x11C0FE2B, 0xF05F3139, 0x7A57C90F, 0x9BC8061D, 
00100         0xC7479063, 0x26D85F71, 0xACD0A747, 0x4D4F6855, 
00101         0x1E803332, 0xFF1FFC20, 0x75170416, 0x9488CB04, 
00102         0xC8075D7A, 0x29989268, 0xA3906A5E, 0x420FA54C, 
00103         0x1B27EFA2, 0xFAB820B0, 0x70B0D886, 0x912F1794, 
00104         0xCDA081EA, 0x2C3F4EF8, 0xA637B6CE, 0x47A879DC, 
00105         0x28CE44DF, 0xC9518BCD, 0x435973FB, 0xA2C6BCE9, 
00106         0xFE492A97, 0x1FD6E585, 0x95DE1DB3, 0x7441D2A1, 
00107         0x2D69984F, 0xCCF6575D, 0x46FEAF6B, 0xA7616079, 
00108         0xFBEEF607, 0x1A713915, 0x9079C123, 0x71E60E31, 
00109         0x22295556, 0xC3B69A44, 0x49BE6272, 0xA821AD60, 
00110         0xF4AE3B1E, 0x1531F40C, 0x9F390C3A, 0x7EA6C328, 
00111         0x278E89C6, 0xC61146D4, 0x4C19BEE2, 0xAD8671F0, 
00112         0xF109E78E, 0x1096289C, 0x9A9ED0AA, 0x7B011FB8, 
00113         0x3CA96664, 0xDD36A976, 0x573E5140, 0xB6A19E52, 
00114         0xEA2E082C, 0x0BB1C73E, 0x81B93F08, 0x6026F01A, 
00115         0x390EBAF4, 0xD89175E6, 0x52998DD0, 0xB30642C2, 
00116         0xEF89D4BC, 0x0E161BAE, 0x841EE398, 0x65812C8A, 
00117         0x364E77ED, 0xD7D1B8FF, 0x5DD940C9, 0xBC468FDB, 
00118         0xE0C919A5, 0x0156D6B7, 0x8B5E2E81, 0x6AC1E193, 
00119         0x33E9AB7D, 0xD276646F, 0x587E9C59, 0xB9E1534B, 
00120         0xE56EC535, 0x04F10A27, 0x8EF9F211, 0x6F663D03, 
00121         0x50358817, 0xB1AA4705, 0x3BA2BF33, 0xDA3D7021, 
00122         0x86B2E65F, 0x672D294D, 0xED25D17B, 0x0CBA1E69, 
00123         0x55925487, 0xB40D9B95, 0x3E0563A3, 0xDF9AACB1, 
00124         0x83153ACF, 0x628AF5DD, 0xE8820DEB, 0x091DC2F9, 
00125         0x5AD2999E, 0xBB4D568C, 0x3145AEBA, 0xD0DA61A8, 
00126         0x8C55F7D6, 0x6DCA38C4, 0xE7C2C0F2, 0x065D0FE0, 
00127         0x5F75450E, 0xBEEA8A1C, 0x34E2722A, 0xD57DBD38, 
00128         0x89F22B46, 0x686DE454, 0xE2651C62, 0x03FAD370, 
00129         0x4452AAAC, 0xA5CD65BE, 0x2FC59D88, 0xCE5A529A, 
00130         0x92D5C4E4, 0x734A0BF6, 0xF942F3C0, 0x18DD3CD2, 
00131         0x41F5763C, 0xA06AB92E, 0x2A624118, 0xCBFD8E0A, 
00132         0x97721874, 0x76EDD766, 0xFCE52F50, 0x1D7AE042, 
00133         0x4EB5BB25, 0xAF2A7437, 0x25228C01, 0xC4BD4313, 
00134         0x9832D56D, 0x79AD1A7F, 0xF3A5E249, 0x123A2D5B, 
00135         0x4B1267B5, 0xAA8DA8A7, 0x20855091, 0xC11A9F83, 
00136         0x9D9509FD, 0x7C0AC6EF, 0xF6023ED9, 0x179DF1CB, 
00137         0x78FBCCC8, 0x996403DA, 0x136CFBEC, 0xF2F334FE, 
00138         0xAE7CA280, 0x4FE36D92, 0xC5EB95A4, 0x24745AB6, 
00139         0x7D5C1058, 0x9CC3DF4A, 0x16CB277C, 0xF754E86E, 
00140         0xABDB7E10, 0x4A44B102, 0xC04C4934, 0x21D38626, 
00141         0x721CDD41, 0x93831253, 0x198BEA65, 0xF8142577, 
00142         0xA49BB309, 0x45047C1B, 0xCF0C842D, 0x2E934B3F, 
00143         0x77BB01D1, 0x9624CEC3, 0x1C2C36F5, 0xFDB3F9E7, 
00144         0xA13C6F99, 0x40A3A08B, 0xCAAB58BD, 0x2B3497AF, 
00145         0x6C9CEE73, 0x8D032161, 0x070BD957, 0xE6941645, 
00146         0xBA1B803B, 0x5B844F29, 0xD18CB71F, 0x3013780D, 
00147         0x693B32E3, 0x88A4FDF1, 0x02AC05C7, 0xE333CAD5, 
00148         0xBFBC5CAB, 0x5E2393B9, 0xD42B6B8F, 0x35B4A49D, 
00149         0x667BFFFA, 0x87E430E8, 0x0DECC8DE, 0xEC7307CC, 
00150         0xB0FC91B2, 0x51635EA0, 0xDB6BA696, 0x3AF46984, 
00151         0x63DC236A, 0x8243EC78, 0x084B144E, 0xE9D4DB5C, 
00152         0xB55B4D22, 0x54C48230, 0xDECC7A06, 0x3F53B514,
00153 #else
00154         0x00000000, 0xE19FCF13, 0x6B973726, 0x8A08F835,
00155         0xD6876E4C, 0x3718A15F, 0xBD10596A, 0x5C8F9679,
00156         0x05A7DC98, 0xE438138B, 0x6E30EBBE, 0x8FAF24AD,
00157         0xD320B2D4, 0x32BF7DC7, 0xB8B785F2, 0x59284AE1,
00158         0x0AE71199, 0xEB78DE8A, 0x617026BF, 0x80EFE9AC,
00159         0xDC607FD5, 0x3DFFB0C6, 0xB7F748F3, 0x566887E0,
00160         0x0F40CD01, 0xEEDF0212, 0x64D7FA27, 0x85483534,
00161         0xD9C7A34D, 0x38586C5E, 0xB250946B, 0x53CF5B78,
00162         0x1467229B, 0xF5F8ED88, 0x7FF015BD, 0x9E6FDAAE,
00163         0xC2E04CD7, 0x237F83C4, 0xA9777BF1, 0x48E8B4E2,
00164         0x11C0FE03, 0xF05F3110, 0x7A57C925, 0x9BC80636,
00165         0xC747904F, 0x26D85F5C, 0xACD0A769, 0x4D4F687A,
00166         0x1E803302, 0xFF1FFC11, 0x75170424, 0x9488CB37,
00167         0xC8075D4E, 0x2998925D, 0xA3906A68, 0x420FA57B,
00168         0x1B27EF9A, 0xFAB82089, 0x70B0D8BC, 0x912F17AF,
00169         0xCDA081D6, 0x2C3F4EC5, 0xA637B6F0, 0x47A879E3,
00170         0x28CE449F, 0xC9518B8C, 0x435973B9, 0xA2C6BCAA,
00171         0xFE492AD3, 0x1FD6E5C0, 0x95DE1DF5, 0x7441D2E6,
00172         0x2D699807, 0xCCF65714, 0x46FEAF21, 0xA7616032,
00173         0xFBEEF64B, 0x1A713958, 0x9079C16D, 0x71E60E7E,
00174         0x22295506, 0xC3B69A15, 0x49BE6220, 0xA821AD33,
00175         0xF4AE3B4A, 0x1531F459, 0x9F390C6C, 0x7EA6C37F,
00176         0x278E899E, 0xC611468D, 0x4C19BEB8, 0xAD8671AB,
00177         0xF109E7D2, 0x109628C1, 0x9A9ED0F4, 0x7B011FE7,
00178         0x3CA96604, 0xDD36A917, 0x573E5122, 0xB6A19E31,
00179         0xEA2E0848, 0x0BB1C75B, 0x81B93F6E, 0x6026F07D,
00180         0x390EBA9C, 0xD891758F, 0x52998DBA, 0xB30642A9,
00181         0xEF89D4D0, 0x0E161BC3, 0x841EE3F6, 0x65812CE5,
00182         0x364E779D, 0xD7D1B88E, 0x5DD940BB, 0xBC468FA8,
00183         0xE0C919D1, 0x0156D6C2, 0x8B5E2EF7, 0x6AC1E1E4,
00184         0x33E9AB05, 0xD2766416, 0x587E9C23, 0xB9E15330,
00185         0xE56EC549, 0x04F10A5A, 0x8EF9F26F, 0x6F663D7C,
00186         0x50358897, 0xB1AA4784, 0x3BA2BFB1, 0xDA3D70A2,
00187         0x86B2E6DB, 0x672D29C8, 0xED25D1FD, 0x0CBA1EEE,
00188         0x5592540F, 0xB40D9B1C, 0x3E056329, 0xDF9AAC3A,
00189         0x83153A43, 0x628AF550, 0xE8820D65, 0x091DC276,
00190         0x5AD2990E, 0xBB4D561D, 0x3145AE28, 0xD0DA613B,
00191         0x8C55F742, 0x6DCA3851, 0xE7C2C064, 0x065D0F77,
00192         0x5F754596, 0xBEEA8A85, 0x34E272B0, 0xD57DBDA3,
00193         0x89F22BDA, 0x686DE4C9, 0xE2651CFC, 0x03FAD3EF,
00194         0x4452AA0C, 0xA5CD651F, 0x2FC59D2A, 0xCE5A5239,
00195         0x92D5C440, 0x734A0B53, 0xF942F366, 0x18DD3C75,
00196         0x41F57694, 0xA06AB987, 0x2A6241B2, 0xCBFD8EA1,
00197         0x977218D8, 0x76EDD7CB, 0xFCE52FFE, 0x1D7AE0ED,
00198         0x4EB5BB95, 0xAF2A7486, 0x25228CB3, 0xC4BD43A0,
00199         0x9832D5D9, 0x79AD1ACA, 0xF3A5E2FF, 0x123A2DEC,
00200         0x4B12670D, 0xAA8DA81E, 0x2085502B, 0xC11A9F38,
00201         0x9D950941, 0x7C0AC652, 0xF6023E67, 0x179DF174,
00202         0x78FBCC08, 0x9964031B, 0x136CFB2E, 0xF2F3343D,
00203         0xAE7CA244, 0x4FE36D57, 0xC5EB9562, 0x24745A71,
00204         0x7D5C1090, 0x9CC3DF83, 0x16CB27B6, 0xF754E8A5,
00205         0xABDB7EDC, 0x4A44B1CF, 0xC04C49FA, 0x21D386E9,
00206         0x721CDD91, 0x93831282, 0x198BEAB7, 0xF81425A4,
00207         0xA49BB3DD, 0x45047CCE, 0xCF0C84FB, 0x2E934BE8,
00208         0x77BB0109, 0x9624CE1A, 0x1C2C362F, 0xFDB3F93C,
00209         0xA13C6F45, 0x40A3A056, 0xCAAB5863, 0x2B349770,
00210         0x6C9CEE93, 0x8D032180, 0x070BD9B5, 0xE69416A6,
00211         0xBA1B80DF, 0x5B844FCC, 0xD18CB7F9, 0x301378EA,
00212         0x693B320B, 0x88A4FD18, 0x02AC052D, 0xE333CA3E,
00213         0xBFBC5C47, 0x5E239354, 0xD42B6B61, 0x35B4A472,
00214         0x667BFF0A, 0x87E43019, 0x0DECC82C, 0xEC73073F,
00215         0xB0FC9146, 0x51635E55, 0xDB6BA660, 0x3AF46973,
00216         0x63DC2392, 0x8243EC81, 0x084B14B4, 0xE9D4DBA7,
00217         0xB55B4DDE, 0x54C482CD, 0xDECC7AF8, 0x3F53B5EB,
00218 #endif
00219         0x00000000, 0x180F40CD, 0x301E8033, 0x2811C0FE,
00220         0x603CA966, 0x7833E9AB, 0x50222955, 0x482D6998,
00221         0xC078FBCC, 0xD877BB01, 0xF0667BFF, 0xE8693B32,
00222         0xA04452AA, 0xB84B1267, 0x905AD299, 0x88559254,
00223         0x29F05F31, 0x31FF1FFC, 0x19EEDF02, 0x01E19FCF,
00224         0x49CCF657, 0x51C3B69A, 0x79D27664, 0x61DD36A9,
00225         0xE988A4FD, 0xF187E430, 0xD99624CE, 0xC1996403,
00226         0x89B40D9B, 0x91BB4D56, 0xB9AA8DA8, 0xA1A5CD65,
00227         0x5249BE62, 0x4A46FEAF, 0x62573E51, 0x7A587E9C,
00228         0x32751704, 0x2A7A57C9, 0x026B9737, 0x1A64D7FA,
00229         0x923145AE, 0x8A3E0563, 0xA22FC59D, 0xBA208550,
00230         0xF20DECC8, 0xEA02AC05, 0xC2136CFB, 0xDA1C2C36,
00231         0x7BB9E153, 0x63B6A19E, 0x4BA76160, 0x53A821AD,
00232         0x1B854835, 0x038A08F8, 0x2B9BC806, 0x339488CB,
00233         0xBBC11A9F, 0xA3CE5A52, 0x8BDF9AAC, 0x93D0DA61,
00234         0xDBFDB3F9, 0xC3F2F334, 0xEBE333CA, 0xF3EC7307,
00235         0xA492D5C4, 0xBC9D9509, 0x948C55F7, 0x8C83153A,
00236         0xC4AE7CA2, 0xDCA13C6F, 0xF4B0FC91, 0xECBFBC5C,
00237         0x64EA2E08, 0x7CE56EC5, 0x54F4AE3B, 0x4CFBEEF6,
00238         0x04D6876E, 0x1CD9C7A3, 0x34C8075D, 0x2CC74790,
00239         0x8D628AF5, 0x956DCA38, 0xBD7C0AC6, 0xA5734A0B,
00240         0xED5E2393, 0xF551635E, 0xDD40A3A0, 0xC54FE36D,
00241         0x4D1A7139, 0x551531F4, 0x7D04F10A, 0x650BB1C7,
00242         0x2D26D85F, 0x35299892, 0x1D38586C, 0x053718A1,
00243         0xF6DB6BA6, 0xEED42B6B, 0xC6C5EB95, 0xDECAAB58,
00244         0x96E7C2C0, 0x8EE8820D, 0xA6F942F3, 0xBEF6023E,
00245         0x36A3906A, 0x2EACD0A7, 0x06BD1059, 0x1EB25094,
00246         0x569F390C, 0x4E9079C1, 0x6681B93F, 0x7E8EF9F2,
00247         0xDF2B3497, 0xC724745A, 0xEF35B4A4, 0xF73AF469,
00248         0xBF179DF1, 0xA718DD3C, 0x8F091DC2, 0x97065D0F,
00249         0x1F53CF5B, 0x075C8F96, 0x2F4D4F68, 0x37420FA5,
00250         0x7F6F663D, 0x676026F0, 0x4F71E60E, 0x577EA6C3,
00251         0xE18D0321, 0xF98243EC, 0xD1938312, 0xC99CC3DF,
00252         0x81B1AA47, 0x99BEEA8A, 0xB1AF2A74, 0xA9A06AB9,
00253         0x21F5F8ED, 0x39FAB820, 0x11EB78DE, 0x09E43813,
00254         0x41C9518B, 0x59C61146, 0x71D7D1B8, 0x69D89175,
00255         0xC87D5C10, 0xD0721CDD, 0xF863DC23, 0xE06C9CEE,
00256         0xA841F576, 0xB04EB5BB, 0x985F7545, 0x80503588,
00257         0x0805A7DC, 0x100AE711, 0x381B27EF, 0x20146722,
00258         0x68390EBA, 0x70364E77, 0x58278E89, 0x4028CE44,
00259         0xB3C4BD43, 0xABCBFD8E, 0x83DA3D70, 0x9BD57DBD,
00260         0xD3F81425, 0xCBF754E8, 0xE3E69416, 0xFBE9D4DB,
00261         0x73BC468F, 0x6BB30642, 0x43A2C6BC, 0x5BAD8671,
00262         0x1380EFE9, 0x0B8FAF24, 0x239E6FDA, 0x3B912F17,
00263         0x9A34E272, 0x823BA2BF, 0xAA2A6241, 0xB225228C,
00264         0xFA084B14, 0xE2070BD9, 0xCA16CB27, 0xD2198BEA,
00265         0x5A4C19BE, 0x42435973, 0x6A52998D, 0x725DD940,
00266         0x3A70B0D8, 0x227FF015, 0x0A6E30EB, 0x12617026,
00267         0x451FD6E5, 0x5D109628, 0x750156D6, 0x6D0E161B,
00268         0x25237F83, 0x3D2C3F4E, 0x153DFFB0, 0x0D32BF7D,
00269         0x85672D29, 0x9D686DE4, 0xB579AD1A, 0xAD76EDD7,
00270         0xE55B844F, 0xFD54C482, 0xD545047C, 0xCD4A44B1,
00271         0x6CEF89D4, 0x74E0C919, 0x5CF109E7, 0x44FE492A,
00272         0x0CD320B2, 0x14DC607F, 0x3CCDA081, 0x24C2E04C,
00273         0xAC977218, 0xB49832D5, 0x9C89F22B, 0x8486B2E6,
00274         0xCCABDB7E, 0xD4A49BB3, 0xFCB55B4D, 0xE4BA1B80,
00275         0x17566887, 0x0F59284A, 0x2748E8B4, 0x3F47A879,
00276         0x776AC1E1, 0x6F65812C, 0x477441D2, 0x5F7B011F,
00277         0xD72E934B, 0xCF21D386, 0xE7301378, 0xFF3F53B5,
00278         0xB7123A2D, 0xAF1D7AE0, 0x870CBA1E, 0x9F03FAD3,
00279         0x3EA637B6, 0x26A9777B, 0x0EB8B785, 0x16B7F748,
00280         0x5E9A9ED0, 0x4695DE1D, 0x6E841EE3, 0x768B5E2E,
00281         0xFEDECC7A, 0xE6D18CB7, 0xCEC04C49, 0xD6CF0C84,
00282         0x9EE2651C, 0x86ED25D1, 0xAEFCE52F, 0xB6F3A5E2
00283 };
00284 }
00285 
00286 #if CRYPTOPP_BOOL_X86 || CRYPTOPP_BOOL_X64
00287 unsigned int SosemanukPolicy::GetAlignment() const
00288 {
00289 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00290 #ifdef __INTEL_COMPILER
00291         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00292 #else
00293         if (HasSSE2())
00294 #endif
00295                 return 16;
00296         else
00297 #endif
00298                 return 1;
00299 }
00300 
00301 unsigned int SosemanukPolicy::GetOptimalBlockSize() const
00302 {
00303 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00304 #ifdef __INTEL_COMPILER
00305         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00306 #else
00307         if (HasSSE2())
00308 #endif
00309                 return 4*BYTES_PER_ITERATION;
00310         else
00311 #endif
00312                 return BYTES_PER_ITERATION;
00313 }
00314 #endif
00315 
00316 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00317 extern "C" {
00318 void Sosemanuk_OperateKeystream(size_t iterationCount, const byte *input, byte *output, word32 *state);
00319 }
00320 #endif
00321 
00322 #pragma warning(disable: 4731)  // frame pointer register 'ebp' modified by inline assembly code
00323 
00324 void SosemanukPolicy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
00325 {
00326 #endif  // #ifdef CRYPTOPP_GENERATE_X64_MASM
00327 
00328 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
00329         Sosemanuk_OperateKeystream(iterationCount, input, output, m_state.data());
00330         return;
00331 #endif
00332 
00333 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
00334 #ifdef CRYPTOPP_GENERATE_X64_MASM
00335                 ALIGN   8
00336         Sosemanuk_OperateKeystream      PROC FRAME
00337                 rex_push_reg rsi
00338                 push_reg rdi
00339                 alloc_stack(80*4*2+12*4+8*WORD_SZ + 2*16+8)
00340                 save_xmm128 xmm6, 02f0h
00341                 save_xmm128 xmm7, 0300h
00342                 .endprolog
00343                 mov             rdi, r8
00344                 mov             rax, r9
00345 #else
00346 #ifdef __INTEL_COMPILER
00347         if (HasSSE2() && !IsP4())       // Intel compiler produces faster code for this algorithm on the P4
00348 #else
00349         if (HasSSE2())
00350 #endif
00351         {
00352 #ifdef __GNUC__
00353         #if CRYPTOPP_BOOL_X64
00354                 __m128i workspace[(80*4*2+12*4+8*WORD_SZ)/16];
00355         #endif
00356                 __asm__ __volatile__
00357                 (
00358                 ".intel_syntax noprefix;"
00359                 AS_PUSH_IF86(   bx)
00360 #else
00361                 word32 *state = m_state;
00362                 AS2(    mov             WORD_REG(ax), state)
00363                 AS2(    mov             WORD_REG(di), output)
00364                 AS2(    mov             WORD_REG(dx), input)
00365                 AS2(    mov             WORD_REG(cx), iterationCount)
00366 #endif
00367 #endif  // #ifdef CRYPTOPP_GENERATE_X64_MASM
00368 
00369 #if defined(__GNUC__) && CRYPTOPP_BOOL_X64
00370         #define SSE2_workspace %5
00371 #else
00372         #define SSE2_workspace WORD_REG(sp)
00373 #endif
00374 
00375 #define SSE2_output                     WORD_PTR [SSE2_workspace+1*WORD_SZ]
00376 #define SSE2_input                      WORD_PTR [SSE2_workspace+2*WORD_SZ]
00377 #define SSE2_wordsLeft          WORD_PTR [SSE2_workspace+3*WORD_SZ]
00378 #define SSE2_diEnd                      WORD_PTR [SSE2_workspace+4*WORD_SZ]
00379 #define SSE2_pMulTables         WORD_PTR [SSE2_workspace+5*WORD_SZ]
00380 #define SSE2_state                      WORD_PTR [SSE2_workspace+6*WORD_SZ]
00381 #define SSE2_wordsLeft2         WORD_PTR [SSE2_workspace+7*WORD_SZ]
00382 #define SSE2_stateCopy          SSE2_workspace + 8*WORD_SZ
00383 #define SSE2_uvStart            SSE2_stateCopy + 12*4
00384 
00385 #if CRYPTOPP_BOOL_X86
00386                 AS_PUSH_IF86(   bp)
00387                 AS2(    mov             AS_REG_6, esp)
00388                 AS2(    and             esp, -16)
00389                 AS2(    sub             esp, 80*4*2+12*4+8*WORD_SZ)     // 80 v's, 80 u's, 12 state, 8 locals
00390                 AS2(    mov             [esp], AS_REG_6)
00391 #endif
00392                 AS2(    mov             SSE2_output, WORD_REG(di))
00393                 AS2(    mov             SSE2_input, WORD_REG(dx))
00394                 AS2(    mov             SSE2_state, WORD_REG(ax))
00395 #ifndef _MSC_VER
00396                 AS2(    mov             SSE2_pMulTables, WORD_REG(si))
00397 #endif
00398                 AS2(    lea             WORD_REG(cx), [4*WORD_REG(cx)+WORD_REG(cx)])
00399                 AS2(    lea             WORD_REG(si), [4*WORD_REG(cx)])
00400                 AS2(    mov             SSE2_wordsLeft, WORD_REG(si))
00401                 AS2(    movdqa  xmm0, [WORD_REG(ax)+0*16])              // copy state to stack to save a register
00402                 AS2(    movdqa  [SSE2_stateCopy+0*16], xmm0)
00403                 AS2(    movdqa  xmm0, [WORD_REG(ax)+1*16])
00404                 AS2(    movdqa  [SSE2_stateCopy+1*16], xmm0)
00405                 AS2(    movq    xmm0, QWORD PTR [WORD_REG(ax)+2*16])
00406                 AS2(    movq    QWORD PTR [SSE2_stateCopy+2*16], xmm0)
00407                 AS2(    psrlq   xmm0, 32)
00408                 AS2(    movd    AS_REG_6d, xmm0)                                // s(9)
00409                 AS2(    mov             ecx, [WORD_REG(ax)+10*4])
00410                 AS2(    mov             edx, [WORD_REG(ax)+11*4])
00411                 AS2(    pcmpeqb xmm7, xmm7)                             // all ones
00412 
00413 #define s(i)    SSE2_stateCopy + ASM_MOD(i,10)*4
00414 #define u(j)    WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4
00415 #define v(j)    WORD_REG(di) + (ASM_MOD(j,4)*20 + (j/4)) * 4 + 80*4
00416 
00417 #define R10 ecx
00418 #define R11 edx
00419 #define R20 edx
00420 #define R21 ecx
00421 
00422 #define SSE2_STEP(i, j) \
00423         AS2(    mov             eax, [s(i+0)])\
00424         AS2(    mov             [v(i)], eax)\
00425         AS2(    rol             eax, 8)\
00426         AS2(    lea             AS_REG_7d, [AS_REG_6d + R2##j])\
00427         AS2(    xor             AS_REG_7d, R1##j)\
00428         AS2(    mov             [u(i)], AS_REG_7d)\
00429         AS2(    mov             AS_REG_7d, 1)\
00430         AS2(    and             AS_REG_7d, R2##j)\
00431         AS1(    neg             AS_REG_7d)\
00432         AS2(    and             AS_REG_7d, AS_REG_6d)\
00433         AS2(    xor             AS_REG_6d, eax)\
00434         AS2(    movzx   eax, al)\
00435         AS2(    xor             AS_REG_6d, [WORD_REG(si)+WORD_REG(ax)*4])\
00436         AS2(    mov             eax, [s(i+3)])\
00437         AS2(    xor             AS_REG_7d, [s(i+2)])\
00438         AS2(    add             R1##j, AS_REG_7d)\
00439         AS2(    movzx   AS_REG_7d, al)\
00440         AS2(    shr             eax, 8)\
00441         AS2(    xor             AS_REG_6d, [WORD_REG(si)+1024+AS_REG_7*4])\
00442         AS2(    xor             AS_REG_6d, eax)\
00443         AS2(    imul    R2##j, AS_HEX(54655307))\
00444         AS2(    rol             R2##j, 7)\
00445         AS2(    mov             [s(i+0)], AS_REG_6d)\
00446 
00447                 ASL(2)  // outer loop, each iteration of this processes 80 words
00448                 AS2(    lea             WORD_REG(di), [SSE2_uvStart])   // start of v and u
00449                 AS2(    mov             WORD_REG(ax), 80)
00450                 AS2(    cmp             WORD_REG(si), 80)
00451                 AS2(    cmovg   WORD_REG(si), WORD_REG(ax))
00452                 AS2(    mov             SSE2_wordsLeft2, WORD_REG(si))
00453                 AS2(    lea             WORD_REG(si), [WORD_REG(di)+WORD_REG(si)])              // use to end first inner loop
00454                 AS2(    mov             SSE2_diEnd, WORD_REG(si))
00455 #ifdef _MSC_VER
00456                 AS2(    lea             WORD_REG(si), s_sosemanukMulTables)
00457 #else
00458                 AS2(    mov             WORD_REG(si), SSE2_pMulTables)
00459 #endif
00460 
00461                 ASL(0)  // first inner loop, 20 words each, 4 iterations
00462                 SSE2_STEP(0, 0)
00463                 SSE2_STEP(1, 1)
00464                 SSE2_STEP(2, 0)
00465                 SSE2_STEP(3, 1)
00466                 SSE2_STEP(4, 0)
00467                 SSE2_STEP(5, 1)
00468                 SSE2_STEP(6, 0)
00469                 SSE2_STEP(7, 1)
00470                 SSE2_STEP(8, 0)
00471                 SSE2_STEP(9, 1)
00472                 SSE2_STEP(10, 0)
00473                 SSE2_STEP(11, 1)
00474                 SSE2_STEP(12, 0)
00475                 SSE2_STEP(13, 1)
00476                 SSE2_STEP(14, 0)
00477                 SSE2_STEP(15, 1)
00478                 SSE2_STEP(16, 0)
00479                 SSE2_STEP(17, 1)
00480                 SSE2_STEP(18, 0)
00481                 SSE2_STEP(19, 1)
00482                 // loop
00483                 AS2(    add             WORD_REG(di), 5*4)
00484                 AS2(    cmp             WORD_REG(di), SSE2_diEnd)
00485                 ASJ(    jne,    0, b)
00486 
00487                 AS2(    mov             WORD_REG(ax), SSE2_input)
00488                 AS2(    mov             AS_REG_7, SSE2_output)
00489                 AS2(    lea             WORD_REG(di), [SSE2_uvStart])           // start of v and u
00490                 AS2(    mov             WORD_REG(si), SSE2_wordsLeft2)
00491 
00492                 ASL(1)  // second inner loop, 16 words each, 5 iterations
00493                 AS2(    movdqa  xmm0, [WORD_REG(di)+0*20*4])
00494                 AS2(    movdqa  xmm2, [WORD_REG(di)+2*20*4])
00495                 AS2(    movdqa  xmm3, [WORD_REG(di)+3*20*4])
00496                 AS2(    movdqa  xmm1, [WORD_REG(di)+1*20*4])
00497                 // S2
00498                 AS2(    movdqa  xmm4, xmm0)
00499                 AS2(    pand    xmm0, xmm2)
00500                 AS2(    pxor    xmm0, xmm3)
00501                 AS2(    pxor    xmm2, xmm1)
00502                 AS2(    pxor    xmm2, xmm0)
00503                 AS2(    por             xmm3, xmm4)
00504                 AS2(    pxor    xmm3, xmm1)
00505                 AS2(    pxor    xmm4, xmm2)
00506                 AS2(    movdqa  xmm1, xmm3)
00507                 AS2(    por             xmm3, xmm4)
00508                 AS2(    pxor    xmm3, xmm0)
00509                 AS2(    pand    xmm0, xmm1)
00510                 AS2(    pxor    xmm4, xmm0)
00511                 AS2(    pxor    xmm1, xmm3)
00512                 AS2(    pxor    xmm1, xmm4)
00513                 AS2(    pxor    xmm4, xmm7)
00514                 // xor with v
00515                 AS2(    pxor    xmm2, [WORD_REG(di)+80*4])
00516                 AS2(    pxor    xmm3, [WORD_REG(di)+80*5])
00517                 AS2(    pxor    xmm1, [WORD_REG(di)+80*6])
00518                 AS2(    pxor    xmm4, [WORD_REG(di)+80*7])
00519                 // exit loop early if less than 16 words left to output
00520                 // this is necessary because block size is 20 words, and we output 16 words in each iteration of this loop
00521                 AS2(    cmp             WORD_REG(si), 16)
00522                 ASJ(    jl,             4, f)
00523                 // unpack
00524                 AS2(    movdqa          xmm6, xmm2)
00525                 AS2(    punpckldq       xmm2, xmm3)
00526                 AS2(    movdqa          xmm5, xmm1)
00527                 AS2(    punpckldq       xmm1, xmm4)
00528                 AS2(    movdqa          xmm0, xmm2)
00529                 AS2(    punpcklqdq      xmm2, xmm1)
00530                 AS2(    punpckhqdq      xmm0, xmm1)
00531                 AS2(    punpckhdq       xmm6, xmm3)
00532                 AS2(    punpckhdq       xmm5, xmm4)
00533                 AS2(    movdqa          xmm3, xmm6)
00534                 AS2(    punpcklqdq      xmm6, xmm5)
00535                 AS2(    punpckhqdq      xmm3, xmm5)
00536                 // output keystream
00537                 AS_XMM_OUTPUT4(SSE2_Sosemanuk_Output, WORD_REG(ax), AS_REG_7, 2,0,6,3, 1, 0,1,2,3, 4)
00538 
00539                 // loop
00540                 AS2(    add             WORD_REG(di), 4*4)
00541                 AS2(    sub             WORD_REG(si), 16)
00542                 ASJ(    jnz,    1, b)
00543 
00544                 // outer loop
00545                 AS2(    mov             WORD_REG(si), SSE2_wordsLeft)
00546                 AS2(    sub             WORD_REG(si), 80)
00547                 ASJ(    jz,             6, f)
00548                 AS2(    mov             SSE2_wordsLeft, WORD_REG(si))
00549                 AS2(    mov             SSE2_input, WORD_REG(ax))
00550                 AS2(    mov             SSE2_output, AS_REG_7)
00551                 ASJ(    jmp,    2, b)
00552 
00553                 ASL(4)  // final output of less than 16 words
00554                 AS2(    test    WORD_REG(ax), WORD_REG(ax))
00555                 ASJ(    jz,             5, f)
00556                 AS2(    movd    xmm0, dword ptr [WORD_REG(ax)+0*4])
00557                 AS2(    pxor    xmm2, xmm0)
00558                 AS2(    movd    xmm0, dword ptr [WORD_REG(ax)+1*4])
00559                 AS2(    pxor    xmm3, xmm0)
00560                 AS2(    movd    xmm0, dword ptr [WORD_REG(ax)+2*4])
00561                 AS2(    pxor    xmm1, xmm0)
00562                 AS2(    movd    xmm0, dword ptr [WORD_REG(ax)+3*4])
00563                 AS2(    pxor    xmm4, xmm0)
00564                 AS2(    add             WORD_REG(ax), 16)
00565                 ASL(5)
00566                 AS2(    movd    dword ptr [AS_REG_7+0*4], xmm2)
00567                 AS2(    movd    dword ptr [AS_REG_7+1*4], xmm3)
00568                 AS2(    movd    dword ptr [AS_REG_7+2*4], xmm1)
00569                 AS2(    movd    dword ptr [AS_REG_7+3*4], xmm4)
00570                 AS2(    sub             WORD_REG(si), 4)
00571                 ASJ(    jz,             6, f)
00572                 AS2(    add             AS_REG_7, 16)
00573                 AS2(    psrldq  xmm2, 4)
00574                 AS2(    psrldq  xmm3, 4)
00575                 AS2(    psrldq  xmm1, 4)
00576                 AS2(    psrldq  xmm4, 4)
00577                 ASJ(    jmp,    4, b)
00578 
00579                 ASL(6)  // save state
00580                 AS2(    mov             AS_REG_6, SSE2_state)
00581                 AS2(    movdqa  xmm0, [SSE2_stateCopy+0*16])
00582                 AS2(    movdqa  [AS_REG_6+0*16], xmm0)
00583                 AS2(    movdqa  xmm0, [SSE2_stateCopy+1*16])
00584                 AS2(    movdqa  [AS_REG_6+1*16], xmm0)
00585                 AS2(    movq    xmm0, QWORD PTR [SSE2_stateCopy+2*16])
00586                 AS2(    movq    QWORD PTR [AS_REG_6+2*16], xmm0)
00587                 AS2(    mov             [AS_REG_6+10*4], ecx)
00588                 AS2(    mov             [AS_REG_6+11*4], edx)
00589 
00590                 AS_POP_IF86(    sp)
00591                 AS_POP_IF86(    bp)
00592 
00593 #ifdef __GNUC__
00594                 AS_POP_IF86(    bx)
00595                 ".att_syntax prefix;"
00596                         :
00597                         : "a" (m_state.m_ptr), "c" (iterationCount), "S" (s_sosemanukMulTables), "D" (output), "d" (input)
00598         #if CRYPTOPP_BOOL_X64
00599                         , "r" (workspace)
00600                         : "memory", "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
00601         #else
00602                         : "memory", "cc"
00603         #endif
00604                 );
00605 #endif
00606 #ifdef CRYPTOPP_GENERATE_X64_MASM
00607         movdqa  xmm6, [rsp + 02f0h]
00608         movdqa  xmm7, [rsp + 0300h]
00609         add             rsp, 80*4*2+12*4+8*WORD_SZ + 2*16+8
00610         pop             rdi
00611         pop             rsi
00612         ret
00613         Sosemanuk_OperateKeystream ENDP
00614 #else
00615         }
00616         else
00617 #endif
00618 #endif
00619 #ifndef CRYPTOPP_GENERATE_X64_MASM
00620         {
00621 #if CRYPTOPP_BOOL_X86 | CRYPTOPP_BOOL_X64
00622 #define MUL_A(x)    (x = rotlFixed(x, 8), x ^ s_sosemanukMulTables[byte(x)])
00623 #else
00624 #define MUL_A(x)    (((x) << 8) ^ s_sosemanukMulTables[(x) >> 24])
00625 #endif
00626 
00627 #define DIV_A(x)    (((x) >> 8) ^ s_sosemanukMulTables[256 + byte(x)])
00628 
00629 #define r1(i) ((i%2) ? reg2 : reg1)
00630 #define r2(i) ((i%2) ? reg1 : reg2)
00631 
00632 #define STEP(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, v, u)      \
00633                 u = (s##x9 + r2(x0)) ^ r1(x0);\
00634                 v = s##x0;\
00635                 s##x0 = MUL_A(s##x0) ^ DIV_A(s##x3) ^ s##x9;\
00636                 r1(x0) += XMUX(r2(x0), s##x2, s##x9);\
00637                 r2(x0) = rotlFixed(r2(x0) * 0x54655307, 7);\
00638 
00639 #define SOSEMANUK_OUTPUT(x)     \
00640         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, u2 ^ v0);\
00641         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, u3 ^ v1);\
00642         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, u1 ^ v2);\
00643         CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, u4 ^ v3);
00644 
00645 #define OUTPUT4 \
00646         S2(0, u0, u1, u2, u3, u4);\
00647         CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SOSEMANUK_OUTPUT, 4*4);
00648 
00649         word32 s0 = m_state[0];
00650         word32 s1 = m_state[1];
00651         word32 s2 = m_state[2];
00652         word32 s3 = m_state[3];
00653         word32 s4 = m_state[4];
00654         word32 s5 = m_state[5];
00655         word32 s6 = m_state[6];
00656         word32 s7 = m_state[7];
00657         word32 s8 = m_state[8];
00658         word32 s9 = m_state[9];
00659         word32 reg1 = m_state[10];
00660         word32 reg2 = m_state[11];
00661         word32 u0, u1, u2, u3, u4, v0, v1, v2, v3;
00662 
00663         do
00664         {
00665                 STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v0, u0)
00666                 STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v1, u1)
00667                 STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v2, u2)
00668                 STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v3, u3)
00669                 OUTPUT4
00670                 STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v0, u0)
00671                 STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v1, u1)
00672                 STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v2, u2)
00673                 STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v3, u3)
00674                 OUTPUT4
00675                 STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v0, u0)
00676                 STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v1, u1)
00677                 STEP(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, v2, u2)
00678                 STEP(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, v3, u3)
00679                 OUTPUT4
00680                 STEP(2, 3, 4, 5, 6, 7, 8, 9, 0, 1, v0, u0)
00681                 STEP(3, 4, 5, 6, 7, 8, 9, 0, 1, 2, v1, u1)
00682                 STEP(4, 5, 6, 7, 8, 9, 0, 1, 2, 3, v2, u2)
00683                 STEP(5, 6, 7, 8, 9, 0, 1, 2, 3, 4, v3, u3)
00684                 OUTPUT4
00685                 STEP(6, 7, 8, 9, 0, 1, 2, 3, 4, 5, v0, u0)
00686                 STEP(7, 8, 9, 0, 1, 2, 3, 4, 5, 6, v1, u1)
00687                 STEP(8, 9, 0, 1, 2, 3, 4, 5, 6, 7, v2, u2)
00688                 STEP(9, 0, 1, 2, 3, 4, 5, 6, 7, 8, v3, u3)
00689                 OUTPUT4
00690         }
00691         while (--iterationCount);
00692 
00693         m_state[0] = s0;
00694         m_state[1] = s1;
00695         m_state[2] = s2;
00696         m_state[3] = s3;
00697         m_state[4] = s4;
00698         m_state[5] = s5;
00699         m_state[6] = s6;
00700         m_state[7] = s7;
00701         m_state[8] = s8;
00702         m_state[9] = s9;
00703         m_state[10] = reg1;
00704         m_state[11] = reg2;
00705         }
00706 }
00707 
00708 NAMESPACE_END
00709 
00710 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM

Generated on Fri Feb 6 00:56:25 2009 for Crypto++ by  doxygen 1.4.7