Crypto++  5.6.4
Free C++ class library of cryptographic schemes
blake2.cpp
1 // blake2.cpp - written and placed in the public domain by Jeffrey Walton and Zooko
2 // Wilcox-O'Hearn. Copyright assigned to the Crypto++ project.
3 // Based on Aumasson, Neves, Wilcox-O'Hearn and Winnerlein's reference BLAKE2
4 // implementation at http://github.com/BLAKE2/BLAKE2.
5 
6 #include "pch.h"
7 #include "config.h"
8 #include "cryptlib.h"
9 #include "argnames.h"
10 #include "algparam.h"
11 #include "blake2.h"
12 #include "cpu.h"
13 
14 NAMESPACE_BEGIN(CryptoPP)
15 
16 // Uncomment for benchmarking C++ against SSE2 or NEON
17 // #undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
18 // #undef CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
19 
20 // Apple Clang 6.0/Clang 3.5 does not have SSSE3 intrinsics
21 // http://llvm.org/bugs/show_bug.cgi?id=20213
22 #if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500))
23 # undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
24 #endif
25 
26 // Sun Studio 12.3 and earlier lack SSE2's _mm_set_epi64x. Win32 lacks _mm_set_epi64x (Win64 supplies it except for VS2008).
27 // Also see http://stackoverflow.com/a/38547909/608639
28 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (_MSC_VER >= 1200 && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600))
29 inline __m128i _mm_set_epi64x(const word64 a, const word64 b)
30 {
31  union INT_128_64x2 {
32  __m128i v128;
33  word64 v64[2];
34  };
35 
36  INT_128_64x2 val;
37  val.v64[0] = b; val.v64[1] = a;
38  return val.v128;
39 }
40 #endif
41 
42 // C/C++ implementation
43 static void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
44 static void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
45 
46 // Also see http://github.com/weidai11/cryptopp/issues/247 for singling out SunCC 5.12
47 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
48 static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
49 # if (__SUNPRO_CC != 0x5120)
50 static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
51 # endif
52 #endif
53 
54 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
55 static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
56 static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
57 #endif
58 
59 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
60 static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state);
61 static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state);
62 #endif
63 
64 #ifndef CRYPTOPP_DOXYGEN_PROCESSING
65 
66 // IV and Sigma are a better fit as part of BLAKE2_Base, but that places
67 // the constants out of reach for the NEON, SSE2 and SSE4 implementations.
68 template<bool T_64bit>
69 struct CRYPTOPP_NO_VTABLE BLAKE2_IV {};
70 
71 //! \brief BLAKE2s initialization vector specialization
72 template<>
73 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<false>
74 {
75  CRYPTOPP_CONSTANT(IVSIZE = 8)
76  // Always align for NEON and SSE
77  CRYPTOPP_ALIGN_DATA(16) static const word32 iv[8];
78 };
79 
80 CRYPTOPP_ALIGN_DATA(16)
81 const word32 BLAKE2_IV<false>::iv[8] = {
82  0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
83  0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
84 };
85 
86 #define BLAKE2S_IV(n) BLAKE2_IV<false>::iv[n]
87 
88 template<>
89 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<true>
90 {
91  CRYPTOPP_CONSTANT(IVSIZE = 8)
92  // Always align for NEON and SSE
93  CRYPTOPP_ALIGN_DATA(16) static const word64 iv[8];
94 };
95 
96 CRYPTOPP_ALIGN_DATA(16)
97 const word64 BLAKE2_IV<true>::iv[8] = {
98  W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
99  W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
100  W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
101  W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
102 };
103 
104 #define BLAKE2B_IV(n) BLAKE2_IV<true>::iv[n]
105 
106 // IV and Sigma are a better fit as part of BLAKE2_Base, but that places
107 // the constants out of reach for the NEON, SSE2 and SSE4 implementations.
108 template<bool T_64bit>
109 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {};
110 
111 template<>
112 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<false>
113 {
114  // Always align for NEON and SSE
115  CRYPTOPP_ALIGN_DATA(16) static const byte sigma[10][16];
116 };
117 
118 CRYPTOPP_ALIGN_DATA(16)
119 const byte BLAKE2_Sigma<false>::sigma[10][16] = {
120  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
121  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
122  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
123  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
124  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
125  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
126  { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
127  { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
128  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
129  { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
130 };
131 
132 //! \brief BLAKE2b sigma table specialization
133 template<>
134 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<true>
135 {
136  // Always align for NEON and SSE
137  CRYPTOPP_ALIGN_DATA(16) static const byte sigma[12][16];
138 };
139 
140 CRYPTOPP_ALIGN_DATA(16)
141 const byte BLAKE2_Sigma<true>::sigma[12][16] = {
142  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
143  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
144  { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
145  { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
146  { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
147  { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
148  { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
149  { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
150  { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
151  { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
152  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
153  { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
154 };
155 
156 typedef void (*pfnCompress32)(const byte*, BLAKE2_State<word32, false>&);
157 typedef void (*pfnCompress64)(const byte*, BLAKE2_State<word64, true>&);
158 
159 pfnCompress64 InitializeCompress64Fn()
160 {
161 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
162  if (HasSSE4())
163  return &BLAKE2_SSE4_Compress64;
164  else
165 #endif
166 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
167 # if (__SUNPRO_CC != 0x5120)
168  if (HasSSE2())
169  return &BLAKE2_SSE2_Compress64;
170  else
171 # endif
172 #endif
173 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
174  if (HasNEON())
175  return &BLAKE2_NEON_Compress64;
176  else
177 #endif
178  return &BLAKE2_CXX_Compress64;
179 }
180 
181 pfnCompress32 InitializeCompress32Fn()
182 {
183 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
184  if (HasSSE4())
185  return &BLAKE2_SSE4_Compress32;
186  else
187 #endif
188 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
189  if (HasSSE2())
190  return &BLAKE2_SSE2_Compress32;
191  else
192 #endif
193 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
194  if (HasNEON())
195  return &BLAKE2_NEON_Compress32;
196  else
197 #endif
198  return &BLAKE2_CXX_Compress32;
199 }
200 
201 #endif // CRYPTOPP_DOXYGEN_PROCESSING
202 
203 BLAKE2_ParameterBlock<false>::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen,
204  const byte* saltStr, size_t saltLen,
205  const byte* personalizationStr, size_t personalizationLen)
206 {
207  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
208  digestLength = (byte)digestLen;
209  keyLength = (byte)keyLen;
210  fanout = depth = 1;
211  nodeDepth = innerLength = 0;
212 
213  memset(leafLength, 0x00, COUNTOF(leafLength));
214  memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
215 
216  if (saltStr && saltLen)
217  {
218  memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
219  const size_t rem = COUNTOF(salt) - saltLen;
220  const size_t off = COUNTOF(salt) - rem;
221  if (rem)
222  memset(salt+off, 0x00, rem);
223  }
224  else
225  {
226  memset(salt, 0x00, COUNTOF(salt));
227  }
228 
229  if (personalizationStr && personalizationLen)
230  {
231  memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
232  const size_t rem = COUNTOF(personalization) - personalizationLen;
233  const size_t off = COUNTOF(personalization) - rem;
234  if (rem)
235  memset(personalization+off, 0x00, rem);
236  }
237  else
238  {
239  memset(personalization, 0x00, COUNTOF(personalization));
240  }
241 }
242 
243 BLAKE2_ParameterBlock<true>::BLAKE2_ParameterBlock(size_t digestLen, size_t keyLen,
244  const byte* saltStr, size_t saltLen,
245  const byte* personalizationStr, size_t personalizationLen)
246 {
247  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
248  digestLength = (byte)digestLen;
249  keyLength = (byte)keyLen;
250  fanout = depth = 1;
251  nodeDepth = innerLength = 0;
252 
253  memset(rfu, 0x00, COUNTOF(rfu));
254  memset(leafLength, 0x00, COUNTOF(leafLength));
255  memset(nodeOffset, 0x00, COUNTOF(nodeOffset));
256 
257  if (saltStr && saltLen)
258  {
259  memcpy_s(salt, COUNTOF(salt), saltStr, saltLen);
260  const size_t rem = COUNTOF(salt) - saltLen;
261  const size_t off = COUNTOF(salt) - rem;
262  if (rem)
263  memset(salt+off, 0x00, rem);
264  }
265  else
266  {
267  memset(salt, 0x00, COUNTOF(salt));
268  }
269 
270  if (personalizationStr && personalizationLen)
271  {
272  memcpy_s(personalization, COUNTOF(personalization), personalizationStr, personalizationLen);
273  const size_t rem = COUNTOF(personalization) - personalizationLen;
274  const size_t off = COUNTOF(personalization) - rem;
275  if (rem)
276  memset(personalization+off, 0x00, rem);
277  }
278  else
279  {
280  memset(personalization, 0x00, COUNTOF(personalization));
281  }
282 }
283 
284 template <class W, bool T_64bit>
285 void BLAKE2_Base<W, T_64bit>::UncheckedSetKey(const byte *key, unsigned int length, const CryptoPP::NameValuePairs& params)
286 {
287  if (key && length)
288  {
289  AlignedSecByteBlock temp(BLOCKSIZE);
290  memcpy_s(temp, BLOCKSIZE, key, length);
291 
292  const size_t rem = BLOCKSIZE - length;
293  if (rem)
294  memset(temp+length, 0x00, rem);
295 
296  m_key.swap(temp);
297  }
298  else
299  {
300  m_key.resize(0);
301  }
302 
303 #if defined(__COVERITY__)
304  // Avoid Coverity finding SIZEOF_MISMATCH/suspicious_sizeof
305  ParameterBlock& block = *m_block.data();
306  memset(m_block.data(), 0x00, sizeof(ParameterBlock));
307 #else
308  // Set Head bytes; Tail bytes are set below
309  ParameterBlock& block = *m_block.data();
310  memset(m_block.data(), 0x00, T_64bit ? 32 : 16);
311 #endif
312 
313  block.keyLength = (byte)length;
314  block.digestLength = (byte)params.GetIntValueWithDefault(Name::DigestSize(), DIGESTSIZE);
315  block.fanout = block.depth = 1;
316 
318  if (params.GetValue(Name::Salt(), t) && t.begin() && t.size())
319  {
320  memcpy_s(block.salt, COUNTOF(block.salt), t.begin(), t.size());
321  const size_t rem = COUNTOF(block.salt) - t.size();
322  const size_t off = COUNTOF(block.salt) - rem;
323  if (rem)
324  memset(block.salt+off, 0x00, rem);
325  }
326  else
327  {
328  memset(block.salt, 0x00, COUNTOF(block.salt));
329  }
330 
331  if (params.GetValue(Name::Personalization(), t) && t.begin() && t.size())
332  {
333  memcpy_s(block.personalization, COUNTOF(block.personalization), t.begin(), t.size());
334  const size_t rem = COUNTOF(block.personalization) - t.size();
335  const size_t off = COUNTOF(block.personalization) - rem;
336  if (rem)
337  memset(block.personalization+off, 0x00, rem);
338  }
339  else
340  {
341  memset(block.personalization, 0x00, COUNTOF(block.personalization));
342  }
343 }
344 
345 template <class W, bool T_64bit>
346 BLAKE2_Base<W, T_64bit>::BLAKE2_Base() : m_state(1), m_block(1), m_digestSize(DIGESTSIZE), m_treeMode(false)
347 {
348  UncheckedSetKey(NULL, 0, g_nullNameValuePairs);
349  Restart();
350 }
351 
352 template <class W, bool T_64bit>
353 BLAKE2_Base<W, T_64bit>::BLAKE2_Base(bool treeMode, unsigned int digestSize) : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
354 {
355  assert(digestSize <= DIGESTSIZE);
356 
357  UncheckedSetKey(NULL, 0, g_nullNameValuePairs);
358  Restart();
359 }
360 
361 template <class W, bool T_64bit>
362 BLAKE2_Base<W, T_64bit>::BLAKE2_Base(const byte *key, size_t keyLength, const byte* salt, size_t saltLength,
363  const byte* personalization, size_t personalizationLength, bool treeMode, unsigned int digestSize)
364  : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
365 {
366  assert(keyLength <= MAX_KEYLENGTH);
367  assert(digestSize <= DIGESTSIZE);
368  assert(saltLength <= SALTSIZE);
369  assert(personalizationLength <= PERSONALIZATIONSIZE);
370 
371  UncheckedSetKey(key, static_cast<unsigned int>(keyLength), MakeParameters(Name::DigestSize(),(int)digestSize)(Name::TreeMode(),treeMode, false)
372  (Name::Salt(), ConstByteArrayParameter(salt, saltLength))(Name::Personalization(), ConstByteArrayParameter(personalization, personalizationLength)));
373  Restart();
374 }
375 
376 template <class W, bool T_64bit>
378 {
379  static const W zero[2] = {0,0};
380  Restart(*m_block.data(), zero);
381 }
382 
383 template <class W, bool T_64bit>
385 {
386  // We take a parameter block as a parameter to allow customized state.
387  // Avoid the copy of the parameter block when we are passing our own block.
388  if (&block != m_block.data())
389  {
390  memcpy_s(m_block.data(), sizeof(ParameterBlock), &block, sizeof(ParameterBlock));
391  m_block.data()->digestLength = (byte)m_digestSize;
392  m_block.data()->keyLength = (byte)m_key.size();
393  }
394 
395  State& state = *m_state.data();
396  state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
397 
398  if (counter != NULL)
399  {
400  state.t[0] = counter[0];
401  state.t[1] = counter[1];
402  }
403 
404  PutBlock<W, LittleEndian, true> put(m_block.data(), &state.h[0]);
405  put(BLAKE2_IV<T_64bit>::iv[0])(BLAKE2_IV<T_64bit>::iv[1])(BLAKE2_IV<T_64bit>::iv[2])(BLAKE2_IV<T_64bit>::iv[3]);
406  put(BLAKE2_IV<T_64bit>::iv[4])(BLAKE2_IV<T_64bit>::iv[5])(BLAKE2_IV<T_64bit>::iv[6])(BLAKE2_IV<T_64bit>::iv[7]);
407 
408  // When BLAKE2 is keyed, the input stream is simply {key||message}. Key it
409  // during Restart to avoid FirstPut and friends. Key size == 0 means no key.
410  if (m_key.size())
411  Update(m_key, m_key.size());
412 }
413 
414 template <class W, bool T_64bit>
415 void BLAKE2_Base<W, T_64bit>::Update(const byte *input, size_t length)
416 {
417  State& state = *m_state.data();
418  if (state.length + length > BLOCKSIZE)
419  {
420  // Complete current block
421  const size_t fill = BLOCKSIZE - state.length;
422  memcpy_s(&state.buffer[state.length], fill, input, fill);
423 
424  IncrementCounter();
425  Compress(state.buffer);
426  state.length = 0;
427 
428  length -= fill, input += fill;
429 
430  // Compress in-place to avoid copies
431  while (length > BLOCKSIZE)
432  {
433  IncrementCounter();
434  Compress(input);
435  length -= BLOCKSIZE, input += BLOCKSIZE;
436  }
437  }
438 
439  // Copy tail bytes
440  if (input && length)
441  {
442  assert(length <= BLOCKSIZE - state.length);
443  memcpy_s(&state.buffer[state.length], length, input, length);
444  state.length += static_cast<unsigned int>(length);
445  }
446 }
447 
448 template <class W, bool T_64bit>
449 void BLAKE2_Base<W, T_64bit>::TruncatedFinal(byte *hash, size_t size)
450 {
451  this->ThrowIfInvalidTruncatedSize(size);
452 
453  // Set last block unconditionally
454  State& state = *m_state.data();
455  state.f[0] = static_cast<W>(-1);
456 
457  // Set last node if tree mode
458  if (m_treeMode)
459  state.f[1] = static_cast<W>(-1);
460 
461  // Increment counter for tail bytes only
462  IncrementCounter(state.length);
463 
464  memset(state.buffer + state.length, 0x00, BLOCKSIZE - state.length);
465  Compress(state.buffer);
466 
467  // Copy to caller buffer
468  memcpy_s(hash, size, &state.h[0], size);
469 
470  Restart();
471 }
472 
473 template <class W, bool T_64bit>
475 {
476  State& state = *m_state.data();
477  state.t[0] += static_cast<W>(count);
478  state.t[1] += !!(state.t[0] < count);
479 }
480 
481 template <>
482 void BLAKE2_Base<word64, true>::Compress(const byte *input)
483 {
484  // Selects the most advanced implmentation at runtime
485  static const pfnCompress64 s_pfn = InitializeCompress64Fn();
486  s_pfn(input, *m_state.data());
487 }
488 
489 template <>
490 void BLAKE2_Base<word32, false>::Compress(const byte *input)
491 {
492  // Selects the most advanced implmentation at runtime
493  static const pfnCompress32 s_pfn = InitializeCompress32Fn();
494  s_pfn(input, *m_state.data());
495 }
496 
497 void BLAKE2_CXX_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
498 {
499  #undef BLAKE2_G
500  #undef BLAKE2_ROUND
501 
502  #define BLAKE2_G(r,i,a,b,c,d) \
503  do { \
504  a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+0]]; \
505  d = rotrVariable<word64>(d ^ a, 32); \
506  c = c + d; \
507  b = rotrVariable<word64>(b ^ c, 24); \
508  a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+1]]; \
509  d = rotrVariable<word64>(d ^ a, 16); \
510  c = c + d; \
511  b = rotrVariable<word64>(b ^ c, 63); \
512  } while(0)
513 
514  #define BLAKE2_ROUND(r) \
515  do { \
516  BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
517  BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
518  BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
519  BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
520  BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
521  BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
522  BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
523  BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
524  } while(0)
525 
526  word64 m[16], v[16];
527 
529  get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
530 
531  GetBlock<word64, LittleEndian, true> get2(&state.h[0]);
532  get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
533 
534  v[ 8] = BLAKE2B_IV(0);
535  v[ 9] = BLAKE2B_IV(1);
536  v[10] = BLAKE2B_IV(2);
537  v[11] = BLAKE2B_IV(3);
538  v[12] = state.t[0] ^ BLAKE2B_IV(4);
539  v[13] = state.t[1] ^ BLAKE2B_IV(5);
540  v[14] = state.f[0] ^ BLAKE2B_IV(6);
541  v[15] = state.f[1] ^ BLAKE2B_IV(7);
542 
543  BLAKE2_ROUND( 0 );
544  BLAKE2_ROUND( 1 );
545  BLAKE2_ROUND( 2 );
546  BLAKE2_ROUND( 3 );
547  BLAKE2_ROUND( 4 );
548  BLAKE2_ROUND( 5 );
549  BLAKE2_ROUND( 6 );
550  BLAKE2_ROUND( 7 );
551  BLAKE2_ROUND( 8 );
552  BLAKE2_ROUND( 9 );
553  BLAKE2_ROUND( 10 );
554  BLAKE2_ROUND( 11 );
555 
556  for(unsigned int i = 0; i < 8; ++i)
557  state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
558 }
559 
560 void BLAKE2_CXX_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
561 {
562  #undef BLAKE2_G
563  #undef BLAKE2_ROUND
564 
565  #define BLAKE2_G(r,i,a,b,c,d) \
566  do { \
567  a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+0]]; \
568  d = rotrVariable<word32>(d ^ a, 16); \
569  c = c + d; \
570  b = rotrVariable<word32>(b ^ c, 12); \
571  a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+1]]; \
572  d = rotrVariable<word32>(d ^ a, 8); \
573  c = c + d; \
574  b = rotrVariable<word32>(b ^ c, 7); \
575  } while(0)
576 
577  #define BLAKE2_ROUND(r) \
578  do { \
579  BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
580  BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
581  BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \
582  BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \
583  BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \
584  BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \
585  BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
586  BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \
587  } while(0)
588 
589  word32 m[16], v[16];
590 
592  get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
593 
594  GetBlock<word32, LittleEndian, true> get2(&state.h[0]);
595  get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
596 
597  v[ 8] = BLAKE2S_IV(0);
598  v[ 9] = BLAKE2S_IV(1);
599  v[10] = BLAKE2S_IV(2);
600  v[11] = BLAKE2S_IV(3);
601  v[12] = state.t[0] ^ BLAKE2S_IV(4);
602  v[13] = state.t[1] ^ BLAKE2S_IV(5);
603  v[14] = state.f[0] ^ BLAKE2S_IV(6);
604  v[15] = state.f[1] ^ BLAKE2S_IV(7);
605 
606  BLAKE2_ROUND( 0 );
607  BLAKE2_ROUND( 1 );
608  BLAKE2_ROUND( 2 );
609  BLAKE2_ROUND( 3 );
610  BLAKE2_ROUND( 4 );
611  BLAKE2_ROUND( 5 );
612  BLAKE2_ROUND( 6 );
613  BLAKE2_ROUND( 7 );
614  BLAKE2_ROUND( 8 );
615  BLAKE2_ROUND( 9 );
616 
617  for(unsigned int i = 0; i < 8; ++i)
618  state.h[i] = state.h[i] ^ ConditionalByteReverse(LittleEndian::ToEnum(), v[i] ^ v[i + 8]);
619 }
620 
621 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
622 static void BLAKE2_SSE2_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
623 {
624  word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
626  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
627 
628  __m128i row1,row2,row3,row4;
629  __m128i buf1,buf2,buf3,buf4;
630  __m128i ff0,ff1;
631 
632  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
633  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
634  row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3));
635  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
636  buf1 = _mm_set_epi32(m6,m4,m2,m0);
637  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
638  row4 = _mm_xor_si128(row4,row1);
639  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
640  row3 = _mm_add_epi32(row3,row4);
641  row2 = _mm_xor_si128(row2,row3);
642  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
643 
644  buf2 = _mm_set_epi32(m7,m5,m3,m1);
645  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
646  row4 = _mm_xor_si128(row4,row1);
647  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
648  row3 = _mm_add_epi32(row3,row4);
649  row2 = _mm_xor_si128(row2,row3);
650  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
651 
652  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
653  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
654  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
655 
656  buf3 = _mm_set_epi32(m14,m12,m10,m8);
657  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
658  row4 = _mm_xor_si128(row4,row1);
659  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
660  row3 = _mm_add_epi32(row3,row4);
661  row2 = _mm_xor_si128(row2,row3);
662  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
663 
664  buf4 = _mm_set_epi32(m15,m13,m11,m9);
665  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
666  row4 = _mm_xor_si128(row4,row1);
667  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
668  row3 = _mm_add_epi32(row3,row4);
669  row2 = _mm_xor_si128(row2,row3);
670  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
671 
672  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
673  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
674  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
675 
676  buf1 = _mm_set_epi32(m13,m9,m4,m14);
677  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
678  row4 = _mm_xor_si128(row4,row1);
679  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
680  row3 = _mm_add_epi32(row3,row4);
681  row2 = _mm_xor_si128(row2,row3);
682  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
683 
684  buf2 = _mm_set_epi32(m6,m15,m8,m10);
685  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
686  row4 = _mm_xor_si128(row4,row1);
687  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
688  row3 = _mm_add_epi32(row3,row4);
689  row2 = _mm_xor_si128(row2,row3);
690  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
691 
692  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
693  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
694  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
695 
696  buf3 = _mm_set_epi32(m5,m11,m0,m1);
697  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
698  row4 = _mm_xor_si128(row4,row1);
699  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
700  row3 = _mm_add_epi32(row3,row4);
701  row2 = _mm_xor_si128(row2,row3);
702  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
703 
704  buf4 = _mm_set_epi32(m3,m7,m2,m12);
705  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
706  row4 = _mm_xor_si128(row4,row1);
707  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
708  row3 = _mm_add_epi32(row3,row4);
709  row2 = _mm_xor_si128(row2,row3);
710  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
711 
712  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
713  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
714  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
715 
716  buf1 = _mm_set_epi32(m15,m5,m12,m11);
717  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
718  row4 = _mm_xor_si128(row4,row1);
719  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
720  row3 = _mm_add_epi32(row3,row4);
721  row2 = _mm_xor_si128(row2,row3);
722  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
723 
724  buf2 = _mm_set_epi32(m13,m2,m0,m8);
725  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
726  row4 = _mm_xor_si128(row4,row1);
727  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
728  row3 = _mm_add_epi32(row3,row4);
729  row2 = _mm_xor_si128(row2,row3);
730  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
731 
732  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
733  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
734  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
735 
736  buf3 = _mm_set_epi32(m9,m7,m3,m10);
737  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
738  row4 = _mm_xor_si128(row4,row1);
739  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
740  row3 = _mm_add_epi32(row3,row4);
741  row2 = _mm_xor_si128(row2,row3);
742  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
743 
744  buf4 = _mm_set_epi32(m4,m1,m6,m14);
745  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
746  row4 = _mm_xor_si128(row4,row1);
747  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
748  row3 = _mm_add_epi32(row3,row4);
749  row2 = _mm_xor_si128(row2,row3);
750  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
751 
752  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
753  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
754  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
755 
756  buf1 = _mm_set_epi32(m11,m13,m3,m7);
757  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
758  row4 = _mm_xor_si128(row4,row1);
759  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
760  row3 = _mm_add_epi32(row3,row4);
761  row2 = _mm_xor_si128(row2,row3);
762  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
763 
764  buf2 = _mm_set_epi32(m14,m12,m1,m9);
765  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
766  row4 = _mm_xor_si128(row4,row1);
767  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
768  row3 = _mm_add_epi32(row3,row4);
769  row2 = _mm_xor_si128(row2,row3);
770  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
771 
772  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
773  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
774  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
775 
776  buf3 = _mm_set_epi32(m15,m4,m5,m2);
777  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
778  row4 = _mm_xor_si128(row4,row1);
779  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
780  row3 = _mm_add_epi32(row3,row4);
781  row2 = _mm_xor_si128(row2,row3);
782  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
783 
784  buf4 = _mm_set_epi32(m8,m0,m10,m6);
785  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
786  row4 = _mm_xor_si128(row4,row1);
787  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
788  row3 = _mm_add_epi32(row3,row4);
789  row2 = _mm_xor_si128(row2,row3);
790  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
791 
792  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
793  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
794  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
795 
796  buf1 = _mm_set_epi32(m10,m2,m5,m9);
797  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
798  row4 = _mm_xor_si128(row4,row1);
799  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
800  row3 = _mm_add_epi32(row3,row4);
801  row2 = _mm_xor_si128(row2,row3);
802  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
803 
804  buf2 = _mm_set_epi32(m15,m4,m7,m0);
805  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
806  row4 = _mm_xor_si128(row4,row1);
807  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
808  row3 = _mm_add_epi32(row3,row4);
809  row2 = _mm_xor_si128(row2,row3);
810  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
811 
812  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
813  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
814  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
815 
816  buf3 = _mm_set_epi32(m3,m6,m11,m14);
817  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
818  row4 = _mm_xor_si128(row4,row1);
819  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
820  row3 = _mm_add_epi32(row3,row4);
821  row2 = _mm_xor_si128(row2,row3);
822  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
823 
824  buf4 = _mm_set_epi32(m13,m8,m12,m1);
825  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
826  row4 = _mm_xor_si128(row4,row1);
827  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
828  row3 = _mm_add_epi32(row3,row4);
829  row2 = _mm_xor_si128(row2,row3);
830  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
831 
832  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
833  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
834  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
835 
836  buf1 = _mm_set_epi32(m8,m0,m6,m2);
837  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
838  row4 = _mm_xor_si128(row4,row1);
839  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
840  row3 = _mm_add_epi32(row3,row4);
841  row2 = _mm_xor_si128(row2,row3);
842  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
843 
844  buf2 = _mm_set_epi32(m3,m11,m10,m12);
845  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
846  row4 = _mm_xor_si128(row4,row1);
847  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
848  row3 = _mm_add_epi32(row3,row4);
849  row2 = _mm_xor_si128(row2,row3);
850  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
851 
852  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
853  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
854  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
855 
856  buf3 = _mm_set_epi32(m1,m15,m7,m4);
857  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
858  row4 = _mm_xor_si128(row4,row1);
859  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
860  row3 = _mm_add_epi32(row3,row4);
861  row2 = _mm_xor_si128(row2,row3);
862  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
863 
864  buf4 = _mm_set_epi32(m9,m14,m5,m13);
865  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
866  row4 = _mm_xor_si128(row4,row1);
867  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
868  row3 = _mm_add_epi32(row3,row4);
869  row2 = _mm_xor_si128(row2,row3);
870  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
871 
872  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
873  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
874  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
875 
876  buf1 = _mm_set_epi32(m4,m14,m1,m12);
877  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
878  row4 = _mm_xor_si128(row4,row1);
879  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
880  row3 = _mm_add_epi32(row3,row4);
881  row2 = _mm_xor_si128(row2,row3);
882  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
883 
884  buf2 = _mm_set_epi32(m10,m13,m15,m5);
885  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
886  row4 = _mm_xor_si128(row4,row1);
887  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
888  row3 = _mm_add_epi32(row3,row4);
889  row2 = _mm_xor_si128(row2,row3);
890  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
891 
892  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
893  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
894  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
895 
896  buf3 = _mm_set_epi32(m8,m9,m6,m0);
897  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
898  row4 = _mm_xor_si128(row4,row1);
899  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
900  row3 = _mm_add_epi32(row3,row4);
901  row2 = _mm_xor_si128(row2,row3);
902  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
903 
904  buf4 = _mm_set_epi32(m11,m2,m3,m7);
905  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
906  row4 = _mm_xor_si128(row4,row1);
907  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
908  row3 = _mm_add_epi32(row3,row4);
909  row2 = _mm_xor_si128(row2,row3);
910  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
911 
912  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
913  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
914  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
915 
916  buf1 = _mm_set_epi32(m3,m12,m7,m13);
917  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
918  row4 = _mm_xor_si128(row4,row1);
919  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
920  row3 = _mm_add_epi32(row3,row4);
921  row2 = _mm_xor_si128(row2,row3);
922  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
923 
924  buf2 = _mm_set_epi32(m9,m1,m14,m11);
925  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
926  row4 = _mm_xor_si128(row4,row1);
927  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
928  row3 = _mm_add_epi32(row3,row4);
929  row2 = _mm_xor_si128(row2,row3);
930  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
931 
932  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
933  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
934  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
935 
936  buf3 = _mm_set_epi32(m2,m8,m15,m5);
937  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
938  row4 = _mm_xor_si128(row4,row1);
939  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
940  row3 = _mm_add_epi32(row3,row4);
941  row2 = _mm_xor_si128(row2,row3);
942  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
943 
944  buf4 = _mm_set_epi32(m10,m6,m4,m0);
945  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
946  row4 = _mm_xor_si128(row4,row1);
947  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
948  row3 = _mm_add_epi32(row3,row4);
949  row2 = _mm_xor_si128(row2,row3);
950  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
951 
952  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
953  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
954  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
955 
956  buf1 = _mm_set_epi32(m0,m11,m14,m6);
957  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
958  row4 = _mm_xor_si128(row4,row1);
959  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
960  row3 = _mm_add_epi32(row3,row4);
961  row2 = _mm_xor_si128(row2,row3);
962  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
963 
964  buf2 = _mm_set_epi32(m8,m3,m9,m15);
965  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
966  row4 = _mm_xor_si128(row4,row1);
967  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
968  row3 = _mm_add_epi32(row3,row4);
969  row2 = _mm_xor_si128(row2,row3);
970  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
971 
972  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
973  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
974  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
975 
976  buf3 = _mm_set_epi32(m10,m1,m13,m12);
977  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
978  row4 = _mm_xor_si128(row4,row1);
979  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
980  row3 = _mm_add_epi32(row3,row4);
981  row2 = _mm_xor_si128(row2,row3);
982  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
983 
984  buf4 = _mm_set_epi32(m5,m4,m7,m2);
985  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
986  row4 = _mm_xor_si128(row4,row1);
987  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
988  row3 = _mm_add_epi32(row3,row4);
989  row2 = _mm_xor_si128(row2,row3);
990  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
991 
992  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
993  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
994  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
995 
996  buf1 = _mm_set_epi32(m1,m7,m8,m10);
997  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
998  row4 = _mm_xor_si128(row4,row1);
999  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1000  row3 = _mm_add_epi32(row3,row4);
1001  row2 = _mm_xor_si128(row2,row3);
1002  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1003 
1004  buf2 = _mm_set_epi32(m5,m6,m4,m2);
1005  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
1006  row4 = _mm_xor_si128(row4,row1);
1007  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1008  row3 = _mm_add_epi32(row3,row4);
1009  row2 = _mm_xor_si128(row2,row3);
1010  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1011 
1012  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
1013  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1014  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
1015 
1016  buf3 = _mm_set_epi32(m13,m3,m9,m15);
1017  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
1018  row4 = _mm_xor_si128(row4,row1);
1019  row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1020  row3 = _mm_add_epi32(row3,row4);
1021  row2 = _mm_xor_si128(row2,row3);
1022  row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1023 
1024  buf4 = _mm_set_epi32(m0,m12,m14,m11);
1025  row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
1026  row4 = _mm_xor_si128(row4,row1);
1027  row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1028  row3 = _mm_add_epi32(row3,row4);
1029  row2 = _mm_xor_si128(row2,row3);
1030  row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1031 
1032  row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
1033  row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1034  row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
1035 
1036  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
1037  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
1038 }
1039 
1040 # if (__SUNPRO_CC != 0x5120)
1041 static void BLAKE2_SSE2_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
1042 {
1043  word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
1045  get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
1046 
1047  __m128i row1l, row1h, row2l, row2h;
1048  __m128i row3l, row3h, row4l, row4h;
1049  __m128i b0, b1, t0, t1;
1050 
1051  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1052  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
1053  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1054  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
1055  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
1056  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
1057  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1058  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
1059 
1060  b0 = _mm_set_epi64x(m2, m0);
1061  b1 = _mm_set_epi64x(m6, m4);
1062  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1063  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1064  row4l = _mm_xor_si128(row4l, row1l);
1065  row4h = _mm_xor_si128(row4h, row1h);
1066  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1067  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1068  row3l = _mm_add_epi64(row3l, row4l);
1069  row3h = _mm_add_epi64(row3h, row4h);
1070  row2l = _mm_xor_si128(row2l, row3l);
1071  row2h = _mm_xor_si128(row2h, row3h);
1072  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40));
1073  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40));
1074 
1075  b0 = _mm_set_epi64x(m3, m1);
1076  b1 = _mm_set_epi64x(m7, m5);
1077  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1078  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1079  row4l = _mm_xor_si128(row4l, row1l);
1080  row4h = _mm_xor_si128(row4h, row1h);
1081  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1082  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1083  row3l = _mm_add_epi64(row3l, row4l);
1084  row3h = _mm_add_epi64(row3h, row4h);
1085  row2l = _mm_xor_si128(row2l, row3l);
1086  row2h = _mm_xor_si128(row2h, row3h);
1087  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1088  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1089 
1090  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1091  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1092  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1093  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1094  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1095 
1096  b0 = _mm_set_epi64x(m10, m8);
1097  b1 = _mm_set_epi64x(m14, m12);
1098  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1099  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1100  row4l = _mm_xor_si128(row4l, row1l);
1101  row4h = _mm_xor_si128(row4h, row1h);
1102  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1103  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1104  row3l = _mm_add_epi64(row3l, row4l);
1105  row3h = _mm_add_epi64(row3h, row4h);
1106  row2l = _mm_xor_si128(row2l, row3l);
1107  row2h = _mm_xor_si128(row2h, row3h);
1108  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1109  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1110 
1111  b0 = _mm_set_epi64x(m11, m9);
1112  b1 = _mm_set_epi64x(m15, m13);
1113  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1114  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1115  row4l = _mm_xor_si128(row4l, row1l);
1116  row4h = _mm_xor_si128(row4h, row1h);
1117  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1118  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1119  row3l = _mm_add_epi64(row3l, row4l);
1120  row3h = _mm_add_epi64(row3h, row4h);
1121  row2l = _mm_xor_si128(row2l, row3l);
1122  row2h = _mm_xor_si128(row2h, row3h);
1123  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1124  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1125 
1126  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1127  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1128  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1129  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1130  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1131 
1132  b0 = _mm_set_epi64x(m4, m14);
1133  b1 = _mm_set_epi64x(m13, m9);
1134  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1135  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1136  row4l = _mm_xor_si128(row4l, row1l);
1137  row4h = _mm_xor_si128(row4h, row1h);
1138  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1139  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1140  row3l = _mm_add_epi64(row3l, row4l);
1141  row3h = _mm_add_epi64(row3h, row4h);
1142  row2l = _mm_xor_si128(row2l, row3l);
1143  row2h = _mm_xor_si128(row2h, row3h);
1144  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1145  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1146 
1147  b0 = _mm_set_epi64x(m8, m10);
1148  b1 = _mm_set_epi64x(m6, m15);
1149  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1150  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1151  row4l = _mm_xor_si128(row4l, row1l);
1152  row4h = _mm_xor_si128(row4h, row1h);
1153  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1154  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1155  row3l = _mm_add_epi64(row3l, row4l);
1156  row3h = _mm_add_epi64(row3h, row4h);
1157  row2l = _mm_xor_si128(row2l, row3l);
1158  row2h = _mm_xor_si128(row2h, row3h);
1159  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1160  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1161 
1162  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1163  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1164  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1165  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1166  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1167  b0 = _mm_set_epi64x(m0, m1);
1168  b1 = _mm_set_epi64x(m5, m11);
1169  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1170  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1171  row4l = _mm_xor_si128(row4l, row1l);
1172  row4h = _mm_xor_si128(row4h, row1h);
1173  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1174  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1175  row3l = _mm_add_epi64(row3l, row4l);
1176  row3h = _mm_add_epi64(row3h, row4h);
1177  row2l = _mm_xor_si128(row2l, row3l);
1178  row2h = _mm_xor_si128(row2h, row3h);
1179  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1180  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1181 
1182  b0 = _mm_set_epi64x(m2, m12);
1183  b1 = _mm_set_epi64x(m3, m7);
1184  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1185  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1186  row4l = _mm_xor_si128(row4l, row1l);
1187  row4h = _mm_xor_si128(row4h, row1h);
1188  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1189  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1190  row3l = _mm_add_epi64(row3l, row4l);
1191  row3h = _mm_add_epi64(row3h, row4h);
1192  row2l = _mm_xor_si128(row2l, row3l);
1193  row2h = _mm_xor_si128(row2h, row3h);
1194  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1195  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1196 
1197  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1198  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1199  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1200  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1201  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1202 
1203  b0 = _mm_set_epi64x(m12, m11);
1204  b1 = _mm_set_epi64x(m15, m5);
1205  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1206  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1207  row4l = _mm_xor_si128(row4l, row1l);
1208  row4h = _mm_xor_si128(row4h, row1h);
1209  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1210  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1211  row3l = _mm_add_epi64(row3l, row4l);
1212  row3h = _mm_add_epi64(row3h, row4h);
1213  row2l = _mm_xor_si128(row2l, row3l);
1214  row2h = _mm_xor_si128(row2h, row3h);
1215  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1216  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1217 
1218  b0 = _mm_set_epi64x(m0, m8);
1219  b1 = _mm_set_epi64x(m13, m2);
1220  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1221  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1222  row4l = _mm_xor_si128(row4l, row1l);
1223  row4h = _mm_xor_si128(row4h, row1h);
1224  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1225  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1226  row3l = _mm_add_epi64(row3l, row4l);
1227  row3h = _mm_add_epi64(row3h, row4h);
1228  row2l = _mm_xor_si128(row2l, row3l);
1229  row2h = _mm_xor_si128(row2h, row3h);
1230  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1231  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1232 
1233  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1234  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1235  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1236  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1237  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1238  b0 = _mm_set_epi64x(m3, m10);
1239  b1 = _mm_set_epi64x(m9, m7);
1240  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1241  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1242  row4l = _mm_xor_si128(row4l, row1l);
1243  row4h = _mm_xor_si128(row4h, row1h);
1244  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1245  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1246  row3l = _mm_add_epi64(row3l, row4l);
1247  row3h = _mm_add_epi64(row3h, row4h);
1248  row2l = _mm_xor_si128(row2l, row3l);
1249  row2h = _mm_xor_si128(row2h, row3h);
1250  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1251  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1252 
1253  b0 = _mm_set_epi64x(m6, m14);
1254  b1 = _mm_set_epi64x(m4, m1);
1255  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1256  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1257  row4l = _mm_xor_si128(row4l, row1l);
1258  row4h = _mm_xor_si128(row4h, row1h);
1259  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1260  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1261  row3l = _mm_add_epi64(row3l, row4l);
1262  row3h = _mm_add_epi64(row3h, row4h);
1263  row2l = _mm_xor_si128(row2l, row3l);
1264  row2h = _mm_xor_si128(row2h, row3h);
1265  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1266  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1267 
1268  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1269  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1270  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1271  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1272  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1273 
1274  b0 = _mm_set_epi64x(m3, m7);
1275  b1 = _mm_set_epi64x(m11, m13);
1276  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1277  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1278  row4l = _mm_xor_si128(row4l, row1l);
1279  row4h = _mm_xor_si128(row4h, row1h);
1280  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1281  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1282  row3l = _mm_add_epi64(row3l, row4l);
1283  row3h = _mm_add_epi64(row3h, row4h);
1284  row2l = _mm_xor_si128(row2l, row3l);
1285  row2h = _mm_xor_si128(row2h, row3h);
1286  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1287  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1288 
1289  b0 = _mm_set_epi64x(m1, m9);
1290  b1 = _mm_set_epi64x(m14, m12);
1291  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1292  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1293  row4l = _mm_xor_si128(row4l, row1l);
1294  row4h = _mm_xor_si128(row4h, row1h);
1295  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1296  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1297  row3l = _mm_add_epi64(row3l, row4l);
1298  row3h = _mm_add_epi64(row3h, row4h);
1299  row2l = _mm_xor_si128(row2l, row3l);
1300  row2h = _mm_xor_si128(row2h, row3h);
1301  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1302  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1303 
1304  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1305  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1306  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1307  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1308  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1309  b0 = _mm_set_epi64x(m5, m2);
1310  b1 = _mm_set_epi64x(m15, m4);
1311  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1312  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1313  row4l = _mm_xor_si128(row4l, row1l);
1314  row4h = _mm_xor_si128(row4h, row1h);
1315  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1316  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1317  row3l = _mm_add_epi64(row3l, row4l);
1318  row3h = _mm_add_epi64(row3h, row4h);
1319  row2l = _mm_xor_si128(row2l, row3l);
1320  row2h = _mm_xor_si128(row2h, row3h);
1321  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1322  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1323 
1324  b0 = _mm_set_epi64x(m10, m6);
1325  b1 = _mm_set_epi64x(m8, m0);
1326  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1327  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1328  row4l = _mm_xor_si128(row4l, row1l);
1329  row4h = _mm_xor_si128(row4h, row1h);
1330  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1331  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1332  row3l = _mm_add_epi64(row3l, row4l);
1333  row3h = _mm_add_epi64(row3h, row4h);
1334  row2l = _mm_xor_si128(row2l, row3l);
1335  row2h = _mm_xor_si128(row2h, row3h);
1336  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1337  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1338 
1339  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1340  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1341  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1342  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1343  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1344 
1345  b0 = _mm_set_epi64x(m5, m9);
1346  b1 = _mm_set_epi64x(m10, m2);
1347  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1348  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1349  row4l = _mm_xor_si128(row4l, row1l);
1350  row4h = _mm_xor_si128(row4h, row1h);
1351  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1352  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1353  row3l = _mm_add_epi64(row3l, row4l);
1354  row3h = _mm_add_epi64(row3h, row4h);
1355  row2l = _mm_xor_si128(row2l, row3l);
1356  row2h = _mm_xor_si128(row2h, row3h);
1357  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1358  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1359 
1360  b0 = _mm_set_epi64x(m7, m0);
1361  b1 = _mm_set_epi64x(m15, m4);
1362  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1363  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1364  row4l = _mm_xor_si128(row4l, row1l);
1365  row4h = _mm_xor_si128(row4h, row1h);
1366  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1367  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1368  row3l = _mm_add_epi64(row3l, row4l);
1369  row3h = _mm_add_epi64(row3h, row4h);
1370  row2l = _mm_xor_si128(row2l, row3l);
1371  row2h = _mm_xor_si128(row2h, row3h);
1372  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1373  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1374 
1375  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1376  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1377  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1378  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1379  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1380  b0 = _mm_set_epi64x(m11, m14);
1381  b1 = _mm_set_epi64x(m3, m6);
1382  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1383  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1384  row4l = _mm_xor_si128(row4l, row1l);
1385  row4h = _mm_xor_si128(row4h, row1h);
1386  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1387  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1388  row3l = _mm_add_epi64(row3l, row4l);
1389  row3h = _mm_add_epi64(row3h, row4h);
1390  row2l = _mm_xor_si128(row2l, row3l);
1391  row2h = _mm_xor_si128(row2h, row3h);
1392  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1393  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1394 
1395 
1396  b0 = _mm_set_epi64x(m12, m1);
1397  b1 = _mm_set_epi64x(m13, m8);
1398  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1399  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1400  row4l = _mm_xor_si128(row4l, row1l);
1401  row4h = _mm_xor_si128(row4h, row1h);
1402  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1403  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1404  row3l = _mm_add_epi64(row3l, row4l);
1405  row3h = _mm_add_epi64(row3h, row4h);
1406  row2l = _mm_xor_si128(row2l, row3l);
1407  row2h = _mm_xor_si128(row2h, row3h);
1408  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1409  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1410 
1411  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1412  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1413  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1414  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1415  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1416 
1417  b0 = _mm_set_epi64x(m6, m2);
1418  b1 = _mm_set_epi64x(m8, m0);
1419  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1420  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1421  row4l = _mm_xor_si128(row4l, row1l);
1422  row4h = _mm_xor_si128(row4h, row1h);
1423  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1424  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1425  row3l = _mm_add_epi64(row3l, row4l);
1426  row3h = _mm_add_epi64(row3h, row4h);
1427  row2l = _mm_xor_si128(row2l, row3l);
1428  row2h = _mm_xor_si128(row2h, row3h);
1429  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1430  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1431 
1432  b0 = _mm_set_epi64x(m10, m12);
1433  b1 = _mm_set_epi64x(m3, m11);
1434  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1435  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1436  row4l = _mm_xor_si128(row4l, row1l);
1437  row4h = _mm_xor_si128(row4h, row1h);
1438  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1439  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1440  row3l = _mm_add_epi64(row3l, row4l);
1441  row3h = _mm_add_epi64(row3h, row4h);
1442  row2l = _mm_xor_si128(row2l, row3l);
1443  row2h = _mm_xor_si128(row2h, row3h);
1444  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1445  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1446 
1447  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1448  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1449  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1450  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1451  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1452  b0 = _mm_set_epi64x(m7, m4);
1453  b1 = _mm_set_epi64x(m1, m15);
1454  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1455  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1456  row4l = _mm_xor_si128(row4l, row1l);
1457  row4h = _mm_xor_si128(row4h, row1h);
1458  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1459  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1460  row3l = _mm_add_epi64(row3l, row4l);
1461  row3h = _mm_add_epi64(row3h, row4h);
1462  row2l = _mm_xor_si128(row2l, row3l);
1463  row2h = _mm_xor_si128(row2h, row3h);
1464  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1465  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1466 
1467  b0 = _mm_set_epi64x(m5, m13);
1468  b1 = _mm_set_epi64x(m9, m14);
1469  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1470  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1471  row4l = _mm_xor_si128(row4l, row1l);
1472  row4h = _mm_xor_si128(row4h, row1h);
1473  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1474  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1475  row3l = _mm_add_epi64(row3l, row4l);
1476  row3h = _mm_add_epi64(row3h, row4h);
1477  row2l = _mm_xor_si128(row2l, row3l);
1478  row2h = _mm_xor_si128(row2h, row3h);
1479  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1480  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1481 
1482  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1483  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1484  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1485  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1486  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1487 
1488  b0 = _mm_set_epi64x(m1, m12);
1489  b1 = _mm_set_epi64x(m4, m14);
1490  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1491  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1492  row4l = _mm_xor_si128(row4l, row1l);
1493  row4h = _mm_xor_si128(row4h, row1h);
1494  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1495  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1496  row3l = _mm_add_epi64(row3l, row4l);
1497  row3h = _mm_add_epi64(row3h, row4h);
1498  row2l = _mm_xor_si128(row2l, row3l);
1499  row2h = _mm_xor_si128(row2h, row3h);
1500  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1501  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1502 
1503  b0 = _mm_set_epi64x(m15, m5);
1504  b1 = _mm_set_epi64x(m10, m13);
1505  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1506  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1507  row4l = _mm_xor_si128(row4l, row1l);
1508  row4h = _mm_xor_si128(row4h, row1h);
1509  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1510  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1511  row3l = _mm_add_epi64(row3l, row4l);
1512  row3h = _mm_add_epi64(row3h, row4h);
1513  row2l = _mm_xor_si128(row2l, row3l);
1514  row2h = _mm_xor_si128(row2h, row3h);
1515  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1516  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1517 
1518  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1519  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1520  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1521  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1522  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1523  b0 = _mm_set_epi64x(m6, m0);
1524  b1 = _mm_set_epi64x(m8, m9);
1525  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1526  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1527  row4l = _mm_xor_si128(row4l, row1l);
1528  row4h = _mm_xor_si128(row4h, row1h);
1529  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1530  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1531  row3l = _mm_add_epi64(row3l, row4l);
1532  row3h = _mm_add_epi64(row3h, row4h);
1533  row2l = _mm_xor_si128(row2l, row3l);
1534  row2h = _mm_xor_si128(row2h, row3h);
1535  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1536  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1537 
1538  b0 = _mm_set_epi64x(m3, m7);
1539  b1 = _mm_set_epi64x(m11, m2);
1540  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1541  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1542  row4l = _mm_xor_si128(row4l, row1l);
1543  row4h = _mm_xor_si128(row4h, row1h);
1544  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1545  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1546  row3l = _mm_add_epi64(row3l, row4l);
1547  row3h = _mm_add_epi64(row3h, row4h);
1548  row2l = _mm_xor_si128(row2l, row3l);
1549  row2h = _mm_xor_si128(row2h, row3h);
1550  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1551  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1552 
1553  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1554  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1555  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1556  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1557  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1558 
1559  b0 = _mm_set_epi64x(m7, m13);
1560  b1 = _mm_set_epi64x(m3, m12);
1561  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1562  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1563  row4l = _mm_xor_si128(row4l, row1l);
1564  row4h = _mm_xor_si128(row4h, row1h);
1565  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1566  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1567  row3l = _mm_add_epi64(row3l, row4l);
1568  row3h = _mm_add_epi64(row3h, row4h);
1569  row2l = _mm_xor_si128(row2l, row3l);
1570  row2h = _mm_xor_si128(row2h, row3h);
1571  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1572  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1573 
1574  b0 = _mm_set_epi64x(m14, m11);
1575  b1 = _mm_set_epi64x(m9, m1);
1576  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1577  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1578  row4l = _mm_xor_si128(row4l, row1l);
1579  row4h = _mm_xor_si128(row4h, row1h);
1580  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1581  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1582  row3l = _mm_add_epi64(row3l, row4l);
1583  row3h = _mm_add_epi64(row3h, row4h);
1584  row2l = _mm_xor_si128(row2l, row3l);
1585  row2h = _mm_xor_si128(row2h, row3h);
1586  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1587  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1588 
1589  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1590  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1591  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1592  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1593  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1594  b0 = _mm_set_epi64x(m15, m5);
1595  b1 = _mm_set_epi64x(m2, m8);
1596  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1597  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1598  row4l = _mm_xor_si128(row4l, row1l);
1599  row4h = _mm_xor_si128(row4h, row1h);
1600  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1601  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1602  row3l = _mm_add_epi64(row3l, row4l);
1603  row3h = _mm_add_epi64(row3h, row4h);
1604  row2l = _mm_xor_si128(row2l, row3l);
1605  row2h = _mm_xor_si128(row2h, row3h);
1606  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1607  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1608 
1609  b0 = _mm_set_epi64x(m4, m0);
1610  b1 = _mm_set_epi64x(m10, m6);
1611  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1612  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1613  row4l = _mm_xor_si128(row4l, row1l);
1614  row4h = _mm_xor_si128(row4h, row1h);
1615  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1616  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1617  row3l = _mm_add_epi64(row3l, row4l);
1618  row3h = _mm_add_epi64(row3h, row4h);
1619  row2l = _mm_xor_si128(row2l, row3l);
1620  row2h = _mm_xor_si128(row2h, row3h);
1621  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1622  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1623 
1624  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1625  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1626  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1627  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1628  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1629 
1630  b0 = _mm_set_epi64x(m14, m6);
1631  b1 = _mm_set_epi64x(m0, m11);
1632  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1633  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1634  row4l = _mm_xor_si128(row4l, row1l);
1635  row4h = _mm_xor_si128(row4h, row1h);
1636  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1637  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1638  row3l = _mm_add_epi64(row3l, row4l);
1639  row3h = _mm_add_epi64(row3h, row4h);
1640  row2l = _mm_xor_si128(row2l, row3l);
1641  row2h = _mm_xor_si128(row2h, row3h);
1642  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1643  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1644 
1645  b0 = _mm_set_epi64x(m9, m15);
1646  b1 = _mm_set_epi64x(m8, m3);
1647  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1648  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1649  row4l = _mm_xor_si128(row4l, row1l);
1650  row4h = _mm_xor_si128(row4h, row1h);
1651  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1652  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1653  row3l = _mm_add_epi64(row3l, row4l);
1654  row3h = _mm_add_epi64(row3h, row4h);
1655  row2l = _mm_xor_si128(row2l, row3l);
1656  row2h = _mm_xor_si128(row2h, row3h);
1657  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1658  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1659 
1660  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1661  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1662  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1663  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1664  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1665  b0 = _mm_set_epi64x(m13, m12);
1666  b1 = _mm_set_epi64x(m10, m1);
1667  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1668  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1669  row4l = _mm_xor_si128(row4l, row1l);
1670  row4h = _mm_xor_si128(row4h, row1h);
1671  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1672  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1673  row3l = _mm_add_epi64(row3l, row4l);
1674  row3h = _mm_add_epi64(row3h, row4h);
1675  row2l = _mm_xor_si128(row2l, row3l);
1676  row2h = _mm_xor_si128(row2h, row3h);
1677  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1678  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1679 
1680  b0 = _mm_set_epi64x(m7, m2);
1681  b1 = _mm_set_epi64x(m5, m4);
1682  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1683  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1684  row4l = _mm_xor_si128(row4l, row1l);
1685  row4h = _mm_xor_si128(row4h, row1h);
1686  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1687  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1688  row3l = _mm_add_epi64(row3l, row4l);
1689  row3h = _mm_add_epi64(row3h, row4h);
1690  row2l = _mm_xor_si128(row2l, row3l);
1691  row2h = _mm_xor_si128(row2h, row3h);
1692  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1693  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1694 
1695  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1696  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1697  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1698  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1699  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1700 
1701  b0 = _mm_set_epi64x(m8, m10);
1702  b1 = _mm_set_epi64x(m1, m7);
1703  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1704  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1705  row4l = _mm_xor_si128(row4l, row1l);
1706  row4h = _mm_xor_si128(row4h, row1h);
1707  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1708  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1709  row3l = _mm_add_epi64(row3l, row4l);
1710  row3h = _mm_add_epi64(row3h, row4h);
1711  row2l = _mm_xor_si128(row2l, row3l);
1712  row2h = _mm_xor_si128(row2h, row3h);
1713  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1714  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1715 
1716  b0 = _mm_set_epi64x(m4, m2);
1717  b1 = _mm_set_epi64x(m5, m6);
1718  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1719  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1720  row4l = _mm_xor_si128(row4l, row1l);
1721  row4h = _mm_xor_si128(row4h, row1h);
1722  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1723  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1724  row3l = _mm_add_epi64(row3l, row4l);
1725  row3h = _mm_add_epi64(row3h, row4h);
1726  row2l = _mm_xor_si128(row2l, row3l);
1727  row2h = _mm_xor_si128(row2h, row3h);
1728  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1729  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1730 
1731  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1732  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1733  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1734  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1735  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1736  b0 = _mm_set_epi64x(m9, m15);
1737  b1 = _mm_set_epi64x(m13, m3);
1738  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1739  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1740  row4l = _mm_xor_si128(row4l, row1l);
1741  row4h = _mm_xor_si128(row4h, row1h);
1742  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1743  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1744  row3l = _mm_add_epi64(row3l, row4l);
1745  row3h = _mm_add_epi64(row3h, row4h);
1746  row2l = _mm_xor_si128(row2l, row3l);
1747  row2h = _mm_xor_si128(row2h, row3h);
1748  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1749  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1750 
1751  b0 = _mm_set_epi64x(m14, m11);
1752  b1 = _mm_set_epi64x(m0, m12);
1753  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1754  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1755  row4l = _mm_xor_si128(row4l, row1l);
1756  row4h = _mm_xor_si128(row4h, row1h);
1757  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1758  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1759  row3l = _mm_add_epi64(row3l, row4l);
1760  row3h = _mm_add_epi64(row3h, row4h);
1761  row2l = _mm_xor_si128(row2l, row3l);
1762  row2h = _mm_xor_si128(row2h, row3h);
1763  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1764  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1765 
1766  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1767  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1768  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1769  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1770  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1771 
1772  b0 = _mm_set_epi64x(m2, m0);
1773  b1 = _mm_set_epi64x(m6, m4);
1774  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1775  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1776  row4l = _mm_xor_si128(row4l, row1l);
1777  row4h = _mm_xor_si128(row4h, row1h);
1778  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1779  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1780  row3l = _mm_add_epi64(row3l, row4l);
1781  row3h = _mm_add_epi64(row3h, row4h);
1782  row2l = _mm_xor_si128(row2l, row3l);
1783  row2h = _mm_xor_si128(row2h, row3h);
1784  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1785  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1786 
1787  b0 = _mm_set_epi64x(m3, m1);
1788  b1 = _mm_set_epi64x(m7, m5);
1789  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1790  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1791  row4l = _mm_xor_si128(row4l, row1l);
1792  row4h = _mm_xor_si128(row4h, row1h);
1793  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1794  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1795  row3l = _mm_add_epi64(row3l, row4l);
1796  row3h = _mm_add_epi64(row3h, row4h);
1797  row2l = _mm_xor_si128(row2l, row3l);
1798  row2h = _mm_xor_si128(row2h, row3h);
1799  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1800  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1801 
1802  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1803  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1804  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1805  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1806  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1807 
1808  b0 = _mm_set_epi64x(m10, m8);
1809  b1 = _mm_set_epi64x(m14, m12);
1810  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1811  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1812  row4l = _mm_xor_si128(row4l, row1l);
1813  row4h = _mm_xor_si128(row4h, row1h);
1814  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1815  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1816  row3l = _mm_add_epi64(row3l, row4l);
1817  row3h = _mm_add_epi64(row3h, row4h);
1818  row2l = _mm_xor_si128(row2l, row3l);
1819  row2h = _mm_xor_si128(row2h, row3h);
1820  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1821  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1822 
1823  b0 = _mm_set_epi64x(m11, m9);
1824  b1 = _mm_set_epi64x(m15, m13);
1825  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1826  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1827  row4l = _mm_xor_si128(row4l, row1l);
1828  row4h = _mm_xor_si128(row4h, row1h);
1829  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1830  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1831  row3l = _mm_add_epi64(row3l, row4l);
1832  row3h = _mm_add_epi64(row3h, row4h);
1833  row2l = _mm_xor_si128(row2l, row3l);
1834  row2h = _mm_xor_si128(row2h, row3h);
1835  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1836  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1837 
1838  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1839  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1840  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1841  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1842  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1843 
1844  b0 = _mm_set_epi64x(m4, m14);
1845  b1 = _mm_set_epi64x(m13, m9);
1846  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1847  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1848  row4l = _mm_xor_si128(row4l, row1l);
1849  row4h = _mm_xor_si128(row4h, row1h);
1850  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1851  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1852  row3l = _mm_add_epi64(row3l, row4l);
1853  row3h = _mm_add_epi64(row3h, row4h);
1854  row2l = _mm_xor_si128(row2l, row3l);
1855  row2h = _mm_xor_si128(row2h, row3h);
1856  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1857  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1858 
1859  b0 = _mm_set_epi64x(m8, m10);
1860  b1 = _mm_set_epi64x(m6, m15);
1861  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1862  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1863  row4l = _mm_xor_si128(row4l, row1l);
1864  row4h = _mm_xor_si128(row4h, row1h);
1865  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1866  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1867  row3l = _mm_add_epi64(row3l, row4l);
1868  row3h = _mm_add_epi64(row3h, row4h);
1869  row2l = _mm_xor_si128(row2l, row3l);
1870  row2h = _mm_xor_si128(row2h, row3h);
1871  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1872  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1873 
1874  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1875  row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1876  row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1877  row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1878  row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1879  b0 = _mm_set_epi64x(m0, m1);
1880  b1 = _mm_set_epi64x(m5, m11);
1881  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1882  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1883  row4l = _mm_xor_si128(row4l, row1l);
1884  row4h = _mm_xor_si128(row4h, row1h);
1885  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1886  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1887  row3l = _mm_add_epi64(row3l, row4l);
1888  row3h = _mm_add_epi64(row3h, row4h);
1889  row2l = _mm_xor_si128(row2l, row3l);
1890  row2h = _mm_xor_si128(row2h, row3h);
1891  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1892  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1893 
1894  b0 = _mm_set_epi64x(m2, m12);
1895  b1 = _mm_set_epi64x(m3, m7);
1896  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1897  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1898  row4l = _mm_xor_si128(row4l, row1l);
1899  row4h = _mm_xor_si128(row4h, row1h);
1900  row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1901  row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1902  row3l = _mm_add_epi64(row3l, row4l);
1903  row3h = _mm_add_epi64(row3h, row4h);
1904  row2l = _mm_xor_si128(row2l, row3l);
1905  row2h = _mm_xor_si128(row2h, row3h);
1906  row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1907  row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1908 
1909  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1910  row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1911  row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1912  row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1913  row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1914 
1915  row1l = _mm_xor_si128(row3l, row1l);
1916  row1h = _mm_xor_si128(row3h, row1h);
1917  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
1918  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
1919 
1920  row2l = _mm_xor_si128(row4l, row2l);
1921  row2h = _mm_xor_si128(row4h, row2h);
1922  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
1923  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
1924 }
1925 # endif // (__SUNPRO_CC != 0x5120)
1926 #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE
1927 
1928 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
1929 static void BLAKE2_SSE4_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
1930 {
1931  __m128i row1, row2, row3, row4;
1932  __m128i buf1, buf2, buf3, buf4;
1933 
1934  __m128i t0, t1, t2;
1935  __m128i ff0, ff1;
1936 
1937  const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
1938  const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
1939 
1940  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
1941  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
1942  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
1943  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
1944 
1945  row1 = ff0 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
1946  row2 = ff1 = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
1947  row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3));
1948  row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
1949  buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
1950 
1951  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1952  row4 = _mm_xor_si128(row4, row1);
1953  row4 = _mm_shuffle_epi8(row4,r16);
1954  row3 = _mm_add_epi32(row3, row4);
1955  row2 = _mm_xor_si128(row2, row3);
1956  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1957 
1958  buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
1959 
1960  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
1961  row4 = _mm_xor_si128(row4, row1);
1962  row4 = _mm_shuffle_epi8(row4,r8);
1963  row3 = _mm_add_epi32(row3, row4);
1964  row2 = _mm_xor_si128(row2, row3);
1965  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1966 
1967  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
1968  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1969  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
1970 
1971  buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
1972 
1973  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
1974  row4 = _mm_xor_si128(row4, row1);
1975  row4 = _mm_shuffle_epi8(row4,r16);
1976  row3 = _mm_add_epi32(row3, row4);
1977  row2 = _mm_xor_si128(row2, row3);
1978  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1979 
1980  buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
1981 
1982  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
1983  row4 = _mm_xor_si128(row4, row1);
1984  row4 = _mm_shuffle_epi8(row4,r8);
1985  row3 = _mm_add_epi32(row3, row4);
1986  row2 = _mm_xor_si128(row2, row3);
1987  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1988 
1989  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
1990  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1991  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
1992 
1993  t0 = _mm_blend_epi16(m1, m2, 0x0C);
1994  t1 = _mm_slli_si128(m3, 4);
1995  t2 = _mm_blend_epi16(t0, t1, 0xF0);
1996  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
1997 
1998  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1999  row4 = _mm_xor_si128(row4, row1);
2000  row4 = _mm_shuffle_epi8(row4,r16);
2001  row3 = _mm_add_epi32(row3, row4);
2002  row2 = _mm_xor_si128(row2, row3);
2003  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2004 
2005  t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
2006  t1 = _mm_blend_epi16(m1,m3,0xC0);
2007  t2 = _mm_blend_epi16(t0, t1, 0xF0);
2008  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2009 
2010  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2011  row4 = _mm_xor_si128(row4, row1);
2012  row4 = _mm_shuffle_epi8(row4,r8);
2013  row3 = _mm_add_epi32(row3, row4);
2014  row2 = _mm_xor_si128(row2, row3);
2015  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2016 
2017  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2018  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2019  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2020 
2021  t0 = _mm_slli_si128(m1, 4);
2022  t1 = _mm_blend_epi16(m2, t0, 0x30);
2023  t2 = _mm_blend_epi16(m0, t1, 0xF0);
2024  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2025 
2026  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2027  row4 = _mm_xor_si128(row4, row1);
2028  row4 = _mm_shuffle_epi8(row4,r16);
2029  row3 = _mm_add_epi32(row3, row4);
2030  row2 = _mm_xor_si128(row2, row3);
2031  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2032 
2033  t0 = _mm_unpackhi_epi32(m0,m1);
2034  t1 = _mm_slli_si128(m3, 4);
2035  t2 = _mm_blend_epi16(t0, t1, 0x0C);
2036  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2037 
2038  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2039  row4 = _mm_xor_si128(row4, row1);
2040  row4 = _mm_shuffle_epi8(row4,r8);
2041  row3 = _mm_add_epi32(row3, row4);
2042  row2 = _mm_xor_si128(row2, row3);
2043  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2044 
2045  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2046  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2047  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2048 
2049  t0 = _mm_unpackhi_epi32(m2,m3);
2050  t1 = _mm_blend_epi16(m3,m1,0x0C);
2051  t2 = _mm_blend_epi16(t0, t1, 0x0F);
2052  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2053 
2054  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2055  row4 = _mm_xor_si128(row4, row1);
2056  row4 = _mm_shuffle_epi8(row4,r16);
2057  row3 = _mm_add_epi32(row3, row4);
2058  row2 = _mm_xor_si128(row2, row3);
2059  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2060 
2061  t0 = _mm_unpacklo_epi32(m2,m0);
2062  t1 = _mm_blend_epi16(t0, m0, 0xF0);
2063  t2 = _mm_slli_si128(m3, 8);
2064  buf2 = _mm_blend_epi16(t1, t2, 0xC0);
2065 
2066  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2067  row4 = _mm_xor_si128(row4, row1);
2068  row4 = _mm_shuffle_epi8(row4,r8);
2069  row3 = _mm_add_epi32(row3, row4);
2070  row2 = _mm_xor_si128(row2, row3);
2071  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2072 
2073  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2074  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2075  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2076 
2077  t0 = _mm_blend_epi16(m0, m2, 0x3C);
2078  t1 = _mm_srli_si128(m1, 12);
2079  t2 = _mm_blend_epi16(t0,t1,0x03);
2080  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
2081 
2082  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2083  row4 = _mm_xor_si128(row4, row1);
2084  row4 = _mm_shuffle_epi8(row4,r16);
2085  row3 = _mm_add_epi32(row3, row4);
2086  row2 = _mm_xor_si128(row2, row3);
2087  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2088 
2089  t0 = _mm_slli_si128(m3, 4);
2090  t1 = _mm_blend_epi16(m0, m1, 0x33);
2091  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2092  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
2093 
2094  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2095  row4 = _mm_xor_si128(row4, row1);
2096  row4 = _mm_shuffle_epi8(row4,r8);
2097  row3 = _mm_add_epi32(row3, row4);
2098  row2 = _mm_xor_si128(row2, row3);
2099  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2100 
2101  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2102  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2103  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2104 
2105  t0 = _mm_unpackhi_epi32(m0,m1);
2106  t1 = _mm_unpackhi_epi32(t0, m2);
2107  t2 = _mm_blend_epi16(t1, m3, 0x0C);
2108  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2109 
2110  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2111  row4 = _mm_xor_si128(row4, row1);
2112  row4 = _mm_shuffle_epi8(row4,r16);
2113  row3 = _mm_add_epi32(row3, row4);
2114  row2 = _mm_xor_si128(row2, row3);
2115  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2116 
2117  t0 = _mm_slli_si128(m2, 8);
2118  t1 = _mm_blend_epi16(m3,m0,0x0C);
2119  t2 = _mm_blend_epi16(t1, t0, 0xC0);
2120  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2121 
2122  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2123  row4 = _mm_xor_si128(row4, row1);
2124  row4 = _mm_shuffle_epi8(row4,r8);
2125  row3 = _mm_add_epi32(row3, row4);
2126  row2 = _mm_xor_si128(row2, row3);
2127  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2128 
2129  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2130  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2131  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2132 
2133  t0 = _mm_blend_epi16(m0,m1,0x0F);
2134  t1 = _mm_blend_epi16(t0, m3, 0xC0);
2135  buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2136 
2137  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2138  row4 = _mm_xor_si128(row4, row1);
2139  row4 = _mm_shuffle_epi8(row4,r16);
2140  row3 = _mm_add_epi32(row3, row4);
2141  row2 = _mm_xor_si128(row2, row3);
2142  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2143 
2144  t0 = _mm_unpacklo_epi32(m0,m2);
2145  t1 = _mm_unpackhi_epi32(m1,m2);
2146  buf4 = _mm_unpacklo_epi64(t1,t0);
2147 
2148  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2149  row4 = _mm_xor_si128(row4, row1);
2150  row4 = _mm_shuffle_epi8(row4,r8);
2151  row3 = _mm_add_epi32(row3, row4);
2152  row2 = _mm_xor_si128(row2, row3);
2153  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2154 
2155  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2156  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2157  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2158 
2159  t0 = _mm_unpacklo_epi64(m1,m2);
2160  t1 = _mm_unpackhi_epi64(m0,m2);
2161  t2 = _mm_blend_epi16(t0,t1,0x33);
2162  buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2163 
2164  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2165  row4 = _mm_xor_si128(row4, row1);
2166  row4 = _mm_shuffle_epi8(row4,r16);
2167  row3 = _mm_add_epi32(row3, row4);
2168  row2 = _mm_xor_si128(row2, row3);
2169  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2170 
2171  t0 = _mm_unpackhi_epi64(m1,m3);
2172  t1 = _mm_unpacklo_epi64(m0,m1);
2173  buf2 = _mm_blend_epi16(t0,t1,0x33);
2174 
2175  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2176  row4 = _mm_xor_si128(row4, row1);
2177  row4 = _mm_shuffle_epi8(row4,r8);
2178  row3 = _mm_add_epi32(row3, row4);
2179  row2 = _mm_xor_si128(row2, row3);
2180  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2181 
2182  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2183  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2184  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2185 
2186  t0 = _mm_unpackhi_epi64(m3,m1);
2187  t1 = _mm_unpackhi_epi64(m2,m0);
2188  buf3 = _mm_blend_epi16(t1,t0,0x33);
2189 
2190  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2191  row4 = _mm_xor_si128(row4, row1);
2192  row4 = _mm_shuffle_epi8(row4,r16);
2193  row3 = _mm_add_epi32(row3, row4);
2194  row2 = _mm_xor_si128(row2, row3);
2195  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2196 
2197  t0 = _mm_blend_epi16(m0,m2,0x03);
2198  t1 = _mm_slli_si128(t0, 8);
2199  t2 = _mm_blend_epi16(t1,m3,0x0F);
2200  buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
2201 
2202  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2203  row4 = _mm_xor_si128(row4, row1);
2204  row4 = _mm_shuffle_epi8(row4,r8);
2205  row3 = _mm_add_epi32(row3, row4);
2206  row2 = _mm_xor_si128(row2, row3);
2207  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2208 
2209  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2210  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2211  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2212 
2213  t0 = _mm_unpackhi_epi32(m0,m1);
2214  t1 = _mm_unpacklo_epi32(m0,m2);
2215  buf1 = _mm_unpacklo_epi64(t0,t1);
2216 
2217  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2218  row4 = _mm_xor_si128(row4, row1);
2219  row4 = _mm_shuffle_epi8(row4,r16);
2220  row3 = _mm_add_epi32(row3, row4);
2221  row2 = _mm_xor_si128(row2, row3);
2222  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2223 
2224  t0 = _mm_srli_si128(m2, 4);
2225  t1 = _mm_blend_epi16(m0,m3,0x03);
2226  buf2 = _mm_blend_epi16(t1,t0,0x3C);
2227 
2228  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2229  row4 = _mm_xor_si128(row4, row1);
2230  row4 = _mm_shuffle_epi8(row4,r8);
2231  row3 = _mm_add_epi32(row3, row4);
2232  row2 = _mm_xor_si128(row2, row3);
2233  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2234 
2235  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2236  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2237  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2238 
2239  t0 = _mm_blend_epi16(m1,m0,0x0C);
2240  t1 = _mm_srli_si128(m3, 4);
2241  t2 = _mm_blend_epi16(t0,t1,0x30);
2242  buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
2243 
2244  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2245  row4 = _mm_xor_si128(row4, row1);
2246  row4 = _mm_shuffle_epi8(row4,r16);
2247  row3 = _mm_add_epi32(row3, row4);
2248  row2 = _mm_xor_si128(row2, row3);
2249  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2250 
2251  t0 = _mm_unpacklo_epi64(m1,m2);
2252  t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
2253  buf4 = _mm_blend_epi16(t0,t1,0x33);
2254 
2255  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2256  row4 = _mm_xor_si128(row4, row1);
2257  row4 = _mm_shuffle_epi8(row4,r8);
2258  row3 = _mm_add_epi32(row3, row4);
2259  row2 = _mm_xor_si128(row2, row3);
2260  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2261 
2262  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2263  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2264  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2265 
2266  t0 = _mm_slli_si128(m1, 12);
2267  t1 = _mm_blend_epi16(m0,m3,0x33);
2268  buf1 = _mm_blend_epi16(t1,t0,0xC0);
2269 
2270  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2271  row4 = _mm_xor_si128(row4, row1);
2272  row4 = _mm_shuffle_epi8(row4,r16);
2273  row3 = _mm_add_epi32(row3, row4);
2274  row2 = _mm_xor_si128(row2, row3);
2275  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2276 
2277  t0 = _mm_blend_epi16(m3,m2,0x30);
2278  t1 = _mm_srli_si128(m1, 4);
2279  t2 = _mm_blend_epi16(t0,t1,0x03);
2280  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
2281 
2282  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2283  row4 = _mm_xor_si128(row4, row1);
2284  row4 = _mm_shuffle_epi8(row4,r8);
2285  row3 = _mm_add_epi32(row3, row4);
2286  row2 = _mm_xor_si128(row2, row3);
2287  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2288 
2289  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2290  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2291  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2292 
2293  t0 = _mm_unpacklo_epi64(m0,m2);
2294  t1 = _mm_srli_si128(m1, 4);
2295  buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
2296 
2297  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2298  row4 = _mm_xor_si128(row4, row1);
2299  row4 = _mm_shuffle_epi8(row4,r16);
2300  row3 = _mm_add_epi32(row3, row4);
2301  row2 = _mm_xor_si128(row2, row3);
2302  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2303 
2304  t0 = _mm_unpackhi_epi32(m1,m2);
2305  t1 = _mm_unpackhi_epi64(m0,t0);
2306  buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2307 
2308  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2309  row4 = _mm_xor_si128(row4, row1);
2310  row4 = _mm_shuffle_epi8(row4,r8);
2311  row3 = _mm_add_epi32(row3, row4);
2312  row2 = _mm_xor_si128(row2, row3);
2313  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2314 
2315  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2316  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2317  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2318 
2319  t0 = _mm_unpackhi_epi32(m0,m1);
2320  t1 = _mm_blend_epi16(t0,m3,0x0F);
2321  buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
2322 
2323  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2324  row4 = _mm_xor_si128(row4, row1);
2325  row4 = _mm_shuffle_epi8(row4,r16);
2326  row3 = _mm_add_epi32(row3, row4);
2327  row2 = _mm_xor_si128(row2, row3);
2328  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2329 
2330  t0 = _mm_blend_epi16(m2,m3,0x30);
2331  t1 = _mm_srli_si128(m0,4);
2332  t2 = _mm_blend_epi16(t0,t1,0x03);
2333  buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
2334 
2335  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2336  row4 = _mm_xor_si128(row4, row1);
2337  row4 = _mm_shuffle_epi8(row4,r8);
2338  row3 = _mm_add_epi32(row3, row4);
2339  row2 = _mm_xor_si128(row2, row3);
2340  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2341 
2342  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2343  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2344  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2345 
2346  t0 = _mm_unpackhi_epi64(m0,m3);
2347  t1 = _mm_unpacklo_epi64(m1,m2);
2348  t2 = _mm_blend_epi16(t0,t1,0x3C);
2349  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
2350 
2351  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2352  row4 = _mm_xor_si128(row4, row1);
2353  row4 = _mm_shuffle_epi8(row4,r16);
2354  row3 = _mm_add_epi32(row3, row4);
2355  row2 = _mm_xor_si128(row2, row3);
2356  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2357 
2358  t0 = _mm_unpacklo_epi32(m0,m1);
2359  t1 = _mm_unpackhi_epi32(m1,m2);
2360  buf4 = _mm_unpacklo_epi64(t0,t1);
2361 
2362  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2363  row4 = _mm_xor_si128(row4, row1);
2364  row4 = _mm_shuffle_epi8(row4,r8);
2365  row3 = _mm_add_epi32(row3, row4);
2366  row2 = _mm_xor_si128(row2, row3);
2367  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2368 
2369  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2370  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2371  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2372 
2373  t0 = _mm_unpackhi_epi32(m1,m3);
2374  t1 = _mm_unpacklo_epi64(t0,m0);
2375  t2 = _mm_blend_epi16(t1,m2,0xC0);
2376  buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
2377 
2378  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2379  row4 = _mm_xor_si128(row4, row1);
2380  row4 = _mm_shuffle_epi8(row4,r16);
2381  row3 = _mm_add_epi32(row3, row4);
2382  row2 = _mm_xor_si128(row2, row3);
2383  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2384 
2385  t0 = _mm_unpackhi_epi32(m0,m3);
2386  t1 = _mm_blend_epi16(m2,t0,0xF0);
2387  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
2388 
2389  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2390  row4 = _mm_xor_si128(row4, row1);
2391  row4 = _mm_shuffle_epi8(row4,r8);
2392  row3 = _mm_add_epi32(row3, row4);
2393  row2 = _mm_xor_si128(row2, row3);
2394  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2395 
2396  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2397  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2398  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2399 
2400  t0 = _mm_blend_epi16(m2,m0,0x0C);
2401  t1 = _mm_slli_si128(t0,4);
2402  buf3 = _mm_blend_epi16(t1,m3,0x0F);
2403 
2404  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2405  row4 = _mm_xor_si128(row4, row1);
2406  row4 = _mm_shuffle_epi8(row4,r16);
2407  row3 = _mm_add_epi32(row3, row4);
2408  row2 = _mm_xor_si128(row2, row3);
2409  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2410 
2411  t0 = _mm_blend_epi16(m1,m0,0x30);
2412  buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2413 
2414  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2415  row4 = _mm_xor_si128(row4, row1);
2416  row4 = _mm_shuffle_epi8(row4,r8);
2417  row3 = _mm_add_epi32(row3, row4);
2418  row2 = _mm_xor_si128(row2, row3);
2419  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2420 
2421  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2422  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2423  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2424 
2425  t0 = _mm_blend_epi16(m0,m2,0x03);
2426  t1 = _mm_blend_epi16(m1,m2,0x30);
2427  t2 = _mm_blend_epi16(t1,t0,0x0F);
2428  buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2429 
2430  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2431  row4 = _mm_xor_si128(row4, row1);
2432  row4 = _mm_shuffle_epi8(row4,r16);
2433  row3 = _mm_add_epi32(row3, row4);
2434  row2 = _mm_xor_si128(row2, row3);
2435  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2436 
2437  t0 = _mm_slli_si128(m0,4);
2438  t1 = _mm_blend_epi16(m1,t0,0xC0);
2439  buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2440 
2441  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2442  row4 = _mm_xor_si128(row4, row1);
2443  row4 = _mm_shuffle_epi8(row4,r8);
2444  row3 = _mm_add_epi32(row3, row4);
2445  row2 = _mm_xor_si128(row2, row3);
2446  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2447 
2448  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2449  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2450  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2451 
2452  t0 = _mm_unpackhi_epi32(m0,m3);
2453  t1 = _mm_unpacklo_epi32(m2,m3);
2454  t2 = _mm_unpackhi_epi64(t0,t1);
2455  buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2456 
2457  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2458  row4 = _mm_xor_si128(row4, row1);
2459  row4 = _mm_shuffle_epi8(row4,r16);
2460  row3 = _mm_add_epi32(row3, row4);
2461  row2 = _mm_xor_si128(row2, row3);
2462  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2463 
2464  t0 = _mm_blend_epi16(m3,m2,0xC0);
2465  t1 = _mm_unpacklo_epi32(m0,m3);
2466  t2 = _mm_blend_epi16(t0,t1,0x0F);
2467  buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2468 
2469  row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2470  row4 = _mm_xor_si128(row4, row1);
2471  row4 = _mm_shuffle_epi8(row4,r8);
2472  row3 = _mm_add_epi32(row3, row4);
2473  row2 = _mm_xor_si128(row2, row3);
2474  row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2475 
2476  row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2477  row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2478  row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2479 
2480  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
2481  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
2482 }
2483 
2484 static void BLAKE2_SSE4_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
2485 {
2486  __m128i row1l, row1h;
2487  __m128i row2l, row2h;
2488  __m128i row3l, row3h;
2489  __m128i row4l, row4h;
2490  __m128i b0, b1, t0, t1;
2491 
2492  const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
2493  const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
2494 
2495  const __m128i m0 = _mm_loadu_si128((const __m128i*)(const void*)(input + 00));
2496  const __m128i m1 = _mm_loadu_si128((const __m128i*)(const void*)(input + 16));
2497  const __m128i m2 = _mm_loadu_si128((const __m128i*)(const void*)(input + 32));
2498  const __m128i m3 = _mm_loadu_si128((const __m128i*)(const void*)(input + 48));
2499  const __m128i m4 = _mm_loadu_si128((const __m128i*)(const void*)(input + 64));
2500  const __m128i m5 = _mm_loadu_si128((const __m128i*)(const void*)(input + 80));
2501  const __m128i m6 = _mm_loadu_si128((const __m128i*)(const void*)(input + 96));
2502  const __m128i m7 = _mm_loadu_si128((const __m128i*)(const void*)(input + 112));
2503 
2504  row1l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[0]));
2505  row1h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[2]));
2506  row2l = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[4]));
2507  row2h = _mm_loadu_si128((const __m128i*)(const void*)(&state.h[6]));
2508  row3l = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(0)));
2509  row3h = _mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(2)));
2510  row4l = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((const __m128i*)(const void*)(&state.t[0])));
2511  row4h = _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((const __m128i*)(const void*)(&state.f[0])));
2512 
2513  b0 = _mm_unpacklo_epi64(m0, m1);
2514  b1 = _mm_unpacklo_epi64(m2, m3);
2515  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2516  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2517  row4l = _mm_xor_si128(row4l, row1l);
2518  row4h = _mm_xor_si128(row4h, row1h);
2519  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2520  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2521  row3l = _mm_add_epi64(row3l, row4l);
2522  row3h = _mm_add_epi64(row3h, row4h);
2523  row2l = _mm_xor_si128(row2l, row3l);
2524  row2h = _mm_xor_si128(row2h, row3h);
2525  row2l = _mm_shuffle_epi8(row2l, r24);
2526  row2h = _mm_shuffle_epi8(row2h, r24);
2527 
2528  b0 = _mm_unpackhi_epi64(m0, m1);
2529  b1 = _mm_unpackhi_epi64(m2, m3);
2530 
2531  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2532  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2533  row4l = _mm_xor_si128(row4l, row1l);
2534  row4h = _mm_xor_si128(row4h, row1h);
2535  row4l = _mm_shuffle_epi8(row4l, r16);
2536  row4h = _mm_shuffle_epi8(row4h, r16);
2537  row3l = _mm_add_epi64(row3l, row4l);
2538  row3h = _mm_add_epi64(row3h, row4h);
2539  row2l = _mm_xor_si128(row2l, row3l);
2540  row2h = _mm_xor_si128(row2h, row3h);
2541  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2542  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2543 
2544  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2545  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2546  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2547  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2548  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2549  row4l = t1, row4h = t0;
2550 
2551  b0 = _mm_unpacklo_epi64(m4, m5);
2552  b1 = _mm_unpacklo_epi64(m6, m7);
2553 
2554  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2555  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2556  row4l = _mm_xor_si128(row4l, row1l);
2557  row4h = _mm_xor_si128(row4h, row1h);
2558  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2559  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2560  row3l = _mm_add_epi64(row3l, row4l);
2561  row3h = _mm_add_epi64(row3h, row4h);
2562  row2l = _mm_xor_si128(row2l, row3l);
2563  row2h = _mm_xor_si128(row2h, row3h);
2564  row2l = _mm_shuffle_epi8(row2l, r24);
2565  row2h = _mm_shuffle_epi8(row2h, r24);
2566 
2567  b0 = _mm_unpackhi_epi64(m4, m5);
2568  b1 = _mm_unpackhi_epi64(m6, m7);
2569 
2570  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2571  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2572  row4l = _mm_xor_si128(row4l, row1l);
2573  row4h = _mm_xor_si128(row4h, row1h);
2574  row4l = _mm_shuffle_epi8(row4l, r16);
2575  row4h = _mm_shuffle_epi8(row4h, r16);
2576  row3l = _mm_add_epi64(row3l, row4l);
2577  row3h = _mm_add_epi64(row3h, row4h);
2578  row2l = _mm_xor_si128(row2l, row3l);
2579  row2h = _mm_xor_si128(row2h, row3h);
2580  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2581  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2582 
2583  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2584  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2585  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2586  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2587  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2588  row4l = t1, row4h = t0;
2589 
2590  b0 = _mm_unpacklo_epi64(m7, m2);
2591  b1 = _mm_unpackhi_epi64(m4, m6);
2592 
2593  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2594  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2595  row4l = _mm_xor_si128(row4l, row1l);
2596  row4h = _mm_xor_si128(row4h, row1h);
2597  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2598  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2599  row3l = _mm_add_epi64(row3l, row4l);
2600  row3h = _mm_add_epi64(row3h, row4h);
2601  row2l = _mm_xor_si128(row2l, row3l);
2602  row2h = _mm_xor_si128(row2h, row3h);
2603  row2l = _mm_shuffle_epi8(row2l, r24);
2604  row2h = _mm_shuffle_epi8(row2h, r24);
2605 
2606  b0 = _mm_unpacklo_epi64(m5, m4);
2607  b1 = _mm_alignr_epi8(m3, m7, 8);
2608 
2609  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2610  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2611  row4l = _mm_xor_si128(row4l, row1l);
2612  row4h = _mm_xor_si128(row4h, row1h);
2613  row4l = _mm_shuffle_epi8(row4l, r16);
2614  row4h = _mm_shuffle_epi8(row4h, r16);
2615  row3l = _mm_add_epi64(row3l, row4l);
2616  row3h = _mm_add_epi64(row3h, row4h);
2617  row2l = _mm_xor_si128(row2l, row3l);
2618  row2h = _mm_xor_si128(row2h, row3h);
2619  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2620  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2621 
2622  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2623  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2624  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2625  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2626  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2627  row4l = t1, row4h = t0;
2628 
2629  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
2630  b1 = _mm_unpackhi_epi64(m5, m2);
2631 
2632  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2633  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2634  row4l = _mm_xor_si128(row4l, row1l);
2635  row4h = _mm_xor_si128(row4h, row1h);
2636  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2637  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2638  row3l = _mm_add_epi64(row3l, row4l);
2639  row3h = _mm_add_epi64(row3h, row4h);
2640  row2l = _mm_xor_si128(row2l, row3l);
2641  row2h = _mm_xor_si128(row2h, row3h);
2642  row2l = _mm_shuffle_epi8(row2l, r24);
2643  row2h = _mm_shuffle_epi8(row2h, r24);
2644 
2645  b0 = _mm_unpacklo_epi64(m6, m1);
2646  b1 = _mm_unpackhi_epi64(m3, m1);
2647 
2648  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2649  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2650  row4l = _mm_xor_si128(row4l, row1l);
2651  row4h = _mm_xor_si128(row4h, row1h);
2652  row4l = _mm_shuffle_epi8(row4l, r16);
2653  row4h = _mm_shuffle_epi8(row4h, r16);
2654  row3l = _mm_add_epi64(row3l, row4l);
2655  row3h = _mm_add_epi64(row3h, row4h);
2656  row2l = _mm_xor_si128(row2l, row3l);
2657  row2h = _mm_xor_si128(row2h, row3h);
2658  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2659  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2660 
2661  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2662  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2663  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2664  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2665  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2666  row4l = t1, row4h = t0;
2667 
2668  b0 = _mm_alignr_epi8(m6, m5, 8);
2669  b1 = _mm_unpackhi_epi64(m2, m7);
2670 
2671  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2672  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2673  row4l = _mm_xor_si128(row4l, row1l);
2674  row4h = _mm_xor_si128(row4h, row1h);
2675  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2676  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2677  row3l = _mm_add_epi64(row3l, row4l);
2678  row3h = _mm_add_epi64(row3h, row4h);
2679  row2l = _mm_xor_si128(row2l, row3l);
2680  row2h = _mm_xor_si128(row2h, row3h);
2681  row2l = _mm_shuffle_epi8(row2l, r24);
2682  row2h = _mm_shuffle_epi8(row2h, r24);
2683 
2684  b0 = _mm_unpacklo_epi64(m4, m0);
2685  b1 = _mm_blend_epi16(m1, m6, 0xF0);
2686 
2687  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2688  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2689  row4l = _mm_xor_si128(row4l, row1l);
2690  row4h = _mm_xor_si128(row4h, row1h);
2691  row4l = _mm_shuffle_epi8(row4l, r16);
2692  row4h = _mm_shuffle_epi8(row4h, r16);
2693  row3l = _mm_add_epi64(row3l, row4l);
2694  row3h = _mm_add_epi64(row3h, row4h);
2695  row2l = _mm_xor_si128(row2l, row3l);
2696  row2h = _mm_xor_si128(row2h, row3h);
2697  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2698  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2699 
2700  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2701  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2702  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2703  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2704  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2705  row4l = t1, row4h = t0;
2706 
2707  b0 = _mm_blend_epi16(m5, m1, 0xF0);
2708  b1 = _mm_unpackhi_epi64(m3, m4);
2709 
2710  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2711  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2712  row4l = _mm_xor_si128(row4l, row1l);
2713  row4h = _mm_xor_si128(row4h, row1h);
2714  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2715  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2716  row3l = _mm_add_epi64(row3l, row4l);
2717  row3h = _mm_add_epi64(row3h, row4h);
2718  row2l = _mm_xor_si128(row2l, row3l);
2719  row2h = _mm_xor_si128(row2h, row3h);
2720  row2l = _mm_shuffle_epi8(row2l, r24);
2721  row2h = _mm_shuffle_epi8(row2h, r24);
2722 
2723  b0 = _mm_unpacklo_epi64(m7, m3);
2724  b1 = _mm_alignr_epi8(m2, m0, 8);
2725 
2726  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2727  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2728  row4l = _mm_xor_si128(row4l, row1l);
2729  row4h = _mm_xor_si128(row4h, row1h);
2730  row4l = _mm_shuffle_epi8(row4l, r16);
2731  row4h = _mm_shuffle_epi8(row4h, r16);
2732  row3l = _mm_add_epi64(row3l, row4l);
2733  row3h = _mm_add_epi64(row3h, row4h);
2734  row2l = _mm_xor_si128(row2l, row3l);
2735  row2h = _mm_xor_si128(row2h, row3h);
2736  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2737  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2738 
2739  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2740  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2741  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2742  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2743  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2744  row4l = t1, row4h = t0;
2745 
2746  b0 = _mm_unpackhi_epi64(m3, m1);
2747  b1 = _mm_unpackhi_epi64(m6, m5);
2748 
2749  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2750  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2751  row4l = _mm_xor_si128(row4l, row1l);
2752  row4h = _mm_xor_si128(row4h, row1h);
2753  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2754  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2755  row3l = _mm_add_epi64(row3l, row4l);
2756  row3h = _mm_add_epi64(row3h, row4h);
2757  row2l = _mm_xor_si128(row2l, row3l);
2758  row2h = _mm_xor_si128(row2h, row3h);
2759  row2l = _mm_shuffle_epi8(row2l, r24);
2760  row2h = _mm_shuffle_epi8(row2h, r24);
2761 
2762  b0 = _mm_unpackhi_epi64(m4, m0);
2763  b1 = _mm_unpacklo_epi64(m6, m7);
2764 
2765  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2766  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2767  row4l = _mm_xor_si128(row4l, row1l);
2768  row4h = _mm_xor_si128(row4h, row1h);
2769  row4l = _mm_shuffle_epi8(row4l, r16);
2770  row4h = _mm_shuffle_epi8(row4h, r16);
2771  row3l = _mm_add_epi64(row3l, row4l);
2772  row3h = _mm_add_epi64(row3h, row4h);
2773  row2l = _mm_xor_si128(row2l, row3l);
2774  row2h = _mm_xor_si128(row2h, row3h);
2775  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2776  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2777 
2778  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2779  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2780  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2781  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2782  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2783  row4l = t1, row4h = t0;
2784 
2785  b0 = _mm_blend_epi16(m1, m2, 0xF0);
2786  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2787 
2788  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2789  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2790  row4l = _mm_xor_si128(row4l, row1l);
2791  row4h = _mm_xor_si128(row4h, row1h);
2792  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2793  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2794  row3l = _mm_add_epi64(row3l, row4l);
2795  row3h = _mm_add_epi64(row3h, row4h);
2796  row2l = _mm_xor_si128(row2l, row3l);
2797  row2h = _mm_xor_si128(row2h, row3h);
2798  row2l = _mm_shuffle_epi8(row2l, r24);
2799  row2h = _mm_shuffle_epi8(row2h, r24);
2800 
2801  b0 = _mm_unpacklo_epi64(m3, m5);
2802  b1 = _mm_unpacklo_epi64(m0, m4);
2803 
2804  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2805  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2806  row4l = _mm_xor_si128(row4l, row1l);
2807  row4h = _mm_xor_si128(row4h, row1h);
2808  row4l = _mm_shuffle_epi8(row4l, r16);
2809  row4h = _mm_shuffle_epi8(row4h, r16);
2810  row3l = _mm_add_epi64(row3l, row4l);
2811  row3h = _mm_add_epi64(row3h, row4h);
2812  row2l = _mm_xor_si128(row2l, row3l);
2813  row2h = _mm_xor_si128(row2h, row3h);
2814  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2815  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2816 
2817  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2818  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2819  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2820  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2821  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2822  row4l = t1, row4h = t0;
2823 
2824  b0 = _mm_unpackhi_epi64(m4, m2);
2825  b1 = _mm_unpacklo_epi64(m1, m5);
2826 
2827  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2828  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2829  row4l = _mm_xor_si128(row4l, row1l);
2830  row4h = _mm_xor_si128(row4h, row1h);
2831  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2832  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2833  row3l = _mm_add_epi64(row3l, row4l);
2834  row3h = _mm_add_epi64(row3h, row4h);
2835  row2l = _mm_xor_si128(row2l, row3l);
2836  row2h = _mm_xor_si128(row2h, row3h);
2837  row2l = _mm_shuffle_epi8(row2l, r24);
2838  row2h = _mm_shuffle_epi8(row2h, r24);
2839 
2840  b0 = _mm_blend_epi16(m0, m3, 0xF0);
2841  b1 = _mm_blend_epi16(m2, m7, 0xF0);
2842 
2843  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2844  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2845  row4l = _mm_xor_si128(row4l, row1l);
2846  row4h = _mm_xor_si128(row4h, row1h);
2847  row4l = _mm_shuffle_epi8(row4l, r16);
2848  row4h = _mm_shuffle_epi8(row4h, r16);
2849  row3l = _mm_add_epi64(row3l, row4l);
2850  row3h = _mm_add_epi64(row3h, row4h);
2851  row2l = _mm_xor_si128(row2l, row3l);
2852  row2h = _mm_xor_si128(row2h, row3h);
2853  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2854  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2855 
2856  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2857  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2858  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2859  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2860  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2861  row4l = t1, row4h = t0;
2862 
2863  b0 = _mm_blend_epi16(m7, m5, 0xF0);
2864  b1 = _mm_blend_epi16(m3, m1, 0xF0);
2865 
2866  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2867  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2868  row4l = _mm_xor_si128(row4l, row1l);
2869  row4h = _mm_xor_si128(row4h, row1h);
2870  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2871  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2872  row3l = _mm_add_epi64(row3l, row4l);
2873  row3h = _mm_add_epi64(row3h, row4h);
2874  row2l = _mm_xor_si128(row2l, row3l);
2875  row2h = _mm_xor_si128(row2h, row3h);
2876  row2l = _mm_shuffle_epi8(row2l, r24);
2877  row2h = _mm_shuffle_epi8(row2h, r24);
2878 
2879  b0 = _mm_alignr_epi8(m6, m0, 8);
2880  b1 = _mm_blend_epi16(m4, m6, 0xF0);
2881 
2882  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2883  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2884  row4l = _mm_xor_si128(row4l, row1l);
2885  row4h = _mm_xor_si128(row4h, row1h);
2886  row4l = _mm_shuffle_epi8(row4l, r16);
2887  row4h = _mm_shuffle_epi8(row4h, r16);
2888  row3l = _mm_add_epi64(row3l, row4l);
2889  row3h = _mm_add_epi64(row3h, row4h);
2890  row2l = _mm_xor_si128(row2l, row3l);
2891  row2h = _mm_xor_si128(row2h, row3h);
2892  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2893  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2894 
2895  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2896  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2897  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2898  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2899  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2900  row4l = t1, row4h = t0;
2901 
2902  b0 = _mm_unpacklo_epi64(m1, m3);
2903  b1 = _mm_unpacklo_epi64(m0, m4);
2904 
2905  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2906  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2907  row4l = _mm_xor_si128(row4l, row1l);
2908  row4h = _mm_xor_si128(row4h, row1h);
2909  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2910  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2911  row3l = _mm_add_epi64(row3l, row4l);
2912  row3h = _mm_add_epi64(row3h, row4h);
2913  row2l = _mm_xor_si128(row2l, row3l);
2914  row2h = _mm_xor_si128(row2h, row3h);
2915  row2l = _mm_shuffle_epi8(row2l, r24);
2916  row2h = _mm_shuffle_epi8(row2h, r24);
2917 
2918  b0 = _mm_unpacklo_epi64(m6, m5);
2919  b1 = _mm_unpackhi_epi64(m5, m1);
2920 
2921  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2922  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2923  row4l = _mm_xor_si128(row4l, row1l);
2924  row4h = _mm_xor_si128(row4h, row1h);
2925  row4l = _mm_shuffle_epi8(row4l, r16);
2926  row4h = _mm_shuffle_epi8(row4h, r16);
2927  row3l = _mm_add_epi64(row3l, row4l);
2928  row3h = _mm_add_epi64(row3h, row4h);
2929  row2l = _mm_xor_si128(row2l, row3l);
2930  row2h = _mm_xor_si128(row2h, row3h);
2931  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2932  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2933 
2934  t0 = _mm_alignr_epi8(row2h, row2l, 8);
2935  t1 = _mm_alignr_epi8(row2l, row2h, 8);
2936  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2937  t0 = _mm_alignr_epi8(row4h, row4l, 8);
2938  t1 = _mm_alignr_epi8(row4l, row4h, 8);
2939  row4l = t1, row4h = t0;
2940 
2941  b0 = _mm_blend_epi16(m2, m3, 0xF0);
2942  b1 = _mm_unpackhi_epi64(m7, m0);
2943 
2944  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2945  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2946  row4l = _mm_xor_si128(row4l, row1l);
2947  row4h = _mm_xor_si128(row4h, row1h);
2948  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2949  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2950  row3l = _mm_add_epi64(row3l, row4l);
2951  row3h = _mm_add_epi64(row3h, row4h);
2952  row2l = _mm_xor_si128(row2l, row3l);
2953  row2h = _mm_xor_si128(row2h, row3h);
2954  row2l = _mm_shuffle_epi8(row2l, r24);
2955  row2h = _mm_shuffle_epi8(row2h, r24);
2956 
2957  b0 = _mm_unpackhi_epi64(m6, m2);
2958  b1 = _mm_blend_epi16(m7, m4, 0xF0);
2959 
2960  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2961  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2962  row4l = _mm_xor_si128(row4l, row1l);
2963  row4h = _mm_xor_si128(row4h, row1h);
2964  row4l = _mm_shuffle_epi8(row4l, r16);
2965  row4h = _mm_shuffle_epi8(row4h, r16);
2966  row3l = _mm_add_epi64(row3l, row4l);
2967  row3h = _mm_add_epi64(row3h, row4h);
2968  row2l = _mm_xor_si128(row2l, row3l);
2969  row2h = _mm_xor_si128(row2h, row3h);
2970  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2971  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2972 
2973  t0 = _mm_alignr_epi8(row2l, row2h, 8);
2974  t1 = _mm_alignr_epi8(row2h, row2l, 8);
2975  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2976  t0 = _mm_alignr_epi8(row4l, row4h, 8);
2977  t1 = _mm_alignr_epi8(row4h, row4l, 8);
2978  row4l = t1, row4h = t0;
2979 
2980  b0 = _mm_blend_epi16(m6, m0, 0xF0);
2981  b1 = _mm_unpacklo_epi64(m7, m2);
2982 
2983  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2984  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2985  row4l = _mm_xor_si128(row4l, row1l);
2986  row4h = _mm_xor_si128(row4h, row1h);
2987  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2988  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2989  row3l = _mm_add_epi64(row3l, row4l);
2990  row3h = _mm_add_epi64(row3h, row4h);
2991  row2l = _mm_xor_si128(row2l, row3l);
2992  row2h = _mm_xor_si128(row2h, row3h);
2993  row2l = _mm_shuffle_epi8(row2l, r24);
2994  row2h = _mm_shuffle_epi8(row2h, r24);
2995 
2996  b0 = _mm_unpackhi_epi64(m2, m7);
2997  b1 = _mm_alignr_epi8(m5, m6, 8);
2998 
2999  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3000  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3001  row4l = _mm_xor_si128(row4l, row1l);
3002  row4h = _mm_xor_si128(row4h, row1h);
3003  row4l = _mm_shuffle_epi8(row4l, r16);
3004  row4h = _mm_shuffle_epi8(row4h, r16);
3005  row3l = _mm_add_epi64(row3l, row4l);
3006  row3h = _mm_add_epi64(row3h, row4h);
3007  row2l = _mm_xor_si128(row2l, row3l);
3008  row2h = _mm_xor_si128(row2h, row3h);
3009  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3010  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3011 
3012  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3013  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3014  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3015  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3016  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3017  row4l = t1, row4h = t0;
3018 
3019  b0 = _mm_unpacklo_epi64(m0, m3);
3020  b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
3021 
3022  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3023  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3024  row4l = _mm_xor_si128(row4l, row1l);
3025  row4h = _mm_xor_si128(row4h, row1h);
3026  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3027  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3028  row3l = _mm_add_epi64(row3l, row4l);
3029  row3h = _mm_add_epi64(row3h, row4h);
3030  row2l = _mm_xor_si128(row2l, row3l);
3031  row2h = _mm_xor_si128(row2h, row3h);
3032  row2l = _mm_shuffle_epi8(row2l, r24);
3033  row2h = _mm_shuffle_epi8(row2h, r24);
3034 
3035  b0 = _mm_unpackhi_epi64(m3, m1);
3036  b1 = _mm_blend_epi16(m1, m5, 0xF0);
3037 
3038  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3039  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3040  row4l = _mm_xor_si128(row4l, row1l);
3041  row4h = _mm_xor_si128(row4h, row1h);
3042  row4l = _mm_shuffle_epi8(row4l, r16);
3043  row4h = _mm_shuffle_epi8(row4h, r16);
3044  row3l = _mm_add_epi64(row3l, row4l);
3045  row3h = _mm_add_epi64(row3h, row4h);
3046  row2l = _mm_xor_si128(row2l, row3l);
3047  row2h = _mm_xor_si128(row2h, row3h);
3048  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3049  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3050 
3051  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3052  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3053  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3054  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3055  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3056  row4l = t1, row4h = t0;
3057 
3058  b0 = _mm_unpackhi_epi64(m6, m3);
3059  b1 = _mm_blend_epi16(m6, m1, 0xF0);
3060 
3061  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3062  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3063  row4l = _mm_xor_si128(row4l, row1l);
3064  row4h = _mm_xor_si128(row4h, row1h);
3065  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3066  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3067  row3l = _mm_add_epi64(row3l, row4l);
3068  row3h = _mm_add_epi64(row3h, row4h);
3069  row2l = _mm_xor_si128(row2l, row3l);
3070  row2h = _mm_xor_si128(row2h, row3h);
3071  row2l = _mm_shuffle_epi8(row2l, r24);
3072  row2h = _mm_shuffle_epi8(row2h, r24);
3073 
3074  b0 = _mm_alignr_epi8(m7, m5, 8);
3075  b1 = _mm_unpackhi_epi64(m0, m4);
3076 
3077  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3078  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3079  row4l = _mm_xor_si128(row4l, row1l);
3080  row4h = _mm_xor_si128(row4h, row1h);
3081  row4l = _mm_shuffle_epi8(row4l, r16);
3082  row4h = _mm_shuffle_epi8(row4h, r16);
3083  row3l = _mm_add_epi64(row3l, row4l);
3084  row3h = _mm_add_epi64(row3h, row4h);
3085  row2l = _mm_xor_si128(row2l, row3l);
3086  row2h = _mm_xor_si128(row2h, row3h);
3087  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3088  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3089 
3090  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3091  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3092  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3093  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3094  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3095  row4l = t1, row4h = t0;
3096 
3097  b0 = _mm_unpackhi_epi64(m2, m7);
3098  b1 = _mm_unpacklo_epi64(m4, m1);
3099 
3100  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3101  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3102  row4l = _mm_xor_si128(row4l, row1l);
3103  row4h = _mm_xor_si128(row4h, row1h);
3104  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3105  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3106  row3l = _mm_add_epi64(row3l, row4l);
3107  row3h = _mm_add_epi64(row3h, row4h);
3108  row2l = _mm_xor_si128(row2l, row3l);
3109  row2h = _mm_xor_si128(row2h, row3h);
3110  row2l = _mm_shuffle_epi8(row2l, r24);
3111  row2h = _mm_shuffle_epi8(row2h, r24);
3112 
3113  b0 = _mm_unpacklo_epi64(m0, m2);
3114  b1 = _mm_unpacklo_epi64(m3, m5);
3115 
3116  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3117  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3118  row4l = _mm_xor_si128(row4l, row1l);
3119  row4h = _mm_xor_si128(row4h, row1h);
3120  row4l = _mm_shuffle_epi8(row4l, r16);
3121  row4h = _mm_shuffle_epi8(row4h, r16);
3122  row3l = _mm_add_epi64(row3l, row4l);
3123  row3h = _mm_add_epi64(row3h, row4h);
3124  row2l = _mm_xor_si128(row2l, row3l);
3125  row2h = _mm_xor_si128(row2h, row3h);
3126  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3127  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3128 
3129  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3130  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3131  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3132  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3133  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3134  row4l = t1, row4h = t0;
3135 
3136  b0 = _mm_unpacklo_epi64(m3, m7);
3137  b1 = _mm_alignr_epi8(m0, m5, 8);
3138 
3139  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3140  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3141  row4l = _mm_xor_si128(row4l, row1l);
3142  row4h = _mm_xor_si128(row4h, row1h);
3143  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3144  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3145  row3l = _mm_add_epi64(row3l, row4l);
3146  row3h = _mm_add_epi64(row3h, row4h);
3147  row2l = _mm_xor_si128(row2l, row3l);
3148  row2h = _mm_xor_si128(row2h, row3h);
3149  row2l = _mm_shuffle_epi8(row2l, r24);
3150  row2h = _mm_shuffle_epi8(row2h, r24);
3151 
3152  b0 = _mm_unpackhi_epi64(m7, m4);
3153  b1 = _mm_alignr_epi8(m4, m1, 8);
3154 
3155  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3156  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3157  row4l = _mm_xor_si128(row4l, row1l);
3158  row4h = _mm_xor_si128(row4h, row1h);
3159  row4l = _mm_shuffle_epi8(row4l, r16);
3160  row4h = _mm_shuffle_epi8(row4h, r16);
3161  row3l = _mm_add_epi64(row3l, row4l);
3162  row3h = _mm_add_epi64(row3h, row4h);
3163  row2l = _mm_xor_si128(row2l, row3l);
3164  row2h = _mm_xor_si128(row2h, row3h);
3165  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3166  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3167 
3168  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3169  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3170  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3171  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3172  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3173  row4l = t1, row4h = t0;
3174 
3175  b0 = m6;
3176  b1 = _mm_alignr_epi8(m5, m0, 8);
3177 
3178  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3179  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3180  row4l = _mm_xor_si128(row4l, row1l);
3181  row4h = _mm_xor_si128(row4h, row1h);
3182  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3183  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3184  row3l = _mm_add_epi64(row3l, row4l);
3185  row3h = _mm_add_epi64(row3h, row4h);
3186  row2l = _mm_xor_si128(row2l, row3l);
3187  row2h = _mm_xor_si128(row2h, row3h);
3188  row2l = _mm_shuffle_epi8(row2l, r24);
3189  row2h = _mm_shuffle_epi8(row2h, r24);
3190 
3191  b0 = _mm_blend_epi16(m1, m3, 0xF0);
3192  b1 = m2;
3193 
3194  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3195  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3196  row4l = _mm_xor_si128(row4l, row1l);
3197  row4h = _mm_xor_si128(row4h, row1h);
3198  row4l = _mm_shuffle_epi8(row4l, r16);
3199  row4h = _mm_shuffle_epi8(row4h, r16);
3200  row3l = _mm_add_epi64(row3l, row4l);
3201  row3h = _mm_add_epi64(row3h, row4h);
3202  row2l = _mm_xor_si128(row2l, row3l);
3203  row2h = _mm_xor_si128(row2h, row3h);
3204  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3205  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3206 
3207  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3208  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3209  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3210  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3211  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3212  row4l = t1, row4h = t0;
3213 
3214  b0 = _mm_unpacklo_epi64(m5, m4);
3215  b1 = _mm_unpackhi_epi64(m3, m0);
3216 
3217  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3218  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3219  row4l = _mm_xor_si128(row4l, row1l);
3220  row4h = _mm_xor_si128(row4h, row1h);
3221  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3222  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3223  row3l = _mm_add_epi64(row3l, row4l);
3224  row3h = _mm_add_epi64(row3h, row4h);
3225  row2l = _mm_xor_si128(row2l, row3l);
3226  row2h = _mm_xor_si128(row2h, row3h);
3227  row2l = _mm_shuffle_epi8(row2l, r24);
3228  row2h = _mm_shuffle_epi8(row2h, r24);
3229 
3230  b0 = _mm_unpacklo_epi64(m1, m2);
3231  b1 = _mm_blend_epi16(m3, m2, 0xF0);
3232 
3233  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3234  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3235  row4l = _mm_xor_si128(row4l, row1l);
3236  row4h = _mm_xor_si128(row4h, row1h);
3237  row4l = _mm_shuffle_epi8(row4l, r16);
3238  row4h = _mm_shuffle_epi8(row4h, r16);
3239  row3l = _mm_add_epi64(row3l, row4l);
3240  row3h = _mm_add_epi64(row3h, row4h);
3241  row2l = _mm_xor_si128(row2l, row3l);
3242  row2h = _mm_xor_si128(row2h, row3h);
3243  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3244  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3245 
3246  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3247  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3248  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3249  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3250  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3251  row4l = t1, row4h = t0;
3252 
3253  b0 = _mm_unpackhi_epi64(m7, m4);
3254  b1 = _mm_unpackhi_epi64(m1, m6);
3255 
3256  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3257  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3258  row4l = _mm_xor_si128(row4l, row1l);
3259  row4h = _mm_xor_si128(row4h, row1h);
3260  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3261  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3262  row3l = _mm_add_epi64(row3l, row4l);
3263  row3h = _mm_add_epi64(row3h, row4h);
3264  row2l = _mm_xor_si128(row2l, row3l);
3265  row2h = _mm_xor_si128(row2h, row3h);
3266  row2l = _mm_shuffle_epi8(row2l, r24);
3267  row2h = _mm_shuffle_epi8(row2h, r24);
3268 
3269  b0 = _mm_alignr_epi8(m7, m5, 8);
3270  b1 = _mm_unpacklo_epi64(m6, m0);
3271 
3272  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3273  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3274  row4l = _mm_xor_si128(row4l, row1l);
3275  row4h = _mm_xor_si128(row4h, row1h);
3276  row4l = _mm_shuffle_epi8(row4l, r16);
3277  row4h = _mm_shuffle_epi8(row4h, r16);
3278  row3l = _mm_add_epi64(row3l, row4l);
3279  row3h = _mm_add_epi64(row3h, row4h);
3280  row2l = _mm_xor_si128(row2l, row3l);
3281  row2h = _mm_xor_si128(row2h, row3h);
3282  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3283  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3284 
3285  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3286  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3287  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3288  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3289  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3290  row4l = t1, row4h = t0;
3291 
3292  b0 = _mm_unpacklo_epi64(m0, m1);
3293  b1 = _mm_unpacklo_epi64(m2, m3);
3294 
3295  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3296  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3297  row4l = _mm_xor_si128(row4l, row1l);
3298  row4h = _mm_xor_si128(row4h, row1h);
3299  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3300  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3301  row3l = _mm_add_epi64(row3l, row4l);
3302  row3h = _mm_add_epi64(row3h, row4h);
3303  row2l = _mm_xor_si128(row2l, row3l);
3304  row2h = _mm_xor_si128(row2h, row3h);
3305  row2l = _mm_shuffle_epi8(row2l, r24);
3306  row2h = _mm_shuffle_epi8(row2h, r24);
3307 
3308  b0 = _mm_unpackhi_epi64(m0, m1);
3309  b1 = _mm_unpackhi_epi64(m2, m3);
3310 
3311  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3312  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3313  row4l = _mm_xor_si128(row4l, row1l);
3314  row4h = _mm_xor_si128(row4h, row1h);
3315  row4l = _mm_shuffle_epi8(row4l, r16);
3316  row4h = _mm_shuffle_epi8(row4h, r16);
3317  row3l = _mm_add_epi64(row3l, row4l);
3318  row3h = _mm_add_epi64(row3h, row4h);
3319  row2l = _mm_xor_si128(row2l, row3l);
3320  row2h = _mm_xor_si128(row2h, row3h);
3321  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3322  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3323 
3324  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3325  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3326  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3327  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3328  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3329  row4l = t1, row4h = t0;
3330 
3331  b0 = _mm_unpacklo_epi64(m4, m5);
3332  b1 = _mm_unpacklo_epi64(m6, m7);
3333 
3334  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3335  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3336  row4l = _mm_xor_si128(row4l, row1l);
3337  row4h = _mm_xor_si128(row4h, row1h);
3338  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3339  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3340  row3l = _mm_add_epi64(row3l, row4l);
3341  row3h = _mm_add_epi64(row3h, row4h);
3342  row2l = _mm_xor_si128(row2l, row3l);
3343  row2h = _mm_xor_si128(row2h, row3h);
3344  row2l = _mm_shuffle_epi8(row2l, r24);
3345  row2h = _mm_shuffle_epi8(row2h, r24);
3346 
3347  b0 = _mm_unpackhi_epi64(m4, m5);
3348  b1 = _mm_unpackhi_epi64(m6, m7);
3349 
3350  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3351  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3352  row4l = _mm_xor_si128(row4l, row1l);
3353  row4h = _mm_xor_si128(row4h, row1h);
3354  row4l = _mm_shuffle_epi8(row4l, r16);
3355  row4h = _mm_shuffle_epi8(row4h, r16);
3356  row3l = _mm_add_epi64(row3l, row4l);
3357  row3h = _mm_add_epi64(row3h, row4h);
3358  row2l = _mm_xor_si128(row2l, row3l);
3359  row2h = _mm_xor_si128(row2h, row3h);
3360  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3361  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3362 
3363  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3364  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3365  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3366  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3367  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3368  row4l = t1, row4h = t0;
3369 
3370  b0 = _mm_unpacklo_epi64(m7, m2);
3371  b1 = _mm_unpackhi_epi64(m4, m6);
3372 
3373  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3374  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3375  row4l = _mm_xor_si128(row4l, row1l);
3376  row4h = _mm_xor_si128(row4h, row1h);
3377  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3378  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3379  row3l = _mm_add_epi64(row3l, row4l);
3380  row3h = _mm_add_epi64(row3h, row4h);
3381  row2l = _mm_xor_si128(row2l, row3l);
3382  row2h = _mm_xor_si128(row2h, row3h);
3383  row2l = _mm_shuffle_epi8(row2l, r24);
3384  row2h = _mm_shuffle_epi8(row2h, r24);
3385 
3386  b0 = _mm_unpacklo_epi64(m5, m4);
3387  b1 = _mm_alignr_epi8(m3, m7, 8);
3388 
3389  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3390  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3391  row4l = _mm_xor_si128(row4l, row1l);
3392  row4h = _mm_xor_si128(row4h, row1h);
3393  row4l = _mm_shuffle_epi8(row4l, r16);
3394  row4h = _mm_shuffle_epi8(row4h, r16);
3395  row3l = _mm_add_epi64(row3l, row4l);
3396  row3h = _mm_add_epi64(row3h, row4h);
3397  row2l = _mm_xor_si128(row2l, row3l);
3398  row2h = _mm_xor_si128(row2h, row3h);
3399  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3400  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3401 
3402  t0 = _mm_alignr_epi8(row2h, row2l, 8);
3403  t1 = _mm_alignr_epi8(row2l, row2h, 8);
3404  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3405  t0 = _mm_alignr_epi8(row4h, row4l, 8);
3406  t1 = _mm_alignr_epi8(row4l, row4h, 8);
3407  row4l = t1, row4h = t0;
3408 
3409  b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
3410  b1 = _mm_unpackhi_epi64(m5, m2);
3411 
3412  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3413  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3414  row4l = _mm_xor_si128(row4l, row1l);
3415  row4h = _mm_xor_si128(row4h, row1h);
3416  row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3417  row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3418  row3l = _mm_add_epi64(row3l, row4l);
3419  row3h = _mm_add_epi64(row3h, row4h);
3420  row2l = _mm_xor_si128(row2l, row3l);
3421  row2h = _mm_xor_si128(row2h, row3h);
3422  row2l = _mm_shuffle_epi8(row2l, r24);
3423  row2h = _mm_shuffle_epi8(row2h, r24);
3424 
3425  b0 = _mm_unpacklo_epi64(m6, m1);
3426  b1 = _mm_unpackhi_epi64(m3, m1);
3427 
3428  row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3429  row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3430  row4l = _mm_xor_si128(row4l, row1l);
3431  row4h = _mm_xor_si128(row4h, row1h);
3432  row4l = _mm_shuffle_epi8(row4l, r16);
3433  row4h = _mm_shuffle_epi8(row4h, r16);
3434  row3l = _mm_add_epi64(row3l, row4l);
3435  row3h = _mm_add_epi64(row3h, row4h);
3436  row2l = _mm_xor_si128(row2l, row3l);
3437  row2h = _mm_xor_si128(row2h, row3h);
3438  row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3439  row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3440 
3441  t0 = _mm_alignr_epi8(row2l, row2h, 8);
3442  t1 = _mm_alignr_epi8(row2h, row2l, 8);
3443  row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3444  t0 = _mm_alignr_epi8(row4l, row4h, 8);
3445  t1 = _mm_alignr_epi8(row4h, row4l, 8);
3446  row4l = t1, row4h = t0;
3447 
3448  row1l = _mm_xor_si128(row3l, row1l);
3449  row1h = _mm_xor_si128(row3h, row1h);
3450  _mm_storeu_si128((__m128i *)(void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[0])), row1l));
3451  _mm_storeu_si128((__m128i *)(void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[2])), row1h));
3452 
3453  row2l = _mm_xor_si128(row4l, row2l);
3454  row2h = _mm_xor_si128(row4h, row2h);
3455  _mm_storeu_si128((__m128i *)(void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[4])), row2l));
3456  _mm_storeu_si128((__m128i *)(void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((const __m128i*)(const void*)(&state.h[6])), row2h));
3457 }
3458 #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE
3459 
3460 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
3461 
3462 // Reverse words for ARM (use arguments to _mm_set_epi32 without reversing them).
3463 #define vld1q_u32_rev(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_u32(d);
3464 
3465 // Keep things straight due to swapping. For a 128-bit vector, H64 denotes
3466 // the high 64-bit vector, and L64 denotes the low 64-bit vector. The
3467 // vectors are the same as returned by vget_high_u64 and vget_low_u64.
3468 static const int LANE_H64 = 1;
3469 static const int LANE_L64 = 0;
3470 
3471 static void BLAKE2_NEON_Compress32(const byte* input, BLAKE2_State<word32, false>& state)
3472 {
3473  //assert(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3474  assert(IsAlignedOn(&state.h[0],GetAlignmentOf<uint32x4_t>()));
3475  assert(IsAlignedOn(&state.h[4],GetAlignmentOf<uint32x4_t>()));
3476  assert(IsAlignedOn(&state.t[0],GetAlignmentOf<uint32x4_t>()));
3477 
3478  CRYPTOPP_ALIGN_DATA(16) uint32_t m0[4], m1[4], m2[4], m3[4], m4[4], m5[4], m6[4], m7[4];
3479  CRYPTOPP_ALIGN_DATA(16) uint32_t m8[4], m9[4], m10[4], m11[4], m12[4], m13[4], m14[4], m15[4];
3480 
3481  GetBlock<word32, LittleEndian, true> get(input);
3482  get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]);
3483 
3484  uint32x4_t row1,row2,row3,row4;
3485  uint32x4_t buf1,buf2,buf3,buf4;
3486  uint32x4_t ff0,ff1;
3487 
3488  row1 = ff0 = vld1q_u32((const uint32_t*)&state.h[0]);
3489  row2 = ff1 = vld1q_u32((const uint32_t*)&state.h[4]);
3490  row3 = vld1q_u32((const uint32_t*)&BLAKE2S_IV(0));
3491  row4 = veorq_u32(vld1q_u32((const uint32_t*)&BLAKE2S_IV(4)), vld1q_u32((const uint32_t*)&state.t[0]));
3492 
3493  // buf1 = vld1q_u32(m6,m4,m2,m0);
3494  vld1q_u32_rev(buf1, m6,m4,m2,m0);
3495 
3496  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3497  row4 = veorq_u32(row4,row1);
3498  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3499  row3 = vaddq_u32(row3,row4);
3500  row2 = veorq_u32(row2,row3);
3501  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3502 
3503  // buf2 = vld1q_u32(m7,m5,m3,m1);
3504  vld1q_u32_rev(buf2, m7,m5,m3,m1);
3505 
3506  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3507  row4 = veorq_u32(row4,row1);
3508  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3509  row3 = vaddq_u32(row3,row4);
3510  row2 = veorq_u32(row2,row3);
3511  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3512 
3513  row4 = vextq_u32(row4,row4,3);
3514  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3515  row2 = vextq_u32(row2,row2,1);
3516 
3517  // buf3 = vld1q_u32(m14,m12,m10,m8);
3518  vld1q_u32_rev(buf3, m14,m12,m10,m8);
3519 
3520  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3521  row4 = veorq_u32(row4,row1);
3522  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3523  row3 = vaddq_u32(row3,row4);
3524  row2 = veorq_u32(row2,row3);
3525  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3526 
3527  // buf4 = vld1q_u32(m15,m13,m11,m9);
3528  vld1q_u32_rev(buf4, m15,m13,m11,m9);
3529 
3530  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3531  row4 = veorq_u32(row4,row1);
3532  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3533  row3 = vaddq_u32(row3,row4);
3534  row2 = veorq_u32(row2,row3);
3535  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3536 
3537  row4 = vextq_u32(row4,row4,1);
3538  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3539  row2 = vextq_u32(row2,row2,3);
3540 
3541  // buf1 = vld1q_u32(m13,m9,m4,m14);
3542  vld1q_u32_rev(buf1, m13,m9,m4,m14);
3543 
3544  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3545  row4 = veorq_u32(row4,row1);
3546  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3547  row3 = vaddq_u32(row3,row4);
3548  row2 = veorq_u32(row2,row3);
3549  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3550 
3551  // buf2 = vld1q_u32(m6,m15,m8,m10);
3552  vld1q_u32_rev(buf2, m6,m15,m8,m10);
3553 
3554  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3555  row4 = veorq_u32(row4,row1);
3556  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3557  row3 = vaddq_u32(row3,row4);
3558  row2 = veorq_u32(row2,row3);
3559  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3560 
3561  row4 = vextq_u32(row4,row4,3);
3562  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3563  row2 = vextq_u32(row2,row2,1);
3564 
3565  // buf3 = vld1q_u32(m5,m11,m0,m1);
3566  vld1q_u32_rev(buf3, m5,m11,m0,m1);
3567 
3568  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3569  row4 = veorq_u32(row4,row1);
3570  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3571  row3 = vaddq_u32(row3,row4);
3572  row2 = veorq_u32(row2,row3);
3573  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3574 
3575  // buf4 = vld1q_u32(m3,m7,m2,m12);
3576  vld1q_u32_rev(buf4, m3,m7,m2,m12);
3577 
3578  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3579  row4 = veorq_u32(row4,row1);
3580  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3581  row3 = vaddq_u32(row3,row4);
3582  row2 = veorq_u32(row2,row3);
3583  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3584 
3585  row4 = vextq_u32(row4,row4,1);
3586  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3587  row2 = vextq_u32(row2,row2,3);
3588 
3589  // buf1 = vld1q_u32(m15,m5,m12,m11);
3590  vld1q_u32_rev(buf1, m15,m5,m12,m11);
3591 
3592  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3593  row4 = veorq_u32(row4,row1);
3594  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3595  row3 = vaddq_u32(row3,row4);
3596  row2 = veorq_u32(row2,row3);
3597  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3598 
3599  // buf2 = vld1q_u32(m13,m2,m0,m8);
3600  vld1q_u32_rev(buf2, m13,m2,m0,m8);
3601 
3602  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3603  row4 = veorq_u32(row4,row1);
3604  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3605  row3 = vaddq_u32(row3,row4);
3606  row2 = veorq_u32(row2,row3);
3607  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3608 
3609  row4 = vextq_u32(row4,row4,3);
3610  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3611  row2 = vextq_u32(row2,row2,1);
3612 
3613  // buf3 = vld1q_u32(m9,m7,m3,m10);
3614  vld1q_u32_rev(buf3, m9,m7,m3,m10);
3615 
3616  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3617  row4 = veorq_u32(row4,row1);
3618  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3619  row3 = vaddq_u32(row3,row4);
3620  row2 = veorq_u32(row2,row3);
3621  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3622 
3623  // buf4 = vld1q_u32(m4,m1,m6,m14);
3624  vld1q_u32_rev(buf4, m4,m1,m6,m14);
3625 
3626  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3627  row4 = veorq_u32(row4,row1);
3628  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3629  row3 = vaddq_u32(row3,row4);
3630  row2 = veorq_u32(row2,row3);
3631  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3632 
3633  row4 = vextq_u32(row4,row4,1);
3634  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3635  row2 = vextq_u32(row2,row2,3);
3636 
3637  // buf1 = vld1q_u32(m11,m13,m3,m7);
3638  vld1q_u32_rev(buf1, m11,m13,m3,m7);
3639 
3640  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3641  row4 = veorq_u32(row4,row1);
3642  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3643  row3 = vaddq_u32(row3,row4);
3644  row2 = veorq_u32(row2,row3);
3645  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3646 
3647  // buf2 = vld1q_u32(m14,m12,m1,m9);
3648  vld1q_u32_rev(buf2, m14,m12,m1,m9);
3649 
3650  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3651  row4 = veorq_u32(row4,row1);
3652  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3653  row3 = vaddq_u32(row3,row4);
3654  row2 = veorq_u32(row2,row3);
3655  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3656 
3657  row4 = vextq_u32(row4,row4,3);
3658  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3659  row2 = vextq_u32(row2,row2,1);
3660 
3661  // buf3 = vld1q_u32(m15,m4,m5,m2);
3662  vld1q_u32_rev(buf3, m15,m4,m5,m2);
3663 
3664  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3665  row4 = veorq_u32(row4,row1);
3666  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3667  row3 = vaddq_u32(row3,row4);
3668  row2 = veorq_u32(row2,row3);
3669  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3670 
3671  // buf4 = vld1q_u32(m8,m0,m10,m6);
3672  vld1q_u32_rev(buf4, m8,m0,m10,m6);
3673 
3674  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3675  row4 = veorq_u32(row4,row1);
3676  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3677  row3 = vaddq_u32(row3,row4);
3678  row2 = veorq_u32(row2,row3);
3679  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3680 
3681  row4 = vextq_u32(row4,row4,1);
3682  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3683  row2 = vextq_u32(row2,row2,3);
3684 
3685  // buf1 = vld1q_u32(m10,m2,m5,m9);
3686  vld1q_u32_rev(buf1, m10,m2,m5,m9);
3687 
3688  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3689  row4 = veorq_u32(row4,row1);
3690  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3691  row3 = vaddq_u32(row3,row4);
3692  row2 = veorq_u32(row2,row3);
3693  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3694 
3695  // buf2 = vld1q_u32(m15,m4,m7,m0);
3696  vld1q_u32_rev(buf2, m15,m4,m7,m0);
3697 
3698  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3699  row4 = veorq_u32(row4,row1);
3700  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3701  row3 = vaddq_u32(row3,row4);
3702  row2 = veorq_u32(row2,row3);
3703  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3704 
3705  row4 = vextq_u32(row4,row4,3);
3706  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3707  row2 = vextq_u32(row2,row2,1);
3708 
3709  // buf3 = vld1q_u32(m3,m6,m11,m14);
3710  vld1q_u32_rev(buf3, m3,m6,m11,m14);
3711 
3712  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3713  row4 = veorq_u32(row4,row1);
3714  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3715  row3 = vaddq_u32(row3,row4);
3716  row2 = veorq_u32(row2,row3);
3717  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3718 
3719  // buf4 = vld1q_u32(m13,m8,m12,m1);
3720  vld1q_u32_rev(buf4, m13,m8,m12,m1);
3721 
3722  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3723  row4 = veorq_u32(row4,row1);
3724  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3725  row3 = vaddq_u32(row3,row4);
3726  row2 = veorq_u32(row2,row3);
3727  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3728 
3729  row4 = vextq_u32(row4,row4,1);
3730  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3731  row2 = vextq_u32(row2,row2,3);
3732 
3733  // buf1 = vld1q_u32(m8,m0,m6,m2);
3734  vld1q_u32_rev(buf1, m8,m0,m6,m2);
3735 
3736  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3737  row4 = veorq_u32(row4,row1);
3738  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3739  row3 = vaddq_u32(row3,row4);
3740  row2 = veorq_u32(row2,row3);
3741  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3742 
3743  // buf2 = vld1q_u32(m3,m11,m10,m12);
3744  vld1q_u32_rev(buf2, m3,m11,m10,m12);
3745 
3746  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3747  row4 = veorq_u32(row4,row1);
3748  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3749  row3 = vaddq_u32(row3,row4);
3750  row2 = veorq_u32(row2,row3);
3751  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3752 
3753  row4 = vextq_u32(row4,row4,3);
3754  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3755  row2 = vextq_u32(row2,row2,1);
3756 
3757  // buf3 = vld1q_u32(m1,m15,m7,m4);
3758  vld1q_u32_rev(buf3, m1,m15,m7,m4);
3759 
3760  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3761  row4 = veorq_u32(row4,row1);
3762  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3763  row3 = vaddq_u32(row3,row4);
3764  row2 = veorq_u32(row2,row3);
3765  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3766 
3767  // buf4 = vld1q_u32(m9,m14,m5,m13);
3768  vld1q_u32_rev(buf4, m9,m14,m5,m13);
3769 
3770  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3771  row4 = veorq_u32(row4,row1);
3772  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3773  row3 = vaddq_u32(row3,row4);
3774  row2 = veorq_u32(row2,row3);
3775  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3776 
3777  row4 = vextq_u32(row4,row4,1);
3778  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3779  row2 = vextq_u32(row2,row2,3);
3780 
3781  // buf1 = vld1q_u32(m4,m14,m1,m12);
3782  vld1q_u32_rev(buf1, m4,m14,m1,m12);
3783 
3784  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3785  row4 = veorq_u32(row4,row1);
3786  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3787  row3 = vaddq_u32(row3,row4);
3788  row2 = veorq_u32(row2,row3);
3789  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3790 
3791  // buf2 = vld1q_u32(m10,m13,m15,m5);
3792  vld1q_u32_rev(buf2, m10,m13,m15,m5);
3793 
3794  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3795  row4 = veorq_u32(row4,row1);
3796  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3797  row3 = vaddq_u32(row3,row4);
3798  row2 = veorq_u32(row2,row3);
3799  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3800 
3801  row4 = vextq_u32(row4,row4,3);
3802  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3803  row2 = vextq_u32(row2,row2,1);
3804 
3805  // buf3 = vld1q_u32(m8,m9,m6,m0);
3806  vld1q_u32_rev(buf3, m8,m9,m6,m0);
3807 
3808  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3809  row4 = veorq_u32(row4,row1);
3810  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3811  row3 = vaddq_u32(row3,row4);
3812  row2 = veorq_u32(row2,row3);
3813  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3814 
3815  // buf4 = vld1q_u32(m11,m2,m3,m7);
3816  vld1q_u32_rev(buf4, m11,m2,m3,m7);
3817 
3818  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3819  row4 = veorq_u32(row4,row1);
3820  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3821  row3 = vaddq_u32(row3,row4);
3822  row2 = veorq_u32(row2,row3);
3823  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3824 
3825  row4 = vextq_u32(row4,row4,1);
3826  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3827  row2 = vextq_u32(row2,row2,3);
3828 
3829  // buf1 = vld1q_u32(m3,m12,m7,m13);
3830  vld1q_u32_rev(buf1, m3,m12,m7,m13);
3831 
3832  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3833  row4 = veorq_u32(row4,row1);
3834  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3835  row3 = vaddq_u32(row3,row4);
3836  row2 = veorq_u32(row2,row3);
3837  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3838 
3839  // buf2 = vld1q_u32(m9,m1,m14,m11);
3840  vld1q_u32_rev(buf2, m9,m1,m14,m11);
3841 
3842  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3843  row4 = veorq_u32(row4,row1);
3844  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3845  row3 = vaddq_u32(row3,row4);
3846  row2 = veorq_u32(row2,row3);
3847  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3848 
3849  row4 = vextq_u32(row4,row4,3);
3850  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3851  row2 = vextq_u32(row2,row2,1);
3852 
3853  // buf3 = vld1q_u32(m2,m8,m15,m5);
3854  vld1q_u32_rev(buf3, m2,m8,m15,m5);
3855 
3856  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3857  row4 = veorq_u32(row4,row1);
3858  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3859  row3 = vaddq_u32(row3,row4);
3860  row2 = veorq_u32(row2,row3);
3861  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3862 
3863  // buf4 = vld1q_u32(m10,m6,m4,m0);
3864  vld1q_u32_rev(buf4, m10,m6,m4,m0);
3865 
3866  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3867  row4 = veorq_u32(row4,row1);
3868  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3869  row3 = vaddq_u32(row3,row4);
3870  row2 = veorq_u32(row2,row3);
3871  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3872 
3873  row4 = vextq_u32(row4,row4,1);
3874  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3875  row2 = vextq_u32(row2,row2,3);
3876 
3877  // buf1 = vld1q_u32(m0,m11,m14,m6);
3878  vld1q_u32_rev(buf1, m0,m11,m14,m6);
3879 
3880  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3881  row4 = veorq_u32(row4,row1);
3882  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3883  row3 = vaddq_u32(row3,row4);
3884  row2 = veorq_u32(row2,row3);
3885  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3886 
3887  // buf2 = vld1q_u32(m8,m3,m9,m15);
3888  vld1q_u32_rev(buf2, m8,m3,m9,m15);
3889 
3890  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3891  row4 = veorq_u32(row4,row1);
3892  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3893  row3 = vaddq_u32(row3,row4);
3894  row2 = veorq_u32(row2,row3);
3895  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3896 
3897  row4 = vextq_u32(row4,row4,3);
3898  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3899  row2 = vextq_u32(row2,row2,1);
3900 
3901  // buf3 = vld1q_u32(m10,m1,m13,m12);
3902  vld1q_u32_rev(buf3, m10,m1,m13,m12);
3903 
3904  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3905  row4 = veorq_u32(row4,row1);
3906  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3907  row3 = vaddq_u32(row3,row4);
3908  row2 = veorq_u32(row2,row3);
3909  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3910 
3911  // buf4 = vld1q_u32(m5,m4,m7,m2);
3912  vld1q_u32_rev(buf4, m5,m4,m7,m2);
3913 
3914  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3915  row4 = veorq_u32(row4,row1);
3916  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3917  row3 = vaddq_u32(row3,row4);
3918  row2 = veorq_u32(row2,row3);
3919  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3920 
3921  row4 = vextq_u32(row4,row4,1);
3922  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3923  row2 = vextq_u32(row2,row2,3);
3924 
3925  // buf1 = vld1q_u32(m1,m7,m8,m10);
3926  vld1q_u32_rev(buf1, m1,m7,m8,m10);
3927 
3928  row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3929  row4 = veorq_u32(row4,row1);
3930  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3931  row3 = vaddq_u32(row3,row4);
3932  row2 = veorq_u32(row2,row3);
3933  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3934 
3935  // buf2 = vld1q_u32(m5,m6,m4,m2);
3936  vld1q_u32_rev(buf2, m5,m6,m4,m2);
3937 
3938  row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3939  row4 = veorq_u32(row4,row1);
3940  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3941  row3 = vaddq_u32(row3,row4);
3942  row2 = veorq_u32(row2,row3);
3943  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3944 
3945  row4 = vextq_u32(row4,row4,3);
3946  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3947  row2 = vextq_u32(row2,row2,1);
3948 
3949  // buf3 = vld1q_u32(m13,m3,m9,m15);
3950  vld1q_u32_rev(buf3, m13,m3,m9,m15);
3951 
3952  row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3953  row4 = veorq_u32(row4,row1);
3954  row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3955  row3 = vaddq_u32(row3,row4);
3956  row2 = veorq_u32(row2,row3);
3957  row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3958 
3959  // buf4 = vld1q_u32(m0,m12,m14,m11);
3960  vld1q_u32_rev(buf4, m0,m12,m14,m11);
3961 
3962  row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3963  row4 = veorq_u32(row4,row1);
3964  row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3965  row3 = vaddq_u32(row3,row4);
3966  row2 = veorq_u32(row2,row3);
3967  row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3968 
3969  row4 = vextq_u32(row4,row4,1);
3970  row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3971  row2 = vextq_u32(row2,row2,3);
3972 
3973  vst1q_u32((uint32_t*)&state.h[0],veorq_u32(ff0,veorq_u32(row1,row3)));
3974  vst1q_u32((uint32_t*)&state.h[4],veorq_u32(ff1,veorq_u32(row2,row4)));
3975 }
3976 
3977 static void BLAKE2_NEON_Compress64(const byte* input, BLAKE2_State<word64, true>& state)
3978 {
3979  //assert(IsAlignedOn(input,GetAlignmentOf<uint8_t*>()));
3980  assert(IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
3981  assert(IsAlignedOn(&state.h[4],GetAlignmentOf<uint64x2_t>()));
3982  assert(IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
3983 
3984  uint64x2_t m0m1,m2m3,m4m5,m6m7,m8m9,m10m11,m12m13,m14m15;
3985 
3986  m0m1 = vreinterpretq_u64_u8(vld1q_u8(input+ 0));
3987  m2m3 = vreinterpretq_u64_u8(vld1q_u8(input+ 16));
3988  m4m5 = vreinterpretq_u64_u8(vld1q_u8(input+ 32));
3989  m6m7 = vreinterpretq_u64_u8(vld1q_u8(input+ 48));
3990  m8m9 = vreinterpretq_u64_u8(vld1q_u8(input+ 64));
3991  m10m11 = vreinterpretq_u64_u8(vld1q_u8(input+ 80));
3992  m12m13 = vreinterpretq_u64_u8(vld1q_u8(input+ 96));
3993  m14m15 = vreinterpretq_u64_u8(vld1q_u8(input+112));
3994 
3995  uint64x2_t row1l, row1h, row2l, row2h;
3996  uint64x2_t row3l, row3h, row4l, row4h;
3997  uint64x2_t b0 = {0,0}, b1 = {0,0}, t0, t1;
3998 
3999  row1l = vld1q_u64((const uint64_t *)&state.h[0]);
4000  row1h = vld1q_u64((const uint64_t *)&state.h[2]);
4001  row2l = vld1q_u64((const uint64_t *)&state.h[4]);
4002  row2h = vld1q_u64((const uint64_t *)&state.h[6]);
4003  row3l = vld1q_u64((const uint64_t *)&BLAKE2B_IV(0));
4004  row3h = vld1q_u64((const uint64_t *)&BLAKE2B_IV(2));
4005  row4l = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(4)), vld1q_u64((const uint64_t*)&state.t[0]));
4006  row4h = veorq_u64(vld1q_u64((const uint64_t *)&BLAKE2B_IV(6)), vld1q_u64((const uint64_t*)&state.f[0]));
4007 
4008  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4009  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4010  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4011  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4012  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4013  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4014  row4l = veorq_u64(row4l, row1l);
4015  row4h = veorq_u64(row4h, row1h);
4016  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4017  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4018  row3l = vaddq_u64(row3l, row4l);
4019  row3h = vaddq_u64(row3h, row4h);
4020  row2l = veorq_u64(row2l, row3l);
4021  row2h = veorq_u64(row2h, row3h);
4022  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4023  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4024 
4025  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4026  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4027  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4028  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4029  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4030  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4031  row4l = veorq_u64(row4l, row1l);
4032  row4h = veorq_u64(row4h, row1h);
4033  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4034  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4035  row3l = vaddq_u64(row3l, row4l);
4036  row3h = vaddq_u64(row3h, row4h);
4037  row2l = veorq_u64(row2l, row3l);
4038  row2h = veorq_u64(row2h, row3h);
4039  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4040  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4041 
4042  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4043  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4044  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4045  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4046  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4047  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4048  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4049  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4050  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4051 
4052  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4053  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4054  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4055  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4056  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4057  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4058  row4l = veorq_u64(row4l, row1l);
4059  row4h = veorq_u64(row4h, row1h);
4060  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4061  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4062  row3l = vaddq_u64(row3l, row4l);
4063  row3h = vaddq_u64(row3h, row4h);
4064  row2l = veorq_u64(row2l, row3l);
4065  row2h = veorq_u64(row2h, row3h);
4066  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4067  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4068 
4069  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4070  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4071  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4072  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4073  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4074  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4075  row4l = veorq_u64(row4l, row1l);
4076  row4h = veorq_u64(row4h, row1h);
4077  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4078  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4079  row3l = vaddq_u64(row3l, row4l);
4080  row3h = vaddq_u64(row3h, row4h);
4081  row2l = veorq_u64(row2l, row3l);
4082  row2h = veorq_u64(row2h, row3h);
4083  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4084  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4085 
4086  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4087  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4088  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4089  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4090  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4091  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4092  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4093  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4094  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4095 
4096  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4097  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4098  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4099  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4100  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4101  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4102  row4l = veorq_u64(row4l, row1l);
4103  row4h = veorq_u64(row4h, row1h);
4104  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4105  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4106  row3l = vaddq_u64(row3l, row4l);
4107  row3h = vaddq_u64(row3h, row4h);
4108  row2l = veorq_u64(row2l, row3l);
4109  row2h = veorq_u64(row2h, row3h);
4110  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4111  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4112 
4113  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4114  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4115  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4116  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4117  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4118  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4119  row4l = veorq_u64(row4l, row1l);
4120  row4h = veorq_u64(row4h, row1h);
4121  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4122  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4123  row3l = vaddq_u64(row3l, row4l);
4124  row3h = vaddq_u64(row3h, row4h);
4125  row2l = veorq_u64(row2l, row3l);
4126  row2h = veorq_u64(row2h, row3h);
4127  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4128  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4129 
4130  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4131  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4132  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4133  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4134  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4135  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4136  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4137  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4138  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4139 
4140  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4141  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4142  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4143  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4144  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4145  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4146  row4l = veorq_u64(row4l, row1l);
4147  row4h = veorq_u64(row4h, row1h);
4148  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4149  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4150  row3l = vaddq_u64(row3l, row4l);
4151  row3h = vaddq_u64(row3h, row4h);
4152  row2l = veorq_u64(row2l, row3l);
4153  row2h = veorq_u64(row2h, row3h);
4154  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4155  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4156 
4157  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4158  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4159  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4160  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4161  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4162  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4163  row4l = veorq_u64(row4l, row1l);
4164  row4h = veorq_u64(row4h, row1h);
4165  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4166  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4167  row3l = vaddq_u64(row3l, row4l);
4168  row3h = vaddq_u64(row3h, row4h);
4169  row2l = veorq_u64(row2l, row3l);
4170  row2h = veorq_u64(row2h, row3h);
4171  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4172  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4173 
4174  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4175  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4176  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4177  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4178  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4179  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4180  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4181  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4182  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4183 
4184  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4185  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4186  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4187  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4188  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4189  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4190  row4l = veorq_u64(row4l, row1l);
4191  row4h = veorq_u64(row4h, row1h);
4192  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4193  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4194  row3l = vaddq_u64(row3l, row4l);
4195  row3h = vaddq_u64(row3h, row4h);
4196  row2l = veorq_u64(row2l, row3l);
4197  row2h = veorq_u64(row2h, row3h);
4198  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4199  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4200 
4201  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4202  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4203  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4204  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4205  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4206  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4207  row4l = veorq_u64(row4l, row1l);
4208  row4h = veorq_u64(row4h, row1h);
4209  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4210  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4211  row3l = vaddq_u64(row3l, row4l);
4212  row3h = vaddq_u64(row3h, row4h);
4213  row2l = veorq_u64(row2l, row3l);
4214  row2h = veorq_u64(row2h, row3h);
4215  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4216  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4217 
4218  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4219  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4220  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4221  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4222  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4223  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4224  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4225  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4226  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4227 
4228  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4229  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4230  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4231  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4232  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4233  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4234  row4l = veorq_u64(row4l, row1l);
4235  row4h = veorq_u64(row4h, row1h);
4236  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4237  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4238  row3l = vaddq_u64(row3l, row4l);
4239  row3h = vaddq_u64(row3h, row4h);
4240  row2l = veorq_u64(row2l, row3l);
4241  row2h = veorq_u64(row2h, row3h);
4242  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4243  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4244 
4245  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4246  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4247  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4248  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4249  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4250  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4251  row4l = veorq_u64(row4l, row1l);
4252  row4h = veorq_u64(row4h, row1h);
4253  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4254  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4255  row3l = vaddq_u64(row3l, row4l);
4256  row3h = vaddq_u64(row3h, row4h);
4257  row2l = veorq_u64(row2l, row3l);
4258  row2h = veorq_u64(row2h, row3h);
4259  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4260  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4261 
4262  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4263  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4264  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4265  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4266  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4267  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4268  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4269  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4270  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4271 
4272  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4273  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4274  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4275  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4276  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4277  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4278  row4l = veorq_u64(row4l, row1l);
4279  row4h = veorq_u64(row4h, row1h);
4280  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4281  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4282  row3l = vaddq_u64(row3l, row4l);
4283  row3h = vaddq_u64(row3h, row4h);
4284  row2l = veorq_u64(row2l, row3l);
4285  row2h = veorq_u64(row2h, row3h);
4286  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4287  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4288 
4289  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4290  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4291  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4292  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4293  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4294  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4295  row4l = veorq_u64(row4l, row1l);
4296  row4h = veorq_u64(row4h, row1h);
4297  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4298  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4299  row3l = vaddq_u64(row3l, row4l);
4300  row3h = vaddq_u64(row3h, row4h);
4301  row2l = veorq_u64(row2l, row3l);
4302  row2h = veorq_u64(row2h, row3h);
4303  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4304  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4305 
4306  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4307  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4308  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4309  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4310  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4311  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4312  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4313  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4314  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4315 
4316  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4317  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4318  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4319  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4320  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4321  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4322  row4l = veorq_u64(row4l, row1l);
4323  row4h = veorq_u64(row4h, row1h);
4324  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4325  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4326  row3l = vaddq_u64(row3l, row4l);
4327  row3h = vaddq_u64(row3h, row4h);
4328  row2l = veorq_u64(row2l, row3l);
4329  row2h = veorq_u64(row2h, row3h);
4330  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4331  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4332 
4333  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4334  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4335  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4336  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4337  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4338  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4339  row4l = veorq_u64(row4l, row1l);
4340  row4h = veorq_u64(row4h, row1h);
4341  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4342  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4343  row3l = vaddq_u64(row3l, row4l);
4344  row3h = vaddq_u64(row3h, row4h);
4345  row2l = veorq_u64(row2l, row3l);
4346  row2h = veorq_u64(row2h, row3h);
4347  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4348  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4349 
4350  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4351  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4352  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4353  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4354  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4355  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4356  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4357  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4358  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4359 
4360  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4361  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4362  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4363  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4364  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4365  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4366  row4l = veorq_u64(row4l, row1l);
4367  row4h = veorq_u64(row4h, row1h);
4368  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4369  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4370  row3l = vaddq_u64(row3l, row4l);
4371  row3h = vaddq_u64(row3h, row4h);
4372  row2l = veorq_u64(row2l, row3l);
4373  row2h = veorq_u64(row2h, row3h);
4374  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4375  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4376 
4377  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4378  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4379  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4380  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4381  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4382  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4383  row4l = veorq_u64(row4l, row1l);
4384  row4h = veorq_u64(row4h, row1h);
4385  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4386  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4387  row3l = vaddq_u64(row3l, row4l);
4388  row3h = vaddq_u64(row3h, row4h);
4389  row2l = veorq_u64(row2l, row3l);
4390  row2h = veorq_u64(row2h, row3h);
4391  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4392  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4393 
4394  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4395  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4396  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4397  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4398  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4399  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4400  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4401  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4402  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4403 
4404  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4405  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4406  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4407  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4408  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4409  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4410  row4l = veorq_u64(row4l, row1l);
4411  row4h = veorq_u64(row4h, row1h);
4412  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4413  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4414  row3l = vaddq_u64(row3l, row4l);
4415  row3h = vaddq_u64(row3h, row4h);
4416  row2l = veorq_u64(row2l, row3l);
4417  row2h = veorq_u64(row2h, row3h);
4418  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4419  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4420 
4421  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4422  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4423  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4424  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4425  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4426  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4427  row4l = veorq_u64(row4l, row1l);
4428  row4h = veorq_u64(row4h, row1h);
4429  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4430  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4431  row3l = vaddq_u64(row3l, row4l);
4432  row3h = vaddq_u64(row3h, row4h);
4433  row2l = veorq_u64(row2l, row3l);
4434  row2h = veorq_u64(row2h, row3h);
4435  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4436  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4437 
4438  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4439  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4440  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4441  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4442  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4443  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4444  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4445  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4446  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4447 
4448  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4449  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4450  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4451  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4452  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4453  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4454  row4l = veorq_u64(row4l, row1l);
4455  row4h = veorq_u64(row4h, row1h);
4456  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4457  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4458  row3l = vaddq_u64(row3l, row4l);
4459  row3h = vaddq_u64(row3h, row4h);
4460  row2l = veorq_u64(row2l, row3l);
4461  row2h = veorq_u64(row2h, row3h);
4462  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4463  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4464 
4465  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4466  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4467  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4468  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4469  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4470  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4471  row4l = veorq_u64(row4l, row1l);
4472  row4h = veorq_u64(row4h, row1h);
4473  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4474  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4475  row3l = vaddq_u64(row3l, row4l);
4476  row3h = vaddq_u64(row3h, row4h);
4477  row2l = veorq_u64(row2l, row3l);
4478  row2h = veorq_u64(row2h, row3h);
4479  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4480  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4481 
4482  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4483  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4484  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4485  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4486  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4487  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4488  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4489  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4490  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4491 
4492  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_L64);
4493  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4494  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4495  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4496  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4497  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4498  row4l = veorq_u64(row4l, row1l);
4499  row4h = veorq_u64(row4h, row1h);
4500  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4501  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4502  row3l = vaddq_u64(row3l, row4l);
4503  row3h = vaddq_u64(row3h, row4h);
4504  row2l = veorq_u64(row2l, row3l);
4505  row2h = veorq_u64(row2h, row3h);
4506  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4507  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4508 
4509  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4510  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4511  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4512  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4513  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4514  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4515  row4l = veorq_u64(row4l, row1l);
4516  row4h = veorq_u64(row4h, row1h);
4517  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4518  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4519  row3l = vaddq_u64(row3l, row4l);
4520  row3h = vaddq_u64(row3h, row4h);
4521  row2l = veorq_u64(row2l, row3l);
4522  row2h = veorq_u64(row2h, row3h);
4523  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4524  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4525 
4526  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4527  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4528  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4529  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4530  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4531  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4532  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4533  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4534  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4535 
4536  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4537  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4538  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4539  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4540  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4541  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4542  row4l = veorq_u64(row4l, row1l);
4543  row4h = veorq_u64(row4h, row1h);
4544  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4545  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4546  row3l = vaddq_u64(row3l, row4l);
4547  row3h = vaddq_u64(row3h, row4h);
4548  row2l = veorq_u64(row2l, row3l);
4549  row2h = veorq_u64(row2h, row3h);
4550  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4551  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4552 
4553  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4554  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4555  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4556  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4557  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4558  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4559  row4l = veorq_u64(row4l, row1l);
4560  row4h = veorq_u64(row4h, row1h);
4561  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4562  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4563  row3l = vaddq_u64(row3l, row4l);
4564  row3h = vaddq_u64(row3h, row4h);
4565  row2l = veorq_u64(row2l, row3l);
4566  row2h = veorq_u64(row2h, row3h);
4567  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4568  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4569 
4570  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4571  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4572  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4573  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4574  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4575  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4576  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4577  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4578  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4579 
4580  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4581  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4582  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4583  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4584  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4585  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4586  row4l = veorq_u64(row4l, row1l);
4587  row4h = veorq_u64(row4h, row1h);
4588  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4589  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4590  row3l = vaddq_u64(row3l, row4l);
4591  row3h = vaddq_u64(row3h, row4h);
4592  row2l = veorq_u64(row2l, row3l);
4593  row2h = veorq_u64(row2h, row3h);
4594  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4595  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4596 
4597  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4598  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4599  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4600  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4601  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4602  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4603  row4l = veorq_u64(row4l, row1l);
4604  row4h = veorq_u64(row4h, row1h);
4605  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4606  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4607  row3l = vaddq_u64(row3l, row4l);
4608  row3h = vaddq_u64(row3h, row4h);
4609  row2l = veorq_u64(row2l, row3l);
4610  row2h = veorq_u64(row2h, row3h);
4611  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4612  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4613 
4614  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4615  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4616  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4617  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4618  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4619  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4620  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4621  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4622  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4623 
4624  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4625  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4626  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4627  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4628  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4629  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4630  row4l = veorq_u64(row4l, row1l);
4631  row4h = veorq_u64(row4h, row1h);
4632  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4633  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4634  row3l = vaddq_u64(row3l, row4l);
4635  row3h = vaddq_u64(row3h, row4h);
4636  row2l = veorq_u64(row2l, row3l);
4637  row2h = veorq_u64(row2h, row3h);
4638  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4639  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4640 
4641  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4642  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4643  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4644  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4645  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4646  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4647  row4l = veorq_u64(row4l, row1l);
4648  row4h = veorq_u64(row4h, row1h);
4649  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4650  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4651  row3l = vaddq_u64(row3l, row4l);
4652  row3h = vaddq_u64(row3h, row4h);
4653  row2l = veorq_u64(row2l, row3l);
4654  row2h = veorq_u64(row2h, row3h);
4655  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4656  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4657 
4658  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4659  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4660  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4661  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4662  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4663  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4664  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4665  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4666  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4667 
4668  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4669  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4670  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4671  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_H64);
4672  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4673  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4674  row4l = veorq_u64(row4l, row1l);
4675  row4h = veorq_u64(row4h, row1h);
4676  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4677  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4678  row3l = vaddq_u64(row3l, row4l);
4679  row3h = vaddq_u64(row3h, row4h);
4680  row2l = veorq_u64(row2l, row3l);
4681  row2h = veorq_u64(row2h, row3h);
4682  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4683  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4684 
4685  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4686  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4687  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4688  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4689  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4690  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4691  row4l = veorq_u64(row4l, row1l);
4692  row4h = veorq_u64(row4h, row1h);
4693  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4694  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4695  row3l = vaddq_u64(row3l, row4l);
4696  row3h = vaddq_u64(row3h, row4h);
4697  row2l = veorq_u64(row2l, row3l);
4698  row2h = veorq_u64(row2h, row3h);
4699  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4700  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4701 
4702  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4703  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4704  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4705  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4706  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4707  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4708  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4709  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4710  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4711 
4712  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4713  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4714  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4715  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4716  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4717  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4718  row4l = veorq_u64(row4l, row1l);
4719  row4h = veorq_u64(row4h, row1h);
4720  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4721  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4722  row3l = vaddq_u64(row3l, row4l);
4723  row3h = vaddq_u64(row3h, row4h);
4724  row2l = veorq_u64(row2l, row3l);
4725  row2h = veorq_u64(row2h, row3h);
4726  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4727  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4728 
4729  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4730  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4731  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4732  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4733  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4734  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4735  row4l = veorq_u64(row4l, row1l);
4736  row4h = veorq_u64(row4h, row1h);
4737  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4738  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4739  row3l = vaddq_u64(row3l, row4l);
4740  row3h = vaddq_u64(row3h, row4h);
4741  row2l = veorq_u64(row2l, row3l);
4742  row2h = veorq_u64(row2h, row3h);
4743  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4744  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4745 
4746  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4747  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4748  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4749  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4750  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4751  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4752  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4753  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4754  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4755 
4756  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4757  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_H64);
4758  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4759  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4760  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4761  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4762  row4l = veorq_u64(row4l, row1l);
4763  row4h = veorq_u64(row4h, row1h);
4764  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4765  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4766  row3l = vaddq_u64(row3l, row4l);
4767  row3h = vaddq_u64(row3h, row4h);
4768  row2l = veorq_u64(row2l, row3l);
4769  row2h = veorq_u64(row2h, row3h);
4770  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4771  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4772 
4773  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4774  b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4775  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4776  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4777  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4778  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4779  row4l = veorq_u64(row4l, row1l);
4780  row4h = veorq_u64(row4h, row1h);
4781  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4782  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4783  row3l = vaddq_u64(row3l, row4l);
4784  row3h = vaddq_u64(row3h, row4h);
4785  row2l = veorq_u64(row2l, row3l);
4786  row2h = veorq_u64(row2h, row3h);
4787  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4788  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4789 
4790  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4791  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4792  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4793  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4794  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4795  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4796  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4797  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4798  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4799 
4800  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4801  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4802  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4803  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4804  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4805  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4806  row4l = veorq_u64(row4l, row1l);
4807  row4h = veorq_u64(row4h, row1h);
4808  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4809  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4810  row3l = vaddq_u64(row3l, row4l);
4811  row3h = vaddq_u64(row3h, row4h);
4812  row2l = veorq_u64(row2l, row3l);
4813  row2h = veorq_u64(row2h, row3h);
4814  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4815  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4816 
4817  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4818  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4819  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4820  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4821  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4822  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4823  row4l = veorq_u64(row4l, row1l);
4824  row4h = veorq_u64(row4h, row1h);
4825  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4826  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4827  row3l = vaddq_u64(row3l, row4l);
4828  row3h = vaddq_u64(row3h, row4h);
4829  row2l = veorq_u64(row2l, row3l);
4830  row2h = veorq_u64(row2h, row3h);
4831  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4832  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4833 
4834  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4835  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4836  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4837  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4838  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4839  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4840  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4841  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4842  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4843 
4844  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4845  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4846  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4847  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4848  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4849  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4850  row4l = veorq_u64(row4l, row1l);
4851  row4h = veorq_u64(row4h, row1h);
4852  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4853  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4854  row3l = vaddq_u64(row3l, row4l);
4855  row3h = vaddq_u64(row3h, row4h);
4856  row2l = veorq_u64(row2l, row3l);
4857  row2h = veorq_u64(row2h, row3h);
4858  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4859  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4860 
4861  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4862  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4863  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4864  b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4865  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4866  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4867  row4l = veorq_u64(row4l, row1l);
4868  row4h = veorq_u64(row4h, row1h);
4869  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4870  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4871  row3l = vaddq_u64(row3l, row4l);
4872  row3h = vaddq_u64(row3h, row4h);
4873  row2l = veorq_u64(row2l, row3l);
4874  row2h = veorq_u64(row2h, row3h);
4875  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4876  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4877 
4878  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4879  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4880  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4881  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4882  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4883  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4884  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4885  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4886  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4887 
4888  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4889  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4890  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4891  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4892  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4893  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4894  row4l = veorq_u64(row4l, row1l);
4895  row4h = veorq_u64(row4h, row1h);
4896  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4897  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4898  row3l = vaddq_u64(row3l, row4l);
4899  row3h = vaddq_u64(row3h, row4h);
4900  row2l = veorq_u64(row2l, row3l);
4901  row2h = veorq_u64(row2h, row3h);
4902  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4903  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4904 
4905  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4906  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4907  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4908  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4909  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4910  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4911  row4l = veorq_u64(row4l, row1l);
4912  row4h = veorq_u64(row4h, row1h);
4913  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4914  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4915  row3l = vaddq_u64(row3l, row4l);
4916  row3h = vaddq_u64(row3h, row4h);
4917  row2l = veorq_u64(row2l, row3l);
4918  row2h = veorq_u64(row2h, row3h);
4919  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4920  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4921 
4922  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4923  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4924  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4925  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4926  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4927  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4928  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4929  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4930  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4931 
4932  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4933  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4934  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4935  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4936  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4937  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4938  row4l = veorq_u64(row4l, row1l);
4939  row4h = veorq_u64(row4h, row1h);
4940  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4941  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4942  row3l = vaddq_u64(row3l, row4l);
4943  row3h = vaddq_u64(row3h, row4h);
4944  row2l = veorq_u64(row2l, row3l);
4945  row2h = veorq_u64(row2h, row3h);
4946  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4947  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4948 
4949  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4950  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4951  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4952  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4953  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4954  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4955  row4l = veorq_u64(row4l, row1l);
4956  row4h = veorq_u64(row4h, row1h);
4957  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4958  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4959  row3l = vaddq_u64(row3l, row4l);
4960  row3h = vaddq_u64(row3h, row4h);
4961  row2l = veorq_u64(row2l, row3l);
4962  row2h = veorq_u64(row2h, row3h);
4963  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4964  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4965 
4966  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4967  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4968  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4969  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4970  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4971  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4972  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4973  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4974  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4975 
4976  b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4977  b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4978  b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4979  b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4980  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4981  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4982  row4l = veorq_u64(row4l, row1l);
4983  row4h = veorq_u64(row4h, row1h);
4984  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4985  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4986  row3l = vaddq_u64(row3l, row4l);
4987  row3h = vaddq_u64(row3h, row4h);
4988  row2l = veorq_u64(row2l, row3l);
4989  row2h = veorq_u64(row2h, row3h);
4990  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4991  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4992 
4993  b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4994  b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4995  b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4996  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4997  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4998  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4999  row4l = veorq_u64(row4l, row1l);
5000  row4h = veorq_u64(row4h, row1h);
5001  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5002  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5003  row3l = vaddq_u64(row3l, row4l);
5004  row3h = vaddq_u64(row3h, row4h);
5005  row2l = veorq_u64(row2l, row3l);
5006  row2h = veorq_u64(row2h, row3h);
5007  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5008  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5009 
5010  t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
5011  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
5012  row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
5013  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
5014  row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
5015  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
5016  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
5017  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
5018  row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
5019 
5020  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
5021  b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
5022  b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
5023  b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
5024  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5025  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5026  row4l = veorq_u64(row4l, row1l);
5027  row4h = veorq_u64(row4h, row1h);
5028  row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
5029  row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
5030  row3l = vaddq_u64(row3l, row4l);
5031  row3h = vaddq_u64(row3h, row4h);
5032  row2l = veorq_u64(row2l, row3l);
5033  row2h = veorq_u64(row2h, row3h);
5034  row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
5035  row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
5036 
5037  b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
5038  b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
5039  b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
5040  b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
5041  row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5042  row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5043  row4l = veorq_u64(row4l, row1l);
5044  row4h = veorq_u64(row4h, row1h);
5045  row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5046  row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5047  row3l = vaddq_u64(row3l, row4l);
5048  row3h = vaddq_u64(row3h, row4h);
5049  row2l = veorq_u64(row2l, row3l);
5050  row2h = veorq_u64(row2h, row3h);
5051  row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5052  row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5053 
5054  t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
5055  row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
5056  row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
5057  row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
5058  row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
5059  row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
5060  row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
5061  row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
5062  row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
5063 
5064  row1l = veorq_u64(row3l, row1l);
5065  row1h = veorq_u64(row3h, row1h);
5066  vst1q_u64((uint64_t*)&state.h[0], veorq_u64(vld1q_u64((const uint64_t*)&state.h[0]), row1l));
5067  vst1q_u64((uint64_t*)&state.h[2], veorq_u64(vld1q_u64((const uint64_t*)&state.h[2]), row1h));
5068 
5069  row2l = veorq_u64(row4l, row2l);
5070  row2h = veorq_u64(row4h, row2h);
5071  vst1q_u64((uint64_t*)&state.h[4], veorq_u64(vld1q_u64((const uint64_t*)&state.h[4]), row2l));
5072  vst1q_u64((uint64_t*)&state.h[6], veorq_u64(vld1q_u64((const uint64_t*)&state.h[6]), row2h));
5073 }
5074 #endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE
5075 
5076 template class BLAKE2_Base<word32, false>;
5077 template class BLAKE2_Base<word64, true>;
5078 
5079 NAMESPACE_END
ConstByteArrayParameter::size
size_t size() const
Length of the memory block.
Definition: algparam.h:93
EnumToType
Converts a typename to an enumerated value.
Definition: cryptlib.h:116
MakeParameters
AlgorithmParameters MakeParameters(const char *name, const T &value, bool throwIfNotUsed=true)
Create an object that implements NameValuePairs.
Definition: algparam.h:554
Name::Personalization
const char * Personalization()
ConstByteArrayParameter.
Definition: argnames.h:84
IsAlignedOn
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
Definition: misc.h:907
BLAKE2_ParameterBlock
BLAKE2 parameter block.
Definition: blake2.h:55
BLAKE2_State
BLAKE2 state information.
Definition: blake2.h:137
blake2.h
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
Name::DigestSize
const char * DigestSize()
int, in bytes
Definition: argnames.h:78
BLAKE2_Base::Update
void Update(const byte *input, size_t length)
Updates a hash with additional input.
Definition: blake2.cpp:415
COUNTOF
#define COUNTOF(arr)
Counts elements in an array.
Definition: misc.h:161
BLAKE2_Base::Restart
void Restart()
Restart the hash.
Definition: blake2.cpp:377
GetBlock
Access a block of memory.
Definition: misc.h:2167
argnames.h
Standard names for retrieving values by name when working with NameValuePairs.
HasSSE4
bool HasSSE4()
Determines SSE4 availability.
Definition: cpu.h:261
Name::Salt
const char * Salt()
ConstByteArrayParameter.
Definition: argnames.h:86
PutBlock
Access a block of memory.
Definition: misc.h:2209
BLAKE2_Base
BLAKE2 hash implementation.
Definition: blake2.h:163
cpu.h
Functions for CPU features and intrinsics.
ConditionalByteReverse
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
Definition: misc.h:1849
ConstByteArrayParameter::begin
const byte * begin() const
Pointer to the first byte in the memory block.
Definition: algparam.h:89
g_nullNameValuePairs
const NameValuePairs & g_nullNameValuePairs
An empty set of name-value pairs.
Definition: cryptlib.cpp:80
CryptoPP
Crypto++ library namespace.
config.h
Library configuration file.
SecBlock::data
A::pointer data()
Provides a pointer to the first element in the memory block.
Definition: secblock.h:516
BLAKE2_Base::TruncatedFinal
void TruncatedFinal(byte *hash, size_t size)
Computes the hash of the current message.
Definition: blake2.cpp:449
AlignedSecByteBlock
SecBlock using AllocatorWithCleanup<byte, true> typedef.
Definition: secblock.h:737
memcpy_s
void memcpy_s(void *dest, size_t sizeInBytes, const void *src, size_t count)
Bounds checking replacement for memcpy()
Definition: misc.h:356
cryptlib.h
Abstract base classes that provide a uniform interface to this library.
HasSSE2
bool HasSSE2()
Determines SSE2 availability.
Definition: cpu.h:236
ConstByteArrayParameter
Used to pass byte array input as part of a NameValuePairs object.
Definition: algparam.h:30
algparam.h
Classes for working with NameValuePairs.