arm_simd.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. // arm_simd.h - written and placed in public domain by Jeffrey Walton
  2. /// \file arm_simd.h
  3. /// \brief Support functions for ARM and vector operations
  4. #ifndef CRYPTOPP_ARM_SIMD_H
  5. #define CRYPTOPP_ARM_SIMD_H
  6. #include "config.h"
  7. #if (CRYPTOPP_ARM_NEON_HEADER)
  8. # include <stdint.h>
  9. # include <arm_neon.h>
  10. #endif
  11. #if (CRYPTOPP_ARM_ACLE_HEADER)
  12. # include <stdint.h>
  13. # include <arm_acle.h>
  14. #endif
  15. #if (CRYPTOPP_ARM_CRC32_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  16. /// \name CRC32 checksum
  17. //@{
  18. /// \brief CRC32 checksum
  19. /// \param crc the starting crc value
  20. /// \param val the value to checksum
  21. /// \return CRC32 value
  22. /// \since Crypto++ 8.6
  23. inline uint32_t CRC32B (uint32_t crc, uint8_t val)
  24. {
  25. #if defined(_MSC_VER)
  26. return __crc32b(crc, val);
  27. #else
  28. __asm__ ("crc32b %w0, %w0, %w1 \n\t"
  29. :"+r" (crc) : "r" (val) );
  30. return crc;
  31. #endif
  32. }
  33. /// \brief CRC32 checksum
  34. /// \param crc the starting crc value
  35. /// \param val the value to checksum
  36. /// \return CRC32 value
  37. /// \since Crypto++ 8.6
  38. inline uint32_t CRC32W (uint32_t crc, uint32_t val)
  39. {
  40. #if defined(_MSC_VER)
  41. return __crc32w(crc, val);
  42. #else
  43. __asm__ ("crc32w %w0, %w0, %w1 \n\t"
  44. :"+r" (crc) : "r" (val) );
  45. return crc;
  46. #endif
  47. }
  48. /// \brief CRC32 checksum
  49. /// \param crc the starting crc value
  50. /// \param vals the values to checksum
  51. /// \return CRC32 value
  52. /// \since Crypto++ 8.6
  53. inline uint32_t CRC32Wx4 (uint32_t crc, const uint32_t vals[4])
  54. {
  55. #if defined(_MSC_VER)
  56. return __crc32w(__crc32w(__crc32w(__crc32w(
  57. crc, vals[0]), vals[1]), vals[2]), vals[3]);
  58. #else
  59. __asm__ ("crc32w %w0, %w0, %w1 \n\t"
  60. "crc32w %w0, %w0, %w2 \n\t"
  61. "crc32w %w0, %w0, %w3 \n\t"
  62. "crc32w %w0, %w0, %w4 \n\t"
  63. :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
  64. "r" (vals[2]), "r" (vals[3]));
  65. return crc;
  66. #endif
  67. }
  68. //@}
  69. /// \name CRC32-C checksum
  70. /// \brief CRC32-C checksum
  71. /// \param crc the starting crc value
  72. /// \param val the value to checksum
  73. /// \return CRC32-C value
  74. /// \since Crypto++ 8.6
  75. inline uint32_t CRC32CB (uint32_t crc, uint8_t val)
  76. {
  77. #if defined(_MSC_VER)
  78. return __crc32cb(crc, val);
  79. #else
  80. __asm__ ("crc32cb %w0, %w0, %w1 \n\t"
  81. :"+r" (crc) : "r" (val) );
  82. return crc;
  83. #endif
  84. }
  85. /// \brief CRC32-C checksum
  86. /// \param crc the starting crc value
  87. /// \param val the value to checksum
  88. /// \return CRC32-C value
  89. /// \since Crypto++ 8.6
  90. inline uint32_t CRC32CW (uint32_t crc, uint32_t val)
  91. {
  92. #if defined(_MSC_VER)
  93. return __crc32cw(crc, val);
  94. #else
  95. __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
  96. :"+r" (crc) : "r" (val) );
  97. return crc;
  98. #endif
  99. }
  100. /// \brief CRC32-C checksum
  101. /// \param crc the starting crc value
  102. /// \param vals the values to checksum
  103. /// \return CRC32-C value
  104. /// \since Crypto++ 8.6
  105. inline uint32_t CRC32CWx4 (uint32_t crc, const uint32_t vals[4])
  106. {
  107. #if defined(_MSC_VER)
  108. return __crc32cw(__crc32cw(__crc32cw(__crc32cw(
  109. crc, vals[0]), vals[1]), vals[2]), vals[3]);
  110. #else
  111. __asm__ ("crc32cw %w0, %w0, %w1 \n\t"
  112. "crc32cw %w0, %w0, %w2 \n\t"
  113. "crc32cw %w0, %w0, %w3 \n\t"
  114. "crc32cw %w0, %w0, %w4 \n\t"
  115. :"+r" (crc) : "r" (vals[0]), "r" (vals[1]),
  116. "r" (vals[2]), "r" (vals[3]));
  117. return crc;
  118. #endif
  119. }
  120. //@}
  121. #endif // CRYPTOPP_ARM_CRC32_AVAILABLE
  122. #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  123. /// \name Polynomial multiplication
  124. //@{
  125. /// \brief Polynomial multiplication
  126. /// \param a the first value
  127. /// \param b the second value
  128. /// \return vector product
  129. /// \details PMULL_00() performs polynomial multiplication and presents
  130. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
  131. /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
  132. /// are multiplied.
  133. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  134. /// is MSB and numbered 127, while the rightmost bit is LSB and
  135. /// numbered 0.
  136. /// \since Crypto++ 8.0
  137. inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
  138. {
  139. #if defined(_MSC_VER)
  140. const __n64 x = { vgetq_lane_u64(a, 0) };
  141. const __n64 y = { vgetq_lane_u64(b, 0) };
  142. return vmull_p64(x, y);
  143. #elif defined(__GNUC__)
  144. uint64x2_t r;
  145. __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
  146. :"=w" (r) : "w" (a), "w" (b) );
  147. return r;
  148. #else
  149. return (uint64x2_t)(vmull_p64(
  150. vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
  151. vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
  152. #endif
  153. }
  154. /// \brief Polynomial multiplication
  155. /// \param a the first value
  156. /// \param b the second value
  157. /// \return vector product
  158. /// \details PMULL_01 performs() polynomial multiplication and presents
  159. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
  160. /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
  161. /// 64-bits of <tt>b</tt> are multiplied.
  162. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  163. /// is MSB and numbered 127, while the rightmost bit is LSB and
  164. /// numbered 0.
  165. /// \since Crypto++ 8.0
  166. inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
  167. {
  168. #if defined(_MSC_VER)
  169. const __n64 x = { vgetq_lane_u64(a, 0) };
  170. const __n64 y = { vgetq_lane_u64(b, 1) };
  171. return vmull_p64(x, y);
  172. #elif defined(__GNUC__)
  173. uint64x2_t r;
  174. __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
  175. :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
  176. return r;
  177. #else
  178. return (uint64x2_t)(vmull_p64(
  179. vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
  180. vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
  181. #endif
  182. }
  183. /// \brief Polynomial multiplication
  184. /// \param a the first value
  185. /// \param b the second value
  186. /// \return vector product
  187. /// \details PMULL_10() performs polynomial multiplication and presents
  188. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
  189. /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
  190. /// 64-bits of <tt>b</tt> are multiplied.
  191. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  192. /// is MSB and numbered 127, while the rightmost bit is LSB and
  193. /// numbered 0.
  194. /// \since Crypto++ 8.0
  195. inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
  196. {
  197. #if defined(_MSC_VER)
  198. const __n64 x = { vgetq_lane_u64(a, 1) };
  199. const __n64 y = { vgetq_lane_u64(b, 0) };
  200. return vmull_p64(x, y);
  201. #elif defined(__GNUC__)
  202. uint64x2_t r;
  203. __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
  204. :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
  205. return r;
  206. #else
  207. return (uint64x2_t)(vmull_p64(
  208. vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
  209. vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
  210. #endif
  211. }
  212. /// \brief Polynomial multiplication
  213. /// \param a the first value
  214. /// \param b the second value
  215. /// \return vector product
  216. /// \details PMULL_11() performs polynomial multiplication and presents
  217. /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
  218. /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
  219. /// are multiplied.
  220. /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
  221. /// is MSB and numbered 127, while the rightmost bit is LSB and
  222. /// numbered 0.
  223. /// \since Crypto++ 8.0
  224. inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
  225. {
  226. #if defined(_MSC_VER)
  227. const __n64 x = { vgetq_lane_u64(a, 1) };
  228. const __n64 y = { vgetq_lane_u64(b, 1) };
  229. return vmull_p64(x, y);
  230. #elif defined(__GNUC__)
  231. uint64x2_t r;
  232. __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
  233. :"=w" (r) : "w" (a), "w" (b) );
  234. return r;
  235. #else
  236. return (uint64x2_t)(vmull_p64(
  237. vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
  238. vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
  239. #endif
  240. }
  241. /// \brief Polynomial multiplication
  242. /// \param a the first value
  243. /// \param b the second value
  244. /// \return vector product
  245. /// \details PMULL() performs vmull_p64(). PMULL is provided as
  246. /// GCC inline assembly due to Clang and lack of support for the intrinsic.
  247. /// \since Crypto++ 8.0
  248. inline uint64x2_t PMULL(const uint64x2_t a, const uint64x2_t b)
  249. {
  250. #if defined(_MSC_VER)
  251. const __n64 x = { vgetq_lane_u64(a, 0) };
  252. const __n64 y = { vgetq_lane_u64(b, 0) };
  253. return vmull_p64(x, y);
  254. #elif defined(__GNUC__)
  255. uint64x2_t r;
  256. __asm__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
  257. :"=w" (r) : "w" (a), "w" (b) );
  258. return r;
  259. #else
  260. return (uint64x2_t)(vmull_p64(
  261. vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
  262. vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
  263. #endif
  264. }
  265. /// \brief Polynomial multiplication
  266. /// \param a the first value
  267. /// \param b the second value
  268. /// \return vector product
  269. /// \details PMULL_HIGH() performs vmull_high_p64(). PMULL_HIGH is provided as
  270. /// GCC inline assembly due to Clang and lack of support for the intrinsic.
  271. /// \since Crypto++ 8.0
  272. inline uint64x2_t PMULL_HIGH(const uint64x2_t a, const uint64x2_t b)
  273. {
  274. #if defined(_MSC_VER)
  275. const __n64 x = { vgetq_lane_u64(a, 1) };
  276. const __n64 y = { vgetq_lane_u64(b, 1) };
  277. return vmull_p64(x, y);
  278. #elif defined(__GNUC__)
  279. uint64x2_t r;
  280. __asm__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
  281. :"=w" (r) : "w" (a), "w" (b) );
  282. return r;
  283. #else
  284. return (uint64x2_t)(vmull_p64(
  285. vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
  286. vgetq_lane_u64(vreinterpretq_u64_u8(b),1))));
  287. #endif
  288. }
  289. /// \brief Vector extraction
  290. /// \param a the first value
  291. /// \param b the second value
  292. /// \param c the byte count
  293. /// \return vector
  294. /// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
  295. /// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
  296. /// as GCC inline assembly due to Clang and lack of support for the intrinsic.
  297. /// \since Crypto++ 8.0
  298. inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
  299. {
  300. #if defined(_MSC_VER)
  301. return vreinterpretq_u64_u8(vextq_u8(
  302. vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c));
  303. #else
  304. uint64x2_t r;
  305. __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
  306. :"=w" (r) : "w" (a), "w" (b), "I" (c) );
  307. return r;
  308. #endif
  309. }
  310. /// \brief Vector extraction
  311. /// \tparam C the byte count
  312. /// \param a the first value
  313. /// \param b the second value
  314. /// \return vector
  315. /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
  316. /// <tt>a</tt> and the remaining bytes in <tt>b</tt>. VEXT_U8 is provided
  317. /// as GCC inline assembly due to Clang and lack of support for the intrinsic.
  318. /// \since Crypto++ 8.0
  319. template <unsigned int C>
  320. inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
  321. {
  322. // https://github.com/weidai11/cryptopp/issues/366
  323. #if defined(_MSC_VER)
  324. return vreinterpretq_u64_u8(vextq_u8(
  325. vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C));
  326. #else
  327. uint64x2_t r;
  328. __asm__ ("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
  329. :"=w" (r) : "w" (a), "w" (b), "I" (C) );
  330. return r;
  331. #endif
  332. }
  333. //@}
  334. #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
  335. #if CRYPTOPP_ARM_SHA3_AVAILABLE || defined(CRYPTOPP_DOXYGEN_PROCESSING)
  336. /// \name ARMv8.2 operations
  337. //@{
  338. /// \brief Three-way XOR
  339. /// \param a the first value
  340. /// \param b the second value
  341. /// \param c the third value
  342. /// \return three-way exclusive OR of the values
  343. /// \details VEOR3() performs veor3q_u64(). VEOR3 is provided as GCC inline assembly due
  344. /// to Clang and lack of support for the intrinsic.
  345. /// \details VEOR3 requires ARMv8.2.
  346. /// \since Crypto++ 8.6
  347. inline uint64x2_t VEOR3(uint64x2_t a, uint64x2_t b, uint64x2_t c)
  348. {
  349. #if defined(_MSC_VER)
  350. return veor3q_u64(a, b, c);
  351. #else
  352. uint64x2_t r;
  353. __asm__ ("eor3 %0.16b, %1.16b, %2.16b, %3.16b \n\t"
  354. :"=w" (r) : "w" (a), "w" (b), "w" (c));
  355. return r;
  356. #endif
  357. }
  358. /// \brief XOR and rotate
  359. /// \param a the first value
  360. /// \param b the second value
  361. /// \param c the third value
  362. /// \return two-way exclusive OR of the values, then rotated by c
  363. /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
  364. /// to Clang and lack of support for the intrinsic.
  365. /// \details VXARQ requires ARMv8.2.
  366. /// \since Crypto++ 8.6
  367. inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b, const int c)
  368. {
  369. #if defined(_MSC_VER)
  370. return vxarq_u64(a, b, c);
  371. #else
  372. uint64x2_t r;
  373. __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
  374. :"=w" (r) : "w" (a), "w" (b), "I" (c));
  375. return r;
  376. #endif
  377. }
  378. /// \brief XOR and rotate
  379. /// \tparam C the rotate amount
  380. /// \param a the first value
  381. /// \param b the second value
  382. /// \return two-way exclusive OR of the values, then rotated by C
  383. /// \details VXARQ() performs vxarq_u64(). VXARQ is provided as GCC inline assembly due
  384. /// to Clang and lack of support for the intrinsic.
  385. /// \details VXARQ requires ARMv8.2.
  386. /// \since Crypto++ 8.6
  387. template <unsigned int C>
  388. inline uint64x2_t VXAR(uint64x2_t a, uint64x2_t b)
  389. {
  390. #if defined(_MSC_VER)
  391. return vxarq_u64(a, b, C);
  392. #else
  393. uint64x2_t r;
  394. __asm__ ("xar %0.2d, %1.2d, %2.2d, %3 \n\t"
  395. :"=w" (r) : "w" (a), "w" (b), "I" (C));
  396. return r;
  397. #endif
  398. }
  399. /// \brief XOR and rotate
  400. /// \param a the first value
  401. /// \param b the second value
  402. /// \return two-way exclusive OR of the values, then rotated 1-bit
  403. /// \details VRAX1() performs vrax1q_u64(). VRAX1 is provided as GCC inline assembly due
  404. /// to Clang and lack of support for the intrinsic.
  405. /// \details VRAX1 requires ARMv8.2.
  406. /// \since Crypto++ 8.6
  407. inline uint64x2_t VRAX1(uint64x2_t a, uint64x2_t b)
  408. {
  409. #if defined(_MSC_VER)
  410. return vrax1q_u64(a, b);
  411. #else
  412. uint64x2_t r;
  413. __asm__ ("rax1 %0.2d, %1.2d, %2.2d \n\t"
  414. :"=w" (r) : "w" (a), "w" (b));
  415. return r;
  416. #endif
  417. }
  418. //@}
  419. #endif // CRYPTOPP_ARM_SHA3_AVAILABLE
  420. #endif // CRYPTOPP_ARM_SIMD_H