lanczos_sse2.hpp 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // (C) Copyright John Maddock 2006.
  2. // Use, modification and distribution are subject to the
  3. // Boost Software License, Version 1.0. (See accompanying file
  4. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #ifndef BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
  6. #define BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS_SSE2
  7. #ifdef _MSC_VER
  8. #pragma once
  9. #endif
  10. #include <emmintrin.h>
  11. #if defined(__GNUC__) || defined(__PGI)
  12. #define ALIGN16 __attribute__((__aligned__(16)))
  13. #else
  14. #define ALIGN16 __declspec(align(16))
  15. #endif
  16. namespace boost{ namespace math{ namespace lanczos{
  17. template <>
  18. inline double lanczos13m53::lanczos_sum<double>(const double& x)
  19. {
  20. static const ALIGN16 double coeff[26] = {
  21. static_cast<double>(2.506628274631000270164908177133837338626L),
  22. static_cast<double>(1u),
  23. static_cast<double>(210.8242777515793458725097339207133627117L),
  24. static_cast<double>(66u),
  25. static_cast<double>(8071.672002365816210638002902272250613822L),
  26. static_cast<double>(1925u),
  27. static_cast<double>(186056.2653952234950402949897160456992822L),
  28. static_cast<double>(32670u),
  29. static_cast<double>(2876370.628935372441225409051620849613599L),
  30. static_cast<double>(357423u),
  31. static_cast<double>(31426415.58540019438061423162831820536287L),
  32. static_cast<double>(2637558u),
  33. static_cast<double>(248874557.8620541565114603864132294232163L),
  34. static_cast<double>(13339535u),
  35. static_cast<double>(1439720407.311721673663223072794912393972L),
  36. static_cast<double>(45995730u),
  37. static_cast<double>(6039542586.35202800506429164430729792107L),
  38. static_cast<double>(105258076u),
  39. static_cast<double>(17921034426.03720969991975575445893111267L),
  40. static_cast<double>(150917976u),
  41. static_cast<double>(35711959237.35566804944018545154716670596L),
  42. static_cast<double>(120543840u),
  43. static_cast<double>(42919803642.64909876895789904700198885093L),
  44. static_cast<double>(39916800u),
  45. static_cast<double>(23531376880.41075968857200767445163675473L),
  46. static_cast<double>(0u)
  47. };
  48. register __m128d vx = _mm_load1_pd(&x);
  49. register __m128d sum_even = _mm_load_pd(coeff);
  50. register __m128d sum_odd = _mm_load_pd(coeff+2);
  51. register __m128d nc_odd, nc_even;
  52. register __m128d vx2 = _mm_mul_pd(vx, vx);
  53. sum_even = _mm_mul_pd(sum_even, vx2);
  54. nc_even = _mm_load_pd(coeff + 4);
  55. sum_odd = _mm_mul_pd(sum_odd, vx2);
  56. nc_odd = _mm_load_pd(coeff + 6);
  57. sum_even = _mm_add_pd(sum_even, nc_even);
  58. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  59. sum_even = _mm_mul_pd(sum_even, vx2);
  60. nc_even = _mm_load_pd(coeff + 8);
  61. sum_odd = _mm_mul_pd(sum_odd, vx2);
  62. nc_odd = _mm_load_pd(coeff + 10);
  63. sum_even = _mm_add_pd(sum_even, nc_even);
  64. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  65. sum_even = _mm_mul_pd(sum_even, vx2);
  66. nc_even = _mm_load_pd(coeff + 12);
  67. sum_odd = _mm_mul_pd(sum_odd, vx2);
  68. nc_odd = _mm_load_pd(coeff + 14);
  69. sum_even = _mm_add_pd(sum_even, nc_even);
  70. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  71. sum_even = _mm_mul_pd(sum_even, vx2);
  72. nc_even = _mm_load_pd(coeff + 16);
  73. sum_odd = _mm_mul_pd(sum_odd, vx2);
  74. nc_odd = _mm_load_pd(coeff + 18);
  75. sum_even = _mm_add_pd(sum_even, nc_even);
  76. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  77. sum_even = _mm_mul_pd(sum_even, vx2);
  78. nc_even = _mm_load_pd(coeff + 20);
  79. sum_odd = _mm_mul_pd(sum_odd, vx2);
  80. nc_odd = _mm_load_pd(coeff + 22);
  81. sum_even = _mm_add_pd(sum_even, nc_even);
  82. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  83. sum_even = _mm_mul_pd(sum_even, vx2);
  84. nc_even = _mm_load_pd(coeff + 24);
  85. sum_odd = _mm_mul_pd(sum_odd, vx);
  86. sum_even = _mm_add_pd(sum_even, nc_even);
  87. sum_even = _mm_add_pd(sum_even, sum_odd);
  88. double ALIGN16 t[2];
  89. _mm_store_pd(t, sum_even);
  90. return t[0] / t[1];
  91. }
  92. template <>
  93. inline double lanczos13m53::lanczos_sum_expG_scaled<double>(const double& x)
  94. {
  95. static const ALIGN16 double coeff[26] = {
  96. static_cast<double>(0.006061842346248906525783753964555936883222L),
  97. static_cast<double>(1u),
  98. static_cast<double>(0.5098416655656676188125178644804694509993L),
  99. static_cast<double>(66u),
  100. static_cast<double>(19.51992788247617482847860966235652136208L),
  101. static_cast<double>(1925u),
  102. static_cast<double>(449.9445569063168119446858607650988409623L),
  103. static_cast<double>(32670u),
  104. static_cast<double>(6955.999602515376140356310115515198987526L),
  105. static_cast<double>(357423u),
  106. static_cast<double>(75999.29304014542649875303443598909137092L),
  107. static_cast<double>(2637558u),
  108. static_cast<double>(601859.6171681098786670226533699352302507L),
  109. static_cast<double>(13339535u),
  110. static_cast<double>(3481712.15498064590882071018964774556468L),
  111. static_cast<double>(45995730u),
  112. static_cast<double>(14605578.08768506808414169982791359218571L),
  113. static_cast<double>(105258076u),
  114. static_cast<double>(43338889.32467613834773723740590533316085L),
  115. static_cast<double>(150917976u),
  116. static_cast<double>(86363131.28813859145546927288977868422342L),
  117. static_cast<double>(120543840u),
  118. static_cast<double>(103794043.1163445451906271053616070238554L),
  119. static_cast<double>(39916800u),
  120. static_cast<double>(56906521.91347156388090791033559122686859L),
  121. static_cast<double>(0u)
  122. };
  123. register __m128d vx = _mm_load1_pd(&x);
  124. register __m128d sum_even = _mm_load_pd(coeff);
  125. register __m128d sum_odd = _mm_load_pd(coeff+2);
  126. register __m128d nc_odd, nc_even;
  127. register __m128d vx2 = _mm_mul_pd(vx, vx);
  128. sum_even = _mm_mul_pd(sum_even, vx2);
  129. nc_even = _mm_load_pd(coeff + 4);
  130. sum_odd = _mm_mul_pd(sum_odd, vx2);
  131. nc_odd = _mm_load_pd(coeff + 6);
  132. sum_even = _mm_add_pd(sum_even, nc_even);
  133. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  134. sum_even = _mm_mul_pd(sum_even, vx2);
  135. nc_even = _mm_load_pd(coeff + 8);
  136. sum_odd = _mm_mul_pd(sum_odd, vx2);
  137. nc_odd = _mm_load_pd(coeff + 10);
  138. sum_even = _mm_add_pd(sum_even, nc_even);
  139. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  140. sum_even = _mm_mul_pd(sum_even, vx2);
  141. nc_even = _mm_load_pd(coeff + 12);
  142. sum_odd = _mm_mul_pd(sum_odd, vx2);
  143. nc_odd = _mm_load_pd(coeff + 14);
  144. sum_even = _mm_add_pd(sum_even, nc_even);
  145. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  146. sum_even = _mm_mul_pd(sum_even, vx2);
  147. nc_even = _mm_load_pd(coeff + 16);
  148. sum_odd = _mm_mul_pd(sum_odd, vx2);
  149. nc_odd = _mm_load_pd(coeff + 18);
  150. sum_even = _mm_add_pd(sum_even, nc_even);
  151. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  152. sum_even = _mm_mul_pd(sum_even, vx2);
  153. nc_even = _mm_load_pd(coeff + 20);
  154. sum_odd = _mm_mul_pd(sum_odd, vx2);
  155. nc_odd = _mm_load_pd(coeff + 22);
  156. sum_even = _mm_add_pd(sum_even, nc_even);
  157. sum_odd = _mm_add_pd(sum_odd, nc_odd);
  158. sum_even = _mm_mul_pd(sum_even, vx2);
  159. nc_even = _mm_load_pd(coeff + 24);
  160. sum_odd = _mm_mul_pd(sum_odd, vx);
  161. sum_even = _mm_add_pd(sum_even, nc_even);
  162. sum_even = _mm_add_pd(sum_even, sum_odd);
  163. double ALIGN16 t[2];
  164. _mm_store_pd(t, sum_even);
  165. return t[0] / t[1];
  166. }
  167. } // namespace lanczos
  168. } // namespace math
  169. } // namespace boost
  170. #undef ALIGN16
  171. #endif // BOOST_MATH_SPECIAL_FUNCTIONS_LANCZOS