1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_avx2_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_avx2_H
4 #include <volk/volk_common.h>
10 unsigned int number = 0;
11 const unsigned int eighthPoints = num_points / 8;
14 const float* aPtr = input;
15 const float* bPtr = taps;
20 __m256 cVal = _mm256_setzero_ps();
22 for(;number < eighthPoints; number++){
24 aVal = _mm256_load_ps(aPtr);
25 bVal = _mm256_load_ps(bPtr);
27 cVal = _mm256_fmadd_ps(aVal, bVal, cVal);
33 __VOLK_ATTR_ALIGNED(32)
float dotProductVector[8];
35 _mm256_store_ps(dotProductVector, cVal);
37 dotProduct = dotProductVector[0];
38 dotProduct += dotProductVector[1];
39 dotProduct += dotProductVector[2];
40 dotProduct += dotProductVector[3];
41 dotProduct += dotProductVector[4];
42 dotProduct += dotProductVector[5];
43 dotProduct += dotProductVector[6];
44 dotProduct += dotProductVector[7];
46 number = eighthPoints*8;
47 for(;number < num_points; number++){
48 dotProduct += ((*aPtr++) * (*bPtr++));
55 unsigned int number = 0;
56 const unsigned int eighthPoints = num_points / 8;
59 float *realpt = &res[0], *imagpt = &res[1];
60 const float* aPtr = (
float*)input;
61 const float* bPtr = taps;
65 __m256 xVal, xloVal, xhiVal;
67 __m256 cVal = _mm256_setzero_ps();
69 for(;number < eighthPoints; number++){
71 a0Val = _mm256_load_ps(aPtr);
72 a1Val = _mm256_load_ps(aPtr+8);
74 xVal = _mm256_load_ps(bPtr);
75 xloVal = _mm256_unpacklo_ps(xVal, xVal);
76 xhiVal = _mm256_unpackhi_ps(xVal, xVal);
78 b0Val = _mm256_permute2f128_ps(xloVal, xhiVal, 0x20);
79 b1Val = _mm256_permute2f128_ps(xloVal, xhiVal, 0x31);
81 cVal = _mm256_fmadd_ps(a0Val, b0Val, cVal);
82 cVal = _mm256_fmadd_ps(a1Val, b1Val, cVal);
88 __VOLK_ATTR_ALIGNED(32)
float dotProductVector[8];
90 _mm256_store_ps(dotProductVector, cVal);
92 *realpt = dotProductVector[0];
93 *imagpt = dotProductVector[1];
94 *realpt += dotProductVector[2];
95 *imagpt += dotProductVector[3];
96 *realpt += dotProductVector[4];
97 *imagpt += dotProductVector[5];
98 *realpt += dotProductVector[6];
99 *imagpt += dotProductVector[7];
101 number = eighthPoints*8;
102 for(;number < num_points; number++){
103 *realpt += ((*aPtr++) * (*bPtr));
104 *imagpt += ((*aPtr++) * (*bPtr++));
107 *result = *(lv_32fc_t*)(&res[0]);
static void volk_32f_x2_dot_prod_32f_a_avx2(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc_avx2.h:9
static void volk_32fc_32f_dot_prod_32fc_a_avx2(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc_avx2.h:54