GNU Radio's FILTER_AVX2 Package
volk_32fc_32f_dot_prod_32fc_avx2.h
Go to the documentation of this file.
1 #ifndef INCLUDED_volk_32fc_32f_dot_prod_32fc_avx2_H
2 #define INCLUDED_volk_32fc_32f_dot_prod_32fc_avx2_H
3 
4 #include <volk/volk_common.h>
5 #include<stdio.h>
6 #include <immintrin.h>
7 
8 
9 static inline void volk_32f_x2_dot_prod_32f_a_avx2(float* result, const float* input, const float* taps, unsigned int num_points) {
10  unsigned int number = 0;
11  const unsigned int eighthPoints = num_points / 8;
12 
13  float dotProduct = 0;
14  const float* aPtr = input;
15  const float* bPtr = taps;
16 
17  __m256 aVal;
18  __m256 bVal;
19 
20  __m256 cVal = _mm256_setzero_ps();
21 
22  for(;number < eighthPoints; number++){
23 
24  aVal = _mm256_load_ps(aPtr);
25  bVal = _mm256_load_ps(bPtr);
26 
27  cVal = _mm256_fmadd_ps(aVal, bVal, cVal);
28 
29  aPtr += 8;
30  bPtr += 8;
31  }
32 
33  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
34 
35  _mm256_store_ps(dotProductVector, cVal); // Store the results back into the dot product vector
36 
37  dotProduct = dotProductVector[0];
38  dotProduct += dotProductVector[1];
39  dotProduct += dotProductVector[2];
40  dotProduct += dotProductVector[3];
41  dotProduct += dotProductVector[4];
42  dotProduct += dotProductVector[5];
43  dotProduct += dotProductVector[6];
44  dotProduct += dotProductVector[7];
45 
46  number = eighthPoints*8;
47  for(;number < num_points; number++){
48  dotProduct += ((*aPtr++) * (*bPtr++));
49  }
50 
51  *result = dotProduct;
52 }
53 
54 static inline void volk_32fc_32f_dot_prod_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* input, const float* taps, unsigned int num_points) {
55  unsigned int number = 0;
56  const unsigned int eighthPoints = num_points / 8;
57 
58  float res[2];
59  float *realpt = &res[0], *imagpt = &res[1];
60  const float* aPtr = (float*)input;
61  const float* bPtr = taps;
62 
63  __m256 a0Val, a1Val;
64  __m256 b0Val, b1Val;
65  __m256 xVal, xloVal, xhiVal;
66 
67  __m256 cVal = _mm256_setzero_ps();
68 
69  for(;number < eighthPoints; number++){
70 
71  a0Val = _mm256_load_ps(aPtr);
72  a1Val = _mm256_load_ps(aPtr+8);
73 
74  xVal = _mm256_load_ps(bPtr); // t0|t1|t2|t3|t4|t5|t6|t7
75  xloVal = _mm256_unpacklo_ps(xVal, xVal); // t0|t0|t1|t1|t4|t4|t5|t5
76  xhiVal = _mm256_unpackhi_ps(xVal, xVal); // t2|t2|t3|t3|t6|t6|t7|t7
77 
78  b0Val = _mm256_permute2f128_ps(xloVal, xhiVal, 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
79  b1Val = _mm256_permute2f128_ps(xloVal, xhiVal, 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
80 
81  cVal = _mm256_fmadd_ps(a0Val, b0Val, cVal);
82  cVal = _mm256_fmadd_ps(a1Val, b1Val, cVal);
83 
84  aPtr += 16;
85  bPtr += 8;
86  }
87 
88  __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
89 
90  _mm256_store_ps(dotProductVector, cVal); // Store the results back into the dot product vector
91 
92  *realpt = dotProductVector[0];
93  *imagpt = dotProductVector[1];
94  *realpt += dotProductVector[2];
95  *imagpt += dotProductVector[3];
96  *realpt += dotProductVector[4];
97  *imagpt += dotProductVector[5];
98  *realpt += dotProductVector[6];
99  *imagpt += dotProductVector[7];
100 
101  number = eighthPoints*8;
102  for(;number < num_points; number++){
103  *realpt += ((*aPtr++) * (*bPtr));
104  *imagpt += ((*aPtr++) * (*bPtr++));
105  }
106 
107  *result = *(lv_32fc_t*)(&res[0]);
108 }
109 
110 #endif
111 
static void volk_32f_x2_dot_prod_32f_a_avx2(float *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc_avx2.h:9
static void volk_32fc_32f_dot_prod_32fc_a_avx2(lv_32fc_t *result, const lv_32fc_t *input, const float *taps, unsigned int num_points)
Definition: volk_32fc_32f_dot_prod_32fc_avx2.h:54