add pffft

2024-11-09 14:57:18 -06:00
parent 78a00f71cc
commit a1790b8977
69 changed files with 25719 additions and 0 deletions
--- a/pffft/bench_conv.cpp
+++ b/pffft/bench_conv.cpp
@@ -0,0 +1,345 @@
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <algorithm>
+#include <random>
+#include <cstdint>
+#include <complex>
+
+#include "papi_perf_counter.h"
+
+//#if defined(HAVE_MIPP) && !defined(NO_MIPP)
+#if defined(HAVE_MIPP)
+#include <mipp.h>
+
+#define MIPP_VECTOR  mipp::vector
+#else
+#define MIPP_VECTOR  std::vector
+#endif
+
+#include "pf_conv_dispatcher.h"
+#include "pf_conv.h"
+
+
+#define TEST_WITH_MIN_LEN     0
+
+
+MIPP_VECTOR<float> generate_rng_vec(int M, int N = -1, int seed_value = 1)
+{
+    MIPP_VECTOR<float> v(N < 0 ? M : N);
+    std::mt19937 g;
+    g.seed(seed_value);
+    constexpr float scale = 1.0F / (1.0F + float(INT_FAST32_MAX));
+    for (int k = 0; k < M; ++k)
+        v[k] = float(int_fast32_t(g())) * scale;
+    for (int k = M; k < N; ++k)
+        v[k] = 0.0F;
+    return v;
+}
+
+
+int bench_oop_core(
+        const conv_f_ptrs & conv_arch,
+        const float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_float_oop;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        state.size += blockLen;
+        int n_out = conv_oop(signal, &state, filter, sz_filter, y);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+int bench_inplace_core(
+        const conv_f_ptrs & conv_arch,
+        float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen
+        )
+{
+    conv_buffer_state state;
+    const auto conv_inplace = conv_arch.fp_conv_float_inplace;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        state.size += blockLen;
+        int n_out = conv_inplace(signal, &state, filter, sz_filter);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+
+int bench_oop(
+        const conv_f_ptrs & conv_arch,
+        float * buffer,
+        const float * signal, const int sz_signal,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_float_oop;
+    const auto move_rest = conv_arch.fp_conv_float_move_rest;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        move_rest(buffer, &state);
+        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
+        std::copy(&signal[off], &signal[off+blockLen], buffer+state.size);
+        state.size += blockLen;
+        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+int bench_cx_real_oop(
+        const conv_f_ptrs & conv_arch,
+        complexf * buffer,
+        const float * signal_re, const int sz_signal_re,
+        const float * filter, const int sz_filter,
+        const int blockLen,
+        float * y_re
+        )
+{
+    conv_buffer_state state;
+    const auto conv_oop = conv_arch.fp_conv_cplx_float_oop;
+    const auto move_rest = conv_arch.fp_conv_cplx_move_rest;
+    // interpret buffer, signal and output vector y  as complex data
+    complexf * y = reinterpret_cast<complexf *>(y_re);
+    const complexf * signal = reinterpret_cast<const complexf *>(signal_re);
+    const int sz_signal = sz_signal_re / 2;
+    int n_out_sum = 0;
+    state.offset = 0;
+    state.size = 0;
+    papi_perf_counter perf_counter(1);
+    for (int off = 0; off + blockLen <= sz_signal; off += blockLen)
+    {
+        move_rest(buffer, &state);
+        //memcpy(buffer+state.size, &s[off], B * sizeof(s[0]));
+        std::copy(&signal[off], &signal[off+blockLen], &buffer[state.size]);
+        state.size += blockLen;
+        int n_out = conv_oop(buffer, &state, filter, sz_filter, &y[n_out_sum]);
+        n_out_sum += n_out;
+    }
+    return n_out_sum;
+}
+
+
+int main(int argc, char *argv[])
+{
+    // cli defaults:
+    // process up to 64 MSample (512 MByte) in blocks of 1 kSamples (=64 kByte) with filterLen 128
+    int arch = 0, N = 64 * 1024 * 1024;
+    int filterLen = 128, blockLen = 1024;
+    int seed_sig = 1, seed_filter = 2;
+    bool verbose = false, exitFromUsage = false, showUsage = (argc <= 1);
+
+    for (int i = 1; i < argc; ++i)
+    {
+        if (i+1 < argc && !strcmp(argv[i], "-a"))
+            arch = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-n"))
+            N = atoi(argv[++i]) * 1024 * 1024;
+        else if (i+1 < argc && !strcmp(argv[i], "-f"))
+            filterLen = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-b"))
+            blockLen = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-ss"))
+            seed_sig = atoi(argv[++i]);
+        else if (i+1 < argc && !strcmp(argv[i], "-sf"))
+            seed_filter = atoi(argv[++i]);
+        else if (!strcmp(argv[i], "-v"))
+            verbose = true;
+        else if (!strcmp(argv[i], "-h"))
+            showUsage = exitFromUsage = true;
+        else
+            fprintf(stderr, "warning: ignoring/skipping unknown option '%s'\n", argv[i]);
+    }
+
+    int num_arch = 0;
+    const ptr_to_conv_f_ptrs * conv_arch_ptrs = get_all_conv_arch_ptrs(&num_arch);
+
+    if (verbose)
+    {
+        fprintf(stderr, "num_arch is %d\n", num_arch);
+        for (int a = 0; a < num_arch; ++a)
+            if (conv_arch_ptrs[a])
+                fprintf(stderr, " arch %d is '%s'\n", a, conv_arch_ptrs[a]->id );
+            else
+                fprintf(stderr, " arch %d is nullptr !!!\n", a );
+        fprintf(stderr, "\n");
+    }
+
+    if ( arch < 0 || arch >= num_arch || !blockLen || !N || !filterLen || showUsage )
+    {
+        fprintf(stderr, "%s [-v] [-a <arch>] [-n <total # of MSamples> [-f <filter length>] [-b <blockLength in samples>]\n", argv[0]);
+        fprintf(stderr, "    [-ss <random seed for signal>] [-sf <random seed for filter coeffs>]\n");
+        fprintf(stderr, "arch is one of:");
+        for (int a = 0; a < num_arch; ++a)
+            if (conv_arch_ptrs[a])
+                fprintf(stderr, " %d for '%s'%s", a, conv_arch_ptrs[a]->id, (a < num_arch-1 ? ",":"") );
+        fprintf(stderr, "\n");
+        if ( exitFromUsage || !blockLen || !N || !filterLen || arch < 0 || arch >= num_arch )
+            return 0;
+    }
+
+    if (verbose)
+    {
+        #ifdef HAVE_PAPI
+        fprintf(stderr, "PAPI is available\n");
+        #else
+        fprintf(stderr, "PAPI is NOT available!\n");
+        #endif
+    }
+    #if !defined(HAVE_MIPP)
+    fprintf(stderr, "MIPP is NOT available!\n");
+    #endif
+
+    //int float_simd_size[num_arch];
+    int max_simd_size = -1;
+    for (int a = 0; a < num_arch; ++a)
+    {
+        if (conv_arch_ptrs[a])
+        {
+            const int sz = conv_arch_ptrs[a]->fp_conv_float_simd_size();
+            //float_simd_size[a] = sz;
+            if (max_simd_size < sz)
+                max_simd_size = sz;
+            if (verbose)
+                fprintf(stderr, "float simd size for '%s': %d\n", conv_arch_ptrs[a]->id, sz);
+        }
+        //else
+        //    float_simd_size[a] = 0;
+    }
+    //const int max_simd_size = *std::max_element( &float_simd_size[0], &float_simd_size[num_arch] );
+    if (verbose)
+        fprintf(stderr, "max float simd size: %d\n", max_simd_size);
+
+#if TEST_WITH_MIN_LEN
+    filterLen = 2;
+#endif
+
+    // round up filter length
+    filterLen = max_simd_size * ( ( filterLen + max_simd_size -1 ) / max_simd_size );
+
+#if TEST_WITH_MIN_LEN
+    blockLen = 1;
+    N = 2 * (3 + filterLen);    // produce 3+1 samples
+#endif
+
+    if (!conv_arch_ptrs[arch])
+    {
+        fprintf(stderr, "Error: architecture %d is NOT available!\n", arch);
+        return 1;
+    }
+    const conv_f_ptrs & conv_arch =  *conv_arch_ptrs[arch];
+    if (verbose)
+        fprintf(stderr, "arch is using mipp: %d\n", conv_arch.using_mipp);
+
+    fprintf(stderr, "processing N = %d MSamples with block length of %d samples with filter length %d taps on '%s'\n",
+        N / (1024 * 1024), blockLen, filterLen, conv_arch.id );
+
+    MIPP_VECTOR<float> s = generate_rng_vec(N + 1, N + 1, seed_sig);
+    MIPP_VECTOR<float> y(N + 1, 0.0F);
+    MIPP_VECTOR<float> filter = generate_rng_vec(filterLen, filterLen, seed_filter);
+    MIPP_VECTOR<float> buffer(blockLen + filterLen + 1, 0.0F);
+    MIPP_VECTOR<complexf> buffer_cx(blockLen + filterLen + 1);
+
+#if 1 && TEST_WITH_MIN_LEN
+    for (int k = 0; k < N; ++k)
+        s[k] = (k+1);
+    for (int k = 0; k < filterLen; ++k)
+        filter[k] = (k+1);
+#endif
+
+    s[N] = 123.0F;
+    y[N] = 321.0F;
+    buffer[blockLen + filterLen] = 789.0F;
+    buffer_cx[blockLen + filterLen].i = 987.0F;
+
+    fprintf(stderr, "\nrunning out-of-place convolution core for '%s':\n", conv_arch.id);
+    int n_oop_out = bench_oop_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\nrunning out-of-place convolution for '%s':\n", conv_arch.id);
+    n_oop_out = bench_oop(conv_arch, buffer.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, y[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\nrunning out-of-place complex/real convolution for '%s':\n", conv_arch.id);
+    n_oop_out = bench_cx_real_oop(conv_arch, buffer_cx.data(), s.data(), N, filter.data(), filterLen, blockLen, y.data());
+    fprintf(stderr, "oop produced %d output samples\n", n_oop_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    fprintf(stderr, "complex output (%d complex samples):\n", n_oop_out);
+    for (int k = 0; k < n_oop_out; ++k )
+        fprintf(stderr, "y[%2d] = %g  %+g * i\n", k, y[2*k], y[2*k+1]);
+    fprintf(stderr, "\n");
+
+    const std::complex<float> * sc = reinterpret_cast< std::complex<float>* >( s.data() );
+    const int Nc = N /2;
+    fprintf(stderr, "reference with std::complex<float>:\n");
+    for (int off = 0; off +filterLen <= Nc; ++off )
+    {
+        std::complex<float> sum(0.0F, 0.0F);
+        for (int k=0; k < filterLen; ++k)
+            sum += sc[off+k] * filter[k];
+        fprintf(stderr, "yv[%2d] = %g  %+g * i\n", off, sum.real(), sum.imag() );
+    }
+#endif
+
+    fprintf(stderr, "\nrunning inplace convolution core for '%s':\n", conv_arch.id);
+    int n_inp_out = bench_inplace_core(conv_arch, s.data(), N, filter.data(), filterLen, blockLen);
+    fprintf(stderr, "inp produced %d output samples\n", n_inp_out);
+    assert(s[N] == 123.0F);
+    assert(y[N] == 321.0F);
+    assert(buffer[blockLen + filterLen] == 789.0F);
+    assert(buffer_cx[blockLen + filterLen].i == 987.0F);
+#if TEST_WITH_MIN_LEN
+    for (int k = 0; k < n_inp_out; ++k )
+        fprintf(stderr, "y[%2d] = %g\n", k, s[k]);
+    fprintf(stderr, "\n");
+#endif
+
+    fprintf(stderr, "\n");
+    return 0;
+}