Actions
|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [vsipl++] [patch] Kernel updates
This patch updates the remaining VSIPL++ kernels to use ALF 3.0 and CML where possible. This removes the dependency on libfft_example from the SDK, removes the specialized split FFT routines, and pushes twiddle computation into the kernels. This also requires a minor CML patch, which I'll post separately. Ok to apply? -- Jules Bergmann CodeSourcery jules@xxxxxxxxxxxxxxxx (650) 331-3385 x705 Index: ChangeLog
===================================================================
--- ChangeLog (revision 209510)
+++ ChangeLog (working copy)
@@ -1,3 +1,45 @@
+2008-05-28 Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
+
+ * m4/cbe.m4 (CPP_SPU_FLAGS, LD_SPU_FLAGS): Pass CML paths.
+ * src/vsip/core/signal/conv_common.hpp: Implement correct min
+ support size case for 2D conv_min.
+ * src/vsip/core/fft.hpp: Remove LLC.
+ * src/vsip/opt/cbe/ppu/fft.cpp: Add split FFT support. Remove
+ twiddle factor computation (now taken care of by kernels).
+ * src/vsip/opt/cbe/ppu/task_manager.hpp: Fix bug in argument
+ processing. Add split FFT tag.
+ * src/vsip/opt/cbe/ppu/fastconv.cpp: Remove twiddle factor computation.
+ * src/vsip/opt/cbe/ppu/fastconv.hpp: Likewise.
+ * src/vsip/opt/cbe/fconv_params.h: Likewise. Also remove padding.
+ * src/vsip/opt/cbe/ppu/alf.hpp: Make exceptions more informative.
+ * src/vsip/opt/cbe/spu/alf_pwarp_ub.cpp: Move ALF decls to
+ alf_decl.hpp. Add missing argument to kernel.
+ * src/vsip/opt/ceb/spu/alf_decls.hpp: New file, decls for functions
+ expected by ALF.
+ * src/vsip/opt/cbe/spu/alf_fconv_c.c: Refactor to use CML.
+ * src/vsip/opt/cbe/spu/alf_fft_c.c: Likewise.
+ * src/vsip/opt/cbe/spu/alf_fconv_split_c.c: Likewise.
+ * src/vsip/opt/cbe/spu/alf_fconvm_c.c: Likewise.
+ * src/vsip/opt/cbe/spu/alf_vmul_split_c.c: Likewise.
+ * src/vsip/opt/cbe/spu/GNUmakefile.inc.in: Use LD_SPU_FLAGS from
+ configure. (LIBS_SPU): Add -lcml_spu.
+ * src/vsip/opt/cbe/spu/alf_vmmul_c.c: Update to use ALF 3.0.
+ * src/vsip/opt/cbe/spu/alf_vmul_s.c: Likewise.
+ * src/vsip/opt/cbe/spu/alf_fconvm_split_c.c: Update to use CML.
+ * src/vsip/opt/cbe/fft_params.h: Remove twiddle factor addresses,
+ add parameters for split FFT.
+ * src/vsip/opt/diag/fft.hpp: Fix Wall warning.
+ * benchmarks/copy.cpp: Add more error detail.
+ * benchmarks/fastconv.cpp: Update parameter passing, make check
+ optional.
+ * benchmarks/vmul.hpp: Add diagnostics.
+ * benchmarks/fftm.cpp: Fix Wall warnings.
+ * src/vsip/opt/cbe/spu/vmul_split.h: Remove file, unused.
+ * src/vsip/opt/cbe/spu/fft_1d.h: Remove file, unused.
+ * src/vsip/opt/cbe/spu/fft_1d_r2.h: Remove file, unused.
+ * src/vsip/opt/cbe/spu/fft_1d_r2_split.h: Remove file, unused.
+ * src/vsip/opt/cbe/spu/spe_assert.h: Remove file, unused.
+
2008-05-28 Stefan Seefeld <stefan@xxxxxxxxxxxxxxxx>
* tests/GNUmakefile.inc.in: Adjust for new test database.
Index: m4/cbe.m4
===================================================================
--- m4/cbe.m4 (revision 208645)
+++ m4/cbe.m4 (working copy)
@@ -67,9 +67,10 @@
if test "$with_cml_prefix" != ""; then
CPPFLAGS="$CPPFLAGS -I$with_cml_prefix/include"
LDFLAGS="$LDFLAGS -L$with_cml_prefix/lib"
+ CPP_SPU_FLAGS="$CPP_SPU_FLAGS -I$with_cml_prefix/include"
+ LD_SPU_FLAGS="$LD_SPU_FLAGS -L$with_cml_prefix/lib"
fi
- AC_SUBST(CPP_SPU_FLAGS, "")
if test "$neutral_acconfig" = 'y'; then
CPPFLAGS="$CPPFLAGS -DVSIP_CBE_SDK_VERSION=$cbe_sdk_version"
CPP_SPU_FLAGS="$CPP_SPU_FLAGS -DVSIP_CBE_SDK_VERSION=$cbe_sdk_version"
@@ -80,6 +81,8 @@
LIBS="-lcml -lalf -lspe2 -ldl $LIBS"
+ AC_SUBST(CPP_SPU_FLAGS, $CPP_SPU_FLAGS)
+ AC_SUBST(LD_SPU_FLAGS, $LD_SPU_FLAGS)
else
AC_SUBST(VSIP_IMPL_HAVE_CBE_SDK, "")
cbe_sdk_version="none"
Index: src/vsip/core/signal/conv_common.hpp
===================================================================
--- src/vsip/core/signal/conv_common.hpp (revision 209510)
+++ src/vsip/core/signal/conv_common.hpp (working copy)
@@ -641,8 +641,29 @@
typedef typename Convolution_accum_trait<T>::sum_type sum_type;
#if VSIP_IMPL_CONV_CORRECT_MIN_SUPPORT_SIZE
- VSIP_IMPL_THROW(vsip::impl::unimplemented(
- "conv_min not implemented for Matrix CORRECT_MIN_SUPPORT_SIZE"));
+ (void)in_rows;
+ (void)in_cols;
+
+ for (index_type r=0; r<out_rows; ++r)
+ {
+ index_type ir = r*decimation + (coeff_rows-1);
+ for (index_type c=0; c<out_cols; ++c)
+ {
+ index_type ic = c*decimation + (coeff_cols-1);
+
+ sum_type sum = sum_type();
+
+ for (index_type rr=0; rr<coeff_rows; ++rr)
+ {
+ for (index_type cc=0; cc<coeff_cols; ++cc)
+ {
+ sum += coeff[rr*coeff_row_stride + cc*coeff_col_stride] *
+ in[(ir-rr) * in_row_stride + (ic-cc) * in_col_stride];
+ }
+ }
+ out[r * out_row_stride + c * out_col_stride] = sum;
+ }
+ }
#else
for (index_type r=0; r<out_rows; ++r)
{
Index: src/vsip/core/fft.hpp
===================================================================
--- src/vsip/core/fft.hpp (revision 209510)
+++ src/vsip/core/fft.hpp (working copy)
@@ -1,5 +1,4 @@
-/* Copyright (c) 2006, 2007, 2008 by CodeSourcery, LLC.
- All rights reserved. */
+/* Copyright (c) 2006, 2007, 2008 by CodeSourcery. All rights reserved. */
/** @file vsip/core/fft.hpp
@author Stefan Seefeld
Index: src/vsip/opt/cbe/ppu/fft.cpp
===================================================================
--- src/vsip/opt/cbe/ppu/fft.cpp (revision 209510)
+++ src/vsip/opt/cbe/ppu/fft.cpp (working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 by CodeSourcery. All rights reserved.
+/* Copyright (c) 2007, 2008 by CodeSourcery. All rights reserved.
This file is available for license from CodeSourcery, Inc. under the terms
of a commercial license and under the GPL. It is not part of the VSIPL++
@@ -42,14 +42,13 @@
typedef std::complex<rtype> ctype;
public:
- Fft_base(length_type size)
- : twiddle_factors_(size / 4)
- {
- compute_twiddle_factors(size);
- }
+ Fft_base(length_type)
+ {}
+
virtual ~Fft_base()
{}
+ // Interleaved-complex FFT
void
fft(std::complex<T> const* in, std::complex<T>* out,
length_type length, T scale, int exponent)
@@ -59,9 +58,8 @@
Fft_params fftp;
fftp.direction = (exponent == -1 ? fwd_fft : inv_fft);
- fftp.elements = length;
+ fftp.fft_size = length;
fftp.scale = scale;
- fftp.ea_twiddle_factors = ea_from_ptr(twiddle_factors_.get());
fftp.ea_input_buffer = ea_from_ptr(in);
fftp.ea_output_buffer = ea_from_ptr(out);
fftp.in_blk_stride = 0; // not applicable in the single FFT case
@@ -89,7 +87,51 @@
task.sync();
}
+ // Split-complex FFT
void
+ fft(T const* in_re, T const* in_im, T* out_re, T* out_im,
+ length_type length, T scale, int exponent)
+ {
+ assert(is_dma_addr_ok(in_re));
+ assert(is_dma_addr_ok(in_im));
+ assert(is_dma_addr_ok(out_re));
+ assert(is_dma_addr_ok(out_im));
+
+ Fft_split_params fftp;
+ fftp.direction = (exponent == -1 ? fwd_fft : inv_fft);
+ fftp.fft_size = length;
+ fftp.scale = scale;
+ fftp.ea_input_re = ea_from_ptr(in_re);
+ fftp.ea_input_im = ea_from_ptr(in_im);
+ fftp.ea_output_re = ea_from_ptr(out_re);
+ fftp.ea_output_im = ea_from_ptr(out_im);
+ fftp.in_blk_stride = 0; // not applicable in the single FFT case
+ fftp.out_blk_stride = 0;
+
+ Task_manager *mgr = Task_manager::instance();
+ // The stack size is determined by accounting for the *worst case*
+ // stack requirements, which, to date, are _at least_ equal
+ // to 64 KB for the 8 K FFT (using two temporary arrays of 4 bytes
+ // per point each).
+ // Note on the input buffer size: the buffer is made twice the
+ // size of the output buffer. This improves performance only at
+ // some lengths, notably 2048 and 4096, but does not hurt for
+ // other problem sizes. Warning: this may change if the underlying
+ // implementation changes and should be revised as needed.
+ Task task = mgr->reserve<Fft_tag, void(std::pair<T*,T*>,std::pair<T*,T*>)>
+ (80*1024, // max stack size
+ sizeof(Fft_split_params),
+ sizeof(complex<T>)*length*2,
+ sizeof(complex<T>)*length,
+ true);
+ Workblock block = task.create_workblock(1);
+ block.set_parameters(fftp);
+ block.enqueue();
+ task.sync();
+ }
+
+ // Interleaved-complex FFTM
+ void
fftm(std::complex<T> const* in, std::complex<T>* out,
stride_type in_r_stride, stride_type in_c_stride,
stride_type out_r_stride, stride_type out_c_stride,
@@ -104,7 +146,6 @@
Fft_params fftp;
fftp.direction = (exponent == -1 ? fwd_fft : inv_fft);
fftp.scale = scale;
- fftp.ea_twiddle_factors = ea_from_ptr(twiddle_factors_.get());
length_type num_ffts;
length_type in_stride;
length_type out_stride;
@@ -113,14 +154,14 @@
num_ffts = rows;
in_stride = in_r_stride;
out_stride = out_r_stride;
- fftp.elements = cols;
+ fftp.fft_size = cols;
}
else
{
num_ffts = cols;
in_stride = in_c_stride;
out_stride = out_c_stride;
- fftp.elements = rows;
+ fftp.fft_size = rows;
}
fftp.ea_input_buffer = ea_from_ptr(in);
fftp.ea_output_buffer = ea_from_ptr(out);
@@ -140,8 +181,8 @@
Task task = mgr->reserve<Fft_tag, void(complex<T>,complex<T>)>
(80*1024, // max stack size
sizeof(Fft_params),
- sizeof(complex<T>)*fftp.elements*2,
- sizeof(complex<T>)*fftp.elements,
+ sizeof(complex<T>)*fftp.fft_size*2,
+ sizeof(complex<T>)*fftp.fft_size,
true);
length_type spes = mgr->num_spes();
@@ -162,23 +203,90 @@
task.sync();
}
- void
- compute_twiddle_factors(length_type length)
+ // Split-complex FFTM
+ void
+ fftm(T const* in_re, T const* in_im,
+ T* out_re, T* out_im,
+ stride_type in_r_stride, stride_type in_c_stride,
+ stride_type out_r_stride, stride_type out_c_stride,
+ length_type rows, length_type cols,
+ T scale, int exponent, int axis)
{
- unsigned int i = 0;
- unsigned int n = length;
- T* W = reinterpret_cast<T*>(twiddle_factors_.get());
- W[0] = 1.0f;
- W[1] = 0.0f;
- for (i = 1; i < n / 4; ++i)
+ assert(is_dma_addr_ok(in_re));
+ assert(is_dma_addr_ok(in_im));
+ assert(is_dma_addr_ok(out_re));
+ assert(is_dma_addr_ok(out_im));
+ assert(is_dma_addr_ok(in_re + (axis != 0 ? in_r_stride : in_c_stride)));
+ assert(is_dma_addr_ok(out_re + (axis != 0 ? out_r_stride : out_c_stride)));
+ assert(is_dma_addr_ok(in_im + (axis != 0 ? in_r_stride : in_c_stride)));
+ assert(is_dma_addr_ok(out_im + (axis != 0 ? out_r_stride : out_c_stride)));
+
+ Fft_split_params fftp;
+ fftp.direction = (exponent == -1 ? fwd_fft : inv_fft);
+ fftp.scale = scale;
+ length_type num_ffts;
+ length_type in_stride;
+ length_type out_stride;
+
+ if (axis != 0)
{
- W[2*i] = cos(i * 2*M_PI / n);
- W[2*(n/4 - i)+1] = -W[2*i];
+ num_ffts = rows;
+ in_stride = in_r_stride;
+ out_stride = out_r_stride;
+ fftp.fft_size = cols;
}
+ else
+ {
+ num_ffts = cols;
+ in_stride = in_c_stride;
+ out_stride = out_c_stride;
+ fftp.fft_size = rows;
+ }
+
+ fftp.ea_input_re = ea_from_ptr(in_re);
+ fftp.ea_input_im = ea_from_ptr(in_im);
+ fftp.ea_output_re = ea_from_ptr(out_re);
+ fftp.ea_output_im = ea_from_ptr(out_im);
+ fftp.in_blk_stride = in_stride;
+ fftp.out_blk_stride = out_stride;
+
+ Task_manager *mgr = Task_manager::instance();
+ // The stack size is determined by accounting for the *worst case*
+ // stack requirements, which, to date, are _at least_ equal
+ // to 64 KB for the 8 K FFT (using two temporary arrays of 4 bytes
+ // per point each).
+ // Note on the input buffer size: the buffer is made twice the
+ // size of the output buffer. This improves performance only at
+ // some lengths, notably 2048 and 4096, but does not hurt for
+ // other problem sizes. Warning: this may change if the underlying
+ // implementation changes and should be revised as needed.
+ Task task = mgr->reserve<Fft_tag, void(std::pair<T*,T*>,std::pair<T*,T*>)>
+ (80*1024, // max stack size
+ sizeof(Fft_split_params),
+ sizeof(complex<T>)*fftp.fft_size*2,
+ sizeof(complex<T>)*fftp.fft_size,
+ true);
+
+ length_type spes = mgr->num_spes();
+ length_type ffts_per_spe = num_ffts / spes;
+
+ for (length_type i = 0; i < spes && i < num_ffts; ++i)
+ {
+ // If rows don't divide evenly, give the first SPEs one extra.
+ length_type spe_ffts = (i < num_ffts % spes) ? ffts_per_spe + 1
+ : ffts_per_spe;
+
+ Workblock block = task.create_workblock(spe_ffts);
+ block.set_parameters(fftp);
+ block.enqueue();
+
+ fftp.ea_input_re += sizeof(T) * spe_ffts * in_stride;
+ fftp.ea_input_im += sizeof(T) * spe_ffts * in_stride;
+ fftp.ea_output_re += sizeof(T) * spe_ffts * out_stride;
+ fftp.ea_output_im += sizeof(T) * spe_ffts * out_stride;
+ }
+ task.sync();
}
-
-private:
- aligned_array<ctype> twiddle_factors_;
};
@@ -215,14 +323,15 @@
{
rtl_inout.pack = stride_unit_align;
rtl_inout.order = tuple<0, 1, 2>();
- rtl_inout.complex = cmplx_inter_fmt;
+ // Both split and interleaved supported.
rtl_inout.align = 16;
}
virtual void query_layout(Rt_layout<1> &rtl_in, Rt_layout<1> &rtl_out)
{
rtl_in.pack = rtl_out.pack = stride_unit_align;
rtl_in.order = rtl_out.order = tuple<0, 1, 2>();
- rtl_in.complex = rtl_out.complex = cmplx_inter_fmt;
+ // Both split and interleaved supported, however in and out must match.
+ rtl_in.complex = rtl_out.complex;
rtl_in.align = rtl_out.align = 16;
}
virtual void in_place(ctype *inout, stride_type stride, length_type length)
@@ -230,8 +339,11 @@
assert(stride == 1);
this->fft(inout, inout, length, this->scale_, E);
}
- virtual void in_place(ztype, stride_type, length_type)
+ virtual void in_place(ztype inout, stride_type stride, length_type length)
{
+ assert(stride == 1);
+ this->fft(inout.first, inout.second, inout.first, inout.second,
+ length, this->scale_, E);
}
virtual void by_reference(ctype *in, stride_type in_stride,
ctype *out, stride_type out_stride,
@@ -241,10 +353,14 @@
assert(out_stride == 1);
this->fft(in, out, length, this->scale_, E);
}
- virtual void by_reference(ztype, stride_type,
- ztype, stride_type,
- length_type)
+ virtual void by_reference(ztype in, stride_type in_stride,
+ ztype out, stride_type out_stride,
+ length_type length)
{
+ assert(in_stride == 1);
+ assert(out_stride == 1);
+ this->fft(in.first, in.second, out.first, out.second,
+ length, this->scale_, E);
}
private:
@@ -308,9 +424,30 @@
this->scale_, E, A);
}
- virtual void in_place(ztype, stride_type, stride_type,
- length_type, length_type)
+ virtual void in_place(ztype inout,
+ stride_type r_stride, stride_type c_stride,
+ length_type rows, length_type cols)
{
+ if (A != 0)
+ {
+ if (rows == 0) return; // Handle empty local subblock.
+ assert(rows <= this->size_[0]); // OK if rows are distributed.
+ assert(cols == this->size_[1]); // Columns must be whole.
+ assert(c_stride == 1);
+ }
+ else
+ {
+ if (cols == 0) return; // Handle empty local subblock.
+ assert(rows == this->size_[0]); // Rows must be whole.
+ assert(cols <= this->size_[1]); // OK if columns are distributed.
+ assert(r_stride == 1);
+ }
+ this->fftm(inout.first, inout.second,
+ inout.first, inout.second,
+ r_stride, c_stride,
+ r_stride, c_stride,
+ rows, cols,
+ this->scale_, E, A);
}
virtual void by_reference(ctype *in,
@@ -339,10 +476,31 @@
rows, cols,
this->scale_, E, A);
}
- virtual void by_reference(ztype, stride_type, stride_type,
- ztype, stride_type, stride_type,
- length_type, length_type)
+ virtual void by_reference(ztype in,
+ stride_type in_r_stride, stride_type in_c_stride,
+ ztype out,
+ stride_type out_r_stride, stride_type out_c_stride,
+ length_type rows, length_type cols)
{
+ if (A != 0)
+ {
+ if (rows == 0) return; // Handle empty local subblock.
+ assert(rows <= this->size_[0]); // OK if rows are distributed.
+ assert(cols == this->size_[1]); // Columns must be whole.
+ assert(in_c_stride == 1 && out_c_stride == 1);
+ }
+ else
+ {
+ if (cols == 0) return; // Handle empty local subblock.
+ assert(rows == this->size_[0]); // Rows must be whole.
+ assert(cols <= this->size_[1]); // OK if columns are distributed.
+ assert(in_r_stride == 1 && out_r_stride == 1);
+ }
+ this->fftm(in.first, in.second, out.first, out.second,
+ in_r_stride, in_c_stride,
+ out_r_stride, out_c_stride,
+ rows, cols,
+ this->scale_, E, A);
}
virtual void query_layout(Rt_layout<2> &rtl_inout)
{
@@ -352,7 +510,7 @@
else
rtl_inout.order = tuple<1, 0, 2>();
rtl_inout.pack = stride_unit_align;
- rtl_inout.complex = cmplx_inter_fmt;
+ // Both split and interleaved supported.
rtl_inout.align = 16;
}
virtual void query_layout(Rt_layout<2> &rtl_in, Rt_layout<2> &rtl_out)
@@ -363,7 +521,8 @@
else
rtl_in.order = rtl_out.order = tuple<1, 0, 2>();
rtl_in.pack = rtl_out.pack = stride_unit_align;
- rtl_in.complex = rtl_out.complex = cmplx_inter_fmt;
+ // Both split and interleaved supported, however in and out must match.
+ rtl_in.complex = rtl_out.complex;
rtl_in.align = rtl_out.align = 16;
}
private:
Index: src/vsip/opt/cbe/ppu/task_manager.hpp
===================================================================
--- src/vsip/opt/cbe/ppu/task_manager.hpp (revision 209510)
+++ src/vsip/opt/cbe/ppu/task_manager.hpp (working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2007 by CodeSourcery. All rights reserved.
+/* Copyright (c) 2007, 2008 by CodeSourcery. All rights reserved.
This file is available for license from CodeSourcery, Inc. under the terms
of a commercial license and under the GPL. It is not part of the VSIPL++
@@ -65,6 +65,7 @@
{
num_spes = atoi(argv[i+1]);
shift_argv(argc, argv, i, 2);
+ i -= 1;
}
}
@@ -105,7 +106,7 @@
static Task_manager *instance_;
- ALF* alf_;
+ ALF* alf_;
Task task_;
length_type num_spes_;
};
@@ -132,10 +133,11 @@
DEFINE_TASK(Mult_tag, std::complex<float>(std::complex<float>, std::complex<float>), vmul_c)
DEFINE_TASK(Mult_tag, split_float_type(split_float_type, split_float_type), vmul_split_c)
DEFINE_TASK(Fft_tag, void(std::complex<float>, std::complex<float>), fft_c)
+DEFINE_TASK(Fft_tag, void(split_float_type, split_float_type), fft_split_c)
DEFINE_TASK(Fastconv_tag, void(std::complex<float>, std::complex<float>), fconv_c)
DEFINE_TASK(Fastconv_tag, void(split_float_type, split_float_type), fconv_split_c)
DEFINE_TASK(Fastconvm_tag, void(std::complex<float>, std::complex<float>), fconvm_c)
DEFINE_TASK(Fastconvm_tag, void(split_float_type, split_float_type), fconvm_split_c)
DEFINE_TASK(Vmmul_tag, std::complex<float>(std::complex<float>, std::complex<float>), vmmul_c)
DEFINE_TASK(Pwarp_tag, void(unsigned char, unsigned char), pwarp_ub)
-#endif
+#endif // VSIP_OPT_CBE_PPU_TASK_MANAGER_HPP
Index: src/vsip/opt/cbe/ppu/fastconv.cpp
===================================================================
--- src/vsip/opt/cbe/ppu/fastconv.cpp (revision 209510)
+++ src/vsip/opt/cbe/ppu/fastconv.cpp (working copy)
@@ -48,12 +48,10 @@
(T* in, T* kernel, T* out, length_type rows, length_type length)
{
Fastconv_params params;
- VSIP_IMPL_STATIC_ASSERT(sizeof(Fastconv_params) % 16 == 0);
params.instance_id = this->instance_id_;
params.elements = length;
params.transform_kernel = this->transform_kernel_;
- params.ea_twiddle_factors = ea_from_ptr(this->twiddle_factors_.get());
params.ea_kernel = ea_from_ptr(kernel);
params.ea_input = ea_from_ptr(in);
params.ea_output = ea_from_ptr(out);
@@ -115,12 +113,10 @@
length_type length)
{
Fastconv_split_params params;
- VSIP_IMPL_STATIC_ASSERT(sizeof(Fastconv_split_params) % 16 == 0);
params.instance_id = this->instance_id_;
params.elements = length;
params.transform_kernel = this->transform_kernel_;
- params.ea_twiddle_factors = ea_from_ptr(this->twiddle_factors_.get());
params.ea_kernel_re = ea_from_ptr(kernel.first);
params.ea_kernel_im = ea_from_ptr(kernel.second);
params.ea_input_re = ea_from_ptr(in.first);
@@ -168,28 +164,7 @@
}
-template <dimension_type D,
- typename T,
- typename ComplexFmt>
-void
-Fastconv_base<D, T, ComplexFmt>::compute_twiddle_factors(
- length_type length)
-{
- typedef typename Scalar_of<T>::type stype;
- unsigned int i = 0;
- unsigned int n = length;
- stype* W = reinterpret_cast<stype*>(this->twiddle_factors_.get());
- W[0] = 1.0f;
- W[1] = 0.0f;
- for (i = 1; i < n / 4; ++i)
- {
- W[2*i] = cos(i * 2*M_PI / n);
- W[2*(n/4 - i)+1] = -W[2*i];
- }
-}
-
-
typedef std::complex<float> ctype;
typedef std::pair<float*,float*> ztype;
@@ -197,17 +172,11 @@
template void \
Fastconv_base<COEFFS_DIM, ctype, Cmplx_inter_fmt>::fconv( \
ctype* in, ctype* kernel, ctype* out, length_type rows, length_type length); \
-template void \
-Fastconv_base<COEFFS_DIM, ctype, Cmplx_inter_fmt>::compute_twiddle_factors( \
- length_type length); \
template<> unsigned int \
Fastconv_base<COEFFS_DIM, ctype, Cmplx_inter_fmt>::instance_id_counter_ = 0; \
template void \
Fastconv_base<COEFFS_DIM, ctype, Cmplx_split_fmt>::fconv( \
ztype in, ztype kernel, ztype out, length_type rows, length_type length); \
-template void \
-Fastconv_base<COEFFS_DIM, ctype, Cmplx_split_fmt>::compute_twiddle_factors( \
- length_type length); \
template<> unsigned int \
Fastconv_base<COEFFS_DIM, ctype, Cmplx_split_fmt>::instance_id_counter_ = 0;
Index: src/vsip/opt/cbe/ppu/alf.hpp
===================================================================
--- src/vsip/opt/cbe/ppu/alf.hpp (revision 209510)
+++ src/vsip/opt/cbe/ppu/alf.hpp (working copy)
@@ -107,7 +107,7 @@
Workblock block;
int status = alf_wb_create(task_, ALF_WB_SINGLE, 1, &block.workblock_);
if (status != 0)
- VSIP_IMPL_THROW(std::runtime_error("unable to create work block!"));
+ VSIP_IMPL_THROW(std::runtime_error("Task::create_workblock(single) -- alf_wb_create failed."));
return block;
}
Workblock create_workblock(int times)
@@ -115,7 +115,7 @@
Workblock block;
int status = alf_wb_create(task_, ALF_WB_MULTI, times, &block.workblock_);
if (status != 0)
- VSIP_IMPL_THROW(std::runtime_error("unable to create work block!"));
+ VSIP_IMPL_THROW(std::runtime_error("Task::create_workblock(multi) -- alf_wb_create failed."));
return block;
}
void sync()
Index: src/vsip/opt/cbe/ppu/fastconv.hpp
===================================================================
--- src/vsip/opt/cbe/ppu/fastconv.hpp (revision 209510)
+++ src/vsip/opt/cbe/ppu/fastconv.hpp (working copy)
@@ -92,12 +92,10 @@
public:
Fastconv_base(length_type const input_size, bool transform_kernel)
: size_ (input_size),
- twiddle_factors_ (input_size / 4),
transform_kernel_(transform_kernel),
instance_id_ (++instance_id_counter_)
{
assert(rt_valid_size(this->size_));
- compute_twiddle_factors(this->size_);
}
static bool rt_valid_size(length_type size)
@@ -157,21 +155,19 @@
private:
typedef typename Scalar_of<T>::type uT;
- void compute_twiddle_factors(length_type length);
void fconv(T* in, T* kernel, T* out, length_type rows, length_type length);
void fconv(std::pair<uT*,uT*> in, std::pair<uT*,uT*> kernel,
std::pair<uT*,uT*> out, length_type rows, length_type length);
// Member data.
length_type size_;
- aligned_array<T> twiddle_factors_;
bool transform_kernel_;
unsigned int instance_id_;
// This counter is used to give each instance of this type
// a unique ID that is passed to the SPEs as part of the
// parameter block. When an SPE sees that the ID has changed,
- // it knows to reload the weights and twiddle factors.
+ // it knows to reload the weights.
static unsigned int instance_id_counter_;
};
Index: src/vsip/opt/cbe/fconv_params.h
===================================================================
--- src/vsip/opt/cbe/fconv_params.h (revision 209510)
+++ src/vsip/opt/cbe/fconv_params.h (working copy)
@@ -36,20 +36,13 @@
#endif
-// Structures used in DMAs should be sized in multiples of 128-bits
-// Definitions for 'command' bitfields:
-#define RELOAD_TWIDDLE_FACTORS (0x01)
-#define RELOAD_WEIGHTS (0x02)
-
typedef struct
{
unsigned int instance_id;
unsigned int elements;
unsigned int transform_kernel;
- unsigned int pad1;
- unsigned long long ea_twiddle_factors;
unsigned long long ea_kernel;
unsigned long long ea_input;
@@ -58,7 +51,6 @@
unsigned int kernel_stride;
unsigned int input_stride;
unsigned int output_stride;
- unsigned int pad2;
} Fastconv_params;
typedef struct
@@ -66,11 +58,7 @@
unsigned int instance_id;
unsigned int elements;
unsigned int transform_kernel;
- unsigned int pad1;
- unsigned long long ea_twiddle_factors;
- unsigned long long pad2;
-
unsigned long long ea_kernel_re;
unsigned long long ea_kernel_im;
@@ -83,7 +71,6 @@
unsigned int kernel_stride;
unsigned int input_stride;
unsigned int output_stride;
- unsigned int pad3;
} Fastconv_split_params;
Index: src/vsip/opt/cbe/spu/alf_fconv_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_fconv_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_fconv_c.c (working copy)
@@ -13,46 +13,28 @@
#include <sys/time.h>
#include <spu_mfcio.h>
#include <alf_accel.h>
-#include <vsip/opt/cbe/fconv_params.h>
-#include "fft_1d_r2.h"
#include <assert.h>
+#include <cml/spu/cml.h>
+#include <cml/spu/cml_core.h>
+#include <vsip/opt/cbe/fconv_params.h>
+
// These are sized for complex values, taking two floats each.
static float coeff[2 * VSIP_IMPL_MAX_FCONV_SIZE]
__attribute__ ((aligned (128)));
-// The twiddle factors occupy only 1/4 the space as the inputs,
-// outputs and convolution kernels.
-static float twiddle_factors[2 * VSIP_IMPL_MAX_FCONV_SIZE / 4]
- __attribute__ ((aligned (128)));
-
static unsigned int instance_id = 0;
-#define VEC_SIZE (4)
-unsigned int log2i(unsigned int size)
-{
- unsigned int log2_size = 0;
- while (!(size & 1))
- {
- size >>= 1;
- log2_size++;
- }
- return log2_size;
-}
-
-void initialize(Fastconv_params* fc, void volatile* p_kernel,
- void volatile* p_twiddles, unsigned int n,
- unsigned int log2n)
+void initialize(
+ Fastconv_params* fc,
+ void* p_kernel,
+ fft1d_f* obj,
+ void* buf)
{
unsigned int size = fc->elements*2*sizeof(float);
- // The number of twiddle factors is 1/4 the input size
- mfc_get(p_twiddles, fc->ea_twiddle_factors, size/4, 31, 0, 0);
- mfc_write_tag_mask(1<<31);
- mfc_read_tag_status_all();
-
// The kernel matches the input and output size
mfc_get(p_kernel, fc->ea_kernel, size, 31, 0, 0);
mfc_write_tag_mask(1<<31);
@@ -62,11 +44,7 @@
{
// Perform the forward FFT on the kernel, in place. This only need
// be done once -- subsequent calls will utilize the same kernel.
- vector float* inout = (vector float *)coeff;
- vector float* W = (vector float*)twiddle_factors;
- vector float re[n / VEC_SIZE], im[n / VEC_SIZE];
- _fft_1d_r2_pre(re, im, inout, W, log2n);
- _fft_1d_r2_fini(inout, re, im, W, log2n);
+ cml_ccfft1d_ip_f(obj, (float*)coeff, CML_FFT_FWD, buf);
}
}
@@ -114,71 +92,57 @@
-int kernel(void* context,
- void* params,
- void* input,
- void* output,
- unsigned int iter,
- unsigned int iter_max)
+int kernel(
+ void* context,
+ void* params,
+ void* input,
+ void* output,
+ void* inout,
+ unsigned int iter,
+ unsigned int iter_max)
{
+ static fft1d_f* obj;
+ static char* buf;
+ static size_t current_size = 0;
+
Fastconv_params* fc = (Fastconv_params *)params;
- unsigned int n = fc->elements;
- unsigned int log2n = log2i(n);
- assert(n <= VSIP_IMPL_MAX_FCONV_SIZE);
+ unsigned int fft_size = fc->elements;
+ assert(fft_size <= VSIP_IMPL_MAX_FCONV_SIZE);
// Initialization establishes the weights (kernel) for the
// convolution step and the twiddle factors for the FFTs.
// These are loaded once per task by checking a unique
// ID passed down from the caller.
- if (instance_id != fc->instance_id)
+ if (iter == 0)
{
- instance_id = fc->instance_id;
- initialize(fc, coeff, twiddle_factors, n, log2n);
+ if (fft_size != current_size)
+ {
+ if (obj)
+ {
+ free(buf);
+ cml_fft1d_destroy_f_alloc(obj);
+ }
+ int rt = cml_fft1d_setup_f_alloc(&obj, CML_FFT_CC, fft_size);
+ assert(rt && obj != NULL);
+ buf = (char*)memalign(16, cml_zzfft1d_buf_size_f(obj));
+ assert(buf != NULL);
+ current_size = fft_size;
+ }
+ if (instance_id != fc->instance_id)
+ {
+ instance_id = fc->instance_id;
+ initialize(fc, coeff, obj, buf);
+ }
}
- vector float* in = (vector float *)input;
- vector float* W = (vector float*)twiddle_factors;
- vector float* out = (vector float*)output;
+ float* in = (float*)input;
+ float* out = (float*)output;
- // Create real & imaginary working arrays
- vector float re[n / VEC_SIZE], im[n / VEC_SIZE];
+ cml_ccfft1d_ip_f(obj, in, CML_FFT_FWD, buf);
+ cml_cvmul1_f(coeff, in, out, fft_size);
+ cml_ccfft1d_ip_f(obj, out, CML_FFT_INV, buf);
+ cml_core_rcsvmul1_f(1.f / fft_size, out, out, fft_size);
-
- // Perform the forward FFT, rolling the convolution into
- // the last stage
- _fft_1d_r2_pre(re, im, in, W, log2n);
- _fft_1d_r2_fini_cvmul(out, re, im, (vector float*)coeff, W, log2n);
-
-
- // Revert back the time domain.
- _fft_1d_r2_pre(re, im, out, W, log2n);
- _fft_1d_r2_fini(out, re, im, W, log2n);
-
-
- // Code for the inverse FFT scaling is taken from the CBE
- // SDK Libraries Overview and Users Guide, sec. 8.1.
- {
- unsigned int i;
- vector float *start, *end, s0, s1, e0, e1;
- vector unsigned int mask = (vector unsigned int){-1, -1, 0, 0};
- vector float vscale = spu_splats(1 / (float)n);
- start = out;
-
- // Scale the output vector and swap the order of the outputs.
- // Note: there are two float values for each of 'n' complex values.
- end = start + 2 * n / VEC_SIZE;
- s0 = e1 = *start;
- for (i = 0; i < n / VEC_SIZE; ++i)
- {
- s1 = *(start + 1);
- e0 = *(--end);
-
- *start++ = spu_mul(spu_sel(e0, e1, mask), vscale);
- *end = spu_mul(spu_sel(s0, s1, mask), vscale);
- s0 = s1;
- e1 = e0;
- }
- }
return 0;
}
Index: src/vsip/opt/cbe/spu/alf_pwarp_ub.cpp
===================================================================
--- src/vsip/opt/cbe/spu/alf_pwarp_ub.cpp (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_pwarp_ub.cpp (working copy)
@@ -19,6 +19,7 @@
#include <vsip/core/acconfig.hpp>
#include <spu_mfcio.h>
#include <vsip/opt/cbe/pwarp_params.h>
+#include <vsip/opt/cbe/spu/alf_decls.hpp>
#include <vsip/opt/simd/simd_spu.hpp>
@@ -26,32 +27,9 @@
static unsigned char src_img[IMG_SIZE] __attribute__ ((aligned (128)));
-extern "C" {
-int input(
- void* context,
- void* params,
- void* entries,
- unsigned int current_count,
- unsigned int total_count);
-int output(
- void* context,
- void* params,
- void* entries,
- unsigned int cur_iter,
- unsigned int tot_iter);
-int kernel(
- void* context,
- void* params,
- void* input,
- void* output,
- unsigned int cur_iter,
- unsigned int tot_iter);
-}
-
-
/***********************************************************************
Definitions
***********************************************************************/
@@ -718,6 +696,7 @@
void* params,
void* input,
void* output,
+ void* inout,
unsigned int cur_iter,
unsigned int tot_iter)
{
Index: src/vsip/opt/cbe/spu/alf_fft_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_fft_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_fft_c.c (working copy)
@@ -12,45 +12,13 @@
#include <alf_accel.h>
#include <assert.h>
+#include <spu_intrinsics.h>
+#include <cml/spu/cml.h>
+#include <cml/spu/cml_core.h>
+
#include <vsip/core/acconfig.hpp>
-#if VSIP_CBE_SDK_VERSION == 210
-# include <libfft.h>
-#else
-# include <libfft_example.h>
-#endif
-#include <spu_mfcio.h>
#include <vsip/opt/cbe/fft_params.h>
-// These are sized for complex values, taking two floats each. The twiddle
-// factors occupy only 1/4 the space as the inputs, outputs and convolution
-// kernels.
-static volatile float twiddle_factors[MAX_FFT_1D_SIZE*2/4] __attribute__ ((aligned (128)));
-static unsigned int initialized = 0;
-
-#define VEC_SIZE (4)
-
-unsigned int log2i(unsigned int size)
-{
- unsigned int log2_size = 0;
- while (!(size & 1))
- {
- size >>= 1;
- log2_size++;
- }
- return log2_size;
-}
-
-void initialize(Fft_params* fft, void volatile* p_twiddles,
- unsigned int n, unsigned int log2n)
-{
- unsigned int size = fft->elements*2*sizeof(float);
-
- // The number of twiddle factors is 1/4 the input size
- mfc_get(p_twiddles, fft->ea_twiddle_factors, size/4, 31, 0, 0);
- mfc_write_tag_mask(1<<31);
- mfc_read_tag_status_all();
-}
-
#define _ALF_MAX_SINGLE_DT_SIZE 16*1024
int input(void* context,
@@ -66,7 +34,7 @@
// Transfer input.
ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_IN, 0);
- unsigned long length = fft->elements;
+ unsigned long length = fft->fft_size;
unsigned long max_length = _ALF_MAX_SINGLE_DT_SIZE / FP / sizeof(float);
ea = fft->ea_input_buffer + current_count * FP * fft->in_blk_stride * sizeof(float);
while (length > 0)
@@ -99,7 +67,7 @@
// Transfer output.
ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_OUT, 0);
- unsigned long length = fft->elements;
+ unsigned long length = fft->fft_size;
unsigned long max_length = _ALF_MAX_SINGLE_DT_SIZE / FP / sizeof(float);
ea = fft->ea_output_buffer + current_count * FP * fft->out_blk_stride * sizeof(float);
while (length > 0)
@@ -118,80 +86,69 @@
}
-int kernel(void* context,
- void* params,
- void* input,
- void* output,
- unsigned int iter,
- unsigned int iter_max)
+int kernel(
+ void* context,
+ void* params,
+ void* input,
+ void* output,
+ void* inout,
+ unsigned int iter,
+ unsigned int iter_max)
{
- Fft_params* fftp = (Fft_params *)params;
- unsigned int n = fftp->elements;
- unsigned int log2n = log2i(n);
- assert(n <= MAX_FFT_1D_SIZE);
+ static fft1d_f* obj;
+ static char* buf;
+ static size_t current_size = 0;
+ Fft_params* fp = (Fft_params *)params;
+ size_t fft_size = fp->fft_size;
- // Initialization establishes the twiddle factors for the FFTs.
- // These are reloaded any time the FFT size changes.
- if (initialized != n)
+ assert(fft_size <= MAX_FFT_1D_SIZE);
+
+ if (iter == 0 && fft_size != current_size)
{
- initialized = n;
- initialize(fftp, twiddle_factors, n, log2n);
+ if (obj)
+ {
+ free(buf);
+ cml_fft1d_destroy_f_alloc(obj);
+ }
+ int rt = cml_fft1d_setup_f_alloc(&obj, CML_FFT_CC, fft_size);
+ assert(rt && obj != NULL);
+ buf = (char*)memalign(16, cml_zzfft1d_buf_size_f(obj));
+ assert(buf != NULL);
+ current_size = fft_size;
}
- vector float* in = (vector float *)input;
- vector float* W = (vector float *)twiddle_factors;
- vector float* out = (vector float*)output;
+ // Handle inverse FFT explicitly so that shuffle and scale can happen
+ // in single step.
+ cml_ccfft1d_op_f(obj, (float*)input, (float*)output, CML_FFT_FWD, buf);
- unsigned int const vec_size = 4;
-
- // Perform the FFT,
- // -- 'in' may be the same as 'out'
- fft_1d_r2(out, in, W, log2n);
-
-
- if (fftp->direction == fwd_fft)
+ if (fp->direction == fwd_fft)
{
- // Forward FFT support scaling, but for efficiency, this step
- // can be skipped for a scale factor of one.
- if (fftp->scale != (double)1.f)
- {
- unsigned int i;
- vector float *start;
- vector float vscale = spu_splats((float)fftp->scale);
- vector float s;
- start = out;
-
- // Scale the output vector.
- // Note: there are two float values for each of 'n' complex values.
- for (i = 0; i < 2 * n / vec_size; ++i)
- {
- s = *start;
- *start++ = spu_mul(s, vscale);
- }
- }
+ if (fp->scale != (double)1.f)
+ cml_core_rcsvmul1_f(fp->scale, output, output, fft_size);
}
else
{
// Code for the inverse FFT taken from the CBE SDK Libraries
// Overview and Users Guide, sec. 8.1.
- unsigned int i;
- vector float *start, *end, s0, s1, e0, e1;
+ int const vec_size = 4;
+ vector float* start = (vector float*)output;
+ vector float* end = start + 2 * fft_size / vec_size;
+ vector float s0, s1, e0, e1;
vector unsigned int mask = (vector unsigned int){-1, -1, 0, 0};
- vector float vscale = spu_splats((float)fftp->scale);
- start = out;
+ vector float vscale = spu_splats((float)fp->scale);
+ unsigned int i;
// Scale the output vector and swap the order of the outputs.
// Note: there are two float values for each of 'n' complex values.
- end = start + 2 * n / vec_size;
s0 = e1 = *start;
- for (i = 0; i < n / vec_size; ++i)
+ for (i = 0; i < fft_size / vec_size; ++i)
{
s1 = *(start + 1);
e0 = *(--end);
*start++ = spu_mul(spu_sel(e0, e1, mask), vscale);
- *end = spu_mul(spu_sel(s0, s1, mask), vscale);
+ *end = spu_mul(spu_sel(s0, s1, mask), vscale);
s0 = s1;
e1 = e0;
}
Index: src/vsip/opt/cbe/spu/vmul_split.h
===================================================================
--- src/vsip/opt/cbe/spu/vmul_split.h (revision 209510)
+++ src/vsip/opt/cbe/spu/vmul_split.h (working copy)
@@ -1,242 +0,0 @@
-/* Copyright (c) 2007 by CodeSourcery. All rights reserved.
-
- This file is available for license from CodeSourcery, Inc. under the terms
- of a commercial license and under the GPL. It is not part of the VSIPL++
- reference implementation and is not available under the BSD license.
-*/
-/** @file vsip/opt/cbe/spu/vmul_split.c
- @author Jules Bergmann
- @date 2007-02-28
- @brief VSIPL++ Library: Split complex vector-multiply routines.
-*/
-
-#ifndef VSIP_OPT_CBE_SPU_VMUL_SPLIT_H
-#define VSIP_OPT_CBE_SPU_VMUL_SPLIT_H
-
-/***********************************************************************
- Definitions
-***********************************************************************/
-
-// Complex vector-multiply, loop unrolled by 8 version.
-
-inline void
-cvmul(
- float* r_re,
- float* r_im,
- float* a_re,
- float* a_im,
- float* b_re,
- float* b_im,
- int length)
-{
- unsigned const vec_len = 4;
- vector float const v_zero = {0, 0, 0, 0};
-
- vector float* v_r_re = (vector float*)r_re;
- vector float* v_r_im = (vector float*)r_im;
- vector float* v_a_re = (vector float*)a_re;
- vector float* v_a_im = (vector float*)a_im;
- vector float* v_b_re = (vector float*)b_re;
- vector float* v_b_im = (vector float*)b_im;
-
- while (length >= (int)(8*vec_len))
- {
- vector float ar0 = *v_a_re++;
- vector float ai0 = *v_a_im++;
- vector float br0 = *v_b_re++;
- vector float bi0 = *v_b_im++;
- vector float ar1 = *v_a_re++;
- vector float ai1 = *v_a_im++;
- vector float br1 = *v_b_re++;
- vector float bi1 = *v_b_im++;
- vector float ar2 = *v_a_re++;
- vector float ai2 = *v_a_im++;
- vector float br2 = *v_b_re++;
- vector float bi2 = *v_b_im++;
- vector float ar3 = *v_a_re++;
- vector float ai3 = *v_a_im++;
- vector float br3 = *v_b_re++;
- vector float bi3 = *v_b_im++;
- vector float ar4 = *v_a_re++;
- vector float ai4 = *v_a_im++;
- vector float br4 = *v_b_re++;
- vector float bi4 = *v_b_im++;
- vector float ar5 = *v_a_re++;
- vector float ai5 = *v_a_im++;
- vector float br5 = *v_b_re++;
- vector float bi5 = *v_b_im++;
- vector float ar6 = *v_a_re++;
- vector float ai6 = *v_a_im++;
- vector float br6 = *v_b_re++;
- vector float bi6 = *v_b_im++;
- vector float ar7 = *v_a_re++;
- vector float ai7 = *v_a_im++;
- vector float br7 = *v_b_re++;
- vector float bi7 = *v_b_im++;
-
- vector float p10 = spu_nmsub(ai0, bi0, v_zero); // -(ai*bi - 0)
- vector float p20 = spu_mul(ai0, br0); // (ai*br)
- vector float ri0 = spu_madd(ar0, bi0, p20); // ar*bi + ai*br
- vector float rr0 = spu_madd(ar0, br0, p10); // ar*br - ai*bi
-
- vector float p11 = spu_nmsub(ai1, bi1, v_zero); // -(ai*bi - 0)
- vector float p21 = spu_mul(ai1, br1); // (ai*br)
- vector float ri1 = spu_madd(ar1, bi1, p21); // ar*bi + ai*br
- vector float rr1 = spu_madd(ar1, br1, p11); // ar*br - ai*bi
-
- vector float p12 = spu_nmsub(ai2, bi2, v_zero); // -(ai*bi - 0)
- vector float p22 = spu_mul(ai2, br2); // (ai*br)
- vector float ri2 = spu_madd(ar2, bi2, p22); // ar*bi + ai*br
- vector float rr2 = spu_madd(ar2, br2, p12); // ar*br - ai*bi
-
- vector float p13 = spu_nmsub(ai3, bi3, v_zero); // -(ai*bi - 0)
- vector float p23 = spu_mul(ai3, br3); // (ai*br)
- vector float ri3 = spu_madd(ar3, bi3, p23); // ar*bi + ai*br
- vector float rr3 = spu_madd(ar3, br3, p13); // ar*br - ai*bi
-
- vector float p14 = spu_nmsub(ai4, bi4, v_zero); // -(ai*bi - 0)
- vector float p24 = spu_mul(ai4, br4); // (ai*br)
- vector float ri4 = spu_madd(ar4, bi4, p24); // ar*bi + ai*br
- vector float rr4 = spu_madd(ar4, br4, p14); // ar*br - ai*bi
-
- vector float p15 = spu_nmsub(ai5, bi5, v_zero); // -(ai*bi - 0)
- vector float p25 = spu_mul(ai5, br5); // (ai*br)
- vector float ri5 = spu_madd(ar5, bi5, p25); // ar*bi + ai*br
- vector float rr5 = spu_madd(ar5, br5, p15); // ar*br - ai*bi
-
- vector float p16 = spu_nmsub(ai6, bi6, v_zero); // -(ai*bi - 0)
- vector float p26 = spu_mul(ai6, br6); // (ai*br)
- vector float ri6 = spu_madd(ar6, bi6, p26); // ar*bi + ai*br
- vector float rr6 = spu_madd(ar6, br6, p16); // ar*br - ai*bi
-
- vector float p17 = spu_nmsub(ai7, bi7, v_zero); // -(ai*bi - 0)
- vector float p27 = spu_mul(ai7, br7); // (ai*br)
- vector float ri7 = spu_madd(ar7, bi7, p27); // ar*bi + ai*br
- vector float rr7 = spu_madd(ar7, br7, p17); // ar*br - ai*bi
-
- *v_r_re++ = rr0;
- *v_r_im++ = ri0;
- *v_r_re++ = rr1;
- *v_r_im++ = ri1;
- *v_r_re++ = rr2;
- *v_r_im++ = ri2;
- *v_r_re++ = rr3;
- *v_r_im++ = ri3;
- *v_r_re++ = rr4;
- *v_r_im++ = ri4;
- *v_r_re++ = rr5;
- *v_r_im++ = ri5;
- *v_r_re++ = rr6;
- *v_r_im++ = ri6;
- *v_r_re++ = rr7;
- *v_r_im++ = ri7;
-
- length -= 8*vec_len;
- }
-
- if (length)
- {
- a_re = (float*)v_a_re; a_im = (float*)v_a_im;
- b_re = (float*)v_b_re; b_im = (float*)v_b_im;
- r_re = (float*)v_r_re; r_im = (float*)v_r_im;
- while (length--)
- {
- float ar = *a_re++; float ai = *a_im++;
- float br = *b_re++; float bi = *b_im++;
- *r_re++ = ar*br - ai*bi;
- *r_im++ = ar*bi + ai*br;
- }
- }
-}
-
-
-
-// Complex vector-multiply, loop unrolled by 4 version.
-
-inline void
-cvmul_4(
- float* r_re,
- float* r_im,
- float* a_re,
- float* a_im,
- float* b_re,
- float* b_im,
- int length)
-{
- unsigned const vec_len = 4;
- vector float const v_zero = {0, 0, 0, 0};
-
- vector float* v_r_re = (vector float*)r_re;
- vector float* v_r_im = (vector float*)r_im;
- vector float* v_a_re = (vector float*)a_re;
- vector float* v_a_im = (vector float*)a_im;
- vector float* v_b_re = (vector float*)b_re;
- vector float* v_b_im = (vector float*)b_im;
-
- while (length >= (int)(4*vec_len))
- {
- vector float ar0 = *v_a_re++;
- vector float ai0 = *v_a_im++;
- vector float br0 = *v_b_re++;
- vector float bi0 = *v_b_im++;
- vector float ar1 = *v_a_re++;
- vector float ai1 = *v_a_im++;
- vector float br1 = *v_b_re++;
- vector float bi1 = *v_b_im++;
- vector float ar2 = *v_a_re++;
- vector float ai2 = *v_a_im++;
- vector float br2 = *v_b_re++;
- vector float bi2 = *v_b_im++;
- vector float ar3 = *v_a_re++;
- vector float ai3 = *v_a_im++;
- vector float br3 = *v_b_re++;
- vector float bi3 = *v_b_im++;
-
- vector float p10 = spu_nmsub(ai0, bi0, v_zero); // -(ai*bi - 0)
- vector float p20 = spu_madd(ai0, br0, v_zero); // (ai*br + 0)
- vector float ri0 = spu_madd(ar0, bi0, p20); // ar*bi + ai*br
- vector float rr0 = spu_madd(ar0, br0, p10); // ar*br - ai*bi
-
- vector float p11 = spu_nmsub(ai1, bi1, v_zero); // -(ai*bi - 0)
- vector float p21 = spu_madd(ai1, br1, v_zero); // (ai*br + 0)
- vector float ri1 = spu_madd(ar1, bi1, p21); // ar*bi + ai*br
- vector float rr1 = spu_madd(ar1, br1, p11); // ar*br - ai*bi
-
- vector float p12 = spu_nmsub(ai2, bi2, v_zero); // -(ai*bi - 0)
- vector float p22 = spu_madd(ai2, br2, v_zero); // (ai*br + 0)
- vector float ri2 = spu_madd(ar2, bi2, p22); // ar*bi + ai*br
- vector float rr2 = spu_madd(ar2, br2, p12); // ar*br - ai*bi
-
- vector float p13 = spu_nmsub(ai3, bi3, v_zero); // -(ai*bi - 0)
- vector float p23 = spu_madd(ai3, br3, v_zero); // (ai*br + 0)
- vector float ri3 = spu_madd(ar3, bi3, p23); // ar*bi + ai*br
- vector float rr3 = spu_madd(ar3, br3, p13); // ar*br - ai*bi
-
- *v_r_re++ = rr0;
- *v_r_im++ = ri0;
- *v_r_re++ = rr1;
- *v_r_im++ = ri1;
- *v_r_re++ = rr2;
- *v_r_im++ = ri2;
- *v_r_re++ = rr3;
- *v_r_im++ = ri3;
-
- length -= 4*vec_len;
- }
-
- if (length)
- {
- a_re = (float*)v_a_re; a_im = (float*)v_a_im;
- b_re = (float*)v_b_re; b_im = (float*)v_b_im;
- r_re = (float*)v_r_re; r_im = (float*)v_r_im;
- while (length--)
- {
- float ar = *a_re++; float ai = *a_im++;
- float br = *b_re++; float bi = *b_im++;
- *r_re++ = ar*br - ai*bi;
- *r_im++ = ar*bi + ai*br;
- }
- }
-}
-
-#endif // VSIP_OPT_CBE_SPU_VMUL_SPLIT_H
Index: src/vsip/opt/cbe/spu/fft_1d.h
===================================================================
--- src/vsip/opt/cbe/spu/fft_1d.h (revision 209510)
+++ src/vsip/opt/cbe/spu/fft_1d.h (working copy)
@@ -1,93 +0,0 @@
-/* -------------------------------------------------------------- */
-/* (C)Copyright 2001,2006, */
-/* International Business Machines Corporation, */
-/* Sony Computer Entertainment, Incorporated, */
-/* Toshiba Corporation, */
-/* */
-/* All Rights Reserved. */
-/* -------------------------------------------------------------- */
-/* PROLOG END TAG zYx */
-#ifndef _FFT_1D_H_
-#define _FFT_1D_H_ 1
-
-#include <spu_intrinsics.h>
-#include <vec_literal.h>
-
-/* BIT_SWAP - swaps up to 16 bits of the integer _i according to the
- * pattern specified by _pat.
- */
-#define BIT_SWAP(_i, _pat) spu_extract(spu_gather(spu_shuffle(spu_maskb(_i), _pat, _pat)), 0)
-
-static vector unsigned char byte_reverse[] = {
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), /* 0 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15), /* 1 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,14), /* 2 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,11,12,15,14,13), /* 3 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,11,15,14,13,12), /* 4 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,10,15,14,13,12,11), /* 5 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,9,15,14,13,12,11,10), /* 6 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,8,15,14,13,12,11,10,9), /* 7 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,7,15,14,13,12,11,10,9,8), /* 8 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,6,15,14,13,12,11,10,9,8,7), /* 9 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,5,15,14,13,12,11,10,9,8,7,6), /* 10 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,4,15,14,13,12,11,10,9,8,7,6,5), /* 11 */
- VEC_LITERAL(vector unsigned char, 0,1,2,3,15,14,13,12,11,10,9,8,7,6,5,4), /* 12 */
- VEC_LITERAL(vector unsigned char, 0,1,2,15,14,13,12,11,10,9,8,7,6,5,4,3), /* 13 */
- VEC_LITERAL(vector unsigned char, 0,1,15,14,13,12,11,10,9,8,7,6,5,4,3,2), /* 14 */
- VEC_LITERAL(vector unsigned char, 0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1), /* 15 */
- VEC_LITERAL(vector unsigned char, 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0), /* 16 */
-};
-
-
-#ifndef MAX_FFT_1D_SIZE
-#define MAX_FFT_1D_SIZE 8192
-#endif
-
-#ifndef INV_SQRT_2
-#define INV_SQRT_2 0.7071067811865
-#endif
-
-
-/* The following macro, FFT_1D_BUTTERFLY, performs a 4 way SIMD basic butterfly
- * operation. The inputs are in parallel arrays (seperate real and imaginary
- * vectors).
- *
- * p --------------------------> P = p + q*Wi
- * \ /
- * \ /
- * \ /
- * \/
- * /\
- * / \
- * / \
- * ____ / \
- * q --| Wi |-----------------> Q = p - q*Wi
- * ----
- */
-
-#define FFT_1D_BUTTERFLY(_P_re, _P_im, _Q_re, _Q_im, _p_re, _p_im, _q_re, _q_im, _W_re, _W_im) { \
- vector float _qw_re, _qw_im; \
- \
- _qw_re = spu_msub(_q_re, _W_re, spu_mul(_q_im, _W_im)); \
- _qw_im = spu_madd(_q_re, _W_im, spu_mul(_q_im, _W_re)); \
- _P_re = spu_add(_p_re, _qw_re); \
- _P_im = spu_add(_p_im, _qw_im); \
- _Q_re = spu_sub(_p_re, _qw_re); \
- _Q_im = spu_sub(_p_im, _qw_im); \
-}
-
-
-/* FFT_1D_BUTTERFLY_HI is equivalent to FFT_1D_BUTTERFLY with twiddle factors (W_im, -W_re)
- */
-#define FFT_1D_BUTTERFLY_HI(_P_re, _P_im, _Q_re, _Q_im, _p_re, _p_im, _q_re, _q_im, _W_re, _W_im) { \
- vector float _qw_re, _qw_im; \
- \
- _qw_re = spu_madd(_q_re, _W_im, spu_mul(_q_im, _W_re)); \
- _qw_im = spu_msub(_q_im, _W_im, spu_mul(_q_re, _W_re)); \
- _P_re = spu_add(_p_re, _qw_re); \
- _P_im = spu_add(_p_im, _qw_im); \
- _Q_re = spu_sub(_p_re, _qw_re); \
- _Q_im = spu_sub(_p_im, _qw_im); \
-}
-
-#endif /* _FFT_1D_H_ */
Index: src/vsip/opt/cbe/spu/alf_fconv_split_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_fconv_split_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_fconv_split_c.c (working copy)
@@ -20,12 +20,12 @@
#include <sys/time.h>
#include <spu_mfcio.h>
#include <alf_accel.h>
+#include <cml/spu/cml.h>
+#include <cml/spu/cml_core.h>
#include <vsip/opt/cbe/fconv_params.h>
-#include "fft_1d_r2_split.h"
-#include "vmul_split.h"
-#include "spe_assert.h"
+#define _ALF_MAX_SINGLE_DT_SIZE 16*1024
#if PERFMON
# include "timer.h"
@@ -48,43 +48,20 @@
static float kernel_im[VSIP_IMPL_MAX_FCONV_SPLIT_SIZE]
__attribute__ ((aligned (128)));
-// Twiddle factors. For N-point convolution, N/4 twiddle factors
-// are required.
-static volatile float twiddle_factors[VSIP_IMPL_MAX_FCONV_SPLIT_SIZE*2/4]
- __attribute__ ((aligned (128)));
-
// Instance-id. Used to determine when new coefficients must be loaded.
static unsigned int instance_id = 0;
-unsigned int log2i(unsigned int size)
-{
- unsigned int log2_size = 0;
- while (!(size & 1))
- {
- size >>= 1;
- log2_size++;
- }
- return log2_size;
-}
-
-
-
void initialize(
Fastconv_split_params* fc,
- float volatile* p_kernel_re,
- float volatile* p_kernel_im,
- void volatile* p_twiddles,
- unsigned int log2n)
+ float* p_kernel_re,
+ float* p_kernel_im,
+ fft1d_f* obj,
+ void* buf)
{
unsigned int n = fc->elements;
- // The number of twiddle factors is 1/4 the input size
- mfc_get(p_twiddles, fc->ea_twiddle_factors, (n/4)*2*sizeof(float), 31, 0, 0);
- mfc_write_tag_mask(1<<31);
- mfc_read_tag_status_all();
-
// The kernel matches the input and output size
mfc_get(p_kernel_re, fc->ea_kernel_re, n*sizeof(float), 31, 0, 0);
mfc_write_tag_mask(1<<31);
@@ -97,19 +74,18 @@
{
// Perform the forward FFT on the kernel, in place. This only need
// be done once -- subsequent calls will utilize the same kernel.
- vector float* W = (vector float*)twiddle_factors;
- _fft_1d_r2_split((vector float*)kernel_re, (vector float*)kernel_im,
- (vector float*)kernel_re, (vector float*)kernel_im,
- W, log2n);
+ cml_zzfft1d_ip_f(obj,
+ (float*)kernel_re, (float*)kernel_im,
+ CML_FFT_FWD, buf);
}
}
-int alf_prepare_input_list(
+int input(
void* context,
void* params,
- void* list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
@@ -117,26 +93,28 @@
(void)total_count;
Fastconv_split_params* fc = (Fastconv_split_params *)params;
- spe_assert(fc->elements * sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
- addr64 ea;
+ assert(fc->elements * sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
+ alf_data_addr64_t ea;
// Transfer input.
- ALF_DT_LIST_CREATE(list_entries, 0);
- ea.ull = fc->ea_input_re + current_count * fc->input_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_IN, 0);
+ ea = fc->ea_input_re + current_count * fc->input_stride * sizeof(float);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, fc->elements, ALF_DATA_FLOAT, ea);
- ea.ull = fc->ea_input_im + current_count * fc->input_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ea = fc->ea_input_im + current_count * fc->input_stride * sizeof(float);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_prepare_output_list(
+int output(
void* context,
void* params,
- void* list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
@@ -144,33 +122,40 @@
(void)total_count;
Fastconv_split_params* fc = (Fastconv_split_params *)params;
- spe_assert(fc->elements * sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
- addr64 ea;
+ assert(fc->elements * sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
+ alf_data_addr64_t ea;
// Transfer output.
- ALF_DT_LIST_CREATE(list_entries, 0);
- ea.ull = fc->ea_output_re + current_count * fc->output_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_OUT, 0);
+ ea = fc->ea_output_re + current_count * fc->output_stride * sizeof(float);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, fc->elements, ALF_DATA_FLOAT, ea);
- ea.ull = fc->ea_output_im + current_count * fc->output_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ea = fc->ea_output_im + current_count * fc->output_stride * sizeof(float);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_comp_kernel(void* context,
- void* params,
- void* input,
- void* output,
- unsigned int iter,
- unsigned int iter_max)
+int kernel(
+ void* context,
+ void* params,
+ void* input,
+ void* output,
+ void* inout,
+ unsigned int iter,
+ unsigned int iter_max)
{
+ static fft1d_f* obj;
+ static char* buf;
+ static size_t current_size = 0;
+
Fastconv_split_params* fc = (Fastconv_split_params *)params;
- unsigned int n = fc->elements;
- unsigned int log2n = log2i(n);
- spe_assert(n <= VSIP_IMPL_MAX_FCONV_SPLIT_SIZE);
+ unsigned int fft_size = fc->elements;
+ assert(fft_size <= VSIP_IMPL_MAX_FCONV_SPLIT_SIZE);
(void)context;
(void)iter;
@@ -187,89 +172,62 @@
// convolution step and the twiddle factors for the FFTs.
// These are loaded once per task by checking a unique
// ID passed down from the caller.
- if (instance_id != fc->instance_id)
+ if (iter == 0)
{
- instance_id = fc->instance_id;
- initialize(fc, kernel_re, kernel_im, twiddle_factors, log2n);
+ if (fft_size != current_size)
+ {
+ if (obj)
+ {
+ free(buf);
+ cml_fft1d_destroy_f_alloc(obj);
+ }
+ int rt = cml_fft1d_setup_f_alloc(&obj, CML_FFT_CC, fft_size);
+ assert(rt && obj != NULL);
+ buf = (char*)memalign(16, cml_zzfft1d_buf_size_f(obj));
+ assert(buf != NULL);
+ current_size = fft_size;
+ }
+ if (instance_id != fc->instance_id)
+ {
+ instance_id = fc->instance_id;
+ initialize(fc, kernel_re, kernel_im, obj, buf);
#if PERFMON
- t1 = init_timer();
- t2 = init_timer();
- t3 = init_timer();
- t4 = init_timer();
+ t1 = init_timer();
+ t2 = init_timer();
+ t3 = init_timer();
+ t4 = init_timer();
#endif
+ }
}
- float* in_re = (float *)input + 0 * n;
- float* in_im = (float *)input + 1 * n;
- vector float* W = (vector float*)twiddle_factors;
- float * out_re = (float*)output + 0 * n;
- float * out_im = (float*)output + 1 * n;
+ float* in_re = (float*)input + 0 * fft_size;
+ float* in_im = (float*)input + 1 * fft_size;
+ float * out_re = (float*)output + 0 * fft_size;
+ float * out_im = (float*)output + 1 * fft_size;
// Switch to frequency space
START_TIMER(&t1);
- _fft_1d_r2_split((vector float*)in_re, (vector float*)in_im,
- (vector float*)in_re, (vector float*)in_im, W, log2n);
+ cml_zzfft1d_ip_f(obj,
+ (float*)in_re, (float*)in_im,
+ CML_FFT_FWD, buf);
STOP_TIMER(&t1);
// Perform convolution -- now a straight multiplication
START_TIMER(&t2);
- cvmul(out_re, out_im, kernel_re, kernel_im, in_re, in_im, n);
+ cml_zvmul1_f(kernel_re, kernel_im, in_re, in_im, out_re, out_im, fft_size);
STOP_TIMER(&t2);
// Revert back the time domain
START_TIMER(&t3);
- _fft_1d_r2_split((vector float*)out_im, (vector float*)out_re,
- (vector float*)out_im, (vector float*)out_re, W, log2n);
+ cml_zzfft1d_ip_f(obj,
+ (float*)out_re, (float*)out_im,
+ CML_FFT_INV, buf);
STOP_TIMER(&t3);
// Scale by 1/n.
START_TIMER(&t4);
- {
- vector float vscale = spu_splats(1 / (float)n);
- vector float* v_out_re = (vector float*)out_re;
- vector float* v_out_im = (vector float*)out_im;
-
- unsigned int i;
- for (i=0; i<n; i+=16*4)
- {
- v_out_re[0] = spu_mul(v_out_re[0], vscale);
- v_out_re[1] = spu_mul(v_out_re[1], vscale);
- v_out_re[2] = spu_mul(v_out_re[2], vscale);
- v_out_re[3] = spu_mul(v_out_re[3], vscale);
- v_out_re[4] = spu_mul(v_out_re[4], vscale);
- v_out_re[5] = spu_mul(v_out_re[5], vscale);
- v_out_re[6] = spu_mul(v_out_re[6], vscale);
- v_out_re[7] = spu_mul(v_out_re[7], vscale);
- v_out_re[8] = spu_mul(v_out_re[8], vscale);
- v_out_re[9] = spu_mul(v_out_re[9], vscale);
- v_out_re[10] = spu_mul(v_out_re[10], vscale);
- v_out_re[11] = spu_mul(v_out_re[11], vscale);
- v_out_re[12] = spu_mul(v_out_re[12], vscale);
- v_out_re[13] = spu_mul(v_out_re[13], vscale);
- v_out_re[14] = spu_mul(v_out_re[14], vscale);
- v_out_re[15] = spu_mul(v_out_re[15], vscale);
-
- v_out_im[0] = spu_mul(v_out_im[0], vscale);
- v_out_im[1] = spu_mul(v_out_im[1], vscale);
- v_out_im[2] = spu_mul(v_out_im[2], vscale);
- v_out_im[3] = spu_mul(v_out_im[3], vscale);
- v_out_im[4] = spu_mul(v_out_im[4], vscale);
- v_out_im[5] = spu_mul(v_out_im[5], vscale);
- v_out_im[6] = spu_mul(v_out_im[6], vscale);
- v_out_im[7] = spu_mul(v_out_im[7], vscale);
- v_out_im[8] = spu_mul(v_out_im[8], vscale);
- v_out_im[9] = spu_mul(v_out_im[9], vscale);
- v_out_im[10] = spu_mul(v_out_im[10], vscale);
- v_out_im[11] = spu_mul(v_out_im[11], vscale);
- v_out_im[12] = spu_mul(v_out_im[12], vscale);
- v_out_im[13] = spu_mul(v_out_im[13], vscale);
- v_out_im[14] = spu_mul(v_out_im[14], vscale);
- v_out_im[15] = spu_mul(v_out_im[15], vscale);
-
- v_out_re += 16;
- v_out_im += 16;
- }
- }
+ cml_core_rzsvmul1_f(1.f / fft_size, out_re, out_im, out_re, out_im,
+ fft_size);
STOP_TIMER(&t4);
#if PERFMON
@@ -295,3 +253,9 @@
return 0;
}
+
+ALF_ACCEL_EXPORT_API_LIST_BEGIN
+ ALF_ACCEL_EXPORT_API ("input", input);
+ ALF_ACCEL_EXPORT_API ("output", output);
+ ALF_ACCEL_EXPORT_API ("kernel", kernel);
+ALF_ACCEL_EXPORT_API_LIST_END
Index: src/vsip/opt/cbe/spu/fft_1d_r2.h
===================================================================
--- src/vsip/opt/cbe/spu/fft_1d_r2.h (revision 209510)
+++ src/vsip/opt/cbe/spu/fft_1d_r2.h (working copy)
@@ -1,751 +0,0 @@
-/* -------------------------------------------------------------- */
-/* (C)Copyright 2001,2006, */
-/* International Business Machines Corporation, */
-/* Sony Computer Entertainment, Incorporated, */
-/* Toshiba Corporation, */
-/* */
-/* All Rights Reserved. */
-/* -------------------------------------------------------------- */
-/* PROLOG END TAG zYx */
-#ifndef _FFT_1D_R2_H_
-#define _FFT_1D_R2_H_ 1
-
-#include <vec_literal.h>
-#include "fft_1d.h"
-
-/* CSL: Based on the original Cell SDK FFT library, this file has
- been modified as follows:
-
- - The function fft_1d_r2 is now split into two separate
- routines:
-
- fft_1d_r2_pre() Performs all but the last (log2(N)'th)
- stage, leaving results split into
- separate real and imaginary arrays.
-
- fft_1d_r2_fini() Performs the last stage, reordering the
- output into an interleaved arra as part
- of the final step.
-
- - A new routine has been added to fuse two of the three
- operations needed for fast convolution:
-
- fft_1d_r2_fini_cvmul() Combines the complex vector-
- matrix multiply with the last
- stage.
-
- 2007-02-25 DM
-*/
-
-
-/* fft_1d_r2
- * ---------
- * Performs a single precision, complex Fast Fourier Transform using
- * the DFT (Discrete Fourier Transform) with radix-2 decimation in time.
- * The input <in> is an array of complex numbers of length (1<<log2_size)
- * entries. The result is returned in the array of complex numbers specified
- * by <out>. Note: This routine can support an in-place transformation
- * by specifying <in> and <out> to be the same array.
- *
- * This implementation utilizes the Cooley-Tukey algorithm consisting
- * of <log2_size> stages. The basic operation is the butterfly.
- *
- * p --------------------------> P = p + q*Wi
- * \ /
- * \ /
- * \ /
- * \/
- * /\
- * / \
- * / \
- * ____ / \
- * q --| Wi |-----------------> Q = p - q*Wi
- * ----
- *
- * This routine also requires pre-computed twiddle values, W. W is an
- * array of single precision complex numbers of length 1<<(log2_size-2)
- * and is computed as follows:
- *
- * for (i=0; i<n/4; i++)
- * W[i].real = cos(i * 2*PI/n);
- * W[i].imag = -sin(i * 2*PI/n);
- * }
- *
- * This array actually only contains the first half of the twiddle
- * factors. Due for symmetry, the second half of the twiddle factors
- * are implied and equal:
- *
- * for (i=0; i<n/4; i++)
- * W[i+n/4].real = W[i].imag = sin(i * 2*PI/n);
- * W[i+n/4].imag = -W[i].real = -cos(i * 2*PI/n);
- * }
- *
- * Further symmetry allows one to generate the twiddle factor table
- * using half the number of trig computations as follows:
- *
- * W[0].real = 1.0;
- * W[0].imag = 0.0;
- * for (i=1; i<n/4; i++)
- * W[i].real = cos(i * 2*PI/n);
- * W[n/4 - i].imag = -W[i].real;
- * }
- *
- * The complex numbers are packed into quadwords as follows:
- *
- * quadword complex
- * array element array elements
- * -----------------------------------------------------
- * i | real 2*i | imag 2*i | real 2*i+1 | imag 2*i+1 |
- * -----------------------------------------------------
- *
- */
-
-
-void
-_fft_1d_r2_pre(vector float *out_re, vector float *out_im, vector float *in, vector float *W, int log2_size)
-{
- int i, j, k;
- int stage, offset;
- int i_rev;
- int n, n_2, n_4, n_8, n_16, n_3_16;
- int w_stride, w_2stride, w_3stride, w_4stride;
- int stride, stride_2, stride_4, stride_3_4;
- vector float *W0, *W1, *W2, *W3;
- vector float *re0, *re1, *re2, *re3;
- vector float *im0, *im1, *im2, *im3;
- vector float *in0, *in1, *in2, *in3, *in4, *in5, *in6, *in7;
- vector float tmp0, tmp1;
- vector float w0_re, w0_im;
- vector float w0, w1, w2, w3;
- vector float src_lo0, src_lo1, src_lo2, src_lo3;
- vector float src_hi0, src_hi1, src_hi2, src_hi3;
- vector float dst_lo0, dst_lo1, dst_lo2, dst_lo3;
- vector float dst_hi0, dst_hi1, dst_hi2, dst_hi3;
- vector float re_lo0, re_lo1, re_lo2, re_lo3;
- vector float im_lo0, im_lo1, im_lo2, im_lo3;
- vector float re_hi0, re_hi1, re_hi2, re_hi3;
- vector float im_hi0, im_hi1, im_hi2, im_hi3;
- vector float pq_lo0, pq_lo1, pq_lo2, pq_lo3;
- vector float pq_hi0, pq_hi1, pq_hi2, pq_hi3;
- vector float ppmm = VEC_LITERAL(vector float, 1.0f, 1.0f, -1.0f, -1.0f);
- vector float pmmp = VEC_LITERAL(vector float, 1.0f, -1.0f, -1.0f, 1.0f);
- vector unsigned char reverse;
- vector unsigned char shuf_lo = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 16,17,18,19, 20,21,22,23);
- vector unsigned char shuf_hi = VEC_LITERAL(vector unsigned char,
- 8, 9,10,11, 12,13,14,15,
- 24,25,26,27, 28,29,30,31);
- vector unsigned char shuf_0202 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 8, 9,10,11,
- 0, 1, 2, 3, 8, 9,10,11);
- vector unsigned char shuf_1313 = VEC_LITERAL(vector unsigned char,
- 4, 5, 6, 7, 12,13,14,15,
- 4, 5, 6, 7, 12,13,14,15);
- vector unsigned char shuf_0303 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 12,13,14,15,
- 0, 1, 2, 3, 12,13,14,15);
- vector unsigned char shuf_1212 = VEC_LITERAL(vector unsigned char,
- 4, 5, 6, 7, 8, 9,10,11,
- 4, 5, 6, 7, 8, 9,10,11);
- vector unsigned char shuf_0415 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 16,17,18,19,
- 4, 5, 6, 7, 20,21,22,23);
- vector unsigned char shuf_2637 = VEC_LITERAL(vector unsigned char,
- 8, 9,10,11, 24,25,26,27,
- 12,13,14,15,28,29,30,31);
-
- n = 1 << log2_size;
- n_2 = n >> 1;
- n_4 = n >> 2;
- n_8 = n >> 3;
- n_16 = n >> 4;
-
- n_3_16 = n_8 + n_16;
-
-
- reverse = byte_reverse[log2_size];
-
- /* Perform the first 3 stages of the FFT. These stages differs from
- * other stages in that the inputs are unscrambled and the data is
- * reformated into parallel arrays (ie, seperate real and imaginary
- * arrays). The term "unscramble" means the bit address reverse the
- * data array. In addition, the first three stages have simple twiddle
- * weighting factors.
- * stage 1: (1, 0)
- * stage 2: (1, 0) and (0, -1)
- * stage 3: (1, 0), (0.707, -0.707), (0, -1), (-0.707, -0.707)
- *
- * The arrays are processed as two halves, simultaneously. The lo (first
- * half) and hi (second half). This is done because the scramble
- * shares source value between each half of the output arrays.
- */
- i = 0;
- i_rev = 0;
-
- in0 = in;
- in1 = in + n_8;
- in2 = in + n_16;
- in3 = in + n_3_16;
-
- in4 = in + n_4;
- in5 = in1 + n_4;
- in6 = in2 + n_4;
- in7 = in3 + n_4;
-
- re0 = out_re;
- re1 = out_re + n_8;
- im0 = out_im;
- im1 = out_im + n_8;
-
- w0_re = VEC_LITERAL(vector float, 1.0f, INV_SQRT_2, 0.0f, -INV_SQRT_2);
- w0_im = VEC_LITERAL(vector float, 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2);
-
- do {
- src_lo0 = in0[i_rev];
- src_lo1 = in1[i_rev];
- src_lo2 = in2[i_rev];
- src_lo3 = in3[i_rev];
-
- src_hi0 = in4[i_rev];
- src_hi1 = in5[i_rev];
- src_hi2 = in6[i_rev];
- src_hi3 = in7[i_rev];
-
- /* Perform scramble.
- */
- dst_lo0 = spu_shuffle(src_lo0, src_hi0, shuf_lo);
- dst_hi0 = spu_shuffle(src_lo0, src_hi0, shuf_hi);
- dst_lo1 = spu_shuffle(src_lo1, src_hi1, shuf_lo);
- dst_hi1 = spu_shuffle(src_lo1, src_hi1, shuf_hi);
- dst_lo2 = spu_shuffle(src_lo2, src_hi2, shuf_lo);
- dst_hi2 = spu_shuffle(src_lo2, src_hi2, shuf_hi);
- dst_lo3 = spu_shuffle(src_lo3, src_hi3, shuf_lo);
- dst_hi3 = spu_shuffle(src_lo3, src_hi3, shuf_hi);
-
- /* Perform the stage 1 butterfly. The multiplier constant, ppmm,
- * is used to control the sign of the operands since a single
- * quadword contains both of P and Q valule of the butterfly.
- */
- pq_lo0 = spu_madd(ppmm, dst_lo0, spu_rlqwbyte(dst_lo0, 8));
- pq_hi0 = spu_madd(ppmm, dst_hi0, spu_rlqwbyte(dst_hi0, 8));
- pq_lo1 = spu_madd(ppmm, dst_lo1, spu_rlqwbyte(dst_lo1, 8));
- pq_hi1 = spu_madd(ppmm, dst_hi1, spu_rlqwbyte(dst_hi1, 8));
- pq_lo2 = spu_madd(ppmm, dst_lo2, spu_rlqwbyte(dst_lo2, 8));
- pq_hi2 = spu_madd(ppmm, dst_hi2, spu_rlqwbyte(dst_hi2, 8));
- pq_lo3 = spu_madd(ppmm, dst_lo3, spu_rlqwbyte(dst_lo3, 8));
- pq_hi3 = spu_madd(ppmm, dst_hi3, spu_rlqwbyte(dst_hi3, 8));
-
- /* Perfrom the stage 2 butterfly. For this stage, the
- * inputs pq are still interleaved (p.real, p.imag, q.real,
- * q.imag), so we must first re-order the data into
- * parallel arrays as well as perform the reorder
- * associated with the twiddle W[n/4], which equals
- * (0, -1).
- *
- * ie. (A, B) * (0, -1) => (B, -A)
- */
- re_lo0 = spu_madd(ppmm,
- spu_shuffle(pq_lo1, pq_lo1, shuf_0303),
- spu_shuffle(pq_lo0, pq_lo0, shuf_0202));
- im_lo0 = spu_madd(pmmp,
- spu_shuffle(pq_lo1, pq_lo1, shuf_1212),
- spu_shuffle(pq_lo0, pq_lo0, shuf_1313));
-
- re_lo1 = spu_madd(ppmm,
- spu_shuffle(pq_lo3, pq_lo3, shuf_0303),
- spu_shuffle(pq_lo2, pq_lo2, shuf_0202));
- im_lo1 = spu_madd(pmmp,
- spu_shuffle(pq_lo3, pq_lo3, shuf_1212),
- spu_shuffle(pq_lo2, pq_lo2, shuf_1313));
-
-
- re_hi0 = spu_madd(ppmm,
- spu_shuffle(pq_hi1, pq_hi1, shuf_0303),
- spu_shuffle(pq_hi0, pq_hi0, shuf_0202));
- im_hi0 = spu_madd(pmmp,
- spu_shuffle(pq_hi1, pq_hi1, shuf_1212),
- spu_shuffle(pq_hi0, pq_hi0, shuf_1313));
-
- re_hi1 = spu_madd(ppmm,
- spu_shuffle(pq_hi3, pq_hi3, shuf_0303),
- spu_shuffle(pq_hi2, pq_hi2, shuf_0202));
- im_hi1 = spu_madd(pmmp,
- spu_shuffle(pq_hi3, pq_hi3, shuf_1212),
- spu_shuffle(pq_hi2, pq_hi2, shuf_1313));
-
-
- /* Perform stage 3 butterfly.
- */
- FFT_1D_BUTTERFLY(re0[0], im0[0], re0[1], im0[1], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
- FFT_1D_BUTTERFLY(re1[0], im1[0], re1[1], im1[1], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
-
- re0 += 2;
- re1 += 2;
- im0 += 2;
- im1 += 2;
-
- i += 8;
- i_rev = BIT_SWAP(i, reverse) / 2;
- } while (i < n_2);
-
- /* Process stages 4 to log2_size-2
- */
- for (stage=4, stride=4; stage<log2_size-1; stage++, stride += stride) {
- w_stride = n_2 >> stage;
- w_2stride = n >> stage;
- w_3stride = w_stride + w_2stride;
- w_4stride = w_2stride + w_2stride;
-
- W0 = W;
- W1 = W + w_stride;
- W2 = W + w_2stride;
- W3 = W + w_3stride;
-
- stride_2 = stride >> 1;
- stride_4 = stride >> 2;
- stride_3_4 = stride_2 + stride_4;
-
- re0 = out_re; im0 = out_im;
- re1 = out_re + stride_2; im1 = out_im + stride_2;
- re2 = out_re + stride_4; im2 = out_im + stride_4;
- re3 = out_re + stride_3_4; im3 = out_im + stride_3_4;
-
- for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
- /* Compute the twiddle factors
- */
- w0 = W0[offset];
- w1 = W1[offset];
- w2 = W2[offset];
- w3 = W3[offset];
-
- tmp0 = spu_shuffle(w0, w2, shuf_0415);
- tmp1 = spu_shuffle(w1, w3, shuf_0415);
-
- w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
- w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
-
- j = i;
- k = i + stride;
- do {
- re_lo0 = re0[j]; im_lo0 = im0[j];
- re_lo1 = re1[j]; im_lo1 = im1[j];
-
- re_hi0 = re2[j]; im_hi0 = im2[j];
- re_hi1 = re3[j]; im_hi1 = im3[j];
-
- re_lo2 = re0[k]; im_lo2 = im0[k];
- re_lo3 = re1[k]; im_lo3 = im1[k];
-
- re_hi2 = re2[k]; im_hi2 = im2[k];
- re_hi3 = re3[k]; im_hi3 = im3[k];
-
- FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
-
- j += 2 * stride;
- k += 2 * stride;
- } while (j < n_4);
- }
- }
-
- /* Process stage log2_size-1. This is identical to the stage processing above
- * except for this stage the inner loop is only executed once so it is removed
- * entirely.
- */
- w_stride = n_2 >> stage;
- w_2stride = n >> stage;
- w_3stride = w_stride + w_2stride;
- w_4stride = w_2stride + w_2stride;
-
- stride_2 = stride >> 1;
- stride_4 = stride >> 2;
-
- stride_3_4 = stride_2 + stride_4;
-
- re0 = out_re; im0 = out_im;
- re1 = out_re + stride_2; im1 = out_im + stride_2;
- re2 = out_re + stride_4; im2 = out_im + stride_4;
- re3 = out_re + stride_3_4; im3 = out_im + stride_3_4;
-
- for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
- /* Compute the twiddle factors
- */
- w0 = W[offset];
- w1 = W[offset + w_stride];
- w2 = W[offset + w_2stride];
- w3 = W[offset + w_3stride];
-
- tmp0 = spu_shuffle(w0, w2, shuf_0415);
- tmp1 = spu_shuffle(w1, w3, shuf_0415);
-
- w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
- w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
-
- j = i;
- k = i + stride;
-
- re_lo0 = re0[j]; im_lo0 = im0[j];
- re_lo1 = re1[j]; im_lo1 = im1[j];
-
- re_hi0 = re2[j]; im_hi0 = im2[j];
- re_hi1 = re3[j]; im_hi1 = im3[j];
-
- re_lo2 = re0[k]; im_lo2 = im0[k];
- re_lo3 = re1[k]; im_lo3 = im1[k];
-
- re_hi2 = re2[k]; im_hi2 = im2[k];
- re_hi3 = re3[k]; im_hi3 = im3[k];
-
- FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2, im_lo2, re_lo3, im_lo3, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2, im_hi2, re_hi3, im_hi3, w0_re, w0_im);
- }
-}
-
-
-inline void
-_fft_1d_r2_fini(vector float *out, vector float *in_re, vector float *in_im,
- vector float *W, int log2_size)
-{
- int i;
- int n, n_2, n_4, n_8, n_16, n_3_16;
- vector float *re0, *re1, *re2, *re3;
- vector float *im0, *im1, *im2, *im3;
- vector float *out0, *out1, *out2, *out3;
- vector float w0_re, w0_im, w1_re, w1_im;
- vector float w0, w1, w2, w3;
- vector float out_re_lo0, out_re_lo1, out_re_lo2, out_re_lo3;
- vector float out_im_lo0, out_im_lo1, out_im_lo2, out_im_lo3;
- vector float out_re_hi0, out_re_hi1, out_re_hi2, out_re_hi3;
- vector float out_im_hi0, out_im_hi1, out_im_hi2, out_im_hi3;
- vector float re_lo0, re_lo1, re_lo2, re_lo3;
- vector float im_lo0, im_lo1, im_lo2, im_lo3;
- vector float re_hi0, re_hi1, re_hi2, re_hi3;
- vector float im_hi0, im_hi1, im_hi2, im_hi3;
- vector unsigned char shuf_0415 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 16,17,18,19,
- 4, 5, 6, 7, 20,21,22,23);
- vector unsigned char shuf_2637 = VEC_LITERAL(vector unsigned char,
- 8, 9,10,11, 24,25,26,27,
- 12,13,14,15,28,29,30,31);
- vector unsigned char shuf_0246 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 8, 9,10,11,
- 16,17,18,19,24,25,26,27);
- vector unsigned char shuf_1357 = VEC_LITERAL(vector unsigned char,
- 4, 5, 6, 7, 12,13,14,15,
- 20,21,22,23,28,29,30,31);
-
- n = 1 << log2_size;
- n_2 = n >> 1;
- n_4 = n >> 2;
- n_8 = n >> 3;
- n_16 = n >> 4;
-
- n_3_16 = n_8 + n_16;
-
- /* Process the final stage (stage log2_size). For this stage,
- * reformat the data from parallel arrays back into
- * interleaved arrays,storing the result into <in>.
- *
- * This loop has been manually unrolled by 2 to improve
- * dual issue rates and reduce stalls. This unrolling
- * forces a minimum FFT size of 32.
- */
- re0 = in_re;
- re1 = in_re + n_8;
- re2 = in_re + n_16;
- re3 = in_re + n_3_16;
-
- im0 = in_im;
- im1 = in_im + n_8;
- im2 = in_im + n_16;
- im3 = in_im + n_3_16;
-
- out0 = out;
- out1 = out + n_4;
- out2 = out + n_8;
- out3 = out1 + n_8;
-
- i = n_16;
-
- do {
- /* Fetch the twiddle factors
- */
- w0 = W[0];
- w1 = W[1];
- w2 = W[2];
- w3 = W[3];
-
- W += 4;
-
- w0_re = spu_shuffle(w0, w1, shuf_0246);
- w0_im = spu_shuffle(w0, w1, shuf_1357);
- w1_re = spu_shuffle(w2, w3, shuf_0246);
- w1_im = spu_shuffle(w2, w3, shuf_1357);
-
- /* Fetch the butterfly inputs, reals and imaginaries
- */
- re_lo0 = re0[0]; im_lo0 = im0[0];
- re_lo1 = re1[0]; im_lo1 = im1[0];
- re_lo2 = re0[1]; im_lo2 = im0[1];
- re_lo3 = re1[1]; im_lo3 = im1[1];
-
- re_hi0 = re2[0]; im_hi0 = im2[0];
- re_hi1 = re3[0]; im_hi1 = im3[0];
- re_hi2 = re2[1]; im_hi2 = im2[1];
- re_hi3 = re3[1]; im_hi3 = im3[1];
-
- re0 += 2; im0 += 2;
- re1 += 2; im1 += 2;
- re2 += 2; im2 += 2;
- re3 += 2; im3 += 2;
-
- /* Perform the butterflys
- */
- FFT_1D_BUTTERFLY (out_re_lo0, out_im_lo0, out_re_lo1, out_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
- FFT_1D_BUTTERFLY (out_re_lo2, out_im_lo2, out_re_lo3, out_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im);
-
- FFT_1D_BUTTERFLY_HI(out_re_hi0, out_im_hi0, out_re_hi1, out_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(out_re_hi2, out_im_hi2, out_re_hi3, out_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im);
-
- /* Interleave the results and store them into the output buffers (ie,
- * the original input buffers.
- */
- out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415);
- out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637);
- out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415);
- out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637);
-
- out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415);
- out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637);
- out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415);
- out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637);
-
- out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415);
- out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637);
- out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415);
- out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637);
-
- out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415);
- out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637);
- out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415);
- out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637);
-
- out0 += 4;
- out1 += 4;
- out2 += 4;
- out3 += 4;
-
- i -= 2;
- } while (i);
-}
-
-
-inline void
-_fft_1d_r2_fini_cvmul(vector float *out, vector float *in_re, vector float *in_im,
- vector float* kernel, vector float *W, int log2_size)
-{
- int i;
- int n, n_2, n_4, n_8, n_16, n_3_16;
- vector float *re0, *re1, *re2, *re3;
- vector float *im0, *im1, *im2, *im3;
- vector float *out0, *out1, *out2, *out3;
- vector float *k0, *k1, *k2, *k3;
- vector float w0_re, w0_im, w1_re, w1_im;
- vector float w0, w1, w2, w3;
- vector float out_re_lo0, out_re_lo1, out_re_lo2, out_re_lo3;
- vector float out_im_lo0, out_im_lo1, out_im_lo2, out_im_lo3;
- vector float out_re_hi0, out_re_hi1, out_re_hi2, out_re_hi3;
- vector float out_im_hi0, out_im_hi1, out_im_hi2, out_im_hi3;
- vector float re_lo0, re_lo1, re_lo2, re_lo3;
- vector float im_lo0, im_lo1, im_lo2, im_lo3;
- vector float re_hi0, re_hi1, re_hi2, re_hi3;
- vector float im_hi0, im_hi1, im_hi2, im_hi3;
- vector unsigned char shuf_0415 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 16,17,18,19,
- 4, 5, 6, 7, 20,21,22,23);
- vector unsigned char shuf_2637 = VEC_LITERAL(vector unsigned char,
- 8, 9,10,11, 24,25,26,27,
- 12,13,14,15,28,29,30,31);
- vector unsigned char shuf_0246 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 8, 9,10,11,
- 16,17,18,19,24,25,26,27);
- vector unsigned char shuf_1357 = VEC_LITERAL(vector unsigned char,
- 4, 5, 6, 7, 12,13,14,15,
- 20,21,22,23,28,29,30,31);
-
- n = 1 << log2_size;
- n_2 = n >> 1;
- n_4 = n >> 2;
- n_8 = n >> 3;
- n_16 = n >> 4;
-
- n_3_16 = n_8 + n_16;
-
- /* Process the final stage (stage log2_size). For this stage,
- * reformat the data from parallel arrays back into
- * interleaved arrays,storing the result into <in>.
- *
- * This loop has been manually unrolled by 2 to improve
- * dual issue rates and reduce stalls. This unrolling
- * forces a minimum FFT size of 32.
- */
- re0 = in_re;
- re1 = in_re + n_8;
- re2 = in_re + n_16;
- re3 = in_re + n_3_16;
-
- im0 = in_im;
- im1 = in_im + n_8;
- im2 = in_im + n_16;
- im3 = in_im + n_3_16;
-
- out0 = out;
- out1 = out + n_4;
- out2 = out + n_8;
- out3 = out1 + n_8;
-
- k0 = kernel;
- k1 = kernel + n_4;
- k2 = kernel + n_8;
- k3 = k1 + n_8;
-
- i = n_16;
-
- do {
- /* Fetch the twiddle factors
- */
- w0 = W[0];
- w1 = W[1];
- w2 = W[2];
- w3 = W[3];
-
- W += 4;
-
- w0_re = spu_shuffle(w0, w1, shuf_0246);
- w0_im = spu_shuffle(w0, w1, shuf_1357);
- w1_re = spu_shuffle(w2, w3, shuf_0246);
- w1_im = spu_shuffle(w2, w3, shuf_1357);
-
-
- /* Fetch the butterfly inputs, reals and imaginaries
- */
- re_lo0 = re0[0]; im_lo0 = im0[0];
- re_lo1 = re1[0]; im_lo1 = im1[0];
- re_lo2 = re0[1]; im_lo2 = im0[1];
- re_lo3 = re1[1]; im_lo3 = im1[1];
-
- re_hi0 = re2[0]; im_hi0 = im2[0];
- re_hi1 = re3[0]; im_hi1 = im3[0];
- re_hi2 = re2[1]; im_hi2 = im2[1];
- re_hi3 = re3[1]; im_hi3 = im3[1];
-
- re0 += 2; im0 += 2;
- re1 += 2; im1 += 2;
- re2 += 2; im2 += 2;
- re3 += 2; im3 += 2;
-
-
-
- /* Perform the butterflys
- */
- {
- vector float k_re_lo0, k_re_lo1, k_re_lo2, k_re_lo3;
- vector float k_im_lo0, k_im_lo1, k_im_lo2, k_im_lo3;
- vector float k_re_hi0, k_re_hi1, k_re_hi2, k_re_hi3;
- vector float k_im_hi0, k_im_hi1, k_im_hi2, k_im_hi3;
- vector float t_re_lo0, t_re_lo1, t_re_lo2, t_re_lo3;
- vector float t_im_lo0, t_im_lo1, t_im_lo2, t_im_lo3;
- vector float t_re_hi0, t_re_hi1, t_re_hi2, t_re_hi3;
- vector float t_im_hi0, t_im_hi1, t_im_hi2, t_im_hi3;
- FFT_1D_BUTTERFLY (t_re_lo0, t_im_lo0, t_re_lo1, t_im_lo1, re_lo0, im_lo0, re_lo1, im_lo1, w0_re, w0_im);
- FFT_1D_BUTTERFLY (t_re_lo2, t_im_lo2, t_re_lo3, t_im_lo3, re_lo2, im_lo2, re_lo3, im_lo3, w1_re, w1_im);
-
- FFT_1D_BUTTERFLY_HI(t_re_hi0, t_im_hi0, t_re_hi1, t_im_hi1, re_hi0, im_hi0, re_hi1, im_hi1, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(t_re_hi2, t_im_hi2, t_re_hi3, t_im_hi3, re_hi2, im_hi2, re_hi3, im_hi3, w1_re, w1_im);
-
-
- /* Fetch the kernel inputs, reals and imaginaries
- */
- k_re_lo0 = spu_shuffle(k0[0], k0[1], shuf_0246);
- k_im_lo0 = spu_shuffle(k0[0], k0[1], shuf_1357);
- k_re_lo1 = spu_shuffle(k1[0], k1[1], shuf_0246);
- k_im_lo1 = spu_shuffle(k1[0], k1[1], shuf_1357);
- k_re_lo2 = spu_shuffle(k0[2], k0[3], shuf_0246);
- k_im_lo2 = spu_shuffle(k0[2], k0[3], shuf_1357);
- k_re_lo3 = spu_shuffle(k1[2], k1[3], shuf_0246);
- k_im_lo3 = spu_shuffle(k1[2], k1[3], shuf_1357);
-
- k_re_hi0 = spu_shuffle(k2[0], k2[1], shuf_0246);
- k_im_hi0 = spu_shuffle(k2[0], k2[1], shuf_1357);
- k_re_hi1 = spu_shuffle(k3[0], k3[1], shuf_0246);
- k_im_hi1 = spu_shuffle(k3[0], k3[1], shuf_1357);
- k_re_hi2 = spu_shuffle(k2[2], k2[3], shuf_0246);
- k_im_hi2 = spu_shuffle(k2[2], k2[3], shuf_1357);
- k_re_hi3 = spu_shuffle(k3[2], k3[3], shuf_0246);
- k_im_hi3 = spu_shuffle(k3[2], k3[3], shuf_1357);
-
- k0 += 4;
- k1 += 4;
- k2 += 4;
- k3 += 4;
-
-
- out_re_lo0 = spu_msub(k_re_lo0, t_re_lo0, spu_mul(k_im_lo0, t_im_lo0));
- out_im_lo0 = spu_madd(k_im_lo0, t_re_lo0, spu_mul(k_re_lo0, t_im_lo0));
- out_re_lo1 = spu_msub(k_re_lo1, t_re_lo1, spu_mul(k_im_lo1, t_im_lo1));
- out_im_lo1 = spu_madd(k_im_lo1, t_re_lo1, spu_mul(k_re_lo1, t_im_lo1));
- out_re_lo2 = spu_msub(k_re_lo2, t_re_lo2, spu_mul(k_im_lo2, t_im_lo2));
- out_im_lo2 = spu_madd(k_im_lo2, t_re_lo2, spu_mul(k_re_lo2, t_im_lo2));
- out_re_lo3 = spu_msub(k_re_lo3, t_re_lo3, spu_mul(k_im_lo3, t_im_lo3));
- out_im_lo3 = spu_madd(k_im_lo3, t_re_lo3, spu_mul(k_re_lo3, t_im_lo3));
-
- out_re_hi0 = spu_msub(k_re_hi0, t_re_hi0, spu_mul(k_im_hi0, t_im_hi0));
- out_im_hi0 = spu_madd(k_im_hi0, t_re_hi0, spu_mul(k_re_hi0, t_im_hi0));
- out_re_hi1 = spu_msub(k_re_hi1, t_re_hi1, spu_mul(k_im_hi1, t_im_hi1));
- out_im_hi1 = spu_madd(k_im_hi1, t_re_hi1, spu_mul(k_re_hi1, t_im_hi1));
- out_re_hi2 = spu_msub(k_re_hi2, t_re_hi2, spu_mul(k_im_hi2, t_im_hi2));
- out_im_hi2 = spu_madd(k_im_hi2, t_re_hi2, spu_mul(k_re_hi2, t_im_hi2));
- out_re_hi3 = spu_msub(k_re_hi3, t_re_hi3, spu_mul(k_im_hi3, t_im_hi3));
- out_im_hi3 = spu_madd(k_im_hi3, t_re_hi3, spu_mul(k_re_hi3, t_im_hi3));
- }
-
- /* Interleave the results and store them into the output buffers (ie,
- * the original input buffers.
- */
- out0[0] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_0415);
- out0[1] = spu_shuffle(out_re_lo0, out_im_lo0, shuf_2637);
- out0[2] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_0415);
- out0[3] = spu_shuffle(out_re_lo2, out_im_lo2, shuf_2637);
-
- out1[0] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_0415);
- out1[1] = spu_shuffle(out_re_lo1, out_im_lo1, shuf_2637);
- out1[2] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_0415);
- out1[3] = spu_shuffle(out_re_lo3, out_im_lo3, shuf_2637);
-
- out2[0] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_0415);
- out2[1] = spu_shuffle(out_re_hi0, out_im_hi0, shuf_2637);
- out2[2] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_0415);
- out2[3] = spu_shuffle(out_re_hi2, out_im_hi2, shuf_2637);
-
- out3[0] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_0415);
- out3[1] = spu_shuffle(out_re_hi1, out_im_hi1, shuf_2637);
- out3[2] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_0415);
- out3[3] = spu_shuffle(out_re_hi3, out_im_hi3, shuf_2637);
-
- out0 += 4;
- out1 += 4;
- out2 += 4;
- out3 += 4;
-
- i -= 2;
- } while (i);
-}
-
-#endif /* _FFT_1D_R2_H_ */
Index: src/vsip/opt/cbe/spu/alf_fconvm_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_fconvm_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_fconvm_c.c (working copy)
@@ -14,72 +14,40 @@
#include <sys/time.h>
#include <spu_mfcio.h>
#include <alf_accel.h>
+#include <cml/spu/cml.h>
+#include <cml/spu/cml_core.h>
+
#include <vsip/opt/cbe/fconv_params.h>
-#include "fft_1d_r2.h"
-#include "spe_assert.h"
-// The twiddle factors occupy only 1/4 the space as the inputs,
-// outputs and convolution kernels.
-static float twiddle_factors[2 * VSIP_IMPL_MAX_FCONV_SIZE / 4]
- __attribute__ ((aligned (128)));
+#define _ALF_MAX_SINGLE_DT_SIZE 16*1024
-static unsigned int instance_id = 0;
-#define VEC_SIZE (4)
-unsigned int
-log2i(unsigned int size)
-{
- unsigned int log2_size = 0;
- while (!(size & 1))
- {
- size >>= 1;
- log2_size++;
- }
- return log2_size;
-}
-
-
-void
-initialize(
- unsigned long long ea_twiddles, // source address in main memory
- void volatile* p_twiddles, // destination address in local store
- unsigned int n) // number of elements
-{
- unsigned int size = n * 2 * sizeof(float);
-
- // The number of twiddle factors is 1/4 the input size
- mfc_get(p_twiddles, ea_twiddles, size/4, 31, 0, 0);
- mfc_write_tag_mask(1<<31);
- mfc_read_tag_status_all();
-}
-
-
-
int
-alf_prepare_input_list(
+input(
void* context,
void* params,
- void* list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
unsigned int const FP = 2; // Complex data: 2 floats per point.
Fastconv_params* fc = (Fastconv_params *)params;
- spe_assert(fc->elements * FP *sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
- addr64 ea;
+ assert(fc->elements * FP *sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
+ alf_data_addr64_t ea;
// Transfer input.
- ALF_DT_LIST_CREATE(list_entries, 0);
- ea.ull = fc->ea_input +
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_IN, 0);
+ ea = fc->ea_input +
current_count * FP * fc->input_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
// Transfer kernel.
- ea.ull = fc->ea_kernel +
+ ea = fc->ea_kernel +
current_count * FP * fc->kernel_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_END(entries);
return 0;
}
@@ -87,24 +55,25 @@
int
-alf_prepare_output_list(
+output(
void* context,
void* params,
- void* list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
unsigned int const FP = 2; // Complex data: 2 floats per point.
Fastconv_params* fc = (Fastconv_params *)params;
- spe_assert(fc->elements * FP *sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
- addr64 ea;
+ assert(fc->elements * FP *sizeof(float) <= _ALF_MAX_SINGLE_DT_SIZE);
+ alf_data_addr64_t ea;
// Transfer output.
- ALF_DT_LIST_CREATE(list_entries, 0);
- ea.ull = fc->ea_output +
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_OUT, 0);
+ ea = fc->ea_output +
current_count * FP * fc->output_stride * sizeof(float);
- ALF_DT_LIST_ADD_ENTRY(list_entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, FP * fc->elements, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_END(entries);
return 0;
}
@@ -112,82 +81,53 @@
int
-alf_comp_kernel(
- void* context,
- void* params,
- void* input,
- void* output,
- unsigned int current_count,
- unsigned int total_count)
+kernel(
+ void* context,
+ void* params,
+ void* input,
+ void* output,
+ void* inout,
+ unsigned int iter,
+ unsigned int iter_max)
{
+ static fft1d_f* obj;
+ static char* buf;
+ static size_t current_size = 0;
+
Fastconv_params* fc = (Fastconv_params *)params;
- unsigned int n = fc->elements;
- unsigned int log2n = log2i(n);
- spe_assert(n <= VSIP_IMPL_MAX_FCONV_SIZE);
+ unsigned int fft_size = fc->elements;
+ assert(fft_size <= VSIP_IMPL_MAX_FCONV_SIZE);
- // Initialization establishes the weights (kernel) for the
- // convolution step and the twiddle factors for the FFTs.
- // These are loaded once per task by checking a unique
- // ID passed down from the caller.
- if (instance_id != fc->instance_id)
+ // Reinitialize the FFT object if the fft size changes.
+ if (iter == 0 && fft_size != current_size)
{
- instance_id = fc->instance_id;
- initialize(fc->ea_twiddle_factors, twiddle_factors, n);
+ if (obj)
+ {
+ free(buf);
+ cml_fft1d_destroy_f_alloc(obj);
+ }
+ int rt = cml_fft1d_setup_f_alloc(&obj, CML_FFT_CC, fft_size);
+ assert(rt && obj != NULL);
+ buf = (char*)memalign(16, cml_zzfft1d_buf_size_f(obj));
+ assert(buf != NULL);
+ current_size = fft_size;
}
- vector float* in = (vector float *)input;
- vector float* k = (vector float *)input + 2 * n / VEC_SIZE;
- vector float* W = (vector float*)twiddle_factors;
- vector float* out = (vector float*)output;
+ float* in = (float*)input;
+ float* coeff = (float*)input + 2 * fft_size;
+ float* out = (float*)output;
- // Create real & imaginary working arrays
- vector float re[n / VEC_SIZE], im[n / VEC_SIZE];
-
-
// Perform the forward FFT on the kernel, in place, but
// only if requested (this step is often done in advance).
if (fc->transform_kernel)
{
- _fft_1d_r2_pre(re, im, k, W, log2n);
- _fft_1d_r2_fini(k, re, im, W, log2n);
+ cml_ccfft1d_ip_f(obj, coeff, CML_FFT_FWD, buf);
}
+ cml_ccfft1d_ip_f(obj, in, CML_FFT_FWD, buf);
+ cml_cvmul1_f(coeff, in, out, fft_size);
+ cml_ccfft1d_ip_f(obj, out, CML_FFT_INV, buf);
+ cml_core_rcsvmul1_f(1.f / fft_size, out, out, fft_size);
- // Perform the forward FFT, rolling the convolution into
- // the last stage
- _fft_1d_r2_pre(re, im, in, W, log2n);
- _fft_1d_r2_fini_cvmul(out, re, im, k, W, log2n);
-
-
- // Revert back the time domain.
- _fft_1d_r2_pre(re, im, out, W, log2n);
- _fft_1d_r2_fini(out, re, im, W, log2n);
-
-
- // Code for the inverse FFT scaling is taken from the CBE
- // SDK Libraries Overview and Users Guide, sec. 8.1.
- {
- unsigned int i;
- vector float *start, *end, s0, s1, e0, e1;
- vector unsigned int mask = (vector unsigned int){-1, -1, 0, 0};
- vector float vscale = spu_splats(1 / (float)n);
- start = out;
-
- // Scale the output vector and swap the order of the outputs.
- // Note: there are two float values for each of 'n' complex values.
- end = start + 2 * n / VEC_SIZE;
- s0 = e1 = *start;
- for (i = 0; i < n / VEC_SIZE; ++i)
- {
- s1 = *(start + 1);
- e0 = *(--end);
-
- *start++ = spu_mul(spu_sel(e0, e1, mask), vscale);
- *end = spu_mul(spu_sel(s0, s1, mask), vscale);
- s0 = s1;
- e1 = e0;
- }
- }
-
return 0;
}
Index: src/vsip/opt/cbe/spu/spe_assert.h
===================================================================
--- src/vsip/opt/cbe/spu/spe_assert.h (revision 209510)
+++ src/vsip/opt/cbe/spu/spe_assert.h (working copy)
@@ -1,57 +0,0 @@
-/* Copyright (c) 2007 by CodeSourcery. All rights reserved.
-
- This file is available for license from CodeSourcery, Inc. under the terms
- of a commercial license and under the GPL. It is not part of the VSIPL++
- reference implementation and is not available under the BSD license.
-*/
-/** @file vsip/opt/cbe/spu/spe_assert.h
- @author Jules Bergmann, Don McCoy
- @date 2007-03-23
- @brief VSIPL++ Library: Replacement function for assert(), used
- because it is broken as of 2007/03.
-*/
-
-#ifndef _SPE_ASSERT_H_
-#define _SPE_ASSERT_H_
-
-#ifndef NDEBUG
-#include <stdio.h>
-
-void inline
-spe_assert_fail(
- const char* assertion,
- const char* file,
- unsigned int line,
- const char* function)
-{
- fprintf(stderr, "ASSERTION FAILURE: %s %s %d %s\n",
- assertion, file, line, function);
- abort();
-}
-
-#if defined(__GNU__)
-# if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
-# define SPE_ASSERT_FUNCTION __func__
-# else
-# define SPE_ASSERT_FUNCTION ((__const char *) 0)
-# endif
-#else
-# define SPE_ASSERT_FUNCTION ((__const char *) 0)
-#endif
-
-#ifdef __STDC__
-# define __SPE_STRING(e) #e
-#else
-# define __SPE_STRING(e) "e"
-#endif
-
-#define spe_assert(expr) \
- ((void)((expr) ? 0 : \
- (spe_assert_fail(__SPE_STRING(expr), __FILE__, __LINE__, \
- SPE_ASSERT_FUNCTION), 0)))
-#else
-#define spe_assert(expr)
-
-#endif /* #ifndef NDEBUG */
-
-#endif /* _SPE_ASSERT_H_ */
Index: src/vsip/opt/cbe/spu/alf_decls.hpp
===================================================================
--- src/vsip/opt/cbe/spu/alf_decls.hpp (revision 0)
+++ src/vsip/opt/cbe/spu/alf_decls.hpp (revision 0)
@@ -0,0 +1,41 @@
+/* Copyright (c) 2008 by CodeSourcery. All rights reserved.
+
+ This file is available for license from CodeSourcery, Inc. under the terms
+ of a commercial license and under the GPL. It is not part of the VSIPL++
+ reference implementation and is not available under the BSD license.
+*/
+/** @file vsip/opt/cbe/spu/alf_decls.cpp
+ @author Jules Bergmann
+ @date 2008-05-22
+ @brief VSIPL++ Library: Decls for ALF input/output/kernel functions.
+*/
+
+#ifndef VSIP_OPT_CBE_SPU_ALF_DECLS_HPP
+#define VSIP_OPT_CBE_SPU_ALF_DECLS_HPP
+
+extern "C" {
+int input(
+ void* context,
+ void* params,
+ void* entries,
+ unsigned int current_count,
+ unsigned int total_count);
+
+int output(
+ void* context,
+ void* params,
+ void* entries,
+ unsigned int cur_iter,
+ unsigned int tot_iter);
+
+int kernel(
+ void* context,
+ void* params,
+ void* input,
+ void* output,
+ void* inout,
+ unsigned int cur_iter,
+ unsigned int tot_iter);
+}
+
+#endif // VSIP_OPT_CBE_SPU_ALF_DECLS_HPP
Index: src/vsip/opt/cbe/spu/alf_vmul_split_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_vmul_split_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_vmul_split_c.c (working copy)
@@ -11,78 +11,77 @@
*/
#include <alf_accel.h>
+#include <cml/spu/cml.h>
+
#include <vsip/opt/cbe/vmul_params.h>
-#include "vmul_split.h"
-int alf_prepare_input_list(
+int input(
void* p_context,
void* p_params,
- void* p_list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
Vmul_split_params* p = (Vmul_split_params*)p_params;
- addr64 ea;
- u32 base_addr;
+ alf_data_addr64_t ea;
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_IN, 0);
+
// Transfer input A real
- ALF_DT_LIST_CREATE(p_list_entries, 0);
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->a_re_ptr + current_count * p->a_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->a_re_ptr + current_count * p->a_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->a_im_ptr + current_count * p->a_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->a_im_ptr + current_count * p->a_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
// Transfer input B.
- ALF_DT_LIST_CREATE(p_list_entries, 2*p->length*sizeof(float));
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->b_re_ptr + current_count * p->b_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->b_re_ptr + current_count * p->b_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->b_im_ptr + current_count * p->b_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->b_im_ptr + current_count * p->b_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_prepare_output_list(
+int output(
void* p_context,
void* p_params,
- void* p_list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
Vmul_split_params* p = (Vmul_split_params*)p_params;
- addr64 ea;
- u32 base_addr;
+ alf_data_addr64_t ea;
// Transfer output R.
- ALF_DT_LIST_CREATE(p_list_entries, 0);
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->r_re_ptr + current_count * p->r_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_OUT, 0);
- ea.ui[0] = 0;
- ea.ui[1] = (u32)(p->r_im_ptr + current_count * p->r_blk_stride);
- ALF_DT_LIST_ADD_ENTRY(p_list_entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->r_re_ptr + current_count * p->r_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
+ ea = p->r_im_ptr + current_count * p->r_blk_stride;
+ ALF_ACCEL_DTL_ENTRY_ADD(entries, p->length, ALF_DATA_FLOAT, ea);
+
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_comp_kernel(
- void* p_context,
- void* p_params,
- void* input,
- void* output,
+int kernel(
+ void* p_context,
+ void* p_params,
+ void* input,
+ void* output,
+ void* inout,
unsigned int iter,
unsigned int n)
{
@@ -96,9 +95,13 @@
float *r_re = (float *)output + 0 * length;
float *r_im = (float *)output + 1 * length;
- cvmul(r_re, r_im, a_re, a_im, b_re, b_im, length);
+ cml_zvmul1_f(a_re, a_im, b_re, b_im, r_re, r_im, length);
return 0;
}
-
+ALF_ACCEL_EXPORT_API_LIST_BEGIN
+ ALF_ACCEL_EXPORT_API ("input", input);
+ ALF_ACCEL_EXPORT_API ("output", output);
+ ALF_ACCEL_EXPORT_API ("kernel", kernel);
+ALF_ACCEL_EXPORT_API_LIST_END
Index: src/vsip/opt/cbe/spu/GNUmakefile.inc.in
===================================================================
--- src/vsip/opt/cbe/spu/GNUmakefile.inc.in (revision 209510)
+++ src/vsip/opt/cbe/spu/GNUmakefile.inc.in (working copy)
@@ -14,10 +14,7 @@
cbe_sdk_version := @cbe_sdk_version@
-#src_vsip_opt_cbe_spu_c_src := $(wildcard $(srcdir)/src/vsip/opt/cbe/spu/alf_*.c)
-src_vsip_opt_cbe_spu_c_src := $(srcdir)/src/vsip/opt/cbe/spu/alf_fconv_c.c
-src_vsip_opt_cbe_spu_c_src += $(srcdir)/src/vsip/opt/cbe/spu/alf_fft_c.c
-src_vsip_opt_cbe_spu_c_src += $(srcdir)/src/vsip/opt/cbe/spu/alf_vmul_c.c
+src_vsip_opt_cbe_spu_c_src := $(wildcard $(srcdir)/src/vsip/opt/cbe/spu/alf_*.c)
src_vsip_opt_cbe_spu_cxx_src := $(wildcard $(srcdir)/src/vsip/opt/cbe/spu/alf_*.cpp)
src_vsip_opt_cbe_spu_src := $(src_vsip_opt_cbe_spu_c_src) $(src_vsip_opt_cbe_spu_cxx_src)
ifneq ($(VSIP_IMPL_CBE_SDK_FFT),1)
@@ -40,7 +37,8 @@
CXX_SPU := @CXX_SPU@
EMBED_SPU := @EMBED_SPU@
CPP_SPU_FLAGS := @CPP_SPU_FLAGS@
-LIBS_SPU := -lalf -lm
+LD_SPU_FLAGS := @LD_SPU_FLAGS@
+LIBS_SPU := -lcml_spu -lalf -lm
CPP_SPU_FLAGS += -I src -I $(srcdir)/src
CPP_SPU_FLAGS += -I $(srcdir)/src/vsip/opt/cbe
@@ -48,7 +46,7 @@
CPP_SPU_FLAGS += -I $(CBE_SDK_SYSROOT)/opt/cell/sdk/usr/spu/include
C_SPU_FLAGS := -O3
CXX_SPU_FLAGS := -O3
-LD_SPU_FLAGS := -Wl,-N -L$(CBE_SDK_SYSROOT)/usr/spu/lib
+LD_SPU_FLAGS += -Wl,-N -L$(CBE_SDK_SYSROOT)/usr/spu/lib
LD_SPU_FLAGS += -L$(CBE_SDK_SYSROOT)/opt/cell/sdk/usr/spu/lib
########################################################################
@@ -112,7 +110,7 @@
$(compile_cxx_spu_kernel)
mostlyclean::
- rm $(spe_kernels)
+ rm -f $(spe_kernels)
rm -f $(src_vsip_opt_cbe_spu_obj)
rm -f $(src_vsip_opt_cbe_spu_mod)
Index: src/vsip/opt/cbe/spu/alf_vmmul_c.c
===================================================================
--- src/vsip/opt/cbe/spu/alf_vmmul_c.c (revision 209510)
+++ src/vsip/opt/cbe/spu/alf_vmmul_c.c (working copy)
@@ -15,7 +15,9 @@
#include <spu_mfcio.h>
#include <vsip/opt/cbe/vmmul_params.h>
+#define _ALF_MAX_SINGLE_DT_SIZE 16*1024
+
// These are sized for complex values, taking two floats each.
static volatile float vector_in[VSIP_IMPL_MAX_VMMUL_SIZE*2]
__attribute__ ((aligned (128)));
@@ -44,10 +46,10 @@
}
-int alf_prepare_input_list(
+int input(
void* p_context,
void* p_params,
- void* p_list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
@@ -55,32 +57,34 @@
Vmmul_params* params = (Vmmul_params*)p_params;
// Transfer input
- ALF_DT_LIST_CREATE(p_list_entries, 0);
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_IN, 0);
unsigned long length = params->length;
unsigned long max_length = _ALF_MAX_SINGLE_DT_SIZE / FP / sizeof(float);
- addr64 ea;
- ea.ull = params->ea_input_matrix + current_count * FP * params->input_stride * sizeof(float);
+ alf_data_addr64_t ea;
+ ea = params->ea_input_matrix + current_count * FP * params->input_stride * sizeof(float);
while (length > 0)
{
unsigned long cur_length = (length > max_length) ? max_length : length;
- ALF_DT_LIST_ADD_ENTRY(p_list_entries,
- FP * cur_length,
- ALF_DATA_FLOAT,
- ea);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries,
+ FP * cur_length,
+ ALF_DATA_FLOAT,
+ ea);
length -= cur_length;
- ea.ull += FP * cur_length * sizeof(float);
+ ea += FP * cur_length * sizeof(float);
}
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_prepare_output_list(
+int output(
void* p_context,
void* p_params,
- void* p_list_entries,
+ void* entries,
unsigned int current_count,
unsigned int total_count)
{
@@ -88,33 +92,36 @@
Vmmul_params* params = (Vmmul_params*)p_params;
// Transfer output
- ALF_DT_LIST_CREATE(p_list_entries, 0);
+ ALF_ACCEL_DTL_BEGIN(entries, ALF_BUF_OUT, 0);
unsigned long length = params->length;
unsigned long max_length = _ALF_MAX_SINGLE_DT_SIZE / FP / sizeof(float);
- addr64 ea;
- ea.ull = params->ea_output_matrix + current_count * FP * params->output_stride * sizeof(float);
+ alf_data_addr64_t ea;
+ ea = params->ea_output_matrix + current_count * FP * params->output_stride * sizeof(float);
while (length > 0)
{
unsigned long cur_length = (length > max_length) ? max_length : length;
- ALF_DT_LIST_ADD_ENTRY(p_list_entries,
- FP * cur_length,
- ALF_DATA_FLOAT,
- ea);
+ ALF_ACCEL_DTL_ENTRY_ADD(entries,
+ FP * cur_length,
+ ALF_DATA_FLOAT,
+ ea);
length -= cur_length;
- ea.ull += FP * cur_length * sizeof(float);
+ ea += FP * cur_length * sizeof(float);
}
+ ALF_ACCEL_DTL_END(entries);
+
return 0;
}
-int alf_comp_kernel(
+int kernel(
void* p_context,
void* p_params,
void* input,
void* output,
+ void* inout,
unsigned int iter,
unsigned int n)
{
@@ -170,3 +177,9 @@
return 0;
}
+
+ALF_ACCEL_EXPORT_API_LIST_BEGIN
+ ALF_ACCEL_EXPORT_API ("input", input);
+ ALF_ACCEL_EXPORT_API ("output", output);
+ ALF_ACCEL_EXPORT_API ("kernel", kernel);
+ALF_ACCEL_EXPORT_API_LIST_END
Index: src/vsip/opt/cbe/spu/fft_1d_r2_split.h
===================================================================
--- src/vsip/opt/cbe/spu/fft_1d_r2_split.h (revision 209510)
+++ src/vsip/opt/cbe/spu/fft_1d_r2_split.h (working copy)
@@ -1,618 +0,0 @@
-/* -------------------------------------------------------------- */
-/* (C)Copyright 2001,2006, */
-/* International Business Machines Corporation, */
-/* Sony Computer Entertainment, Incorporated, */
-/* Toshiba Corporation, */
-/* */
-/* All Rights Reserved. */
-/* -------------------------------------------------------------- */
-/* PROLOG END TAG zYx */
-
-/* CSL: Based on the original Cell SDK FFT library, this file has
- been modified as follows:
-
- 2007-02-28 JPB
-
- - The function _fft_1d_r2 had been converted to operate on
- split-complex data.
- - Function renamed to _fft_1d_r2_split,
- - Input stage unrolled by 2 and changed to process data
- vertically + transpose, rather than horizontally.
- - Later stages manually unrolled by 2.
-
-*/
-#ifndef _FFT_1D_R2_SPLIT_H_
-#define _FFT_1D_R2_SPLIT_H_ 1
-
-#include <vec_literal.h>
-#include "fft_1d.h"
-
-/* fft_1d_r2
- * ---------
- * Performs a single precision, complex Fast Fourier Transform using
- * the DFT (Discrete Fourier Transform) with radix-2 decimation in time.
- * The input <in> is an array of complex numbers of length (1<<log2_size)
- * entries. The result is returned in the array of complex numbers specified
- * by <out>. Note: This routine can support an in-place transformation
- * by specifying <in> and <out> to be the same array.
- *
- * This implementation utilizes the Cooley-Tukey algorithm consisting
- * of <log2_size> stages. The basic operation is the butterfly.
- *
- * p --------------------------> P = p + q*Wi
- * \ /
- * \ /
- * \ /
- * \/
- * /\
- * / \
- * / \
- * ____ / \
- * q --| Wi |-----------------> Q = p - q*Wi
- * ----
- *
- * This routine also requires pre-computed twiddle values, W. W is an
- * array of single precision complex numbers of length 1<<(log2_size-2)
- * and is computed as follows:
- *
- * for (i=0; i<n/4; i++)
- * W[i].real = cos(i * 2*PI/n);
- * W[i].imag = -sin(i * 2*PI/n);
- * }
- *
- * This array actually only contains the first half of the twiddle
- * factors. Due for symmetry, the second half of the twiddle factors
- * are implied and equal:
- *
- * for (i=0; i<n/4; i++)
- * W[i+n/4].real = W[i].imag = sin(i * 2*PI/n);
- * W[i+n/4].imag = -W[i].real = -cos(i * 2*PI/n);
- * }
- *
- * Further symmetry allows one to generate the twiddle factor table
- * using half the number of trig computations as follows:
- *
- * W[0].real = 1.0;
- * W[0].imag = 0.0;
- * for (i=1; i<n/4; i++)
- * W[i].real = cos(i * 2*PI/n);
- * W[n/4 - i].imag = -W[i].real;
- * }
- *
- * The complex numbers are packed into quadwords as follows:
- *
- * quadword complex
- * array element array elements
- * -----------------------------------------------------
- * i | real 2*i | imag 2*i | real 2*i+1 | imag 2*i+1 |
- * -----------------------------------------------------
- *
- */
-
-#define TRANSPOSE_4X4(abcd, efgh, ijkl, mnop, aeim, bfjn, cgko, dhlp)\
-{\
- vector float aibj, ckdl, emfn, gohp;\
- aibj = spu_shuffle(abcd, ijkl, shuf_0415);\
- ckdl = spu_shuffle(abcd, ijkl, shuf_2637);\
- emfn = spu_shuffle(efgh, mnop, shuf_0415);\
- gohp = spu_shuffle(efgh, mnop, shuf_2637);\
-\
- aeim = spu_shuffle(aibj, emfn, shuf_0415);\
- bfjn = spu_shuffle(aibj, emfn, shuf_2637);\
- cgko = spu_shuffle(ckdl, gohp, shuf_0415);\
- dhlp = spu_shuffle(ckdl, gohp, shuf_2637);\
-}
-
-
-static __inline void _fft_1d_r2_split(
- vector float *out_re,
- vector float *out_im,
- vector float *in_re,
- vector float *in_im,
- vector float *W,
- int log2_size)
-{
- int i, j, k;
- int stage, offset;
- int i_rev;
- int n, n_2, n_4, n_8, n_16, n_3_16;
- int w_stride, w_2stride, w_3stride, w_4stride;
- int stride, stride_2, stride_4, stride_3_4;
- vector float *W0, *W1, *W2, *W3;
- vector float *re0, *re1, *re2, *re3;
- vector float *im0, *im1, *im2, *im3;
- vector float *re0_a, *re1_a;
- vector float *re0_b, *re1_b;
- vector float *im0_a, *im1_a;
- vector float *im0_b, *im1_b;
- vector float *in0_re, *in1_re, *in2_re, *in3_re, *in4_re, *in5_re, *in6_re, *in7_re;
- vector float *in0_im, *in1_im, *in2_im, *in3_im, *in4_im, *in5_im, *in6_im, *in7_im;
- vector float *out0_re, *out1_re, *out2_re, *out3_re;
- vector float *out0_im, *out1_im, *out2_im, *out3_im;
- vector float tmp0, tmp1;
- vector float tmp0_a, tmp1_a;
- vector float tmp0_b, tmp1_b;
- vector float w0_re, w0_im; // , w1_re, w1_im;
- vector float w0, w1, w2, w3;
-
- vector float w0_re_a, w0_im_a, w1_re_a, w1_im_a;
- vector float w0_a, w1_a, w2_a, w3_a;
- vector float w0_re_b, w0_im_b, w1_re_b, w1_im_b;
- vector float w0_b, w1_b, w2_b, w3_b;
-
- vector float src_lo0_re, src_lo1_re, src_lo2_re, src_lo3_re;
- vector float src_hi0_re, src_hi1_re, src_hi2_re, src_hi3_re;
-
- vector float src_lo0_im, src_lo1_im, src_lo2_im, src_lo3_im;
- vector float src_hi0_im, src_hi1_im, src_hi2_im, src_hi3_im;
-
- vector float re_lo0_a, re_lo1_a, re_lo2_a, re_lo3_a;
- vector float im_lo0_a, im_lo1_a, im_lo2_a, im_lo3_a;
- vector float re_hi0_a, re_hi1_a, re_hi2_a, re_hi3_a;
- vector float im_hi0_a, im_hi1_a, im_hi2_a, im_hi3_a;
-
- vector float re_lo0_b, re_lo1_b, re_lo2_b, re_lo3_b;
- vector float im_lo0_b, im_lo1_b, im_lo2_b, im_lo3_b;
- vector float re_hi0_b, re_hi1_b, re_hi2_b, re_hi3_b;
- vector float im_hi0_b, im_hi1_b, im_hi2_b, im_hi3_b;
-
- vector float re[MAX_FFT_1D_SIZE/4], im[MAX_FFT_1D_SIZE/4]; /* real & imaginary working arrays */
-
- vector unsigned char reverse;
- vector unsigned char shuf_0415 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 16,17,18,19,
- 4, 5, 6, 7, 20,21,22,23);
- vector unsigned char shuf_2637 = VEC_LITERAL(vector unsigned char,
- 8, 9,10,11, 24,25,26,27,
- 12,13,14,15,28,29,30,31);
- vector unsigned char shuf_0246 = VEC_LITERAL(vector unsigned char,
- 0, 1, 2, 3, 8, 9,10,11,
- 16,17,18,19,24,25,26,27);
- vector unsigned char shuf_1357 = VEC_LITERAL(vector unsigned char,
- 4, 5, 6, 7, 12,13,14,15,
- 20,21,22,23,28,29,30,31);
-
- n = 1 << log2_size;
- n_2 = n >> 1;
- n_4 = n >> 2;
- n_8 = n >> 3;
- n_16 = n >> 4;
-
- n_3_16 = n_8 + n_16;
-
- reverse = byte_reverse[log2_size];
-
- /* Perform the first 3 stages of the FFT. These stages differs from
- * other stages in that the inputs are unscrambled and the data is
- * reformated into parallel arrays (ie, seperate real and imaginary
- * arrays). The term "unscramble" means the bit address reverse the
- * data array. In addition, the first three stages have simple twiddle
- * weighting factors.
- * stage 1: (1, 0)
- * stage 2: (1, 0) and (0, -1)
- * stage 3: (1, 0), (0.707, -0.707), (0, -1), (-0.707, -0.707)
- *
- * The arrays are processed as two halves, simultaneously. The lo (first
- * half) and hi (second half). This is done because the scramble
- * shares source value between each half of the output arrays.
- */
- i = 0;
- i_rev = 0;
-
- in0_re = in_re;
- in1_re = in_re + n_8/2;
- in2_re = in_re + n_16/2;
- in3_re = in_re + n_3_16/2;
-
- in4_re = in_re + n_8;
- in5_re = in1_re + n_8;
- in6_re = in2_re + n_8;
- in7_re = in3_re + n_8;
-
- in0_im = in_im;
- in1_im = in_im + n_8/2;
- in2_im = in_im + n_16/2;
- in3_im = in_im + n_3_16/2;
-
- in4_im = in_im + n_8;
- in5_im = in1_im + n_8;
- in6_im = in2_im + n_8;
- in7_im = in3_im + n_8;
-
- re0_a = re;
- re1_a = re + n_8;
- im0_a = im;
- im1_a = im + n_8;
-
- re0_b = re + n_16;
- re1_b = re + n_8 + n_16;
- im0_b = im + n_16;
- im1_b = im + n_8 + n_16;
-
- w0_re = VEC_LITERAL(vector float, 1.0f, INV_SQRT_2, 0.0f, -INV_SQRT_2);
- w0_im = VEC_LITERAL(vector float, 0.0f, -INV_SQRT_2, -1.0f, -INV_SQRT_2);
-
- do {
- src_lo0_re = in0_re[i_rev];
- src_lo1_re = in1_re[i_rev];
- src_lo2_re = in2_re[i_rev];
- src_lo3_re = in3_re[i_rev];
-
- src_hi0_re = in4_re[i_rev];
- src_hi1_re = in5_re[i_rev];
- src_hi2_re = in6_re[i_rev];
- src_hi3_re = in7_re[i_rev];
-
- src_lo0_im = in0_im[i_rev];
- src_lo1_im = in1_im[i_rev];
- src_lo2_im = in2_im[i_rev];
- src_lo3_im = in3_im[i_rev];
-
- src_hi0_im = in4_im[i_rev];
- src_hi1_im = in5_im[i_rev];
- src_hi2_im = in6_im[i_rev];
- src_hi3_im = in7_im[i_rev];
-
- vector float a_0_16 = spu_add(src_lo0_re, src_hi0_re);
- vector float s_0_16 = spu_sub(src_lo0_re, src_hi0_re);
- vector float a_4_20 = spu_add(src_lo1_re, src_hi1_re);
- vector float s_4_20 = spu_sub(src_lo1_re, src_hi1_re);
- vector float a_8_24 = spu_add(src_lo2_re, src_hi2_re);
- vector float s_8_24 = spu_sub(src_lo2_re, src_hi2_re);
- vector float a_12_28 = spu_add(src_lo3_re, src_hi3_re);
- vector float s_12_28 = spu_sub(src_lo3_re, src_hi3_re);
- vector float a_32_48 = spu_add(src_lo0_im, src_hi0_im);
- vector float s_32_48 = spu_sub(src_lo0_im, src_hi0_im);
- vector float a_36_52 = spu_add(src_lo1_im, src_hi1_im);
- vector float s_36_52 = spu_sub(src_lo1_im, src_hi1_im);
- vector float a_40_56 = spu_add(src_lo2_im, src_hi2_im);
- vector float s_40_56 = spu_sub(src_lo2_im, src_hi2_im);
- vector float a_44_60 = spu_add(src_lo3_im, src_hi3_im);
- vector float s_44_60 = spu_sub(src_lo3_im, src_hi3_im);
-
-
- vector float re_hilo0_0 = spu_add(a_4_20, a_0_16);
- vector float re_hilo0_1 = spu_add(s_36_52, s_0_16);
- vector float re_hilo0_2 = spu_sub(a_0_16, a_4_20);
- vector float re_hilo0_3 = spu_sub(s_0_16, s_36_52);
-
- vector float im_hilo0_0 = spu_add(a_36_52, a_32_48);
- vector float im_hilo0_1 = spu_sub(s_32_48, s_4_20);
- vector float im_hilo0_2 = spu_sub(a_32_48, a_36_52);
- vector float im_hilo0_3 = spu_add(s_4_20, s_32_48);
-
- vector float re_hilo1_0 = spu_add(a_12_28, a_8_24);
- vector float re_hilo1_1 = spu_add(s_44_60, s_8_24);
- vector float re_hilo1_2 = spu_sub(a_8_24, a_12_28);
- vector float re_hilo1_3 = spu_sub(s_8_24, s_44_60);
-
- vector float im_hilo1_0 = spu_add(a_44_60, a_40_56);
- vector float im_hilo1_1 = spu_sub(s_40_56, s_12_28);
- vector float im_hilo1_2 = spu_sub(a_40_56, a_44_60);
- vector float im_hilo1_3 = spu_add(s_12_28, s_40_56);
-
-
- TRANSPOSE_4X4(re_hilo0_0, re_hilo0_1, re_hilo0_2, re_hilo0_3,
- re_lo0_a, re_hi0_a, re_lo0_b, re_hi0_b);
-
- TRANSPOSE_4X4(im_hilo0_0, im_hilo0_1, im_hilo0_2, im_hilo0_3,
- im_lo0_a, im_hi0_a, im_lo0_b, im_hi0_b);
-
- TRANSPOSE_4X4(re_hilo1_0, re_hilo1_1, re_hilo1_2, re_hilo1_3,
- re_lo1_a, re_hi1_a, re_lo1_b, re_hi1_b);
-
- TRANSPOSE_4X4(im_hilo1_0, im_hilo1_1, im_hilo1_2, im_hilo1_3,
- im_lo1_a, im_hi1_a, im_lo1_b, im_hi1_b);
-
-
- /* Perform stage 3 butterfly.
- */
- FFT_1D_BUTTERFLY(re0_a[0], im0_a[0], re0_a[1], im0_a[1],
- re_lo0_a, im_lo0_a, re_lo1_a, im_lo1_a, w0_re, w0_im);
- FFT_1D_BUTTERFLY(re1_a[0], im1_a[0], re1_a[1], im1_a[1],
- re_hi0_a, im_hi0_a, re_hi1_a, im_hi1_a, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY(re0_b[0], im0_b[0], re0_b[1], im0_b[1],
- re_lo0_b, im_lo0_b, re_lo1_b, im_lo1_b, w0_re, w0_im);
- FFT_1D_BUTTERFLY(re1_b[0], im1_b[0], re1_b[1], im1_b[1],
- re_hi0_b, im_hi0_b, re_hi1_b, im_hi1_b, w0_re, w0_im);
-
-
- re0_a += 2;
- re1_a += 2;
- im0_a += 2;
- im1_a += 2;
-
- re0_b += 2;
- re1_b += 2;
- im0_b += 2;
- im1_b += 2;
-
- i += 8;
- i_rev = BIT_SWAP(i, reverse) / 4;
- } while (i < n_4);
-
- /* Process stages 4 to log2_size-2
- */
- for (stage=4, stride=4; stage<log2_size-1; stage++, stride += stride) {
- w_stride = n_2 >> stage;
- w_2stride = n >> stage;
- w_3stride = w_stride + w_2stride;
- w_4stride = w_2stride + w_2stride;
-
- W0 = W;
- W1 = W + w_stride;
- W2 = W + w_2stride;
- W3 = W + w_3stride;
-
- stride_2 = stride >> 1;
- stride_4 = stride >> 2;
- stride_3_4 = stride_2 + stride_4;
- int stride_2x = 2*stride;
- int stride_4x = 4*stride;
-
- re0 = re; im0 = im;
- re1 = re + stride_2; im1 = im + stride_2;
- re2 = re + stride_4; im2 = im + stride_4;
- re3 = re + stride_3_4; im3 = im + stride_3_4;
-
- for (i=0, offset=0; i<stride_4; i++, offset += w_4stride) {
- /* Compute the twiddle factors
- */
- w0 = W0[offset];
- w1 = W1[offset];
- w2 = W2[offset];
- w3 = W3[offset];
-
- tmp0 = spu_shuffle(w0, w2, shuf_0415);
- tmp1 = spu_shuffle(w1, w3, shuf_0415);
-
- w0_re = spu_shuffle(tmp0, tmp1, shuf_0415);
- w0_im = spu_shuffle(tmp0, tmp1, shuf_2637);
-
- j = i;
- k = i + stride;
-
- do {
- re_lo0_a = re0[j]; im_lo0_a = im0[j];
- re_lo1_a = re1[j]; im_lo1_a = im1[j];
-
- re_hi0_a = re2[j]; im_hi0_a = im2[j];
- re_hi1_a = re3[j]; im_hi1_a = im3[j];
-
- re_lo2_a = re0[k]; im_lo2_a = im0[k];
- re_lo3_a = re1[k]; im_lo3_a = im1[k];
-
- re_hi2_a = re2[k]; im_hi2_a = im2[k];
- re_hi3_a = re3[k]; im_hi3_a = im3[k];
-
- re_lo0_b = re0[j+stride_2x]; im_lo0_b = im0[j+stride_2x];
- re_lo1_b = re1[j+stride_2x]; im_lo1_b = im1[j+stride_2x];
-
- re_hi0_b = re2[j+stride_2x]; im_hi0_b = im2[j+stride_2x];
- re_hi1_b = re3[j+stride_2x]; im_hi1_b = im3[j+stride_2x];
-
- re_lo2_b = re0[k+stride_2x]; im_lo2_b = im0[k+stride_2x];
- re_lo3_b = re1[k+stride_2x]; im_lo3_b = im1[k+stride_2x];
-
- re_hi2_b = re2[k+stride_2x]; im_hi2_b = im2[k+stride_2x];
- re_hi3_b = re3[k+stride_2x]; im_hi3_b = im3[k+stride_2x];
-
- FFT_1D_BUTTERFLY (re0[j], im0[j], re1[j], im1[j], re_lo0_a, im_lo0_a, re_lo1_a, im_lo1_a, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[j], im2[j], re3[j], im3[j], re_hi0_a, im_hi0_a, re_hi1_a, im_hi1_a, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY (re0[k], im0[k], re1[k], im1[k], re_lo2_a, im_lo2_a, re_lo3_a, im_lo3_a, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[k], im2[k], re3[k], im3[k], re_hi2_a, im_hi2_a, re_hi3_a, im_hi3_a, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY (re0[j+stride_2x], im0[j+stride_2x], re1[j+stride_2x], im1[j+stride_2x], re_lo0_b, im_lo0_b, re_lo1_b, im_lo1_b, w0_re, w0_im);
- FFT_1D_BUTTERFLY_HI(re2[j+stride_2x], im2[j+stride_2x], re3[j+stride_2x], im3[j+stride_2x], re_hi0_b, im_hi0_b, re_hi1_b, im_hi1_b, w0_re, w0_im);
-
- FFT_1D_BUTTERFLY (re0[k+stride_2x], im0[k+stride_2x], re1[k+stride_2x], im1[k+stride_2x], re_lo2_b, im_lo2_b, re_lo3_ | |||||||