Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

patch: support sal-fft


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: patch: support sal-fft
  • From: Stefan Seefeld <stefan@xxxxxxxxxxxxxxxx>
  • Date: Thu, 16 Feb 2006 11:22:31 -0500

The attached patch adds support for SAL as a new backend for our FFT engine.
This new backend does not support 3D ffts, and block sizes that are not powers of two.
I therefor (conditionally) masked those tests in tests/fft.cpp that SAL does not
support. This can be reverted as soon as the fft_dispatch framework is in place,
i.e. multiple fft backends are supported in parallel.

I believe the code in src/vsip/impl/sal/fft.h can be tidied up quite a bit,
though I'd prefer to do that only once the code is refactored, i.e. for now
focus on feature completeness.

Regards,
		Stefan
Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.396
diff -u -r1.396 ChangeLog
--- ChangeLog	16 Feb 2006 15:59:52 -0000	1.396
+++ ChangeLog	16 Feb 2006 16:15:44 -0000
@@ -1,5 +1,12 @@
 2006-02-16  Stefan Seefeld  <stefan@xxxxxxxxxxxxxxxx>
 
+	* configure.ac: Add support for sal-fft.
+	* src/vsip/impl/fft-core.hpp: Likewise.
+	* src/vsip/impl/signal-fft.hpp: Likewise.
+	* src/vsip/impl/sal/fft.hpp: Likewise.
+	* tests/fft.cpp: Temporarily mask all tests that sal-fft is known
+	not to support, when building with sal-fft.
+
 	* configure.ac: Emit variable PAR_SERVICE.
 	* vsipl++.pc.in: Publish it.
 	* tests/GNUmakefile.inc.in: Propagate it...
Index: configure.ac
===================================================================
RCS file: /home/cvs/Repository/vpp/configure.ac,v
retrieving revision 1.81
diff -u -r1.81 configure.ac
--- configure.ac	16 Feb 2006 15:59:52 -0000	1.81
+++ configure.ac	16 Feb 2006 16:15:44 -0000
@@ -11,7 +11,7 @@
 dnl Autoconf initialization
 dnl ------------------------------------------------------------------
 AC_PREREQ(2.56)
-AC_REVISION($Revision: 1.81 $)
+AC_REVISION($Revision: 1.80 $)
 AC_INIT(Sourcery VSIPL++, 1.0, vsipl++@xxxxxxxxxxxxxxxx, sourceryvsipl++)
 
 ######################################################################
@@ -110,7 +110,7 @@
 AC_ARG_WITH(fft,
   AS_HELP_STRING([--with-fft=LIB],
                  [Specify FFT engine: fftw3, fftw2-float, fftw2-double,
-		  fftw2-generic, ipp, or builtin.  For fftw2-generic, 
+		  fftw2-generic, ipp, sal, or builtin.  For fftw2-generic, 
 		  float support is in <fftw.h> and -lfftw, not <sfftw.h> 
 	          and -lsfftw.  (Default is fftw3 if found, otherwise 
                   builtin, meaning build and use in-tree fftw3.)]),
@@ -418,6 +418,7 @@
 enable_fftw3="no"
 enable_fftw2="no"
 enable_ipp_fft="no"
+enable_sal_fft="no"
 build_fftw3="no"
 
 if test "$enable_fft_float" = yes -o \
@@ -440,9 +441,11 @@
     enable_fftw2_float="yes"
   elif test "$with_fft" = "ipp"; then
     enable_ipp_fft="yes"
+  elif test "$with_fft" = "sal"; then
+    enable_sal_fft="yes"
   elif test "$with_fft" != "none"; then
     AC_MSG_ERROR([Argument to --with-fft= must be one of fftw3, fftw2-float,
-                  fftw2-double, fftw2-generic, ipp, builtin, or none.])
+                  fftw2-double, fftw2-generic, ipp, sal, builtin, or none.])
   fi 
 fi
 
@@ -500,6 +503,7 @@
   LIBS="$keep_LIBS"
 
   enable_ipp_fft="no"
+  enable_sal_fft="no"
   enable_fftw2="no"
 fi
 
@@ -710,6 +714,7 @@
     AC_SUBST(FFT_LIBS)
 
     enable_ipp_fft="no"
+    enable_sal_fft="no"
   fi
 fi
 
@@ -851,6 +856,14 @@
 #
 # Find the Mercury SAL library, if enabled.
 #
+if test "$enable_sal_fft" == "yes"; then
+  if test "$enable_sal" == "no"; then
+    AC_MSG_ERROR([SAL FFT requires SAL])
+  else
+    enable_sal="yes"
+  fi 
+fi
+
 if test "$enable_sal" != "no"; then
 
   if test -n "$with_sal_include"; then
@@ -863,8 +876,11 @@
   vsipl_sal_h_name="not found"
   AC_CHECK_HEADER([sal.h], [vsipl_sal_h_name='<sal.h>'],, [// no prerequisites])
   if test "$vsipl_sal_h_name" == "not found"; then
-    AC_MSG_ERROR([SAL enabled, but no sal.h detected])
-    CPPFLAGS="$save_CPPFLAGS"
+    if test "$enable_sal" != "probe" -o "$enable_sal_fft" == "yes"; then
+      AC_MSG_ERROR([SAL enabled, but no sal.h detected])
+    else
+      CPPFLAGS="$save_CPPFLAGS"
+    fi
   else
 
     # Find the library.
@@ -908,6 +924,21 @@
     AC_SUBST(VSIP_IMPL_HAVE_SAL, 1)
     AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SAL, 1,
       [Define to set whether or not to use Mercury's SAL library.])
+
+    if test "$enable_sal_fft" != "no"; then 
+      AC_SUBST(VSIP_IMPL_SAL_FFT, 1)
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_SAL_FFT, 1,
+	    [Define to use Mercury's SAL library to perform FFTs.])
+      if test "$enable_fft_float" = yes; then
+	AC_DEFINE_UNQUOTED(VSIP_IMPL_FFT_USE_FLOAT, $vsip_impl_use_float,
+	      [Define to build code with support for FFT on float types.])
+      fi
+      if test "$enable_fft_double" = yes; then
+	AC_DEFINE_UNQUOTED(VSIP_IMPL_FFT_USE_DOUBLE, $vsip_impl_use_double,
+	      [Define to build code with support for FFT on double types.])
+      fi
+    fi
+
   fi
 
 fi
Index: src/vsip/impl/fft-core.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/fft-core.hpp,v
retrieving revision 1.19
diff -u -r1.19 fft-core.hpp
--- src/vsip/impl/fft-core.hpp	22 Dec 2005 08:21:23 -0000	1.19
+++ src/vsip/impl/fft-core.hpp	16 Feb 2006 16:15:45 -0000
@@ -68,6 +68,10 @@
 # include <ippi.h>
 #endif
 
+#if defined(VSIP_IMPL_SAL_FFT)
+# include <vsip/impl/sal/fft.hpp>
+#endif // VSIP_IMPL_SAL_FFT
+
 //////////////////////////////////////////////////////////////////////////
 //
 // Local includes
@@ -1503,7 +1507,6 @@
 
 #endif // VSIP_IMPL_IPP_FFT
 
-
 //////////////////////////////////////////////////////////////////////////
 //
 // Generic entry points
Index: src/vsip/impl/signal-fft.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/signal-fft.hpp,v
retrieving revision 1.29
diff -u -r1.29 signal-fft.hpp
--- src/vsip/impl/signal-fft.hpp	10 Feb 2006 22:24:02 -0000	1.29
+++ src/vsip/impl/signal-fft.hpp	16 Feb 2006 16:15:46 -0000
@@ -82,7 +82,19 @@
   void* p_buffer_;      // temporary storage not allocated in the plan
   unsigned row_step_;    // length in bytes of 2D row.
 # endif
-
+#elif defined(VSIP_IMPL_SAL_FFT)
+  // Array sizes (in log2 units).
+  // size_[0] always corresponds to the line being processed first,
+  // and thus may depend on the sD template parameter.
+  unsigned long size_[Dim];
+  void *plan_;
+  bool is_forward_;
+
+  int  stride_; // 1 for sd_ == 0, length of row for sd_ == 1.
+  int  dist_;   // 1 for sd_ == 1, length of column for sd_ == 0.
+  // used only for Fftm
+  int  sd_;     // 0: compute FFTs of rows; 1: of columns
+  int  runs_;   // number of 1D FFTs to perform; varies by map
 #endif
 
   // if any of the above functions applies the scale itself, it must
@@ -421,6 +433,37 @@
     static const vsip::dimension_type  transpose_target =
       (sizeof(inT) != sizeof(outT)) ? Fft_imp::dim-1 : sD;
 
+#elif defined(VSIP_IMPL_SAL_FFT) 
+
+    // In some contexts, SAL destroys the input data itself, and sometimes
+    // we have to modify it to 'pack' data into the format SAL expects
+    // (see SAL Tutorial for details).
+    // Therefor, we always copy the input.
+    static const bool  force_copy = true;
+    // SAL cannot handle non-unit strides properly as 'complex' isn't
+    // a real (packed) datatype, so the stride would be applied to the real/imag
+    // offset, too.
+    static const vsip::dimension_type  transpose_target = Fft_imp::dim-1;
+
+    // FIXME: For now SAL always operates on a copy of the input / output buffers
+    //        with unit stride, so we only adjust here to effectively transpose if
+    //        required by the sD parameter (that concerns fft 2D and fftm).
+    //        If these copies can be avoided, we have to track the compound strides
+    //        here.
+    // These are already dealt with for fftm elsewhere, so we only adjust them
+    // here for fft2d
+//     if (sD)
+//     {
+//       this->core_->stride_ = 1;
+//       this->core_->dist_ = 1 << this->core_->size_[0];
+//     }
+//     else
+//     {
+//       this->core_->stride_ = 1 << this->core_->size_[0];
+//       this->core_->dist_ = 1;
+//     }
+    this->core_->stride_ = 1;
+    this->core_->dist_ = 1 << this->core_->size_[0];
 #else
 
     // ideal case: can c->r, r->c on any axis, never clobbers input.
@@ -435,7 +478,7 @@
             Fft_imp::dim,sD,transpose_target>::type>::type
         raw_in(in.block(), impl::SYNC_IN,
 	  impl::Ext_data<in_block_type>(this->in_temp_).data());
-      
+
       typename impl::Maybe_force_copy<force_copy,Block1,
 	  typename impl::Maybe_transpose<
             Fft_imp::dim,sD,transpose_target>::type>::type
@@ -744,6 +787,11 @@
       static const bool force_copy = inPlace::value;
       static const vsip::dimension_type  transpose_target = 1;
 
+#elif defined(VSIP_IMPL_SAL_FFT)
+
+      static const bool force_copy = true;
+      static const vsip::dimension_type  transpose_target = 1;
+
 #else
       static const bool force_copy = false;
       static const vsip::dimension_type  transpose_target = axis;
@@ -820,11 +868,20 @@
 #if defined(VSIP_IMPL_FFTW3)
       static const bool  must_copy = (sD == vsip::col);
       static const vsip::dimension_type  transpose_target = 1;
+#elif defined(VSIP_IMPL_SAL_FFT)
+    // In some contexts, SAL destroys the input data itself, and sometimes
+    // we have to modify it to 'pack' data into the format SAL expects
+    // (see SAL Tutorial for details).
+    // Therefor, we always copy the input.
+    static const bool  must_copy = true;
+    // SAL cannot handle non-unit strides properly as 'complex' isn't
+    // a real (packed) datatype, so the stride would be applied to the real/imag
+    // offset, too.
+    static const vsip::dimension_type  transpose_target = 1;
 #else
       static const bool  must_copy = false;
       static const vsip::dimension_type  transpose_target = axis;
 #endif
-
       typename impl::Maybe_force_copy<
 	  must_copy,typename local_type::block_type,
           typename impl::Maybe_transpose<2,axis,transpose_target>::type>::type
@@ -833,7 +890,7 @@
 	  impl::Ext_data<in_block_type>(this->in_temp_).data());
 
       const bool native_order = (axis == transpose_target);
-      
+
       this->core_->scale_ = this->scale_;
       this->core_->runs_ = local_inout.size(1-axis);
       this->core_->stride_ = 1;
Index: src/vsip/impl/sal/fft.hpp
===================================================================
RCS file: src/vsip/impl/sal/fft.hpp
diff -N src/vsip/impl/sal/fft.hpp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/vsip/impl/sal/fft.hpp	16 Feb 2006 16:15:46 -0000
@@ -0,0 +1,624 @@
+/* Copyright (c) 2006 by CodeSourcery, LLC.  All rights reserved. */
+
+/** @file    vsip/impl/sal/sal.hpp
+    @author  Stefan Seefeld
+    @date    2006-02-02
+    @brief   VSIPL++ Library: FFT wrappers and traits to bridge with Mercury's SAL.
+*/
+
+#ifndef VSIP_IMPL_SAL_FFT_HPP
+#define VSIP_IMPL_SAL_FFT_HPP
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/support.hpp>
+#include <vsip/domain.hpp>
+#include <vsip/impl/signal-fft.hpp>
+#include <sal.h>
+
+/***********************************************************************
+  Declarations
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace sal
+{
+
+// TODO: figure out what exactly this ESAL flag is and tune it.
+//       (requires SAL, not CSAL, and real h/w)
+long const ESAL = 0;
+
+inline unsigned int
+int_log2(unsigned int size)    // assume size = 2^n, != 0, return n.
+{
+  unsigned int n = 0;
+  while (size >>= 1) ++n;
+  return n;
+}
+
+template <dimension_type D> struct log2n;
+
+template <>
+struct log2n<1>
+{
+  static unsigned long translate(Domain<1> const &d, int, unsigned long *out) 
+  {
+    *out = int_log2(d.size());
+    return *out;
+  }
+};
+
+template <>
+struct log2n<2>
+{
+  static unsigned long translate(Domain<2> const &d, int sd, unsigned long *out)
+  {
+    // If sd == 1, invert size[0] and size[1], i.e. transpose.
+    *out = int_log2(d[sd].size());
+    *(out + 1) = int_log2(d[1 - sd].size());
+    return std::max(*out, *(out + 1));
+  }
+};
+
+template <>
+struct log2n<3>
+{
+  static unsigned long translate(Domain<3> const &d, int sd, unsigned long *out)
+  {
+    // If sd == 1, invert size[0] and size[1], i.e. transpose.
+    *out = int_log2(d[sd].size());
+    *(out + 1) = int_log2(d[1 - sd].size());
+    *(out + 2) = int_log2(d[2].size());
+    return std::max(*out, std::max(*(out + 1), *(out + 2)));
+  }
+};
+
+/// Helper trait used to discriminate setup / cleanup functions and options.
+template <typename inT, typename outT> struct fft_inout_trait;
+
+template <>
+struct fft_inout_trait<float, float>
+{
+  static bool const single = true;
+  static long const option = FFT_REAL_ONLY;
+};
+
+template <>
+struct fft_inout_trait<float, std::complex<float> >
+{
+  static bool const single = true;
+  static long const option = 0;
+};
+
+template <>
+struct fft_inout_trait<std::complex<float>, float>
+{
+  static bool const single = true;
+  static long const option = 0;
+};
+
+template <>
+struct fft_inout_trait<std::complex<float>, std::complex<float> >
+{
+  static bool const single = true;
+  static long const option = FFT_COMPLEX_ONLY;
+};
+
+template <>
+struct fft_inout_trait<double, double>
+{
+  static bool const single = false;
+  static long const option = FFT_REAL_ONLY;
+};
+
+template <>
+struct fft_inout_trait<double, std::complex<double> >
+{
+  static bool const single = false;
+  static long const option = 0;
+};
+
+template <>
+struct fft_inout_trait<std::complex<double>, double>
+{
+  static bool const single = false;
+  static long const option = 0;
+};
+
+template <>
+struct fft_inout_trait<std::complex<double>, std::complex<double> >
+{
+  static bool const single = false;
+  static long const option = FFT_COMPLEX_ONLY;
+};
+
+template <dimension_type D,
+	  typename inT, typename outT,
+	  bool single = fft_inout_trait<inT, outT>::single >
+struct fft_planner;
+
+template <dimension_type D,
+	  typename inT, typename outT>
+struct fft_planner<D, inT, outT, true /* single */>
+{
+  static void
+  create(void *&plan, unsigned long size) VSIP_THROW((std::bad_alloc))
+  {
+    long options = fft_inout_trait<inT, outT>::option;
+    FFT_setup setup = 0;
+    unsigned long nbytes = 0;
+    fft_setup(size, options, &setup, &nbytes);
+    plan = setup;
+  }
+  static void 
+  destroy(void *plan)
+  {
+    FFT_setup setup = reinterpret_cast<FFT_setup>(plan);
+    fft_free(&setup);
+  }
+};
+
+template <dimension_type D,
+	  typename inT, typename outT>
+struct fft_planner<D, inT, outT, false /* single */>
+{
+  static void
+  create(void *&plan, unsigned long size) VSIP_THROW((std::bad_alloc))
+  {
+    long options = fft_inout_trait<inT, outT>::option;
+    FFT_setupd setup = 0;
+    unsigned long nbytes = 0;
+    fft_setupd(size, options, &setup, &nbytes);
+    plan = setup;
+  }
+  static void 
+  destroy(void *plan)
+  {
+    FFT_setupd setup = reinterpret_cast<FFT_setupd>(plan);
+    fft_freed(&setup);
+  }
+};
+
+} // namespace vsip::impl::sal
+
+template<dimension_type D, typename inT, typename outT, bool doFftm>
+inline void
+create_plan(Fft_core<D, inT, outT, doFftm>& self, Domain<D> const& dom, 
+	    int sd, int expn, unsigned /* will_call */, inT*, outT*)
+  VSIP_THROW((std::bad_alloc))
+{
+  self.is_forward_ = (expn == -1);
+  unsigned long max = sal::log2n<D>::translate(dom, sd, self.size_);
+  sal::fft_planner<D, inT, outT>::create(self.plan_, max);
+}
+
+template<dimension_type D, typename inT, typename outT, bool doFftm>
+inline void
+destroy(Fft_core<D, inT, outT, doFftm>& self) VSIP_THROW((std::bad_alloc))
+{
+  sal::fft_planner<D, inT, outT>::destroy(self.plan_);
+}
+
+inline void
+in_place(Fft_core<1, std::complex<float>, std::complex<float>, false>& self,
+	 std::complex<float> *inout) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  COMPLEX *data = reinterpret_cast<COMPLEX *>(inout);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft_cipx(&setup, data, stride, self.size_[0], direction, sal::ESAL);
+}
+
+inline void
+in_place(Fft_core<1, std::complex<double>, std::complex<double>, false>& self,
+	 std::complex<double> *inout) VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  DOUBLE_COMPLEX *data = reinterpret_cast<DOUBLE_COMPLEX *>(inout);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft_cipdx(&setup, data, stride, self.size_[0], direction, sal::ESAL);
+}
+
+inline void
+in_place(Fft_core<2, std::complex<float>, std::complex<float>, false>& self,
+	 std::complex<float> *inout) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  COMPLEX *data = reinterpret_cast<COMPLEX *>(inout);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft2d_cipx(&setup, data, stride, 2 << self.size_[1],
+	     self.size_[1], self.size_[0], direction, sal::ESAL);
+}
+
+inline void
+in_place(Fft_core<2, std::complex<double>, std::complex<double>, false>& self,
+	 std::complex<double> *inout) VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  DOUBLE_COMPLEX *data = reinterpret_cast<DOUBLE_COMPLEX *>(inout);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft2d_cipdx(&setup, data, stride, 2 << self.size_[1],
+	      self.size_[1], self.size_[0], direction, sal::ESAL);
+}
+
+template <vsip::dimension_type D, typename inT, typename outT, bool doFftm>
+inline void
+from_to(Fft_core<D, inT, outT, doFftm>& self, inT const* in, outT* out)
+  VSIP_NOTHROW
+{
+  assert(0 && "Sorry, operation not yet supported for this type !");
+  // TBD
+}
+
+// 1D real -> complex forward fft
+
+inline void
+from_to(Fft_core<1, float, std::complex<float>, false>& self,
+	float const* in, std::complex<float>* out)
+  VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  float *out_ = reinterpret_cast<float*>(out);
+  fft_ropx(&setup, const_cast<float*>(in), 1, out_, 1,
+	   self.size_[0], FFT_FORWARD, sal::ESAL);
+  // unpack the data (see SAL reference for details).
+  int const N = (1 << self.size_[0]) + 2;
+  out_[N - 2] = out_[1];
+  out_[1] = 0.f;
+  out_[N - 1] = 0.f;
+  // forward fft_ropx is scaled up by 2.
+  float scale = 0.5f;
+  vsmulx(out_, 1, &scale, out_, 1, N, sal::ESAL);
+}
+
+// 1D complex -> real inverse fft
+
+inline void
+from_to(Fft_core<1, std::complex<float>, float, false>& self,
+	std::complex<float> const* in, float* out)
+  VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  float *in_ = 
+    reinterpret_cast<float *>(const_cast<std::complex<float>*>(in));
+  // pack the data (see SAL reference for details).
+  int const N = (1 << self.size_[0]) + 2;
+  in_[1] = in_[N - 2];
+  in_[N - 2] = in_[N - 1] = 0.f;
+
+  fft_ropx(&setup, in_, 1,
+	   out, 1, self.size_[0], FFT_INVERSE, sal::ESAL);
+}
+
+// 1D real -> complex forward fft
+
+inline void
+from_to(Fft_core<1, double, std::complex<double>, false>& self,
+	double const* in, std::complex<double>* out)
+  VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  double *out_ = reinterpret_cast<double*>(out);
+  fft_ropdx(&setup, const_cast<double*>(in), 1, out_, 1,
+	    self.size_[0], FFT_FORWARD, sal::ESAL);
+  // unpack the data (see SAL reference for details).
+  int const N = (1 << self.size_[0]) + 2;
+  out_[N - 2] = out_[1];
+  out_[1] = 0.f;
+  out_[N - 1] = 0.f;
+  // forward fft_ropx is scaled up by 2.
+  double scale = 0.5f;
+  vsmuldx(out_, 1, &scale, out_, 1, N, sal::ESAL);
+}
+
+// 1D complex -> real inverse fft
+
+inline void
+from_to(Fft_core<1, std::complex<double>, double, false>& self,
+	std::complex<double> const* in, double* out)
+  VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  double *in_ = 
+    reinterpret_cast<double *>(const_cast<std::complex<double>*>(in));
+  // pack the data (see SAL reference for details).
+  int const N = (1 << self.size_[0]) + 2;
+  in_[1] = in_[N - 2];
+  in_[N - 2] = in_[N - 1] = 0.f;
+
+  fft_ropdx(&setup, in_, 1,
+	    out, 1, self.size_[0], FFT_INVERSE, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<1, std::complex<float>, std::complex<float>, false>& self,
+	std::complex<float> const *in, std::complex<float> *out) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  COMPLEX *in_ = reinterpret_cast<COMPLEX *>(const_cast<std::complex<float>*>(in));
+  COMPLEX *out_ = reinterpret_cast<COMPLEX *>(out);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft_copx(&setup, in_, stride, out_, stride, self.size_[0],
+	   direction, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<1, std::complex<double>, std::complex<double>, false>& self,
+	std::complex<double> const *in, std::complex<double> *out) VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  DOUBLE_COMPLEX *in_ = 
+    reinterpret_cast<DOUBLE_COMPLEX *>(const_cast<std::complex<double>*>(in));
+  DOUBLE_COMPLEX *out_ = reinterpret_cast<DOUBLE_COMPLEX *>(out);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft_copdx(&setup, in_, stride, out_, stride, self.size_[0],
+	    direction, sal::ESAL);
+}
+
+// 2D real -> complex forward fft
+
+template <typename T>
+inline void 
+unpack(T *data, unsigned long N, unsigned long M, unsigned long stride)
+{
+  // unpack the data (see SAL reference, figure 3.6, for details).
+  unsigned long const M2 = M/2 + 1;
+  T t10r = data[N * stride];         // (1, 0).real()
+  T t10i = data[N * stride + 1];     // (1, 0).imag()
+  data[N * stride + 1] = 0.;         // (0, 0).imag()
+  data[N * stride - 2] = data[1];    // (0,-1).real()
+  data[N * stride - 1] = 0.;         // (0,-1).imag()
+  data[1] = 0.;
+  for (unsigned long r = 1; r != M2 - 1; ++r)
+  {
+    // set last row (r,-1)
+    data[(r + 1) * N * stride - 2] = data[2 * r * N * stride + 1];
+    data[(r + 1) * N * stride - 1] = data[(2 * r + 1) * N * stride + 1];
+    data[2 * r * N * stride + 1] = 0.;
+    data[(2 * r + 1) * N * stride + 1] = 0.;
+    // set first row (r, 0)
+    data[r * N * stride] = data[2 * r * N * stride];
+    data[r * N * stride + 1] = data[(2 * r + 1) * N * stride];
+    data[2 * r * N * stride] = 0.;
+    data[(2 * r + 1) * N * stride] = 0.;
+  }
+  data[(M2 - 1) * N * stride] = t10r;
+  data[(M2 - 1) * N * stride + 1] = 0.;
+  data[M2  * N * stride - 2] = t10i;
+  data[M2  * N * stride - 1] = 0;
+
+  // Now fill in the missing cells by symmetry.
+  for (unsigned long r = M2; r != M; ++r)
+  {
+    // first column (r, 0)
+    data[r * N * stride] = data[(M - r) * N * stride];
+    data[r * N * stride + 1] = - data[(M - r) * N * stride + 1];
+    // last column (r, -1)
+    data[(r + 1) * N * stride - 2] = data[(M - r + 1) * N * stride - 2];
+    data[(r + 1) * N * stride - 1] = - data[(M - r + 1) * N * stride - 1];
+  }
+}
+
+inline void
+from_to(Fft_core<2, float, std::complex<float>, false>& self,
+	float const* in, std::complex<float>* out)
+  VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  float *out_ = reinterpret_cast<float*>(out);
+  // The size of the output array is (N/2) x M (if measured in std::complex<float>)
+  unsigned long const N = (1 << self.size_[1]) + 2;
+  unsigned long const M = (1 << self.size_[0]);
+  fft2d_ropx(&setup, const_cast<float*>(in), self.stride_, self.dist_,
+	     out_, self.stride_, N,
+	     self.size_[1], self.size_[0], FFT_FORWARD, sal::ESAL);
+
+  // unpack the data (see SAL reference, figure 3.6, for details).
+  unpack(out_, N, M, self.stride_);
+  // forward fft_ropx is scaled up by 2.
+  float scale = 0.5f;
+  for (unsigned long i = 0; i != M; ++i, out_ += N)
+    vsmulx(out_, self.stride_, &scale, out_, self.stride_, N, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<2, double, std::complex<double>, false>& self,
+	double const* in, std::complex<double>* out)
+  VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  double *out_ = reinterpret_cast<double*>(out);
+  // The size of the output array is (N/2) x M (if measured in std::complex<float>)
+  unsigned long const N = (1 << self.size_[1]) + 2;
+  unsigned long const M = (1 << self.size_[0]);
+  fft2d_ropdx(&setup, const_cast<double*>(in), self.stride_, self.dist_,
+	      out_, self.stride_, N,
+	      self.size_[1], self.size_[0], FFT_FORWARD, sal::ESAL);
+
+  // unpack the data (see SAL reference, figure 3.6, for details).
+  unpack(out_, N, M, self.stride_);
+
+  // forward fft_ropx is scaled up by 2.
+  double scale = 0.5f;
+  for (unsigned long i = 0; i != M; ++i, out_ += N)
+    vsmuldx(out_, self.stride_, &scale, out_, self.stride_, N, sal::ESAL);
+}
+
+// 2D complex -> real inverse fft
+
+template <typename T>
+inline void 
+pack(T *data, unsigned long N, unsigned long M, unsigned long stride)
+{
+  unsigned long const M2 = M/2 + 1;
+
+  T t10i = data[M2  * N * stride - 2];
+  T t10r = data[(M2 - 1) * N * stride];
+
+  for (unsigned long r = M2 - 2; r; --r)
+  {
+    // pack first row (r, 0)
+    data[2 * r * N * stride] = data[r * N * stride];
+    data[(2 * r + 1) * N * stride] = data[r * N * stride + 1];
+    // pack last row (r,-1)
+    data[2 * r * N * stride + 1] = data[(r + 1) * N * stride - 2];
+    data[(2 * r + 1) * N * stride + 1] = data[(r + 1) * N * stride - 1];
+  }
+
+  data[N * stride] = t10r;         // (1, 0).real()
+  data[N * stride + 1] = t10i;     // (1, 0).imag()
+  data[1] = data[N * stride - 2];  // (0,-1).real()
+}
+
+inline void
+from_to(Fft_core<2, std::complex<float>, float, false>& self,
+	std::complex<float> const* in, float* out)
+  VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  float *in_ = 
+    reinterpret_cast<float *>(const_cast<std::complex<float>*>(in));
+  // The size of the output array is (N/2) x M (if measured in std::complex<float>)
+  // pack the data (see SAL reference for details).
+  int const N = (1 << self.size_[1]) + 2;
+  unsigned long const M = (1 << self.size_[0]);
+  pack(in_, N, M, self.stride_);
+  fft2d_ropx(&setup, in_, 1, N,
+	     out, 1, 1 << self.size_[1],
+	     self.size_[1], self.size_[0], FFT_INVERSE, sal::ESAL);
+  // inverse fft_ropx is scaled up by N.
+//   float N = 1 << self.size_[1] + 2;
+//   vsmul(out, 1, &N, out, 1, (int)N);
+}
+
+inline void
+from_to(Fft_core<2, std::complex<double>, double, false>& self,
+	std::complex<double> const* in, double* out)
+  VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  double *in_ = 
+    reinterpret_cast<double *>(const_cast<std::complex<double>*>(in));
+  // pack the data (see SAL reference for details).
+  int const N = (1 << self.size_[1]) + 2;
+  in_[1] = in_[N - 2];
+  in_[N - 2] = in_[N - 1] = 0.f;
+
+  fft2d_ropdx(&setup, in_, 1, 1 << self.size_[1],
+	      out, 1, 1 << self.size_[1],
+	      self.size_[1], self.size_[0], FFT_INVERSE, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<2, std::complex<float>, std::complex<float>, false>& self,
+	std::complex<float> const *in, std::complex<float> *out) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  COMPLEX *in_ = reinterpret_cast<COMPLEX *>(const_cast<std::complex<float>*>(in));
+  COMPLEX *out_ = reinterpret_cast<COMPLEX *>(out);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft2d_copx(&setup, in_, stride, 2 << self.size_[1],
+	     out_, stride, 2 << self.size_[1],
+	     self.size_[1], self.size_[0],
+	     direction, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<2, std::complex<double>, std::complex<double>, false>& self,
+	std::complex<double> const *in, std::complex<double> *out) VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  DOUBLE_COMPLEX *in_ = 
+    reinterpret_cast<DOUBLE_COMPLEX *>(const_cast<std::complex<double>*>(in));
+  DOUBLE_COMPLEX *out_ = reinterpret_cast<DOUBLE_COMPLEX *>(out);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fft2d_copdx(&setup, in_, stride, stride << self.size_[1],
+	      out_, stride, stride << self.size_[1],
+	      self.size_[1], self.size_[0],
+	      direction, sal::ESAL);
+}
+
+// FFTM 
+
+inline void
+in_place(Fft_core<2, std::complex<float>, std::complex<float>, true>& self,
+	 std::complex<float> *inout) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fftm_cipx(&setup, reinterpret_cast<COMPLEX *>(inout),
+	    2, 2 << self.size_[1], self.size_[1], self.runs_,
+	    direction, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<2, std::complex<float>, std::complex<float>, true>& self,
+	std::complex<float> const *in, std::complex<float> *out) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  COMPLEX *in_ = 
+    reinterpret_cast<COMPLEX *>(const_cast<std::complex<float>*>(in));
+  COMPLEX *out_ = reinterpret_cast<COMPLEX *>(out);
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fftm_copx(&setup, in_, self.stride_, self.dist_,
+	    out_, 2, 2 << self.size_[1], self.size_[1], self.runs_,
+	    direction, sal::ESAL);
+}
+
+inline void
+from_to(Fft_core<2, float, std::complex<float>, true>& self,
+	float const *in, std::complex<float> *out) VSIP_NOTHROW
+{
+  FFT_setup setup = reinterpret_cast<FFT_setup>(self.plan_);
+  float *out_ = reinterpret_cast<float*>(out);
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fftm_ropx(&setup, const_cast<float *>(in), self.stride_, self.dist_,
+ 	    out_, self.stride_, self.dist_ + 2,
+ 	    self.size_[1], self.runs_,
+ 	    direction, sal::ESAL);
+  // Unpack the data (see SAL reference for details), and scale back by 1/2.
+  int const N = (1 << self.size_[1]) + 2;
+  float scale = 0.5f;
+  for (unsigned int i = 0; i != self.runs_; ++i, out_ += self.dist_ + 2)
+  {
+    out_[(N - 2) * self.stride_] = out_[self.stride_];
+    out_[self.stride_] = 0.f;
+    out_[(N - 1) * self.stride_] = 0.f;
+    vsmulx(out_, self.stride_, &scale, out_, self.stride_, N, sal::ESAL);
+  }
+}
+
+inline void
+from_to(Fft_core<2, std::complex<double>, std::complex<double>, true>& self,
+	std::complex<double> const *in, std::complex<double> *out) VSIP_NOTHROW
+{
+  FFT_setupd setup = reinterpret_cast<FFT_setupd>(self.plan_);
+  DOUBLE_COMPLEX *in_ = 
+    reinterpret_cast<DOUBLE_COMPLEX *>(const_cast<std::complex<double>*>(in));
+  DOUBLE_COMPLEX *out_ = reinterpret_cast<DOUBLE_COMPLEX *>(out);
+  long stride = 2;
+  long direction = self.is_forward_ ? FFT_FORWARD : FFT_INVERSE;
+  fftm_copdx(&setup, in_, stride, stride << self.size_[1],
+	     out_, stride, stride << self.size_[1],
+	     self.size_[1], 1 << self.size_[0],
+	     direction, sal::ESAL);
+}
+
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif
+
Index: tests/fft.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/tests/fft.cpp,v
retrieving revision 1.10
diff -u -r1.10 fft.cpp
--- tests/fft.cpp	22 Dec 2005 08:21:23 -0000	1.10
+++ tests/fft.cpp	16 Feb 2006 16:15:46 -0000
@@ -762,14 +762,20 @@
 
 unsigned  sizes[][3] =
 {
+#if !defined(VSIP_IMPL_SAL_FFT)
   { 2, 2, 2 },
+#endif
   { 8, 8, 8 },
+#if !defined(VSIP_IMPL_SAL_FFT)
   { 1, 1, 1 },
   { 2, 2, 1 },
   { 2, 8, 128 },
+#endif
   { 3, 5, 7 },
+#if !defined(VSIP_IMPL_SAL_FFT)
   { 2, 24, 48 },
   { 24, 1, 5 },
+#endif
 };
 
 //   the generic test
@@ -924,17 +930,23 @@
 #if defined(VSIP_IMPL_FFT_USE_FLOAT)
 
   test_by_ref<complex<float> >(2, 64);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<float> >(1, 68);
+#endif
   test_by_ref<complex<float> >(2, 256);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<float> >(2, 252);
   test_by_ref<complex<float> >(3, 17);
+#endif
 
   test_by_val<complex<float> >(1, 128);
   test_by_val<complex<float> >(2, 256);
   test_by_val<complex<float> >(3, 512);
 
   test_real<float>(1, 128);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_real<float>(2, 242);
+#endif
   test_real<float>(3, 16);
 
 #endif 
@@ -942,17 +954,23 @@
 #if defined(VSIP_IMPL_FFT_USE_DOUBLE)
 
   test_by_ref<complex<double> >(2, 64);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<double> >(1, 68);
+#endif
   test_by_ref<complex<double> >(2, 256);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<double> >(2, 252);
   test_by_ref<complex<double> >(3, 17);
+#endif
 
   test_by_val<complex<double> >(1, 128);
   test_by_val<complex<double> >(2, 256);
   test_by_val<complex<double> >(3, 512);
 
   test_real<double>(1, 128);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_real<double>(2, 242);
+#endif
   test_real<double>(3, 16);
 
 #endif 
@@ -960,18 +978,24 @@
 #if defined(VSIP_IMPL_FFT_USE_LONG_DOUBLE)
 
 #if ! defined(VSIP_IMPL_IPP_FFT)
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<long double> >(2, 64);
+#endif 
   test_by_ref<complex<long double> >(1, 68);
   test_by_ref<complex<long double> >(2, 256);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_by_ref<complex<long double> >(2, 252);
   test_by_ref<complex<long double> >(3, 17);
+#endif 
 
   test_by_val<complex<long double> >(1, 128);
   test_by_val<complex<long double> >(2, 256);
   test_by_val<complex<long double> >(3, 512);
 
   test_real<long double>(1, 128);
+#if !defined(VSIP_IMPL_SAL_FFT)
   test_real<long double>(2, 242);
+#endif 
   test_real<long double>(3, 16);
 #endif
 
@@ -988,14 +1012,17 @@
   test_fft<0,0,float,false,2,vsip::fft_fwd>();
 
 #if ! defined(VSIP_IMPL_IPP_FFT)
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,float,false,3,vsip::fft_fwd>();
-
+#endif
   test_fft<0,0,float,true,2,1>();
   test_fft<0,0,float,true,2,0>();
 
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,float,true,3,2>();
   test_fft<0,0,float,true,3,1>();
   test_fft<0,0,float,true,3,0>();
+#endif
 #endif   /* VSIP_IMPL_IPP_FFT */
 
 #endif 
@@ -1004,14 +1031,17 @@
 
 #if ! defined(VSIP_IMPL_IPP_FFT)
   test_fft<0,0,double,false,2,vsip::fft_fwd>();
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,double,false,3,vsip::fft_fwd>();
-
+#endif
   test_fft<0,0,double,true,2,1>();
   test_fft<0,0,double,true,2,0>();
 
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,double,true,3,2>();
   test_fft<0,0,double,true,3,1>();
   test_fft<0,0,double,true,3,0>();
+#endif
 #endif  /* VSIP_IMPL_IPP_FFT */
 
 #endif
@@ -1020,14 +1050,17 @@
 
 #if ! defined(VSIP_IMPL_IPP_FFT)
   test_fft<0,0,double,false,2,vsip::fft_fwd>();
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,double,false,3,vsip::fft_fwd>();
-
+#endif
   test_fft<0,0,double,true,2,1>();
   test_fft<0,0,double,true,2,0>();
 
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,0,double,true,3,2>();
   test_fft<0,0,double,true,3,1>();
   test_fft<0,0,double,true,3,0>();
+#endif
 #endif  /* VSIP_IMPL_IPP_FFT */
 
 #endif
@@ -1075,7 +1108,7 @@
   test_fft<2,2,SCALAR,true,2,1>();
   test_fft<2,2,SCALAR,true,2,0>();
 
-
+#if ! defined(VSIP_IMPL_SAL_FFT)
   test_fft<0,1,SCALAR,false,3,vsip::fft_fwd>();
   test_fft<0,2,SCALAR,false,3,vsip::fft_fwd>();
   test_fft<1,0,SCALAR,false,3,vsip::fft_fwd>();
@@ -1111,7 +1144,7 @@
   test_fft<2,2,SCALAR,true,3,2>();
   test_fft<2,2,SCALAR,true,3,1>();
   test_fft<2,2,SCALAR,true,3,0>();
-
+#endif
 #endif  /* VSIP_IMPL_IPP_FFT */
 
 #endif