Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[patch] Misc fixes


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [patch] Misc fixes
  • From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
  • Date: Tue, 31 Jul 2007 09:11:27 -0400

This patch
- disable FFTW split-complex altogether. Out-of-place complex split FFT appears broken on belgarath. - Fixes ifdef logic bug that prevented optimized convolution BEs from being used.
 - Adds a 2D conv benchmark

Patch applied.

				-- Jules

--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 177792)
+++ ChangeLog	(working copy)
@@ -1,3 +1,17 @@
+2007-07-31  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
+
+	* src/vsip/core/signal/conv.hpp: Fix ifdef logic bug.  Optimized
+	  BEs were not being included.
+	* src/vsip/opt/ipp/fir.hpp: Use aligned_array for temporary storage
+	  (using Vector doesn't work when dense format is split).
+	* src/vsip/opt/fftw3/fft_impl.cpp (fftw3_complex_type): Centralize
+	  choice of complex_type.
+	  (USE_FFTW_SPLIT): New macro, avoid use of split-complex altogether.
+	* src/vsip/opt/diag/eval.hpp (Dispatch_name): Add missing impl tags.
+	* tests/fft.cpp: Fix Wall warning.
+	* benchmarks/conv.cpp: Add diag output.
+	* benchmarks/conv2d.cpp: New file, benchmark for 2D convolution.
+
 2007-07-30  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
 
 	* src/vsip/core/layout.hpp: Fix typo.
Index: src/vsip/core/signal/conv.hpp
===================================================================
--- src/vsip/core/signal/conv.hpp	(revision 176624)
+++ src/vsip/core/signal/conv.hpp	(working copy)
@@ -22,9 +22,8 @@
 #include <vsip/core/signal/types.hpp>
 #include <vsip/core/profile.hpp>
 #include <vsip/core/signal/conv_common.hpp>
-#ifndef VSIP_IMPL_REF_IMPL
+#if !VSIP_IMPL_REF_IMPL
 # include <vsip/opt/signal/conv_ext.hpp>
-#else
 # if VSIP_IMPL_HAVE_IPP
 #  include <vsip/opt/ipp/conv.hpp>
 # endif
Index: src/vsip/opt/ipp/fir.hpp
===================================================================
--- src/vsip/opt/ipp/fir.hpp	(revision 176624)
+++ src/vsip/opt/ipp/fir.hpp	(working copy)
@@ -128,11 +128,9 @@
     // Make data be available with unit-stride.
     typedef Layout<1, tuple<0,1,2>, Stride_unit, Cmplx_inter_fmt> layout_type;
     Ext_data<sub_block_type, layout_type> ext_in
-      (sub_in_block, SYNC_IN,
-       Ext_data<block_type>(this->temp_in_.block()).data());
+      (sub_in_block, SYNC_IN, temp_in_.get());
     Ext_data<sub_block_type, layout_type>  ext_out
-      (sub_out_block, SYNC_OUT,
-       Ext_data<block_type>(this->temp_out_.block()).data());
+      (sub_out_block, SYNC_OUT, temp_out_.get());
     Ext_data<block_type, layout_type>  ext_kernel(this->kernel_.block());
     Ext_data<block_type, layout_type>  ext_state(this->state_.block());
     length_type const d = this->decimation();
@@ -158,8 +156,8 @@
   Vector<T, block_type> kernel_; 
   Vector<T, block_type> state_;
   length_type state_saved_;   // number of elements saved
-  Vector<T, block_type> temp_in_;
-  Vector<T, block_type> temp_out_;
+  aligned_array<T> temp_in_;
+  aligned_array<T> temp_out_;
 };
 } // namespace vsip::impl::ipp
 
Index: src/vsip/opt/fftw3/fft_impl.cpp
===================================================================
--- src/vsip/opt/fftw3/fft_impl.cpp	(revision 177792)
+++ src/vsip/opt/fftw3/fft_impl.cpp	(working copy)
@@ -29,18 +29,28 @@
 
 
 // 070729: FFTW 3.1.2's split in-place complex-to-complex FFT is
-// broken.  The plan captures the gap between the real and imaginary
-// components.
+// broken on PowerPC and x86.  The plan captures the gap between the
+// real and imaginary components.
 //
+// 070730: FFTW 3.1.2 split out-of-place complex FFT is broken on
+// PowerPC.
+//
 // 070730: FFTW 3.1.2's split real-complex and complex-real FFT
-// also appear broken.
+// also appear broken on x86.
 //
-// As a work-around: copy, then perform transform out-of-place.
+// Brave souls may set
+//   USE_FFTW_SPLIT 1
+//   USE_BROKEN_FFTW_SPLIT 0
+// to attempt a work-around: copy, then perform transform out-of-place.
 
+// Control whether FFTW split-complex transforms are performed at all.
+#define USE_FFTW_SPLIT 0
+
+// Control whether a subset broken FFTW split-complex transforms are
+// worked around.
 #define USE_BROKEN_FFTW_SPLIT 0
 
 
-
 /***********************************************************************
   Declarations
 ***********************************************************************/
@@ -52,6 +62,12 @@
 namespace fftw3
 {
 
+#if USE_FFTW_SPLIT
+typedef vsip::impl::dense_complex_type fftw3_complex_type;
+#else
+typedef Cmplx_inter_fmt fftw3_complex_type;
+#endif
+
 template <dimension_type D>
 struct Fft_base<D, std::complex<SCALAR_TYPE>, std::complex<SCALAR_TYPE> >
 {
@@ -68,13 +84,13 @@
     
     for(index_type i=0;i<D;i++) size_[i] = dom[i].size();
     plan_in_place_ =
-      Create_plan<vsip::impl::dense_complex_type>
+      Create_plan<fftw3_complex_type>
         ::create<FFTW(plan), FFTW(iodim)>
         (in_buffer_.ptr(), in_buffer_.ptr(), exp, flags, dom);
     
     if (!plan_in_place_) VSIP_IMPL_THROW(std::bad_alloc());
 
-    plan_by_reference_ = Create_plan<vsip::impl::dense_complex_type>
+    plan_by_reference_ = Create_plan<fftw3_complex_type>
       ::create<FFTW(plan), FFTW(iodim)>
       (in_buffer_.ptr(), out_buffer_.ptr(), exp, flags, dom);
 
@@ -90,8 +106,8 @@
     if (plan_by_reference_) FFTW(destroy_plan)(plan_by_reference_);
   }
 
-  Cmplx_buffer<dense_complex_type, SCALAR_TYPE> in_buffer_;
-  Cmplx_buffer<dense_complex_type, SCALAR_TYPE> out_buffer_;
+  Cmplx_buffer<fftw3_complex_type, SCALAR_TYPE> in_buffer_;
+  Cmplx_buffer<fftw3_complex_type, SCALAR_TYPE> out_buffer_;
   FFTW(plan) plan_in_place_;
   FFTW(plan) plan_by_reference_;
   int size_[D];
@@ -112,7 +128,7 @@
     // FFTW3 assumes A == D - 1.
     // See also query_layout().
     if (A != D - 1) std::swap(size_[A], size_[D - 1]);
-    plan_by_reference_ = Create_plan<dense_complex_type>::
+    plan_by_reference_ = Create_plan<fftw3_complex_type>::
       create<FFTW(plan), FFTW(iodim)>
       (in_buffer_.get(), out_buffer_.ptr(), A, flags, dom);
     if (!plan_by_reference_) VSIP_IMPL_THROW(std::bad_alloc());
@@ -123,7 +139,7 @@
   }
 
   aligned_array<SCALAR_TYPE> in_buffer_;
-  Cmplx_buffer<dense_complex_type, SCALAR_TYPE> out_buffer_;
+  Cmplx_buffer<fftw3_complex_type, SCALAR_TYPE> out_buffer_;
   FFTW(plan) plan_by_reference_;
   int size_[D];
   bool aligned_;
@@ -143,7 +159,7 @@
     // FFTW3 assumes A == D - 1.
     // See also query_layout().
     if (A != D - 1) std::swap(size_[A], size_[D - 1]);
-    plan_by_reference_ = Create_plan<dense_complex_type>::
+    plan_by_reference_ = Create_plan<fftw3_complex_type>::
       create<FFTW(plan), FFTW(iodim)>
       (in_buffer_.ptr(), out_buffer_.get(), A, flags, dom);
 
@@ -154,7 +170,7 @@
     if (plan_by_reference_) FFTW(destroy_plan)(plan_by_reference_);
   }
 
-  Cmplx_buffer<dense_complex_type, SCALAR_TYPE> in_buffer_;
+  Cmplx_buffer<fftw3_complex_type, SCALAR_TYPE> in_buffer_;
   aligned_array<SCALAR_TYPE>              out_buffer_;
   FFTW(plan) plan_by_reference_;
   int size_[D];
@@ -186,14 +202,14 @@
     rtl_inout.align = VSIP_IMPL_ALLOC_ALIGNMENT;
     rtl_inout.order = tuple<0, 1, 2>();
     // make default based on library
-    rtl_inout.complex = Create_plan<dense_complex_type>::format;
+    rtl_inout.complex = Create_plan<fftw3_complex_type>::format;
   }
   virtual void query_layout(Rt_layout<1> &rtl_in, Rt_layout<1> &rtl_out)
   {
     rtl_in.pack = this->aligned_ ? stride_unit_align : stride_unit_dense;
     rtl_in.align = VSIP_IMPL_ALLOC_ALIGNMENT;
     rtl_in.order = tuple<0, 1, 2>();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual void in_place(ctype *inout, stride_type s, length_type l)
@@ -217,7 +233,7 @@
 			      inout.second, inout.first,
 			      inout.second, inout.first);
 #else
-    typedef Storage<dense_complex_type, ctype> storage_type;
+    typedef Storage<fftw3_complex_type, ctype> storage_type;
     rtype* real = storage_type::get_real_ptr(in_buffer_.ptr());
     rtype* imag = storage_type::get_imag_ptr(in_buffer_.ptr());
     memcpy(real, inout.first, l*sizeof(rtype));
@@ -282,7 +298,7 @@
     rtl_in.pack = this->aligned_ ? stride_unit_align : stride_unit_dense;
     rtl_in.align = VSIP_IMPL_ALLOC_ALIGNMENT;
     rtl_in.order = tuple<0, 1, 2>();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual void by_reference(rtype *in, stride_type,
@@ -302,7 +318,7 @@
     FFTW(execute_split_dft_r2c)(plan_by_reference_, 
 			  in, out.first, out.second);
 #else
-    typedef Storage<dense_complex_type, ctype> storage_type;
+    typedef Storage<fftw3_complex_type, ctype> storage_type;
     rtype* out_r = storage_type::get_real_ptr(out_buffer_.ptr());
     rtype* out_i = storage_type::get_imag_ptr(out_buffer_.ptr());
     FFTW(execute_split_dft_r2c)(plan_by_reference_, 
@@ -336,7 +352,7 @@
     rtl_in.pack = this->aligned_ ? stride_unit_align : stride_unit_dense;
     rtl_in.align = VSIP_IMPL_ALLOC_ALIGNMENT;
     rtl_in.order = tuple<0, 1, 2>();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
 
@@ -359,7 +375,7 @@
     FFTW(execute_split_dft_c2r)(plan_by_reference_,
 			  in.first, in.second, out);
 #else
-    typedef Storage<dense_complex_type, ctype> storage_type;
+    typedef Storage<fftw3_complex_type, ctype> storage_type;
     rtype* in_r = storage_type::get_real_ptr(in_buffer_.ptr());
     rtype* in_i = storage_type::get_imag_ptr(in_buffer_.ptr());
     memcpy(in_r, in.first, (length/2+1)*sizeof(rtype));
@@ -392,7 +408,7 @@
   {
     rtl_in.pack = stride_unit_dense;
     rtl_in.order = row2_type();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual void in_place(ctype *inout,
@@ -477,7 +493,7 @@
     // FFTW3 assumes A is the last dimension.
     if (A == 0) rtl_in.order = tuple<1, 0, 2>();
     else rtl_in.order = tuple<0, 1, 2>();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual bool requires_copy(Rt_layout<2> &) { return true;}
@@ -527,7 +543,7 @@
     // FFTW3 assumes A is the last dimension.
     if (A == 0) rtl_in.order = tuple<1, 0, 2>();
     else rtl_in.order = tuple<0, 1, 2>();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual bool requires_copy(Rt_layout<2> &) { return true;}
@@ -575,7 +591,7 @@
   {
     rtl_in.pack = stride_unit_dense;
     rtl_in.order = row3_type();
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual void in_place(ctype *inout,
@@ -706,7 +722,7 @@
       case 1: rtl_in.order = tuple<0, 2, 1>(); break;
       default: rtl_in.order = tuple<0, 1, 2>(); break;
     }
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual bool requires_copy(Rt_layout<3> &) { return true;}
@@ -772,7 +788,7 @@
       case 1: rtl_in.order = tuple<0, 2, 1>(); break;
       default: rtl_in.order = tuple<0, 1, 2>(); break;
     }
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual bool requires_copy(Rt_layout<3> &) { return true;}
@@ -837,7 +853,7 @@
     if (A == 0) rtl_in.order = tuple<1, 0, 2>();
     else  rtl_in.order = tuple<0, 1, 2>();
     // make default based on library
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual void by_reference(rtype *in,
@@ -882,7 +898,7 @@
     rtype* out_i = out.second;
 
 #if !USE_BROKEN_FFTW_SPLIT
-    typedef Storage<dense_complex_type, ctype> storage_type;
+    typedef Storage<fftw3_complex_type, ctype> storage_type;
     rtype* tmp_out_r = storage_type::get_real_ptr(out_buffer_.ptr());
     rtype* tmp_out_i = storage_type::get_imag_ptr(out_buffer_.ptr());
 #endif
@@ -937,7 +953,7 @@
     if (A == 0) rtl_in.order = tuple<1, 0, 2>();
     else  rtl_in.order = tuple<0, 1, 2>();
     // make default based on library
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
   virtual bool requires_copy(Rt_layout<2> &) { return true;}
@@ -984,7 +1000,7 @@
     rtype* in_i = in.second;
 
 #if !USE_BROKEN_FFTW_SPLIT
-    typedef Storage<dense_complex_type, ctype> storage_type;
+    typedef Storage<fftw3_complex_type, ctype> storage_type;
     rtype* tmp_in_r = storage_type::get_real_ptr(in_buffer_.ptr());
     rtype* tmp_in_i = storage_type::get_imag_ptr(in_buffer_.ptr());
 #endif
@@ -1040,7 +1056,7 @@
     if (A == 0) rtl_inout.order = tuple<1, 0, 2>();
     else rtl_inout.order = tuple<0, 1, 2>();
     // make default based on library
-    rtl_inout.complex = Create_plan<dense_complex_type>::format;
+    rtl_inout.complex = Create_plan<fftw3_complex_type>::format;
   }
 
   virtual void query_layout(Rt_layout<2> &rtl_in, Rt_layout<2> &rtl_out)
@@ -1050,7 +1066,7 @@
     if (A == 0) rtl_in.order = tuple<1, 0, 2>();
     else  rtl_in.order = tuple<0, 1, 2>();
     // make default based on library
-    rtl_in.complex = Create_plan<dense_complex_type>::format;
+    rtl_in.complex = Create_plan<fftw3_complex_type>::format;
     rtl_out = rtl_in;
   }
 
@@ -1058,7 +1074,7 @@
 			stride_type str_0, stride_type str_1,
 			length_type rows, length_type cols)
   {
-    assert((Type_equal<dense_complex_type, Cmplx_inter_fmt>::value));
+    assert((Type_equal<fftw3_complex_type, Cmplx_inter_fmt>::value));
 
     length_type const n_fft       = (A == 1) ? rows : cols;
     stride_type const fft_stride  = (A == 1) ? str_0 : str_1;
@@ -1083,7 +1099,7 @@
     length_type rows,
     length_type cols)
   {
-    assert((Type_equal<dense_complex_type, Cmplx_split_fmt>::value));
+    assert((Type_equal<fftw3_complex_type, Cmplx_split_fmt>::value));
 
     length_type const n_fft       = (A == 1) ? rows : cols;
     stride_type const fft_stride  = (A == 1) ? str_0 : str_1;
@@ -1107,7 +1123,7 @@
 				imag_ptr, real_ptr,
 				imag_ptr, real_ptr);
 #else
-      typedef Storage<dense_complex_type, ctype> storage_type;
+      typedef Storage<fftw3_complex_type, ctype> storage_type;
       rtype* tmp_in_r = storage_type::get_real_ptr(in_buffer_.ptr());
       rtype* tmp_in_i = storage_type::get_imag_ptr(in_buffer_.ptr());
       memcpy(tmp_in_r, real_ptr, size_[0]*sizeof(rtype));
Index: src/vsip/opt/diag/eval.hpp
===================================================================
--- src/vsip/opt/diag/eval.hpp	(revision 176624)
+++ src/vsip/opt/diag/eval.hpp	(working copy)
@@ -66,6 +66,8 @@
 VSIP_IMPL_DISPATCH_NAME(Fc_expr_tag)
 VSIP_IMPL_DISPATCH_NAME(Rbo_expr_tag)
 VSIP_IMPL_DISPATCH_NAME(Loop_fusion_tag)
+VSIP_IMPL_DISPATCH_NAME(Cvsip_tag)
+VSIP_IMPL_DISPATCH_NAME(Generic_tag)
 
 VSIP_IMPL_DISPATCH_NAME(Tag_illegal_mix_of_local_and_global_in_assign)
 VSIP_IMPL_DISPATCH_NAME(Tag_serial_expr)
Index: tests/fft.cpp
===================================================================
--- tests/fft.cpp	(revision 177792)
+++ tests/fft.cpp	(working copy)
@@ -72,6 +72,8 @@
     std::cout << "  v1 =\n" << v1;
     std::cout << "  v2 =\n" << v2;
   }
+#else
+  (void)where;
 #endif
   test_assert(error < epsilon);
 }
Index: benchmarks/conv.cpp
===================================================================
--- benchmarks/conv.cpp	(revision 173215)
+++ benchmarks/conv.cpp	(working copy)
@@ -7,7 +7,7 @@
 /** @file    benchmarks/conv.cpp
     @author  Jules Bergmann
     @date    2005-07-11
-    @brief   VSIPL++ Library: Benchmark for Convolution.
+    @brief   VSIPL++ Library: Benchmark for 1D Convolution.
 
 */
 
@@ -22,7 +22,7 @@
 #include <vsip/math.hpp>
 #include <vsip/signal.hpp>
 
-#include <vsip/core/profile.hpp>
+#include <vsip/opt/diag/eval.hpp>
 
 #include <vsip_csl/test.hpp>
 #include "loop.hpp"
@@ -99,6 +99,13 @@
 
   t_conv1(length_type coeff_size) : coeff_size_(coeff_size) {}
 
+  void diag()
+  {
+    using impl::diag_detail::Dispatch_name;
+    typedef typename impl::Choose_conv_impl<1, T>::type impl_tag;
+    std::cout << "BE: " << Dispatch_name<impl_tag>::name() << std::endl;
+  }
+
   length_type coeff_size_;
 };
 
Index: benchmarks/conv2d.cpp
===================================================================
--- benchmarks/conv2d.cpp	(revision 0)
+++ benchmarks/conv2d.cpp	(revision 0)
@@ -0,0 +1,172 @@
+/* Copyright (c) 2005, 2006 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    benchmarks/conv.cpp
+    @author  Jules Bergmann
+    @date    2005-07-11
+    @brief   VSIPL++ Library: Benchmark for Convolution.
+
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <iostream>
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/math.hpp>
+#include <vsip/signal.hpp>
+
+#include <vsip/opt/diag/eval.hpp>
+
+#include <vsip_csl/test.hpp>
+#include "loop.hpp"
+
+using namespace vsip;
+
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+template <support_region_type Supp,
+	  typename            T>
+struct t_conv2d : Benchmark_base
+{
+  static length_type const rdec = 1;
+  static length_type const cdec = 1;
+
+  char* what() { return "t_conv2d"; }
+
+  void output_size(
+    length_type  rows,   length_type  cols,
+    length_type& o_rows, length_type& o_cols)
+  {
+    if      (Supp == support_full)
+    {
+      o_rows = ((rows + m_ - 2)/rdec) + 1;
+      o_cols = ((cols + n_ - 2)/cdec) + 1;
+    }
+    else if (Supp == support_same)
+    {
+      o_rows = ((rows - 1)/rdec) + 1;
+      o_cols = ((cols - 1)/cdec) + 1;
+    }
+    else /* (Supp == support_min) */
+    {
+      o_rows = ((rows-1)/rdec) - ((m_-1)/rdec) + 1;
+      o_cols = ((cols-1)/cdec) - ((n_-1)/rdec) + 1;
+    }
+  }
+  
+  float ops_per_point(length_type cols)
+  {
+    length_type o_rows, o_cols;
+
+    output_size(rows_, cols, o_rows, o_cols);
+
+    float ops = m_ * n_ * o_rows * o_cols *
+      (vsip::impl::Ops_info<T>::mul + vsip::impl::Ops_info<T>::add);
+
+    return ops / cols;
+  }
+
+  int riob_per_point(length_type) { return -1; }
+  int wiob_per_point(length_type) { return -1; }
+  int mem_per_point(length_type)  { return 2*sizeof(T); }
+
+  void operator()(length_type cols, length_type loop, float& time)
+  {
+    length_type o_rows, o_cols;
+
+    output_size(rows_, cols, o_rows, o_cols);
+
+    Matrix<T>   in (rows_, cols, T());
+    Matrix<T>   out(o_rows, o_cols);
+    Matrix<T>   coeff(m_, n_, T());
+
+    coeff = T(1);
+
+    symmetry_type const       symmetry = nonsym;
+
+    typedef Convolution<const_Matrix, symmetry, Supp, T> conv_type;
+
+    conv_type conv(coeff, Domain<2>(rows_, cols), rdec);
+
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      conv(in, out);
+    t1.stop();
+    
+    time = t1.delta();
+  }
+
+  t_conv2d(length_type rows, length_type m, length_type n)
+    : rows_(rows), m_(m), n_(n)
+  {}
+
+  void diag()
+  {
+    using impl::diag_detail::Dispatch_name;
+    typedef typename impl::Choose_conv_impl<2, T>::type impl_tag;
+    std::cout << "BE: " << Dispatch_name<impl_tag>::name() << std::endl;
+  }
+
+  length_type rows_;
+  length_type m_;
+  length_type n_;
+};
+
+
+
+void
+defaults(Loop1P& loop)
+{
+  loop.loop_start_ = 5000;
+  loop.start_ = 4;
+
+  loop.param_["rows"] = "16";
+  loop.param_["mn"]   = "0";
+  loop.param_["m"]    = "3";
+  loop.param_["n"]    = "3";
+}
+
+
+
+int
+test(Loop1P& loop, int what)
+{
+  typedef complex<float> cf_type;
+
+  length_type rows = atoi(loop.param_["rows"].c_str());
+  length_type MN   = atoi(loop.param_["mn"].c_str());
+  length_type M    = atoi(loop.param_["m"].c_str());
+  length_type N    = atoi(loop.param_["n"].c_str());
+
+  std::cout << "MN: " << MN << std::endl;
+
+  if (MN != 0)
+    M = N = MN;
+
+  switch (what)
+  {
+  case  1: loop(t_conv2d<support_full, float>(rows, M, N)); break;
+  case  2: loop(t_conv2d<support_same, float>(rows, M, N)); break;
+  case  3: loop(t_conv2d<support_min, float> (rows, M, N)); break;
+
+  // case  4: loop(t_conv1<support_full, cf_type>(loop.user_param_)); break;
+  // case  5: loop(t_conv1<support_same, cf_type>(loop.user_param_)); break;
+  // case  6: loop(t_conv1<support_min,  cf_type>(loop.user_param_)); break;
+
+  default: return 0;
+  }
+  return 1;
+}