Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[patch] Mercury bits, performance bits


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [patch] Mercury bits, performance bits
  • From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
  • Date: Tue, 28 Feb 2006 17:47:29 -0500

This patch covers fixes and improvements for running on the mercury. It also includes some of the performance work done for the LM site visit.

				-- Jules
--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: ChangeLog
===================================================================
RCS file: /home/cvs/Repository/vpp/ChangeLog,v
retrieving revision 1.399
diff -u -r1.399 ChangeLog
--- ChangeLog	27 Feb 2006 15:07:12 -0000	1.399
+++ ChangeLog	28 Feb 2006 22:17:48 -0000
@@ -1,3 +1,71 @@
+2006-02-14  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
+
+	* autogen.sh: Doctor generated configure file to ignore extra
+	  files generated by GreenHills compiler when detecting object
+	  and executable exentions.
+	* configure.ac: New '--with-simd=TYPE' option to enable use of
+	  SIMD extensions.  Search for ippm library.
+	
+	* apps/sarsim/histcmp.c: Define min() macro.
+	* apps/sarsim/util_io.c: Fix signed/unsigned warnings.
+	
+	* benchmarks/conv.cpp: Remove unnecessary code.  Add mem_per_point
+	  function.
+	* benchmarks/conv_ipp.cpp:  Add mem_per_point() function.
+	* benchmarks/copy.cpp: Likewise.
+	* benchmarks/fft_ipp.cpp: Likewise.
+	* benchmarks/qrd.cpp: Likewise.
+	* benchmarks/vmul_c.cpp: Likewise.
+	* benchmarks/vmul_ipp.cpp: Likewise.
+	* benchmarks/fastconv.cpp: Add in-place phase fast convolution
+	  variants.  Add mem_per_point function.  Make std::log() calls
+	  with float argument.
+	* benchmarks/fft.cpp: Add variants with scale != 1.0 to stress
+	  scaling performance.  Add mem_per_point function.
+	* benchmarks/loop.hpp: Add "left-hand side" metrics (either
+	  points or memory size in bytes).  Option to show time.  Add
+	  sweep mode.
+	* benchmarks/main.cpp: Likewise.
+	* benchmarks/mcopy.cpp: Extend benchmark functionality.
+	* benchmarks/mcopy_ipp.cpp: New benchmark for IPP matrix copy
+	  and transpose.
+	* benchmarks/mpi_alltoall.cpp: New benchmark for MPI alltoall
+	  functionality.
+	* benchmarks/vmul.cpp: Add cases for vector subviews, real-complex
+	  vector multiply, and real-complex scalar-view multiply.
+	
+	* doc/csl-docbook/GNUmakefile.inc: Robustify installation of HTML.
+	* src/vsip/GNUmakefile.inc.in: Add simd sources.
+	* src/vsip/support.hpp: Don't define noinline attribute for Greenhills.
+	* src/vsip/vector.hpp: Send scalar op= through dispatch.
+	* src/vsip/impl/distributed-block.hpp: Fix GCC 4.1 Wall warnings.
+	* src/vsip/impl/eval-sal.hpp: Fix Op_list_3 usage, fix Wall warnings.
+	* src/vsip/impl/expr_serial_dispatch.hpp: Hooks for dispatch to SIMD.
+	* src/vsip/impl/expr_serial_evaluator.hpp: Hold view extents in const
+	  variable.
+	* src/vsip/impl/lvalue-proxy.hpp: Remove unnecessary implicit
+	  conversion operator when deriving from std::complex<>.
+	* src/vsip/impl/par-chain-assign.hpp: Add profile events.
+	* src/vsip/impl/par-foreach.hpp: Fix submatrix typenames.
+	* src/vsip/impl/sal.cpp: Add wrappers for real*complex cases.
+	* src/vsip/impl/sal.hpp: Extend coverage of vector multiply
+	  to mixed types and scalar-view cases.
+	* src/vsip/impl/signal-conv-sal.hpp: Fixed Wall warnings.
+	
+	* src/vsip/impl/simd/eval-generic.hpp: New file, dispatch 
+	  evaluators for generic SIMD routines.
+	* src/vsip/impl/simd/simd.hpp: New file, SIMD definitions.
+	* src/vsip/impl/simd/vmul.cpp: New file, SIMD vector-multiply impl.
+	* src/vsip/impl/simd/vmul.hpp: New file, SIMD vector-multiply decls.
+
+	* tests/test.hpp: Define VSIP_IMPL_TEST_LEVEL.
+	* tests/convolution.cpp: Reduce compilation time when
+	  VSIP_IMPL_TEST_LEVEL is 0.
+	* tests/extdata-subviews.cpp: Likewise.
+	* tests/scalar-view.cpp: Likewise.
+	* tests/regr_transpose.cpp: New file, regression test for
+	  greenhills tail-recursion optimization defect.
+
 2006-02-27  Stefan Seefeld  <stefan@xxxxxxxxxxxxxxxx>
 
 	* tests/fft.cpp: Add tests for complex split format.
Index: autogen.sh
===================================================================
RCS file: /home/cvs/Repository/vpp/autogen.sh,v
retrieving revision 1.3
diff -u -r1.3 autogen.sh
--- autogen.sh	5 Dec 2005 15:16:15 -0000	1.3
+++ autogen.sh	28 Feb 2006 22:17:49 -0000
@@ -6,6 +6,12 @@
 # Generate 'configure' from 'configure.ac'
 autoconf
 
+# Tell configure to ignore non-executable/object files generated by
+# greenhills compiler
+cat configure | sed -e "s,\*.xcoff,\*.xcoff | *.dla | *.dnm | *.dbo | *.map," > configure.tmp
+mv configure.tmp configure
+chmod +x configure
+
 if test -f "vendor/atlas/autogen.sh"; then
   cd vendor/atlas
   ./autogen.sh
Index: configure.ac
===================================================================
RCS file: /home/cvs/Repository/vpp/configure.ac,v
retrieving revision 1.82
diff -u -r1.82 configure.ac
--- configure.ac	17 Feb 2006 20:23:44 -0000	1.82
+++ configure.ac	28 Feb 2006 22:17:49 -0000
@@ -240,6 +240,11 @@
                  [set CPU speed in MHz.  Only necessary for TSC and if /proc/cpuinfo does not exist or is wrong]),,
   [enable_cpu_mhz=none])
 
+AC_ARG_WITH([simd],
+  AS_HELP_STRING([--with-simd=WHAT],
+                 [set SIMD extensions]),,
+  [with_simd=none])
+
 
 #
 # Files to generate.
@@ -981,10 +986,12 @@
       ippcore_search="ippcore ippcoreem64t"
       ipps_search="ipps ippsem64t"
       ippi_search="ippi ippiem64t"
+      ippm_search="ippm ippmem64t"
     else
       ippcore_search="ippcore$with_ipp_suffix"
       ipps_search="ipps$with_ipp_suffix"
       ippi_search="ippi$with_ipp_suffix"
+      ippm_search="ippm$with_ipp_suffix"
     fi
     # Find the library.
     save_LDFLAGS="$LDFLAGS"
@@ -1032,6 +1039,12 @@
 	[have_ippi="no"
          LD_FLAGS="$save_LDFLAGS"])
 
+    AC_SEARCH_LIBS(
+	[ippmCopy_ma_32f_SS], [$ippm_search],
+	[have_ippm="yes"],
+	[have_ippm="no"
+         LD_FLAGS="$save_LDFLAGS"])
+
     if test "$enable_ipp_fft" != "no"; then 
       AC_SUBST(VSIP_IMPL_IPP_FFT, 1)
       AC_DEFINE_UNQUOTED(VSIP_IMPL_IPP_FFT, 1,
@@ -1487,6 +1500,67 @@
 fi
 
 #
+# Configure SIMD extensions
+#
+if test "$with_simd" != "none"; then
+  keep_IFS=$IFS
+  IFS=","
+
+  taglist=""
+
+  for simd_type in $with_simd; do
+    AC_MSG_CHECKING([SIMD Tag $simd_type])
+    if test "$simd_type" == "3dnowext-32"; then
+      taglist="${taglist}Simd_3dnowext_tag,"
+      AC_SUBST(USE_SIMD_3DNOWEXT_32, 1)
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_3DNOWEXT, 1,
+          [Define whether to use 3DNow!-ext ISA routines in expr dispatch.])
+      AC_MSG_RESULT([ok])
+
+    elif test "$simd_type" == "3dnowext-64"; then
+      taglist="${taglist}Simd_3dnowext_tag,"
+      AC_SUBST(USE_SIMD_3DNOWEXT_64, 1)
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_3DNOWEXT, 1,
+          [Define whether to use 3DNow!-ext ISA routines in expr dispatch.])
+      AC_MSG_RESULT([ok])
+
+    elif test "$simd_type" == "sse2-32"; then
+      taglist="${taglist}Simd_sse2_tag,"
+      AC_SUBST(USE_SIMD_SSE2_32, 1)
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_SSE2, 1,
+          [Define whether to use SSE2 ISA routines in expr dispatch.])
+      AC_MSG_RESULT([ok])
+
+    elif test "$simd_type" == "sse2-64"; then
+      taglist="${taglist}Simd_sse2_tag,"
+      AC_SUBST(USE_SIMD_SSE2_64, 1)
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_SSE2, 1,
+          [Define whether to use SSE2 ISA routines in expr dispatch.])
+      AC_MSG_RESULT([ok])
+
+    elif test "$simd_type" == "generic"; then
+      taglist="${taglist}Simd_tag,"
+      AC_DEFINE_UNQUOTED(VSIP_IMPL_HAVE_SIMD_GENERIC, 1,
+          [Define whether to use Generic SIMD routines in expr dispatch.])
+      AC_MSG_RESULT([ok])
+
+    else
+      AC_MSG_ERROR([Invalid tag to --with-simd=SIMD: $simd_type.])
+    fi
+  done
+
+  IFS=$keep_IFS
+else
+  taglist=""
+fi
+
+AC_DEFINE_UNQUOTED(VSIP_IMPL_SIMD_TAG_LIST, $taglist,
+          [Define to set whether or not to use Intel's IPP library.])
+
+mkdir -p src/vsip/impl/simd
+
+
+#
 # library
 #
 ARFLAGS="r"
Index: apps/sarsim/histcmp.c
===================================================================
RCS file: /home/cvs/Repository/vpp/apps/sarsim/histcmp.c,v
retrieving revision 1.3
diff -u -r1.3 histcmp.c
--- apps/sarsim/histcmp.c	10 Sep 2005 17:59:24 -0000	1.3
+++ apps/sarsim/histcmp.c	28 Feb 2006 22:17:50 -0000
@@ -28,6 +28,8 @@
 #include "sarx.h"
 #include "util_io.h"
 
+#define min(a,b) ((a)<(b)?(a):(b))
+
 main(argc,argv)
 int	argc;
 char	**argv;
Index: apps/sarsim/util_io.c
===================================================================
RCS file: /home/cvs/Repository/vpp/apps/sarsim/util_io.c,v
retrieving revision 1.1
diff -u -r1.1 util_io.c
--- apps/sarsim/util_io.c	16 Jun 2005 18:01:20 -0000	1.1
+++ apps/sarsim/util_io.c	28 Feb 2006 22:17:51 -0000
@@ -27,7 +27,7 @@
    size_t	nmemb,
    FILE		*stream)
 {
-   int i;
+   size_t i;
 
    assert(size*nmemb < BUF_SIZE);
 
@@ -79,9 +79,9 @@
    FILE		*stream,
    int		align)
 {
-   int i;
+   size_t i;
 
-   int n_align = size*nmemb/align;
+   size_t n_align = size*nmemb/align;
 
    assert(n_align*align == size*nmemb);
 
@@ -131,7 +131,7 @@
    size_t	nmemb,
    FILE		*stream)
 {
-   int i;
+   size_t i;
 
    if (size == 1) {
       return fread(ptr, size, nmemb, stream);
@@ -168,9 +168,9 @@
    FILE		*stream,
    int		align)
 {
-   int i;
+   size_t i;
 
-   int n_align = size*nmemb/align;
+   size_t n_align = size*nmemb/align;
    assert(n_align*align == size*nmemb);
 
    if (align == 1) {
Index: benchmarks/conv.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/conv.cpp,v
retrieving revision 1.3
diff -u -r1.3 conv.cpp
--- benchmarks/conv.cpp	26 Sep 2005 20:11:05 -0000	1.3
+++ benchmarks/conv.cpp	28 Feb 2006 22:17:51 -0000
@@ -27,38 +27,10 @@
 using namespace vsip;
 
 
-/// Return a random value between -0.5 and +0.5
-
-template <typename T>
-struct Random
-{
-  static T value() { return T(1.f * rand()/(RAND_MAX+1.0)) - T(0.5); }
-};
-
-/// Specialization for random complex value.
-
-template <typename T>
-struct Random<complex<T> >
-{
-  static complex<T> value() {
-    return complex<T>(Random<T>::value(), Random<T>::value());
-  }
-};
-
-
-
-/// Fill a matrix with random values.
-
-template <typename T,
-	  typename Block>
-void
-randm(Matrix<T, Block> m)
-{
-  for (index_type r=0; r<m.size(0); ++r)
-    for (index_type c=0; c<m.size(1); ++c)
-      m(r, c) = Random<T>::value();
-}
 
+/***********************************************************************
+  Definitions
+***********************************************************************/
 
 template <support_region_type Supp,
 	  typename            T>
@@ -86,6 +58,7 @@
 
   int riob_per_point(length_type) { return -1*sizeof(T); }
   int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int mem_per_point(length_type)  { return 2*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
Index: benchmarks/conv_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/conv_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 conv_ipp.cpp
--- benchmarks/conv_ipp.cpp	7 Sep 2005 12:19:30 -0000	1.2
+++ benchmarks/conv_ipp.cpp	28 Feb 2006 22:17:51 -0000
@@ -73,6 +73,7 @@
   }
   int riob_per_point(size_t) { return 1*sizeof(Ipp32f); }
   int wiob_per_point(size_t) { return 1*sizeof(Ipp32f); }
+  int mem_per_point(size_t)  { return 2*sizeof(Ipp32f); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
Index: benchmarks/copy.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/copy.cpp,v
retrieving revision 1.1
diff -u -r1.1 copy.cpp
--- benchmarks/copy.cpp	7 Sep 2005 12:19:30 -0000	1.1
+++ benchmarks/copy.cpp	28 Feb 2006 22:17:51 -0000
@@ -41,6 +41,7 @@
   int ops_per_point(length_type)  { return 1; }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 1*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -106,6 +107,7 @@
   int ops_per_point(length_type)  { return 1; }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 1*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -169,6 +171,7 @@
   int ops_per_point(length_type)  { return 1; }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 1*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
Index: benchmarks/fastconv.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fastconv.cpp,v
retrieving revision 1.2
diff -u -r1.2 fastconv.cpp
--- benchmarks/fastconv.cpp	19 Dec 2005 16:08:55 -0000	1.2
+++ benchmarks/fastconv.cpp	28 Feb 2006 22:17:51 -0000
@@ -34,7 +34,7 @@
 int
 fft_ops(length_type len)
 {
-  return int(5 * std::log(len) / std::log(2));
+  return int(5 * std::log((float)len) / std::log(2.f));
 }
 
 template <typename T,
@@ -45,7 +45,9 @@
 struct Impl1ip;		// in-place, phased fast-convolution
 struct Impl1pip1;	// psuedo in-place (using in-place Fft), phased
 struct Impl1pip2;	// psuedo in-place (using out-of-place Fft), phased
-struct Impl2;		// out-of-place (tmp), interleaved fast-convolution
+struct Impl2op;		// out-of-place (tmp), interleaved fast-convolution
+struct Impl2ip;		// in-place, interleaved fast-convolution
+struct Impl2ip_tmp;	// in-place (w/tmp), interleaved fast-convolution
 struct Impl2fv;		// foreach_vector, interleaved fast-convolution
 
 struct Impl1pip2_nopar;
@@ -54,7 +56,7 @@
 {
   float ops(length_type npulse, length_type nrange) 
   {
-    float fft_ops = 5 * nrange * std::log(nrange) / std::log(2);
+    float fft_ops = 5 * nrange * std::log((float)nrange) / std::log(2.f);
     float tot_ops = 2 * npulse * fft_ops + 6 * npulse * nrange;
     return tot_ops;
   }
@@ -99,7 +101,7 @@
 
     // Create the FFT objects.
     for_fftm_type for_fftm(Domain<2>(npulse, nrange), 1.0);
-    inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/(nrange*npulse));
+    inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/nrange);
 
     // Initialize
     data    = T();
@@ -155,7 +157,7 @@
 
     // Create the FFT objects.
     for_fftm_type for_fftm(Domain<2>(npulse, nrange), 1.0);
-    inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/(nrange*npulse));
+    inv_fftm_type inv_fftm(Domain<2>(npulse, nrange), 1.0/nrange);
 
     // Initialize
     data    = T();
@@ -384,11 +386,11 @@
 
 
 /***********************************************************************
-  Impl2: out-of-place (tmp), interleaved fast-convolution
+  Impl2op: out-of-place (tmp), interleaved fast-convolution
 ***********************************************************************/
 
 template <typename T>
-struct t_fastconv_base<T, Impl2> : fastconv_ops
+struct t_fastconv_base<T, Impl2op> : fastconv_ops
 {
   void fastconv(length_type npulse, length_type nrange,
 		length_type loop, float& time)
@@ -456,6 +458,152 @@
 
 
 /***********************************************************************
+  Impl2ip: in-place, interleaved fast-convolution
+***********************************************************************/
+
+template <typename T>
+struct t_fastconv_base<T, Impl2ip> : fastconv_ops
+{
+  void fastconv(length_type npulse, length_type nrange,
+		length_type loop, float& time)
+  {
+    typedef Map<Block_dist, Whole_dist>      map_type;
+    typedef Dense<2, T, row2_type, map_type> block_type;
+    typedef Matrix<T, block_type>            view_type;
+
+    typedef Dense<1, T, row1_type, Global_map<1> > replica_block_type;
+    typedef Vector<T, replica_block_type>          replica_view_type;
+
+    processor_type np = num_processors();
+    map_type map = map_type(Block_dist(np), Whole_dist());
+
+    // Create the data cube.
+    view_type data(npulse, nrange, map);
+    // Vector<T> tmp(nrange);
+    
+    // Create the pulse replica
+    replica_view_type replica(nrange);
+
+    // int const no_times = 0; // FFTW_PATIENT
+    int const no_times = 15; // not > 12 = FFT_MEASURE
+    
+    typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times>
+	  	for_fft_type;
+    typedef Fft<const_Vector, T, T, fft_inv, by_reference, no_times>
+	  	inv_fft_type;
+
+    // Create the FFT objects.
+    for_fft_type for_fft(Domain<1>(nrange), 1.0);
+    inv_fft_type inv_fft(Domain<1>(nrange), 1.0/(nrange));
+
+    // Initialize
+    data    = T();
+    replica = T();
+
+
+    // Before fast convolution, convert the replica into the
+    // frequency domain
+    // for_fft(replica);
+    
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+    {
+      typename view_type::local_type         l_data    = data.local();
+      typename replica_view_type::local_type l_replica = replica.local();
+      length_type                            l_npulse  = l_data.size(0);
+      for (index_type p=0; p<l_npulse; ++p)
+      {
+	for_fft(l_data.row(p));
+	l_data.row(p) *= l_replica;
+	inv_fft(l_data.row(p));
+      }
+    }
+    t1.stop();
+
+    // CHECK RESULT
+    time = t1.delta();
+  }
+};
+
+
+
+/***********************************************************************
+  Impl2ip_tmp: in-place, interleaved fast-convolution
+***********************************************************************/
+
+template <typename T>
+struct t_fastconv_base<T, Impl2ip_tmp> : fastconv_ops
+{
+  void fastconv(length_type npulse, length_type nrange,
+		length_type loop, float& time)
+  {
+    typedef Map<Block_dist, Whole_dist>      map_type;
+    typedef Dense<2, T, row2_type, map_type> block_type;
+    typedef Matrix<T, block_type>            view_type;
+
+    typedef Dense<1, T, row1_type, Global_map<1> > replica_block_type;
+    typedef Vector<T, replica_block_type>          replica_view_type;
+
+    processor_type np = num_processors();
+    map_type map = map_type(Block_dist(np), Whole_dist());
+
+    // Create the data cube.
+    view_type data(npulse, nrange, map);
+    Vector<T> tmp(nrange);
+    
+    // Create the pulse replica
+    replica_view_type replica(nrange);
+
+    // int const no_times = 0; // FFTW_PATIENT
+    int const no_times = 15; // not > 12 = FFT_MEASURE
+    
+    typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times>
+	  	for_fft_type;
+    typedef Fft<const_Vector, T, T, fft_inv, by_reference, no_times>
+	  	inv_fft_type;
+
+    // Create the FFT objects.
+    for_fft_type for_fft(Domain<1>(nrange), 1.0);
+    inv_fft_type inv_fft(Domain<1>(nrange), 1.0/(nrange));
+
+    // Initialize
+    data    = T();
+    replica = T();
+
+
+    // Before fast convolution, convert the replica into the
+    // frequency domain
+    // for_fft(replica);
+    
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+    {
+      typename view_type::local_type         l_data    = data.local();
+      typename replica_view_type::local_type l_replica = replica.local();
+      length_type                            l_npulse  = l_data.size(0);
+      for (index_type p=0; p<l_npulse; ++p)
+      {
+	tmp = l_data.row(p);
+	for_fft(tmp);
+	tmp *= l_replica;
+	inv_fft(tmp);
+	l_data.row(p) = tmp;
+      }
+    }
+    t1.stop();
+
+    // CHECK RESULT
+    time = t1.delta();
+  }
+};
+
+
+
+/***********************************************************************
   Impl2fv: foreach_vector, interleaved fast-convolution
 ***********************************************************************/
 
@@ -555,8 +703,9 @@
   char* what() { return "t_fastconv_pf"; }
   int ops_per_point(length_type size)
     { return (int)(this->ops(npulse_, size) / size); }
-  int riob_per_point(length_type) { return -1*sizeof(T); }
-  int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int mem_per_point(length_type size) { return npulse_*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -581,8 +730,9 @@
   char* what() { return "t_fastconv_rf"; }
   int ops_per_point(length_type size)
     { return (int)(this->ops(size, nrange_) / size); }
-  int riob_per_point(length_type) { return -1*sizeof(T); }
-  int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int mem_per_point(length_type size) { return nrange_*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -619,8 +769,10 @@
   case  2: loop(t_fastconv_pf<complex<float>, Impl1ip>(param1)); break;
   case  3: loop(t_fastconv_pf<complex<float>, Impl1pip1>(param1)); break;
   case  4: loop(t_fastconv_pf<complex<float>, Impl1pip2>(param1)); break;
-  case  5: loop(t_fastconv_pf<complex<float>, Impl2>(param1)); break;
-  case  6: loop(t_fastconv_pf<complex<float>, Impl2fv>(param1)); break;
+  case  5: loop(t_fastconv_pf<complex<float>, Impl2op>(param1)); break;
+  case  6: loop(t_fastconv_pf<complex<float>, Impl2ip>(param1)); break;
+  case  7: loop(t_fastconv_pf<complex<float>, Impl2ip_tmp>(param1)); break;
+  case  8: loop(t_fastconv_pf<complex<float>, Impl2fv>(param1)); break;
 
   case  9: loop(t_fastconv_pf<complex<float>, Impl1pip2_nopar>(param1)); break;
 
@@ -628,8 +780,10 @@
   case 12: loop(t_fastconv_rf<complex<float>, Impl1ip>(param1)); break;
   case 13: loop(t_fastconv_rf<complex<float>, Impl1pip1>(param1)); break;
   case 14: loop(t_fastconv_rf<complex<float>, Impl1pip2>(param1)); break;
-  case 15: loop(t_fastconv_rf<complex<float>, Impl2>(param1)); break;
-  case 16: loop(t_fastconv_rf<complex<float>, Impl2fv>(param1)); break;
+  case 15: loop(t_fastconv_rf<complex<float>, Impl2op>(param1)); break;
+  case 16: loop(t_fastconv_rf<complex<float>, Impl2ip>(param1)); break;
+  case 17: loop(t_fastconv_rf<complex<float>, Impl2ip_tmp>(param1)); break;
+  case 18: loop(t_fastconv_rf<complex<float>, Impl2fv>(param1)); break;
 
   default: return 0;
   }
Index: benchmarks/fft.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fft.cpp,v
retrieving revision 1.3
diff -u -r1.3 fft.cpp
--- benchmarks/fft.cpp	26 Sep 2005 20:11:05 -0000	1.3
+++ benchmarks/fft.cpp	28 Feb 2006 22:17:51 -0000
@@ -28,7 +28,7 @@
 int
 fft_ops(length_type len)
 {
-  return int(5 * std::log(len) / std::log(2));
+  return int(5 * std::log((float)len) / std::log(2.f));
 }
 
 
@@ -37,8 +37,9 @@
 {
   char* what() { return "t_fft_op"; }
   int ops_per_point(length_type len)  { return fft_ops(len); }
-  int riob_per_point(length_type) { return -1*sizeof(T); }
-  int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int mem_per_point(length_type)  { return 1*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -51,7 +52,7 @@
     typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times, alg_time>
       fft_type;
 
-    fft_type fft(Domain<1>(size), 1.f);
+    fft_type fft(Domain<1>(size), scale_ ? (1.f/size) : 1.f);
 
     A = T(1);
     
@@ -62,7 +63,7 @@
       fft(A, Z);
     t1.stop();
     
-    if (!equal(Z(0), T(size)))
+    if (!equal(Z(0), T(scale_ ? 1 : size)))
     {
       std::cout << "t_fft_op: ERROR" << std::endl;
       abort();
@@ -70,6 +71,11 @@
     
     time = t1.delta();
   }
+
+  t_fft_op(bool scale) : scale_(scale) {}
+
+  // Member data
+  bool scale_;
 };
 
 
@@ -79,8 +85,9 @@
 {
   char* what() { return "t_fft_ip"; }
   int ops_per_point(length_type len)  { return fft_ops(len); }
-  int riob_per_point(length_type) { return -1*sizeof(T); }
-  int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int riob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int wiob_per_point(length_type) { return -1*(int)sizeof(T); }
+  int mem_per_point(length_type)  { return 1*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -92,7 +99,7 @@
     typedef Fft<const_Vector, T, T, fft_fwd, by_reference, no_times, alg_time>
       fft_type;
 
-    fft_type fft(Domain<1>(size), 1.f);
+    fft_type fft(Domain<1>(size), scale_ ? (1.f/size) : 1.f);
 
     vsip::impl::profile::Timer t1;
     
@@ -109,6 +116,11 @@
     
     time = t1.delta();
   }
+
+  t_fft_ip(bool scale) : scale_(scale) {}
+
+  // Member data
+  bool scale_;
 };
 
 
@@ -126,8 +138,11 @@
 {
   switch (what)
   {
-  case  1: loop(t_fft_op<complex<float> >()); break;
-  case  2: loop(t_fft_ip<complex<float> >()); break;
+  case  1: loop(t_fft_op<complex<float> >(false)); break;
+  case  2: loop(t_fft_ip<complex<float> >(false)); break;
+
+  case  5: loop(t_fft_op<complex<float> >(true)); break;
+  case  6: loop(t_fft_ip<complex<float> >(true)); break;
   default: return 0;
   }
   return 1;
Index: benchmarks/fft_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/fft_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 fft_ipp.cpp
--- benchmarks/fft_ipp.cpp	7 Sep 2005 12:19:30 -0000	1.2
+++ benchmarks/fft_ipp.cpp	28 Feb 2006 22:17:51 -0000
@@ -50,6 +50,7 @@
   int ops_per_point(size_t len)  { return fft_ops(len); }
   int riob_per_point(size_t) { return -1*sizeof(Ipp32fc); }
   int wiob_per_point(size_t) { return -1*sizeof(Ipp32fc); }
+  int mem_per_point(size_t)  { return  2*sizeof(Ipp32fc); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
Index: benchmarks/loop.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/loop.hpp,v
retrieving revision 1.10
diff -u -r1.10 loop.hpp
--- benchmarks/loop.hpp	14 Feb 2006 13:42:51 -0000	1.10
+++ benchmarks/loop.hpp	28 Feb 2006 22:17:51 -0000
@@ -34,10 +34,24 @@
   pts_per_sec,
   ops_per_sec,
   iob_per_sec,
+  wiob_per_sec,
   all_per_sec
 };
 
 
+enum lhs_metric
+{
+  lhs_pts,
+  lhs_mem
+};
+
+enum bench_mode
+{
+  steady_mode,
+  sweep_mode
+};
+
+
 // 1 Parameter Loop
 class Loop1P
 {
@@ -50,17 +64,26 @@
     start_	 (2),
     stop_	 (21),
     cal_	 (4),
-    loop_start_	 (20),
+    loop_start_	 (10),
     samples_	 (1),
     goal_sec_	 (1.0),
     metric_      (pts_per_sec),
+    lhs_         (lhs_pts),
     note_        (0),
     do_prof_     (false),
     what_        (0),
-    show_loop_   (false)
+    show_loop_   (false),
+    show_time_   (false),
+    mode_        (sweep_mode)
   {}
 
   template <typename Functor>
+  void sweep(Functor func);
+
+  template <typename Functor>
+  void steady(Functor func);
+
+  template <typename Functor>
   void operator()(Functor func);
 
   template <typename Functor>
@@ -76,11 +99,14 @@
   unsigned	samples_;
   double        goal_sec_;	// measurement goal (in seconds)
   output_metric metric_;
+  lhs_metric    lhs_;
   char*         note_;
   int           user_param_;
   bool          do_prof_;
   int           what_;
   bool          show_loop_;
+  bool          show_time_;
+  bench_mode    mode_;
 };
 
 
@@ -114,6 +140,11 @@
                            * loop;
     return ops / (time * 1e6);
   }
+  else if (m == wiob_per_sec)
+  {
+    double ops = (double)M * fcn.wiob_per_point(M) * loop;
+    return ops / (time * 1e6);
+  }
   else
     return 0.f;
 }
@@ -122,8 +153,7 @@
 
 template <typename Functor>
 inline void
-Loop1P::operator()(
-  Functor fcn)
+Loop1P::sweep(Functor fcn)
 {
   using vsip::Index;
   using vsip::Vector;
@@ -154,9 +184,20 @@
   // calibrate --------------------------------------------------------
   while (1)
   {
+    // printf("%d: calib %5d\n", rank, loop);
+    comm.barrier();
     fcn(M, loop, time);
+    comm.barrier();
+
+    dist_time.local().put(0, time);
+    glob_time = dist_time;
+
+    Index<1> idx;
+
+    time = maxval(glob_time.local(), idx);
+
     if (time <= 0.01) time = 0.01;
-    // if (!as_data) printf("c: %5d - %5d\n", loop, time);
+    // printf("%d: time %f\n", rank, time);
 
     float factor = goal_sec_ / time;
     if (factor < 1.0) factor += 0.1 * (1.0 - factor);
@@ -173,10 +214,12 @@
     printf("# ops_per_point(1) : %d\n", (int)fcn.ops_per_point(1));
     printf("# riob_per_point(1): %d\n", fcn.riob_per_point(1));
     printf("# wiob_per_point(1): %d\n", fcn.wiob_per_point(1));
-    printf("# metric           : %s\n", metric_ == pts_per_sec ? "pts_per_sec" :
-	                              metric_ == ops_per_sec ? "ops_per_sec" :
-	                              metric_ == iob_per_sec ? "iob_per_sec" :
-	                                                       "*unknown*");
+    printf("# metric           : %s\n",
+	   metric_ == pts_per_sec  ? "pts_per_sec" :
+	   metric_ == ops_per_sec  ? "ops_per_sec" :
+	   metric_ == iob_per_sec  ? "iob_per_sec" :
+	   metric_ == wiob_per_sec ? "wiob_per_sec" :
+	                             "*unknown*");
     if (this->note_)
       printf("# note: %s\n", this->note_);
     printf("# start_loop       : %lu\n", (unsigned long) loop);
@@ -215,23 +258,33 @@
 
     if (rank == 0)
     {
+      size_t L;
+      
+      if (this->lhs_ == lhs_mem)
+	L = M * fcn.mem_per_point(M);
+      else // (this->lhs_ == lhs_pts)
+	L = M;
+
       if (this->metric_ == all_per_sec)
-	printf("%7ld %f %f %f", (unsigned long) M,
+	printf("%7ld %f %f %f", (unsigned long) L,
 	       this->metric(fcn, M, loop, mtime[(n_time-1)/2], pts_per_sec),
 	       this->metric(fcn, M, loop, mtime[(n_time-1)/2], ops_per_sec),
 	       this->metric(fcn, M, loop, mtime[(n_time-1)/2], iob_per_sec));
       else if (n_time > 1)
 	// Note: max time is min op/s, and min time is max op/s
-	printf("%7lu %f %f %f", (unsigned long) M,
+	printf("%7lu %f %f %f", (unsigned long) L,
 	       this->metric(fcn, M, loop, mtime[(n_time-1)/2], metric_),
 	       this->metric(fcn, M, loop, mtime[n_time-1],     metric_),
 	       this->metric(fcn, M, loop, mtime[0],            metric_));
       else
-	printf("%7lu %f", (unsigned long) M,
+	printf("%7lu %f", (unsigned long) L,
 	       this->metric(fcn, M, loop, mtime[0], metric_));
       if (this->show_loop_)
 	printf("  %8lu", (unsigned long)loop);
+      if (this->show_time_)
+	printf("  %f", mtime[(n_time-1)/2]);
       printf("\n");
+      fflush(stdout);
     }
 
     time = mtime[(n_time-1)/2];
@@ -247,4 +300,115 @@
   }
 }
 
+
+
+template <typename Functor>
+void
+Loop1P::steady(Functor fcn)
+{
+  using vsip::Index;
+  using vsip::Vector;
+  using vsip::Dense;
+  using vsip::Map;
+  using vsip::Global_map;
+  using vsip::row1_type;
+
+  size_t   loop, M;
+  float    time;
+
+  vsip::impl::Communicator comm  = vsip::impl::default_communicator();
+  vsip::processor_type     rank  = comm.rank();
+  vsip::processor_type     nproc = vsip::num_processors();
+
+  Vector<float, Dense<1, float, row1_type, Map<> > >
+    dist_time(nproc, Map<>(nproc));
+  Vector<float, Dense<1, float, row1_type, Global_map<1> > > glob_time(nproc);
+
+  loop = (1 << loop_start_);
+
+  if (rank == 0)
+  {
+    printf("# what             : %s (%d)\n", fcn.what(), what_);
+    printf("# nproc            : %d\n", nproc);
+    printf("# ops_per_point(1) : %d\n", (int)fcn.ops_per_point(1));
+    printf("# riob_per_point(1): %d\n", fcn.riob_per_point(1));
+    printf("# wiob_per_point(1): %d\n", fcn.wiob_per_point(1));
+    printf("# metric           : %s\n",
+	   metric_ == pts_per_sec  ? "pts_per_sec" :
+	   metric_ == ops_per_sec  ? "ops_per_sec" :
+	   metric_ == iob_per_sec  ? "iob_per_sec" :
+	   metric_ == wiob_per_sec ? "wiob_per_sec" :
+	                             "*unknown*");
+    if (this->note_)
+      printf("# note: %s\n", this->note_);
+    printf("# start_loop       : %lu\n", (unsigned long) loop);
+  }
+
+  if (this->do_prof_)
+    vsip::impl::profile::prof->set_mode(vsip::impl::profile::pm_accum);
+
+  // for real ---------------------------------------------------------
+  while (1)
+  {
+    M = (1 << start_);
+
+    comm.barrier();
+    fcn(M, loop, time);
+    comm.barrier();
+
+    dist_time.local().put(0, time);
+    glob_time = dist_time;
+
+    Index<1> idx;
+
+    time = maxval(glob_time.local(), idx);
+
+#if 0
+    if (this->do_prof_)
+    {
+      char     filename[256];
+      sprintf(filename, "vprof.%lu.out", (unsigned long) M);
+      vsip::impl::profile::prof->dump(filename);
+    }
+#endif
+
+    if (rank == 0)
+    {
+      if (this->metric_ == all_per_sec)
+	printf("%7ld %f %f %f", (unsigned long) M,
+	       this->metric(fcn, M, loop, time, pts_per_sec),
+	       this->metric(fcn, M, loop, time, ops_per_sec),
+	       this->metric(fcn, M, loop, time, iob_per_sec));
+      else
+	printf("%7lu %f", (unsigned long) M,
+	       this->metric(fcn, M, loop, time, metric_));
+      if (this->show_loop_)
+	printf("  %8lu", (unsigned long)loop);
+      if (this->show_time_)
+	printf("  %f", time);
+      printf("\n");
+      fflush(stdout);
+    }
+
+    float factor = goal_sec_ / time;
+    if (factor < 1.0) factor += 0.1 * (1.0 - factor);
+    loop = (int)(factor * loop);
+
+    if (loop < 1) loop = 1;
+  }
+}
+
+
+
+template <typename Functor>
+inline void
+Loop1P::operator()(
+  Functor fcn)
+{
+  if (mode_ == steady_mode)
+    this->steady(fcn);
+  else
+    this->sweep(fcn);
+}
+
 #endif // CSL_LOOP_HPP
Index: benchmarks/main.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/main.cpp,v
retrieving revision 1.6
diff -u -r1.6 main.cpp
--- benchmarks/main.cpp	27 Jan 2006 13:13:23 -0000	1.6
+++ benchmarks/main.cpp	28 Feb 2006 22:17:51 -0000
@@ -53,7 +53,10 @@
     if (sscanf(argv[i], "-%d", &what))
       ;
     else if (!strcmp(argv[i], "-start"))
+    {
       loop.start_ = atoi(argv[++i]);
+      loop.cal_   = loop.start_;
+    }
     else if (!strcmp(argv[i], "-stop"))
       loop.stop_ = atoi(argv[++i]);
     else if (!strcmp(argv[i], "-cal"))
@@ -74,12 +77,22 @@
       loop.metric_ = pts_per_sec;
     else if (!strcmp(argv[i], "-iob"))
       loop.metric_ = iob_per_sec;
+    else if (!strcmp(argv[i], "-wiob"))
+      loop.metric_ = wiob_per_sec;
     else if (!strcmp(argv[i], "-all"))
       loop.metric_ = all_per_sec;
+    else if (!strcmp(argv[i], "-mem"))
+      loop.lhs_ = lhs_mem;
     else if (!strcmp(argv[i], "-prof"))
       loop.do_prof_  = true;
     else if (!strcmp(argv[i], "-show_loop"))
       loop.show_loop_ = true;
+    else if (!strcmp(argv[i], "-show_time"))
+      loop.show_time_ = true;
+    else if (!strcmp(argv[i], "-steady"))
+      loop.mode_ = steady_mode;
+    else
+      std::cerr << "ERROR: Unknown argument: " << argv[i] << std::endl;
   }
 
   if (verbose)
Index: benchmarks/mcopy.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/mcopy.cpp,v
retrieving revision 1.2
diff -u -r1.2 mcopy.cpp
--- benchmarks/mcopy.cpp	22 Dec 2005 01:29:25 -0000	1.2
+++ benchmarks/mcopy.cpp	28 Feb 2006 22:17:51 -0000
@@ -20,6 +20,7 @@
 #include <vsip/impl/par-chain-assign.hpp>
 #include <vsip/map.hpp>
 #include <vsip/impl/profile.hpp>
+#include <vsip/impl/metaprogramming.hpp>
 
 #include "test.hpp"
 #include "loop.hpp"
@@ -29,6 +30,36 @@
 
 using namespace vsip;
 
+using vsip::impl::ITE_Type;
+using vsip::impl::As_type;
+
+
+
+/***********************************************************************
+  Matrix copy - normal assignment
+***********************************************************************/
+
+template <typename T,
+	  typename DstBlock,
+	  typename SrcBlock>
+void
+matrix_copy(
+  Matrix<T, DstBlock> dst,
+  Matrix<T, SrcBlock> src)
+{
+  dst = src;
+}
+
+struct Impl_assign;
+struct Impl_sa;
+struct Impl_memcpy;
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock,
+	  typename ImplTag>
+struct t_mcopy;
+
 
 
 /***********************************************************************
@@ -38,17 +69,19 @@
 template <typename T,
 	  typename SrcBlock,
 	  typename DstBlock>
-struct t_mcopy
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_assign>
 {
   typedef typename SrcBlock::map_type src_map_type;
   typedef typename DstBlock::map_type dst_map_type;
 
-  char* what() { return "t_mcopy"; }
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_assign>"; }
   int ops_per_point(length_type size)  { return size; }
   int riob_per_point(length_type size) { return size*sizeof(T); }
   int wiob_per_point(length_type size) { return size*sizeof(T); }
+  int mem_per_point(length_type size)  { return size*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
   {
     length_type const M = size;
     length_type const N = size;
@@ -95,16 +128,160 @@
 
 
 
+/***********************************************************************
+  Matrix copy - setup assignment
+***********************************************************************/
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_sa>
+{
+  typedef typename SrcBlock::map_type src_map_type;
+  typedef typename DstBlock::map_type dst_map_type;
+
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_sa>"; }
+  int ops_per_point(length_type size)  { return size; }
+  int riob_per_point(length_type size) { return size*sizeof(T); }
+  int wiob_per_point(length_type size) { return size*sizeof(T); }
+  int mem_per_point(length_type size)  { return size*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    length_type const M = size;
+    length_type const N = size;
+
+    Matrix<T, SrcBlock>   A(M, N, T(), src_map_);
+    Matrix<T, DstBlock>   Z(M, N,      dst_map_);
+
+    Setup_assign expr(Z, A);
+
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	A.put(m, n, T(m*N + n));
+      }
+    
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      expr();
+    t1.stop();
+    
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	if (!equal(Z.get(m, n), T(m*N+n)))
+	{
+	  std::cout << "t_mcopy: ERROR" << std::endl;
+	  abort();
+	}
+      }
+    
+    time = t1.delta();
+  }
+
+  t_mcopy(src_map_type src_map, dst_map_type dst_map)
+    : src_map_(src_map),
+      dst_map_(dst_map)
+    {}
+
+  // Member data.
+  src_map_type	src_map_;
+  dst_map_type	dst_map_;
+};
+
+
+
+/***********************************************************************
+  Matrix copy - using memcpy 
+***********************************************************************/
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_memcpy>
+{
+  typedef typename SrcBlock::map_type src_map_type;
+  typedef typename DstBlock::map_type dst_map_type;
+
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_memcpy>"; }
+  int ops_per_point(length_type size)  { return size; }
+  int riob_per_point(length_type size) { return size*sizeof(T); }
+  int wiob_per_point(length_type size) { return size*sizeof(T); }
+  int mem_per_point(length_type size)  { return size*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    length_type const M = size;
+    length_type const N = size;
+
+    Matrix<T, SrcBlock>   A(M, N, T(), src_map_);
+    Matrix<T, DstBlock>   Z(M, N,      dst_map_);
+
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	A.put(m, n, T(m*N + n));
+      }
+    
+    vsip::impl::profile::Timer t1;
+
+    {
+      impl::Ext_data<SrcBlock> src_ext(A.block());
+      impl::Ext_data<DstBlock> dst_ext(Z.block());
+    
+      t1.start();
+      for (index_type l=0; l<loop; ++l)
+	memcpy(dst_ext.data(), src_ext.data(), M*N*sizeof(T));
+      t1.stop();
+    }
+    
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	if (!equal(Z.get(m, n), T(m*N+n)))
+	{
+	  std::cout << "t_mcopy: ERROR" << std::endl;
+	  abort();
+	}
+      }
+    
+    time = t1.delta();
+  }
+
+  t_mcopy(src_map_type src_map, dst_map_type dst_map)
+    : src_map_(src_map),
+      dst_map_(dst_map)
+    {}
+
+  // Member data.
+  src_map_type	src_map_;
+  dst_map_type	dst_map_;
+};
+
+
+
+/***********************************************************************
+  Benchmark driver for local copy
+***********************************************************************/
+
 template <typename T,
 	  typename SrcOrder,
-	  typename DstOrder>
+	  typename DstOrder,
+	  typename ImplTag>
 struct t_mcopy_local : t_mcopy<T,
 			       Dense<2, T, SrcOrder, Local_map>,
-			       Dense<2, T, DstOrder, Local_map> >
+			       Dense<2, T, DstOrder, Local_map>,
+			       ImplTag>
 {
   typedef t_mcopy<T,
 		  Dense<2, T, SrcOrder, Local_map>,
-		  Dense<2, T, DstOrder, Local_map> > base_type;
+		  Dense<2, T, DstOrder, Local_map>,
+		  ImplTag> base_type;
   t_mcopy_local()
     : base_type(Local_map(), Local_map()) 
   {}
@@ -113,41 +290,73 @@
 
 template <typename T,
 	  typename SrcOrder,
-	  typename DstOrder>
+	  typename DstOrder,
+	  typename ImplTag>
 struct t_mcopy_pb : t_mcopy<T,
 			    Plain_block<2, T, SrcOrder, Local_map>,
-			    Plain_block<2, T, DstOrder, Local_map> >
+			    Plain_block<2, T, DstOrder, Local_map>,
+			    ImplTag>
 {
   typedef t_mcopy<T,
 		  Plain_block<2, T, SrcOrder, Local_map>,
-		  Plain_block<2, T, DstOrder, Local_map> > base_type;
+		  Plain_block<2, T, DstOrder, Local_map>,
+		  ImplTag > base_type;
   t_mcopy_pb()
     : base_type(Local_map(), Local_map()) 
   {}
 };
 
-#if 0
+
+
+/***********************************************************************
+  Benchmark driver for distributed copy
+***********************************************************************/
+
 template <typename T,
-	  typename SrcOrder,
-	  typename DstOrder>
-struct t_mcopy_root : t_mcopy<T,
-			       Dense<2, T, SrcOrder, MapT>,
-			       Dense<2, T, DstOrder, MapT>,
-			       Local_map>
+	  int      SrcDO,
+	  int      DstDO,
+	  typename ImplTag>
+struct t_mcopy_par_helper
 {
-  typedef t_mcopy<T, SrcOrder, DstOrder, Map<> > base_type;
-  typedef t_mcopy<T,
-		  Dense<2, T, SrcOrder, MapT>,
-		  Dense<2, T, DstOrder, MapT>,
-		  Map<> > base_type;
-  t_mcopy_root()
-    : base_type(Map<>()) 
+  typedef Map<Block_dist, Block_dist> map_type;
+
+  typedef typename ITE_Type<SrcDO == 0,
+		   As_type<row2_type>, As_type<col2_type> >::type
+          src_order_type;
+  typedef Dense<2, T, src_order_type, map_type> src_block_type;
+
+  typedef typename ITE_Type<DstDO == 0,
+		   As_type<row2_type>, As_type<col2_type> >::type
+          dst_order_type;
+  typedef Dense<2, T, dst_order_type, map_type> dst_block_type;
+
+  typedef t_mcopy<T, src_block_type, dst_block_type, ImplTag> base_type;
+};
+
+
+template <typename T,
+	  int      SrcDO,
+	  int      DstDO,
+	  typename ImplTag>
+struct t_mcopy_par
+  : t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag>::base_type 
+{
+  typedef t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag> helper_type;
+  typedef typename helper_type::map_type  map_type;
+  typedef typename helper_type::base_type base_type;
+
+  t_mcopy_par(processor_type src_np, processor_type dst_np)
+    : base_type(SrcDO == 0 ? map_type(src_np, 1) : map_type(1, src_np),
+		DstDO == 0 ? map_type(dst_np, 1) : map_type(1, dst_np))
   {}
 };
-#endif
 
 
 
+/***********************************************************************
+  Main functions
+***********************************************************************/
+
 void
 defaults(Loop1P& loop)
 {
@@ -159,23 +368,38 @@
 int
 test(Loop1P& loop, int what)
 {
+  processor_type np = num_processors();
+
+  typedef row2_type rt;
+  typedef col2_type ct;
+
   switch (what)
   {
-  case  1: loop(t_mcopy_local<float, row2_type, row2_type>()); break;
-  case  2: loop(t_mcopy_local<float, row2_type, col2_type>()); break;
-  case  3: loop(t_mcopy_local<float, col2_type, row2_type>()); break;
-  case  4: loop(t_mcopy_local<float, col2_type, col2_type>()); break;
-
-  case  5: loop(t_mcopy_local<complex<float>, row2_type, row2_type>()); break;
-  case  6: loop(t_mcopy_local<complex<float>, row2_type, col2_type>()); break;
-  case  7: loop(t_mcopy_local<complex<float>, col2_type, row2_type>()); break;
-  case  8: loop(t_mcopy_local<complex<float>, col2_type, col2_type>()); break;
-
+  case  1: loop(t_mcopy_local<float, rt, rt, Impl_assign>()); break;
+  case  2: loop(t_mcopy_local<float, rt, ct, Impl_assign>()); break;
+  case  3: loop(t_mcopy_local<float, ct, rt, Impl_assign>()); break;
+  case  4: loop(t_mcopy_local<float, ct, ct, Impl_assign>()); break;
+
+  case  5: loop(t_mcopy_local<complex<float>, rt, rt, Impl_assign>()); break;
+  case  6: loop(t_mcopy_local<complex<float>, rt, ct, Impl_assign>()); break;
+  case  7: loop(t_mcopy_local<complex<float>, ct, rt, Impl_assign>()); break;
+  case  8: loop(t_mcopy_local<complex<float>, ct, ct, Impl_assign>()); break;
+
+  case  11: loop(t_mcopy_local<float, rt, rt, Impl_memcpy>()); break;
+  // case  12: loop(t_mcopy_local<float, rt, ct, Impl_memcpy>()); break;
+  // case  13: loop(t_mcopy_local<float, ct, rt, Impl_memcpy>()); break;
+  case  14: loop(t_mcopy_local<float, ct, ct, Impl_memcpy>()); break;
     
-  case  20: loop(t_mcopy_pb<float, row2_type, row2_type>()); break;
-  case  21: loop(t_mcopy_pb<float, row2_type, col2_type>()); break;
-  case  22: loop(t_mcopy_pb<float, col2_type, row2_type>()); break;
-  case  23: loop(t_mcopy_pb<float, col2_type, col2_type>()); break;
+  case  21: loop(t_mcopy_pb<float, rt, rt, Impl_assign>()); break;
+  case  22: loop(t_mcopy_pb<float, rt, ct, Impl_assign>()); break;
+  case  23: loop(t_mcopy_pb<float, ct, rt, Impl_assign>()); break;
+  case  24: loop(t_mcopy_pb<float, ct, ct, Impl_assign>()); break;
+
+  case  31: loop(t_mcopy_par<float, 0, 0, Impl_assign>(np, np)); break;
+  case  32: loop(t_mcopy_par<float, 0, 1, Impl_assign>(np, np)); break;
+
+  case  41: loop(t_mcopy_par<float, 0, 0, Impl_sa>(np, np)); break;
+  case  42: loop(t_mcopy_par<float, 0, 1, Impl_sa>(np, np)); break;
 
   default:
     return 0;
Index: benchmarks/mcopy_ipp.cpp
===================================================================
RCS file: benchmarks/mcopy_ipp.cpp
diff -N benchmarks/mcopy_ipp.cpp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ benchmarks/mcopy_ipp.cpp	28 Feb 2006 22:17:51 -0000
@@ -0,0 +1,486 @@
+/* Copyright (c) 2006 by CodeSourcery, LLC.  All rights reserved. */
+
+/** @file    benchmarks/mcopy_ipp.cpp
+    @author  Jules Bergmann
+    @date    2005-02-06
+    @brief   VSIPL++ Library: Benchmark for matrix copy using IPP
+
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <iostream>
+
+#include <ippm.h>
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/math.hpp>
+#include <vsip/impl/setup-assign.hpp>
+#include <vsip/impl/par-chain-assign.hpp>
+#include <vsip/map.hpp>
+#include <vsip/impl/profile.hpp>
+#include <vsip/impl/metaprogramming.hpp>
+
+#include "test.hpp"
+#include "output.hpp"
+#include "loop.hpp"
+#include "ops_info.hpp"
+
+#include "plainblock.hpp"
+
+using namespace vsip;
+
+using vsip::impl::ITE_Type;
+using vsip::impl::As_type;
+
+
+void
+ipp_copy(
+  float*      src,
+  stride_type src_row_stride,
+  stride_type src_col_stride,
+  float*      dst,
+  stride_type dst_row_stride,
+  stride_type dst_col_stride,
+  length_type rows,
+  length_type cols)
+{
+  ippmCopy_ma_32f_SS(
+		src, rows*cols,
+		src_row_stride*sizeof(float), src_col_stride*sizeof(float),
+		dst, rows*cols,
+		dst_row_stride*sizeof(float), dst_col_stride*sizeof(float),
+		cols, rows,  1);
+}
+
+
+
+void
+ipp_transpose(
+  float*      src,
+  stride_type src_row_stride,
+  stride_type src_col_stride,
+  float*      dst,
+  stride_type dst_row_stride,
+  stride_type dst_col_stride,
+  length_type rows,
+  length_type cols)
+{
+  assert(src_col_stride == 1);
+  assert(dst_col_stride == 1);
+  ippmTranspose_m_32f(
+		src, src_row_stride*sizeof(float),
+		cols, rows,
+		dst, dst_row_stride*sizeof(float)
+		);
+}
+
+
+
+/***********************************************************************
+  Matrix copy - normal assignment
+***********************************************************************/
+
+template <typename T,
+	  typename DstBlock,
+	  typename SrcBlock>
+void
+matrix_copy(
+  Matrix<T, DstBlock> dst,
+  Matrix<T, SrcBlock> src)
+{
+  dst = src;
+}
+
+struct Impl_copy;
+struct Impl_transpose;
+struct Impl_select;
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock,
+	  typename ImplTag>
+struct t_mcopy;
+
+
+
+/***********************************************************************
+  Matrix copy - using Copy 
+***********************************************************************/
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_copy>
+{
+  typedef typename SrcBlock::map_type src_map_type;
+  typedef typename DstBlock::map_type dst_map_type;
+
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_copy>"; }
+  int ops_per_point(length_type size)  { return size; }
+  int riob_per_point(length_type size) { return size*sizeof(T); }
+  int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    length_type const M = size;
+    length_type const N = size;
+
+    Matrix<T, SrcBlock>   A(M, N, T(), src_map_);
+    Matrix<T, DstBlock>   Z(M, N,      dst_map_);
+
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	A.put(m, n, T(m*N + n));
+      }
+    
+    vsip::impl::profile::Timer t1;
+
+    {
+      impl::Ext_data<SrcBlock> src_ext(A.block());
+      impl::Ext_data<DstBlock> dst_ext(Z.block());
+    
+      t1.start();
+      for (index_type l=0; l<loop; ++l)
+	ipp_copy(src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+		 dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+		 M, N);
+      t1.stop();
+    }
+    
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	if (!equal(Z.get(m, n), T(m*N+n)))
+	{
+	  std::cout << "t_mcopy: ERROR" << std::endl;
+	  std::cout << "A = " << A << std::endl;
+	  std::cout << "Z = " << Z << std::endl;
+	  abort();
+	}
+      }
+    
+    time = t1.delta();
+  }
+
+  t_mcopy(src_map_type src_map, dst_map_type dst_map)
+    : src_map_(src_map),
+      dst_map_(dst_map)
+    {}
+
+  // Member data.
+  src_map_type	src_map_;
+  dst_map_type	dst_map_;
+};
+
+
+
+/***********************************************************************
+  Matrix copy - using Transpose 
+***********************************************************************/
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_transpose>
+{
+  typedef typename SrcBlock::map_type src_map_type;
+  typedef typename DstBlock::map_type dst_map_type;
+
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_transpose>"; }
+  int ops_per_point(length_type size)  { return size; }
+  int riob_per_point(length_type size) { return size*sizeof(T); }
+  int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    length_type const M = size;
+    length_type const N = size;
+
+    Matrix<T, SrcBlock>   A(M, N, T(), src_map_);
+    Matrix<T, DstBlock>   Z(M, N,      dst_map_);
+
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	A.put(m, n, T(m*N + n));
+      }
+    
+    vsip::impl::profile::Timer t1;
+
+    {
+      impl::Ext_data<SrcBlock> src_ext(A.block());
+      impl::Ext_data<DstBlock> dst_ext(Z.block());
+    
+      if (src_ext.stride(1) == 1 && dst_ext.stride(0) == 1)
+      {
+	t1.start();
+	for (index_type l=0; l<loop; ++l)
+	  ipp_transpose(
+		src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+		dst_ext.data(), dst_ext.stride(1), dst_ext.stride(0),
+		M, N);
+	t1.stop();
+      }
+      else if (src_ext.stride(0) == 1 && dst_ext.stride(1) == 1)
+      {
+	t1.start();
+	for (index_type l=0; l<loop; ++l)
+	  ipp_transpose(
+		src_ext.data(), src_ext.stride(1), src_ext.stride(0),
+		dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+		N, M);
+	t1.stop();
+      }
+      else assert(0);
+    }
+    
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	if (!equal(Z.get(m, n), T(m*N+n)))
+	{
+	  std::cout << "t_mcopy: ERROR" << std::endl;
+	  std::cout << "A = " << A << std::endl;
+	  std::cout << "Z = " << Z << std::endl;
+	  abort();
+	}
+      }
+    
+    time = t1.delta();
+  }
+
+  t_mcopy(src_map_type src_map, dst_map_type dst_map)
+    : src_map_(src_map),
+      dst_map_(dst_map)
+    {}
+
+  // Member data.
+  src_map_type	src_map_;
+  dst_map_type	dst_map_;
+};
+
+
+
+/***********************************************************************
+  Matrix copy - using select 
+***********************************************************************/
+
+template <typename T,
+	  typename SrcBlock,
+	  typename DstBlock>
+struct t_mcopy<T, SrcBlock, DstBlock, Impl_select>
+{
+  typedef typename SrcBlock::map_type src_map_type;
+  typedef typename DstBlock::map_type dst_map_type;
+
+  char* what() { return "t_mcopy<T, SrcBlock, DstBlock, Impl_copy>"; }
+  int ops_per_point(length_type size)  { return size; }
+  int riob_per_point(length_type size) { return size*sizeof(T); }
+  int wiob_per_point(length_type size) { return size*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    length_type const M = size;
+    length_type const N = size;
+
+    Matrix<T, SrcBlock>   A(M, N, T(), src_map_);
+    Matrix<T, DstBlock>   Z(M, N,      dst_map_);
+
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	A.put(m, n, T(m*N + n));
+      }
+    
+    vsip::impl::profile::Timer t1;
+
+    {
+      impl::Ext_data<SrcBlock> src_ext(A.block());
+      impl::Ext_data<DstBlock> dst_ext(Z.block());
+    
+      if (src_ext.stride(1) == 1 && dst_ext.stride(0) == 1)
+      {
+	t1.start();
+	for (index_type l=0; l<loop; ++l)
+	  ipp_transpose(
+		src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+		dst_ext.data(), dst_ext.stride(1), dst_ext.stride(0),
+		M, N);
+	t1.stop();
+      }
+      else
+      {
+	t1.start();
+	for (index_type l=0; l<loop; ++l)
+	  ipp_copy(src_ext.data(), src_ext.stride(0), src_ext.stride(1),
+		   dst_ext.data(), dst_ext.stride(0), dst_ext.stride(1),
+		   M, N);
+	t1.stop();
+      }
+    }
+    
+    for (index_type m=0; m<M; ++m)
+      for (index_type n=0; n<N; ++n)
+      {
+	if (!equal(Z.get(m, n), T(m*N+n)))
+	{
+	  std::cout << "t_mcopy: ERROR" << std::endl;
+	  std::cout << "A = " << A << std::endl;
+	  std::cout << "Z = " << Z << std::endl;
+	  abort();
+	}
+      }
+    
+    time = t1.delta();
+  }
+
+  t_mcopy(src_map_type src_map, dst_map_type dst_map)
+    : src_map_(src_map),
+      dst_map_(dst_map)
+    {}
+
+  // Member data.
+  src_map_type	src_map_;
+  dst_map_type	dst_map_;
+};
+
+
+
+/***********************************************************************
+  Benchmark driver for local copy
+***********************************************************************/
+
+template <typename T,
+	  typename SrcOrder,
+	  typename DstOrder,
+	  typename ImplTag>
+struct t_mcopy_local : t_mcopy<T,
+			       Dense<2, T, SrcOrder, Local_map>,
+			       Dense<2, T, DstOrder, Local_map>,
+			       ImplTag>
+{
+  typedef t_mcopy<T,
+		  Dense<2, T, SrcOrder, Local_map>,
+		  Dense<2, T, DstOrder, Local_map>,
+		  ImplTag> base_type;
+  t_mcopy_local()
+    : base_type(Local_map(), Local_map()) 
+  {}
+};
+
+
+template <typename T,
+	  typename SrcOrder,
+	  typename DstOrder,
+	  typename ImplTag>
+struct t_mcopy_pb : t_mcopy<T,
+			    Plain_block<2, T, SrcOrder, Local_map>,
+			    Plain_block<2, T, DstOrder, Local_map>,
+			    ImplTag>
+{
+  typedef t_mcopy<T,
+		  Plain_block<2, T, SrcOrder, Local_map>,
+		  Plain_block<2, T, DstOrder, Local_map>,
+		  ImplTag > base_type;
+  t_mcopy_pb()
+    : base_type(Local_map(), Local_map()) 
+  {}
+};
+
+
+
+/***********************************************************************
+  Benchmark driver for distributed copy
+***********************************************************************/
+
+template <typename T,
+	  int      SrcDO,
+	  int      DstDO,
+	  typename ImplTag>
+struct t_mcopy_par_helper
+{
+  typedef Map<Block_dist, Block_dist> map_type;
+
+  typedef typename ITE_Type<SrcDO == 0,
+		   As_type<row2_type>, As_type<col2_type> >::type
+          src_order_type;
+  typedef Dense<2, T, src_order_type, map_type> src_block_type;
+
+  typedef typename ITE_Type<DstDO == 0,
+		   As_type<row2_type>, As_type<col2_type> >::type
+          dst_order_type;
+  typedef Dense<2, T, dst_order_type, map_type> dst_block_type;
+
+  typedef t_mcopy<T, src_block_type, dst_block_type, ImplTag> base_type;
+};
+
+
+template <typename T,
+	  int      SrcDO,
+	  int      DstDO,
+	  typename ImplTag>
+struct t_mcopy_par
+  : t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag>::base_type 
+{
+  typedef t_mcopy_par_helper<T, SrcDO, DstDO, ImplTag> helper_type;
+  typedef typename helper_type::map_type  map_type;
+  typedef typename helper_type::base_type base_type;
+
+  t_mcopy_par(processor_type src_np, processor_type dst_np)
+    : base_type(SrcDO == 0 ? map_type(src_np, 1) : map_type(1, src_np),
+		DstDO == 0 ? map_type(dst_np, 1) : map_type(1, dst_np))
+  {}
+};
+
+
+
+/***********************************************************************
+  Main functions
+***********************************************************************/
+
+void
+defaults(Loop1P& loop)
+{
+  loop.stop_ = 12;
+}
+
+
+
+int
+test(Loop1P& loop, int what)
+{
+  processor_type np = num_processors();
+
+  typedef row2_type rt;
+  typedef col2_type ct;
+
+  switch (what)
+  {
+  case  1: loop(t_mcopy_local<float, rt, rt, Impl_select>()); break;
+  case  2: loop(t_mcopy_local<float, rt, ct, Impl_select>()); break;
+  case  3: loop(t_mcopy_local<float, ct, rt, Impl_select>()); break;
+  case  4: loop(t_mcopy_local<float, ct, ct, Impl_select>()); break;
+
+  case 11: loop(t_mcopy_local<float, rt, rt, Impl_copy>()); break;
+  case 12: loop(t_mcopy_local<float, rt, ct, Impl_copy>()); break;
+  case 13: loop(t_mcopy_local<float, ct, rt, Impl_copy>()); break;
+  case 14: loop(t_mcopy_local<float, ct, ct, Impl_copy>()); break;
+
+  // case 21: loop(t_mcopy_local<float, rt, rt, Impl_transpose>()); break;
+  case 22: loop(t_mcopy_local<float, rt, ct, Impl_transpose>()); break;
+  case 23: loop(t_mcopy_local<float, ct, rt, Impl_transpose>()); break;
+  // case 24: loop(t_mcopy_local<float, ct, ct, Impl_transpose>()); break;
+
+  default:
+    return 0;
+  }
+  return 1;
+}
Index: benchmarks/mpi_alltoall.cpp
===================================================================
RCS file: benchmarks/mpi_alltoall.cpp
diff -N benchmarks/mpi_alltoall.cpp
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ benchmarks/mpi_alltoall.cpp	28 Feb 2006 22:17:51 -0000
@@ -0,0 +1,1164 @@
+/* Copyright (c) 2005 by CodeSourcery, LLC.  All rights reserved. */
+
+/** @file    benchmarks/mpi_alltoall.cpp
+    @author  Jules Bergmann
+    @date    2005-11-06
+    @brief   VSIPL++ Library: Benchmark for MPI alltoall.
+
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <iostream>
+
+#include <vsip/initfin.hpp>
+#include <vsip/support.hpp>
+#include <vsip/math.hpp>
+#include <vsip/map.hpp>
+#include <vsip/impl/profile.hpp>
+
+#include "test.hpp"
+#include "loop.hpp"
+#include "ops_info.hpp"
+
+using namespace vsip;
+
+
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock,
+	  typename ImplTag>
+class Send;
+
+struct Impl_alltoall;
+struct Impl_isend;
+struct Impl_isend_x;
+struct Impl_persistent;
+struct Impl_persistent_x;
+
+
+
+/***********************************************************************
+  Send class using MPI_alltoall.
+***********************************************************************/
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock>
+class Send<T, SBlock, DBlock, Impl_alltoall>
+{
+  typedef typename SBlock::map_type src_map_type;
+  typedef typename DBlock::map_type dst_map_type;
+
+  typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+  typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+  typedef typename impl::Distributed_local_block<DBlock>::type
+		dst_local_block_t;
+  typedef typename impl::Distributed_local_block<SBlock>::type
+		src_local_block_t;
+
+  // Constructor.
+public:
+
+  Send(
+    Matrix<T, SBlock> src,
+    Matrix<T, DBlock> dst)
+  : src_    (src),
+    dst_    (dst),
+    src_ext_(src_.local().block()),
+    dst_ext_(dst_.local().block())
+  {
+    using vsip::impl::Type_equal;
+
+    src_map_type src_map_ = src_.block().map();
+    dst_map_type dst_map_ = dst_.block().map();
+
+    // Check assumptions ----------------------------------------------
+    
+    // This benchmarking code assumes a particular data layout,
+    // check that inputs match layout before proceeding.
+
+    // Source is row-major, block distributed by row.
+    assert((Type_equal<row2_type, src_order_type>::value));
+    assert(src_map_.num_subblocks(0) >= 1);
+    assert(src_map_.num_subblocks(1) == 1);
+    assert(src_map_.distribution(0) == block);
+    assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+    // Destination is row- or column-major, block distributed by column.
+//    assert((Type_equal<col2_type, dst_order_type>::value));
+    assert(dst_map_.num_subblocks(0) == 1);
+    assert(dst_map_.num_subblocks(1) >= 1);
+    assert(dst_map_.distribution(1) == block);
+    assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+    // Create send-side datatype.
+    length_type nrows_per_send = src_.size(0) / src_map_.num_subblocks(0);
+    length_type ncols_per_recv = dst_.size(1) / dst_map_.num_subblocks(1);
+
+//    printf("(%d): nrows_per_send = %d,  ncols_per_recv = %d\n",
+//	   rank, nrows_per_send, ncols_per_recv);
+
+    // Setup source datatype. -----------------------------------------
+
+    assert(src_ext_.stride(1) == 1);
+
+    MPI_Type_vector(nrows_per_send,	// number of blocks
+		    ncols_per_recv,	// elements per block
+		    src_.size(1),	// stride between block starts
+		    MPI_FLOAT,
+		    &tmp0_datatype_);
+    MPI_Type_commit(&tmp0_datatype_);
+
+    int		 lena[2]   = { 1, 1 };
+    MPI_Aint	 loca[2]   = { 0, ncols_per_recv * sizeof(T) };
+    MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+    MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+    MPI_Type_commit(&src_datatype_);
+
+
+
+    // Setup destination datatype -------------------------------------
+
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      assert(dst_ext.stride(0) == 1);
+
+      MPI_Type_vector (ncols_per_recv,	// number of blocks
+		       1,
+		       dst_.size(0),
+		       MPI_FLOAT,
+		       &tmp1_datatype_);
+      MPI_Type_commit(&tmp1_datatype_);
+
+      int		 len[3]   = { 1, 1, 1 };
+      MPI_Aint	 loc[3]   = { 0, 0, sizeof(T) };
+      MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+      MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+      MPI_Type_commit(&tmp2_datatype_);
+      
+      
+      MPI_Type_hvector(nrows_per_send, 1, sizeof(float), tmp2_datatype_,
+		       &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+    else
+    {
+      assert(dst_ext.stride(1) == 1);
+
+      MPI_Type_vector(nrows_per_send,	// number of blocks
+		      ncols_per_recv,	// elements per block
+		      ncols_per_recv,   // stride between block starts
+		      MPI_FLOAT,
+		      &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+
+  }
+
+  ~Send()
+  {
+    using vsip::impl::Type_equal;
+
+    MPI_Type_free(&tmp0_datatype_);
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      MPI_Type_free(&tmp1_datatype_);
+      MPI_Type_free(&tmp2_datatype_);
+    }
+    MPI_Type_free(&src_datatype_);
+    MPI_Type_free(&dst_datatype_);
+  }
+
+  void operator()()
+  {
+    MPI_Alltoall(src_ext_.data(), 1, src_datatype_,
+		 dst_ext_.data(), 1, dst_datatype_,
+		 MPI_COMM_WORLD);
+  }
+
+  // Member data.
+private:
+  Matrix<T, SBlock>           src_;
+  Matrix<T, DBlock>           dst_;
+
+  impl::Ext_data<dst_local_block_t> dst_ext_;
+  impl::Ext_data<src_local_block_t> src_ext_;
+
+  MPI_Datatype      src_datatype_;
+  MPI_Datatype      dst_datatype_;
+  MPI_Datatype      tmp0_datatype_;
+  MPI_Datatype      tmp1_datatype_;
+  MPI_Datatype      tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+  Send class using MPI_isend / MPI_recv
+***********************************************************************/
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock>
+class Send<T, SBlock, DBlock, Impl_isend>
+{
+  typedef typename SBlock::map_type src_map_type;
+  typedef typename DBlock::map_type dst_map_type;
+
+  typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+  typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+  typedef typename impl::Distributed_local_block<DBlock>::type
+		dst_local_block_t;
+  typedef typename impl::Distributed_local_block<SBlock>::type
+		src_local_block_t;
+
+  // Constructor.
+public:
+
+  Send(
+    Matrix<T, SBlock> src,
+    Matrix<T, DBlock> dst)
+  : src_    (src),
+    dst_    (dst),
+    src_ext_(src_.local().block()),
+    dst_ext_(dst_.local().block()),
+    nproc_  (src_.block().map().num_subblocks(0)),
+    req_    (new MPI_Request[nproc_])
+  {
+    using vsip::impl::Type_equal;
+
+    src_map_type src_map_ = src_.block().map();
+    dst_map_type dst_map_ = dst_.block().map();
+
+    // Check assumptions ----------------------------------------------
+    
+    // This benchmarking code assumes a particular data layout,
+    // check that inputs match layout before proceeding.
+
+    // Source is row-major, block distributed by row.
+    assert((Type_equal<row2_type, src_order_type>::value));
+    assert(src_map_.num_subblocks(0) >= 1);
+    assert(src_map_.num_subblocks(1) == 1);
+    assert(src_map_.distribution(0) == block);
+    assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+    // Destination is row- or column-major, block distributed by column.
+    assert(dst_map_.num_subblocks(0) == 1);
+    assert(dst_map_.num_subblocks(1) >= 1);
+    assert(dst_map_.distribution(1) == block);
+    assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+    // Number of senders == number of receivers
+    assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+    // Create send-side datatype.
+    nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+    ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+//    printf("(%d): nrows_per_send = %d,  ncols_per_recv = %d\n",
+//	   rank, nrows_per_send_, ncols_per_recv_);
+
+    // Setup source datatype. -----------------------------------------
+
+    assert(src_ext_.stride(1) == 1);
+
+    MPI_Type_vector(nrows_per_send_,	// number of blocks
+		    ncols_per_recv_,	// elements per block
+		    src_.size(1),	// stride between block starts
+		    MPI_FLOAT,
+		    &tmp0_datatype_);
+    MPI_Type_commit(&tmp0_datatype_);
+
+    int		 lena[2]   = { 1, 1 };
+    MPI_Aint	 loca[2]   = { 0, ncols_per_recv_ * sizeof(T) };
+    MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+    MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+    MPI_Type_commit(&src_datatype_);
+
+
+
+    // Setup destination datatype -------------------------------------
+
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      assert(dst_ext.stride(0) == 1);
+
+      MPI_Type_vector (ncols_per_recv_,	// number of blocks
+		       1,
+		       dst_.size(0),
+		       MPI_FLOAT,
+		       &tmp1_datatype_);
+      MPI_Type_commit(&tmp1_datatype_);
+
+      int		 len[3]   = { 1, 1, 1 };
+      MPI_Aint	 loc[3]   = { 0, 0, sizeof(T) };
+      MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+      MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+      MPI_Type_commit(&tmp2_datatype_);
+      
+      
+      MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+		       &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+    else
+    {
+      assert(dst_ext.stride(1) == 1);
+
+      MPI_Type_vector(nrows_per_send_,	// number of blocks
+		      ncols_per_recv_,	// elements per block
+		      ncols_per_recv_,   // stride between block starts
+		      MPI_FLOAT,
+		      &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+
+  }
+
+  ~Send()
+  {
+    using vsip::impl::Type_equal;
+
+    MPI_Type_free(&tmp0_datatype_);
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      MPI_Type_free(&tmp1_datatype_);
+      MPI_Type_free(&tmp2_datatype_);
+    }
+    MPI_Type_free(&src_datatype_);
+    MPI_Type_free(&dst_datatype_);
+
+    delete[] req_;
+  }
+
+  void operator()()
+  {
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      MPI_Isend(src_ext_.data() + i*ncols_per_recv_,
+		1, src_datatype_,
+		i, 0, MPI_COMM_WORLD, &(req_[i]));
+    }
+
+    if (impl::Type_equal<dst_order_type, col2_type>::value)
+    {
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	MPI_Status status;
+	int ierr = MPI_Recv(dst_ext_.data() + i*nrows_per_send_,
+			    1, dst_datatype_, i, 0,
+			    MPI_COMM_WORLD, &status);
+      }
+    }
+    else
+    {
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	MPI_Status status;
+	int ierr = MPI_Recv(dst_ext_.data()
+			       + i*nrows_per_send_*ncols_per_recv_,
+			    1, dst_datatype_, i, 0,
+			    MPI_COMM_WORLD, &status);
+      }
+    }
+
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      MPI_Status status;
+      MPI_Wait(&(req_[i]), &status);
+    }
+  }
+
+  // Member data.
+private:
+  Matrix<T, SBlock>           src_;
+  Matrix<T, DBlock>           dst_;
+
+  impl::Ext_data<dst_local_block_t> dst_ext_;
+  impl::Ext_data<src_local_block_t> src_ext_;
+
+  length_type       nproc_;
+  length_type       nrows_per_send_;
+  length_type       ncols_per_recv_;
+
+  MPI_Request*      req_;
+
+  MPI_Datatype      src_datatype_;
+  MPI_Datatype      dst_datatype_;
+  MPI_Datatype      tmp0_datatype_;
+  MPI_Datatype      tmp1_datatype_;
+  MPI_Datatype      tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+  Send class using MPI_isend / MPI_recv + local copy
+***********************************************************************/
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock>
+class Send<T, SBlock, DBlock, Impl_isend_x>
+{
+  typedef typename SBlock::map_type src_map_type;
+  typedef typename DBlock::map_type dst_map_type;
+
+  typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+  typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+  typedef typename impl::Distributed_local_block<DBlock>::type
+		dst_local_block_t;
+  typedef typename impl::Distributed_local_block<SBlock>::type
+		src_local_block_t;
+
+  // Constructor.
+public:
+
+  Send(
+    Matrix<T, SBlock> src,
+    Matrix<T, DBlock> dst)
+  : src_    (src),
+    dst_    (dst),
+    src_ext_(src_.local().block()),
+    dst_ext_(dst_.local().block()),
+    nproc_  (src_.block().map().num_subblocks(0)),
+    rank_   (src_.block().map().impl_rank()),
+    req_    (new MPI_Request[nproc_])
+  {
+    using vsip::impl::Type_equal;
+
+    src_map_type src_map_ = src_.block().map();
+    dst_map_type dst_map_ = dst_.block().map();
+
+    // Check assumptions ----------------------------------------------
+    
+    // This benchmarking code assumes a particular data layout,
+    // check that inputs match layout before proceeding.
+
+    // Source is row-major, block distributed by row.
+    assert((Type_equal<row2_type, src_order_type>::value));
+    assert(src_map_.num_subblocks(0) >= 1);
+    assert(src_map_.num_subblocks(1) == 1);
+    assert(src_map_.distribution(0) == block);
+    assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+    // Destination is row- or column-major, block distributed by column.
+    assert(dst_map_.num_subblocks(0) == 1);
+    assert(dst_map_.num_subblocks(1) >= 1);
+    assert(dst_map_.distribution(1) == block);
+    assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+    // Number of senders == number of receivers
+    assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+    // Create send-side datatype.
+    nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+    ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+//    printf("(%d): nrows_per_send = %d,  ncols_per_recv = %d\n",
+//	   rank, nrows_per_send_, ncols_per_recv_);
+
+    // Setup source datatype. -----------------------------------------
+
+    assert(src_ext_.stride(1) == 1);
+
+    MPI_Type_vector(nrows_per_send_,	// number of blocks
+		    ncols_per_recv_,	// elements per block
+		    src_.size(1),	// stride between block starts
+		    MPI_FLOAT,
+		    &tmp0_datatype_);
+    MPI_Type_commit(&tmp0_datatype_);
+
+    int		 lena[2]   = { 1, 1 };
+    MPI_Aint	 loca[2]   = { 0, ncols_per_recv_ * sizeof(T) };
+    MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+    MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+    MPI_Type_commit(&src_datatype_);
+
+
+
+    // Setup destination datatype -------------------------------------
+
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      assert(dst_ext.stride(0) == 1);
+
+      MPI_Type_vector (ncols_per_recv_,	// number of blocks
+		       1,
+		       dst_.size(0),
+		       MPI_FLOAT,
+		       &tmp1_datatype_);
+      MPI_Type_commit(&tmp1_datatype_);
+
+      int		 len[3]   = { 1, 1, 1 };
+      MPI_Aint	 loc[3]   = { 0, 0, sizeof(T) };
+      MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+      MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+      MPI_Type_commit(&tmp2_datatype_);
+      
+      
+      MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+		       &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+    else
+    {
+      assert(dst_ext.stride(1) == 1);
+
+      MPI_Type_vector(nrows_per_send_,	// number of blocks
+		      ncols_per_recv_,	// elements per block
+		      ncols_per_recv_,   // stride between block starts
+		      MPI_FLOAT,
+		      &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+    }
+
+  }
+
+  ~Send()
+  {
+    using vsip::impl::Type_equal;
+
+    MPI_Type_free(&tmp0_datatype_);
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      MPI_Type_free(&tmp1_datatype_);
+      MPI_Type_free(&tmp2_datatype_);
+    }
+    MPI_Type_free(&src_datatype_);
+    MPI_Type_free(&dst_datatype_);
+
+    delete[] req_;
+  }
+
+  void operator()()
+  {
+    // Send -----------------------------------------------------------
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      if (i != rank_)
+	MPI_Isend(src_ext_.data() + i*ncols_per_recv_,
+		  1, src_datatype_,
+		  i, 0, MPI_COMM_WORLD, &(req_[i]));
+    }
+
+    // Copy -----------------------------------------------------------
+    if (impl::Type_equal<dst_order_type, col2_type>::value)
+    {
+      T* dst = dst_ext_.data() + rank_*nrows_per_send_;
+      T* src = src_ext_.data() + rank_*ncols_per_recv_;
+      impl::transpose_unit(dst, src,
+			   nrows_per_send_, ncols_per_recv_,
+			   dst_ext_.stride(1),
+			   src_ext_.stride(0));
+    }
+    else
+    {
+      T* dst = dst_ext_.data() + rank_*ncols_per_recv_*nrows_per_send_;
+      T* src = src_ext_.data() + rank_*ncols_per_recv_;
+      for (index_type i=0; i<nrows_per_send_; ++i)
+	memcpy(dst + i*ncols_per_recv_,
+	       src + i*src_.size(1),
+	       ncols_per_recv_ * sizeof(T));
+    }
+
+    // Recv -----------------------------------------------------------
+    if (impl::Type_equal<dst_order_type, col2_type>::value)
+    {
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	MPI_Status status;
+	if (i != rank_)
+	  int ierr = MPI_Recv(dst_ext_.data() + i*nrows_per_send_,
+			      1, dst_datatype_, i, 0,
+			      MPI_COMM_WORLD, &status);
+      }
+    }
+    else
+    {
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	MPI_Status status;
+	if (i != rank_)
+	  int ierr = MPI_Recv(dst_ext_.data()
+			      + i*nrows_per_send_*ncols_per_recv_,
+			      1, dst_datatype_, i, 0,
+			      MPI_COMM_WORLD, &status);
+      }
+    }
+
+    // Wait for sends to finish ---------------------------------------
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      MPI_Status status;
+      if (i != rank_)
+	MPI_Wait(&(req_[i]), &status);
+    }
+  }
+
+  // Member data.
+private:
+  Matrix<T, SBlock>           src_;
+  Matrix<T, DBlock>           dst_;
+
+  impl::Ext_data<dst_local_block_t> dst_ext_;
+  impl::Ext_data<src_local_block_t> src_ext_;
+
+  length_type       nproc_;
+  index_type        rank_;
+  length_type       nrows_per_send_;
+  length_type       ncols_per_recv_;
+
+  MPI_Request*      req_;
+
+  MPI_Datatype      src_datatype_;
+  MPI_Datatype      dst_datatype_;
+  MPI_Datatype      tmp0_datatype_;
+  MPI_Datatype      tmp1_datatype_;
+  MPI_Datatype      tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+  Send class using Persistent communications.
+***********************************************************************/
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock>
+class Send<T, SBlock, DBlock, Impl_persistent>
+{
+  typedef typename SBlock::map_type src_map_type;
+  typedef typename DBlock::map_type dst_map_type;
+
+  typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+  typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+  typedef typename impl::Distributed_local_block<DBlock>::type
+		dst_local_block_t;
+  typedef typename impl::Distributed_local_block<SBlock>::type
+		src_local_block_t;
+
+  // Constructor.
+public:
+
+  Send(
+    Matrix<T, SBlock> src,
+    Matrix<T, DBlock> dst)
+  : src_    (src),
+    dst_    (dst),
+    src_ext_(src_.local().block()),
+    dst_ext_(dst_.local().block()),
+    nproc_  (src_.block().map().num_subblocks(0)),
+    req_    (new MPI_Request[2*nproc_]),
+    status_ (new MPI_Status[2*nproc_])
+  {
+    using vsip::impl::Type_equal;
+
+    src_map_type src_map_ = src_.block().map();
+    dst_map_type dst_map_ = dst_.block().map();
+
+    // Check assumptions ----------------------------------------------
+    
+    // This benchmarking code assumes a particular data layout,
+    // check that inputs match layout before proceeding.
+
+    // Source is row-major, block distributed by row.
+    assert((Type_equal<row2_type, src_order_type>::value));
+    assert(src_map_.num_subblocks(0) >= 1);
+    assert(src_map_.num_subblocks(1) == 1);
+    assert(src_map_.distribution(0) == block);
+    assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+    // Destination is row- or column-major, block distributed by column.
+    assert(dst_map_.num_subblocks(0) == 1);
+    assert(dst_map_.num_subblocks(1) >= 1);
+    assert(dst_map_.distribution(1) == block);
+    assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+    // Number of senders == number of receivers
+    assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+    // Create send-side datatype.
+    nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+    ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+//    printf("(%d): nrows_per_send = %d,  ncols_per_recv = %d\n",
+//	   rank, nrows_per_send_, ncols_per_recv_);
+
+    // Setup source datatype. -----------------------------------------
+
+    assert(src_ext_.stride(1) == 1);
+
+    MPI_Type_vector(nrows_per_send_,	// number of blocks
+		    ncols_per_recv_,	// elements per block
+		    src_.size(1),	// stride between block starts
+		    MPI_FLOAT,
+		    &tmp0_datatype_);
+    MPI_Type_commit(&tmp0_datatype_);
+
+    int		 lena[2]   = { 1, 1 };
+    MPI_Aint	 loca[2]   = { 0, ncols_per_recv_ * sizeof(T) };
+    MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+    MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+    MPI_Type_commit(&src_datatype_);
+
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      MPI_Send_init(src_ext_.data() + i*ncols_per_recv_,
+		    1, src_datatype_,
+		    i, 0, MPI_COMM_WORLD, &(req_[i]));
+    }
+
+
+
+    // Setup destination datatype -------------------------------------
+
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      assert(dst_ext.stride(0) == 1);
+
+      MPI_Type_vector (ncols_per_recv_,	// number of blocks
+		       1,
+		       dst_.size(0),
+		       MPI_FLOAT,
+		       &tmp1_datatype_);
+      MPI_Type_commit(&tmp1_datatype_);
+
+      int		 len[3]   = { 1, 1, 1 };
+      MPI_Aint	 loc[3]   = { 0, 0, sizeof(T) };
+      MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+      MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+      MPI_Type_commit(&tmp2_datatype_);
+      
+      
+      MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+		       &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	int ierr = MPI_Recv_init(dst_ext_.data() + i*nrows_per_send_,
+				 1, dst_datatype_, i, 0,
+				 MPI_COMM_WORLD, &(req_[nproc_+i]));
+      }
+    }
+    else
+    {
+      assert(dst_ext.stride(1) == 1);
+
+      MPI_Type_vector(nrows_per_send_,	// number of blocks
+		      ncols_per_recv_,	// elements per block
+		      ncols_per_recv_,   // stride between block starts
+		      MPI_FLOAT,
+		      &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	int ierr = MPI_Recv_init(dst_ext_.data()
+				   + i*nrows_per_send_*ncols_per_recv_,
+				 1, dst_datatype_, i, 0,
+				 MPI_COMM_WORLD, &(req_[nproc_+i]));
+      }
+    }
+
+  }
+
+  ~Send()
+  {
+    using vsip::impl::Type_equal;
+
+    MPI_Type_free(&tmp0_datatype_);
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      MPI_Type_free(&tmp1_datatype_);
+      MPI_Type_free(&tmp2_datatype_);
+    }
+    MPI_Type_free(&src_datatype_);
+    MPI_Type_free(&dst_datatype_);
+
+    for (index_type i=0; i<2*nproc_; ++i)
+      MPI_Request_free(&(req_[i]));
+
+    delete[] req_;
+    delete[] status_;
+  }
+
+  void operator()()
+  {
+    MPI_Startall(2*nproc_, req_);
+    MPI_Waitall (2*nproc_, req_, status_);
+  }
+
+  // Member data.
+private:
+  Matrix<T, SBlock>           src_;
+  Matrix<T, DBlock>           dst_;
+
+  impl::Ext_data<dst_local_block_t> dst_ext_;
+  impl::Ext_data<src_local_block_t> src_ext_;
+
+  length_type       nproc_;
+  length_type       nrows_per_send_;
+  length_type       ncols_per_recv_;
+
+  MPI_Request*      req_;
+  MPI_Status*       status_;
+
+  MPI_Datatype      src_datatype_;
+  MPI_Datatype      dst_datatype_;
+  MPI_Datatype      tmp0_datatype_;
+  MPI_Datatype      tmp1_datatype_;
+  MPI_Datatype      tmp2_datatype_;
+};
+
+
+
+/***********************************************************************
+  Send class using Persistent communications + local copy.
+***********************************************************************/
+
+template <typename T,
+	  typename SBlock,
+	  typename DBlock>
+class Send<T, SBlock, DBlock, Impl_persistent_x>
+{
+  typedef typename SBlock::map_type src_map_type;
+  typedef typename DBlock::map_type dst_map_type;
+
+  typedef typename impl::Block_layout<SBlock>::order_type src_order_type;
+  typedef typename impl::Block_layout<DBlock>::order_type dst_order_type;
+
+  typedef typename impl::Distributed_local_block<DBlock>::type
+		dst_local_block_t;
+  typedef typename impl::Distributed_local_block<SBlock>::type
+		src_local_block_t;
+
+  // Constructor.
+public:
+
+  Send(
+    Matrix<T, SBlock> src,
+    Matrix<T, DBlock> dst)
+  : src_    (src),
+    dst_    (dst),
+    src_ext_(src_.local().block()),
+    dst_ext_(dst_.local().block()),
+    nproc_  (src_.block().map().num_subblocks(0)),
+    rank_   (src_.block().map().impl_rank()),
+    req_    (new MPI_Request[2*nproc_]),
+    status_ (new MPI_Status[2*nproc_])
+  {
+    using vsip::impl::Type_equal;
+
+    src_map_type src_map_ = src_.block().map();
+    dst_map_type dst_map_ = dst_.block().map();
+
+    // Check assumptions ----------------------------------------------
+    
+    // This benchmarking code assumes a particular data layout,
+    // check that inputs match layout before proceeding.
+
+    // Source is row-major, block distributed by row.
+    assert((Type_equal<row2_type, src_order_type>::value));
+    assert(src_map_.num_subblocks(0) >= 1);
+    assert(src_map_.num_subblocks(1) == 1);
+    assert(src_map_.distribution(0) == block);
+    assert(src_.size(0) % src_map_.num_subblocks(0) == 0);
+
+    // Destination is row- or column-major, block distributed by column.
+    assert(dst_map_.num_subblocks(0) == 1);
+    assert(dst_map_.num_subblocks(1) >= 1);
+    assert(dst_map_.distribution(1) == block);
+    assert(dst_.size(1) % dst_map_.num_subblocks(1) == 0);
+
+    // Number of senders == number of receivers
+    assert(src_map_.num_subblocks(0) == dst_map_.num_subblocks(1));
+
+    // Create send-side datatype.
+    nrows_per_send_ = src_.size(0) / src_map_.num_subblocks(0);
+    ncols_per_recv_ = dst_.size(1) / dst_map_.num_subblocks(1);
+
+//    printf("(%d): nrows_per_send = %d,  ncols_per_recv = %d\n",
+//	   rank, nrows_per_send_, ncols_per_recv_);
+
+    // Setup source datatype. -----------------------------------------
+
+    assert(src_ext_.stride(1) == 1);
+
+    MPI_Type_vector(nrows_per_send_,	// number of blocks
+		    ncols_per_recv_,	// elements per block
+		    src_.size(1),	// stride between block starts
+		    MPI_FLOAT,
+		    &tmp0_datatype_);
+    MPI_Type_commit(&tmp0_datatype_);
+
+    int		 lena[2]   = { 1, 1 };
+    MPI_Aint	 loca[2]   = { 0, ncols_per_recv_ * sizeof(T) };
+    MPI_Datatype typesa[2] = { tmp0_datatype_, MPI_UB };
+    MPI_Type_struct(2, lena, loca, typesa, &src_datatype_);
+    MPI_Type_commit(&src_datatype_);
+
+    int n_req = 0;
+    for (index_type i=0; i<nproc_; ++i)
+    {
+      if (i != rank_)
+	MPI_Send_init(src_ext_.data() + i*ncols_per_recv_,
+		      1, src_datatype_,
+		      i, 0, MPI_COMM_WORLD, &(req_[n_req++]));
+    }
+
+
+
+    // Setup destination datatype -------------------------------------
+
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      assert(dst_ext.stride(0) == 1);
+
+      MPI_Type_vector (ncols_per_recv_,	// number of blocks
+		       1,
+		       dst_.size(0),
+		       MPI_FLOAT,
+		       &tmp1_datatype_);
+      MPI_Type_commit(&tmp1_datatype_);
+
+      int		 len[3]   = { 1, 1, 1 };
+      MPI_Aint	 loc[3]   = { 0, 0, sizeof(T) };
+      MPI_Datatype types[3] = { tmp1_datatype_, MPI_LB, MPI_UB };
+      MPI_Type_struct(3, len, loc, types, &tmp2_datatype_);
+      MPI_Type_commit(&tmp2_datatype_);
+      
+      
+      MPI_Type_hvector(nrows_per_send_, 1, sizeof(float), tmp2_datatype_,
+		       &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	if (i != rank_)
+	  int ierr = MPI_Recv_init(dst_ext_.data() + i*nrows_per_send_,
+				   1, dst_datatype_, i, 0,
+				   MPI_COMM_WORLD, &(req_[n_req++]));
+      }
+    }
+    else
+    {
+      assert(dst_ext.stride(1) == 1);
+
+      MPI_Type_vector(nrows_per_send_,	// number of blocks
+		      ncols_per_recv_,	// elements per block
+		      ncols_per_recv_,   // stride between block starts
+		      MPI_FLOAT,
+		      &dst_datatype_);
+      MPI_Type_commit(&dst_datatype_);
+
+      for (index_type i=0; i<nproc_; ++i)
+      {
+	if (i != rank_)
+	  int ierr = MPI_Recv_init(dst_ext_.data()
+				   + i*nrows_per_send_*ncols_per_recv_,
+				   1, dst_datatype_, i, 0,
+				   MPI_COMM_WORLD, &(req_[n_req++]));
+      }
+    }
+
+  }
+
+  ~Send()
+  {
+    using vsip::impl::Type_equal;
+
+    MPI_Type_free(&tmp0_datatype_);
+    if (Type_equal<dst_order_type, col2_type>::value)
+    {
+      MPI_Type_free(&tmp1_datatype_);
+      MPI_Type_free(&tmp2_datatype_);
+    }
+    MPI_Type_free(&src_datatype_);
+    MPI_Type_free(&dst_datatype_);
+
+    for (index_type i=0; i<2*(nproc_-1); ++i)
+      MPI_Request_free(&(req_[i]));
+
+    delete[] req_;
+    delete[] status_;
+  }
+
+  void operator()()
+  {
+    if (nproc_ > 1)
+      MPI_Startall(2*(nproc_-1), req_);
+    if (impl::Type_equal<dst_order_type, col2_type>::value)
+    {
+      T* dst = dst_ext_.data() + rank_*nrows_per_send_;
+      T* src = src_ext_.data() + rank_*ncols_per_recv_;
+      impl::transpose_unit(dst, src,
+			   nrows_per_send_, ncols_per_recv_,
+			   dst_ext_.stride(1),
+			   src_ext_.stride(0));
+    }
+    else
+    {
+      T* dst = dst_ext_.data() + rank_*ncols_per_recv_*nrows_per_send_;
+      T* src = src_ext_.data() + rank_*ncols_per_recv_;
+      for (index_type i=0; i<nrows_per_send_; ++i)
+	memcpy(dst + i*ncols_per_recv_,
+	       src + i*src_.size(1),
+	       ncols_per_recv_ * sizeof(T));
+    }
+    if (nproc_ > 1)
+      MPI_Waitall (2*(nproc_-1), req_, status_);
+  }
+
+  // Member data.
+private:
+  Matrix<T, SBlock>           src_;
+  Matrix<T, DBlock>           dst_;
+
+  impl::Ext_data<dst_local_block_t> dst_ext_;
+  impl::Ext_data<src_local_block_t> src_ext_;
+
+  length_type       nproc_;
+  index_type        rank_;
+  length_type       nrows_per_send_;
+  length_type       ncols_per_recv_;
+
+  MPI_Request*      req_;
+  MPI_Status*       status_;
+
+  MPI_Datatype      src_datatype_;
+  MPI_Datatype      dst_datatype_;
+  MPI_Datatype      tmp0_datatype_;
+  MPI_Datatype      tmp1_datatype_;
+  MPI_Datatype      tmp2_datatype_;
+};
+
+
+
+template <typename T,
+	  typename SrcOrder,
+	  typename DstOrder,
+	  typename ImplTag>
+struct t_alltoall
+{
+  char* what() { return "t_alltoall"; }
+  int ops_per_point(length_type rows)  { return rows*ratio_; } 
+  int riob_per_point(length_type rows) { return rows*ratio_*sizeof(T); }
+  int wiob_per_point(length_type rows) { return rows*ratio_*sizeof(T); }
+  int mem_per_point(length_type rows)  { return rows*ratio_*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+  {
+    typedef Dense<2, T, SrcOrder, Map<> > src_block_t;
+    typedef Dense<2, T, DstOrder, Map<> > dst_block_t;
+
+    processor_type np   = vsip::num_processors();
+    index_type     rank = vsip::local_processor();
+
+    length_type nrows = size;
+    length_type ncols = ratio_*size;
+
+    Map<> root_map(1, 1);
+    Map<> src_map(np, 1);
+    Map<> dst_map(1, np);
+
+    Matrix<float, src_block_t> chk(nrows, ncols, root_map);
+    Matrix<float, src_block_t> src(nrows, ncols, src_map);
+    Matrix<float, dst_block_t> dst(nrows, ncols, dst_map);
+
+    Send<T, src_block_t, dst_block_t, ImplTag> send(src, dst);
+
+    // Initialize chk
+    if (root_map.subblock() != no_subblock)
+    {
+      for (index_type r=0; r<nrows; ++r)
+	for (index_type c=0; c<ncols; ++c)
+	  chk.local()(r, c) = T(rank*ncols*nrows+r*ncols+c);
+    }
+
+    // Scatter chk to src
+    src = chk;
+
+    // Clear chk
+    chk = T();
+
+    // Assign src to dst
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      send();
+    t1.stop();
+    
+    // Gather dst to chk
+    chk = dst;
+    
+    // Check chk.
+    if (root_map.subblock() != no_subblock)
+    {
+      for (index_type r=0; r<nrows; ++r)
+	for (index_type c=0; c<ncols; ++c)
+	  test_assert(chk.local().get(r, c) == T(rank*ncols*nrows+r*ncols+c));
+    }
+    
+    time = t1.delta();
+  }
+
+  t_alltoall(length_type ratio) : ratio_(ratio) {};
+
+// Member data:
+  length_type ratio_;
+};
+
+
+
+void
+defaults(Loop1P& loop)
+{
+  loop.user_param_ = 1;
+  loop.stop_ = 11;
+}
+
+
+
+int
+test(Loop1P& loop, int what)
+{
+  // Ratio is the number of columns per row.
+  length_type ratio = loop.user_param_;
+
+  typedef row2_type Rt;
+  typedef col2_type Ct;
+
+  switch (what)
+  {
+  case  1: loop(t_alltoall<float, Rt, Rt, Impl_alltoall>(ratio)); break;
+  case  2: loop(t_alltoall<float, Rt, Ct, Impl_alltoall>(ratio)); break;
+
+  case 11: loop(t_alltoall<float, Rt, Rt, Impl_isend>(ratio)); break;
+  case 12: loop(t_alltoall<float, Rt, Ct, Impl_isend>(ratio)); break;
+
+  case 21: loop(t_alltoall<float, Rt, Rt, Impl_isend_x>(ratio)); break;
+  case 22: loop(t_alltoall<float, Rt, Ct, Impl_isend_x>(ratio)); break;
+
+  case 31: loop(t_alltoall<float, Rt, Rt, Impl_persistent>(ratio)); break;
+  case 32: loop(t_alltoall<float, Rt, Ct, Impl_persistent>(ratio)); break;
+
+  case 41: loop(t_alltoall<float, Rt, Rt, Impl_persistent_x>(ratio)); break;
+  case 42: loop(t_alltoall<float, Rt, Ct, Impl_persistent_x>(ratio)); break;
+
+  default:
+    return 0;
+  }
+  return 1;
+}
Index: benchmarks/qrd.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/qrd.cpp,v
retrieving revision 1.2
diff -u -r1.2 qrd.cpp
--- benchmarks/qrd.cpp	7 Sep 2005 12:19:30 -0000	1.2
+++ benchmarks/qrd.cpp	28 Feb 2006 22:17:51 -0000
@@ -72,6 +72,7 @@
 
   int riob_per_point(length_type) { return -1*sizeof(T); }
   int wiob_per_point(length_type) { return -1*sizeof(T); }
+  int mem_per_point(length_type size) { return size*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
Index: benchmarks/vmul.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul.cpp,v
retrieving revision 1.6
diff -u -r1.6 vmul.cpp
--- benchmarks/vmul.cpp	27 Jan 2006 13:13:23 -0000	1.6
+++ benchmarks/vmul.cpp	28 Feb 2006 22:17:51 -0000
@@ -16,6 +16,7 @@
 #include <vsip/initfin.hpp>
 #include <vsip/support.hpp>
 #include <vsip/math.hpp>
+#include <vsip/random.hpp>
 #include <vsip/impl/profile.hpp>
 
 #include "test.hpp"
@@ -26,6 +27,10 @@
 
 
 
+/***********************************************************************
+  Definitions - vector element-wise multiply
+***********************************************************************/
+
 template <typename T>
 struct t_vmul1
 {
@@ -33,6 +38,7 @@
   int ops_per_point(length_type)  { return Ops_info<T>::mul; }
   int riob_per_point(length_type) { return 2*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
     VSIP_IMPL_NOINLINE
@@ -41,6 +47,10 @@
     Vector<T>   B(size, T());
     Vector<T>   C(size);
 
+    Rand<T> gen(0, 0);
+    A = gen.randu(size);
+    B = gen.randu(size);
+
     A(0) = T(3);
     B(0) = T(4);
     
@@ -56,6 +66,92 @@
       std::cout << "t_vmul1: ERROR" << std::endl;
       abort();
     }
+
+    for (index_type i=0; i<size; ++i)
+      test_assert(equal(C(i), A(i) * B(i)));
+    
+    time = t1.delta();
+  }
+};
+
+
+
+template <typename T>
+struct t_vmul_ip1
+{
+  char* what() { return "t_vmul_ip1"; }
+  int ops_per_point(length_type)  { return Ops_info<T>::mul; }
+  int riob_per_point(length_type) { return 2*sizeof(T); }
+  int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    Vector<T>   A(size, T(1));
+    Vector<T>   C(size);
+    Vector<T>   chk(size);
+
+    Rand<T> gen(0, 0);
+    chk = gen.randu(size);
+    C = chk;
+
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      C *= A;
+    t1.stop();
+    
+    for (index_type i=0; i<size; ++i)
+      test_assert(equal(chk(i), C(i)));
+    
+    time = t1.delta();
+  }
+};
+
+
+
+template <typename T>
+struct t_vmul_dom1
+{
+  char* what() { return "t_vmul_dom1"; }
+  int ops_per_point(length_type)  { return Ops_info<T>::mul; }
+  int riob_per_point(length_type) { return 2*sizeof(T); }
+  int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    Vector<T>   A(size, T());
+    Vector<T>   B(size, T());
+    Vector<T>   C(size);
+
+    Rand<T> gen(0, 0);
+    A = gen.randu(size);
+    B = gen.randu(size);
+
+    A(0) = T(3);
+    B(0) = T(4);
+
+    Domain<1> dom(size);
+    
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      C(dom) = A(dom) * B(dom);
+    t1.stop();
+    
+    if (!equal(C(0), T(12)))
+    {
+      std::cout << "t_vmul1: ERROR" << std::endl;
+      abort();
+    }
+
+    for (index_type i=0; i<size; ++i)
+      test_assert(equal(C(i), A(i) * B(i)));
     
     time = t1.delta();
   }
@@ -70,6 +166,7 @@
   int ops_per_point(length_type)  { return Ops_info<T>::mul; }
   int riob_per_point(length_type) { return 2*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
     VSIP_IMPL_NOINLINE
@@ -104,23 +201,74 @@
 
 
 
-// Benchmark scalar-view vector multiply (Scalar * View)
+/***********************************************************************
+  Definitions - real * complex vector element-wise multiply
+***********************************************************************/
 
 template <typename T>
+struct t_rcvmul1
+{
+  char* what() { return "t_rcvmul1"; }
+  int ops_per_point(length_type)  { return 2*Ops_info<T>::mul; }
+  int riob_per_point(length_type) { return sizeof(T) + sizeof(complex<T>); }
+  int wiob_per_point(length_type) { return 1*sizeof(complex<T>); }
+  int mem_per_point(length_type)  { return 1*sizeof(T)+2*sizeof(complex<T>); }
+
+  void operator()(length_type size, length_type loop, float& time)
+    VSIP_IMPL_NOINLINE
+  {
+    Vector<complex<T> > A(size);
+    Vector<T>           B(size);
+    Vector<complex<T> > C(size);
+
+    Rand<complex<T> > cgen(0, 0);
+    Rand<T>           sgen(0, 0);
+
+    A = cgen.randu(size);
+    B = sgen.randu(size);
+
+    vsip::impl::profile::Timer t1;
+    
+    t1.start();
+    for (index_type l=0; l<loop; ++l)
+      C = A * B;
+    t1.stop();
+    
+    for (index_type i=0; i<size; ++i)
+      test_assert(equal(C(i), A(i) * B(i)));
+    
+    time = t1.delta();
+  }
+};
+
+
+
+// Benchmark scalar-view vector multiply (Scalar * View)
+
+template <typename ScalarT,
+	  typename T>
 struct t_svmul1
 {
   char* what() { return "t_svmul1"; }
-  int ops_per_point(length_type)  { return Ops_info<T>::mul; }
+  int ops_per_point(length_type)
+  { if (sizeof(ScalarT) == sizeof(T))
+      return Ops_info<T>::mul;
+    else
+      return 2*Ops_info<ScalarT>::mul;
+  }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 2*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
     Vector<T>   A(size, T());
     Vector<T>   C(size);
 
-    T alpha = T(3);
+    ScalarT alpha = ScalarT(3);
 
+    Rand<T>     gen(0, 0);
+    A = gen.randu(size);
     A(0) = T(4);
     
     vsip::impl::profile::Timer t1;
@@ -130,7 +278,8 @@
       C = alpha * A;
     t1.stop();
     
-    test_assert(equal(C(0), T(12)));
+    for (index_type i=0; i<size; ++i)
+      test_assert(equal(C(i), alpha * A(i)));
     
     time = t1.delta();
   }
@@ -147,6 +296,7 @@
   int ops_per_point(length_type)  { return Ops_info<T>::mul; }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 2*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -188,11 +338,21 @@
   case  2: loop(t_vmul1<complex<float> >()); break;
   case  3: loop(t_vmul2<complex<float>, impl::Cmplx_inter_fmt>()); break;
   case  4: loop(t_vmul2<complex<float>, impl::Cmplx_split_fmt>()); break;
+  case  5: loop(t_rcvmul1<float>()); break;
+
+  case 11: loop(t_svmul1<float,          float>()); break;
+  case 12: loop(t_svmul1<float,          complex<float> >()); break;
+  case 13: loop(t_svmul1<complex<float>, complex<float> >()); break;
+
+  case 14: loop(t_svmul2<float>()); break;
+  case 15: loop(t_svmul2<complex<float> >()); break;
+
+  case 21: loop(t_vmul_dom1<float>()); break;
+  case 22: loop(t_vmul_dom1<complex<float> >()); break;
+
+  case 31: loop(t_vmul_ip1<float>()); break;
+  case 32: loop(t_vmul_ip1<complex<float> >()); break;
 
-  case 11: loop(t_svmul1<float>()); break;
-  case 12: loop(t_svmul1<complex<float> >()); break;
-  case 13: loop(t_svmul2<float>()); break;
-  case 14: loop(t_svmul2<complex<float> >()); break;
   default:
     return 0;
   }
Index: benchmarks/vmul_c.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul_c.cpp,v
retrieving revision 1.1
diff -u -r1.1 vmul_c.cpp
--- benchmarks/vmul_c.cpp	27 Jan 2006 13:13:23 -0000	1.1
+++ benchmarks/vmul_c.cpp	28 Feb 2006 22:17:51 -0000
@@ -21,13 +21,16 @@
 #include "test.hpp"
 #include "loop.hpp"
 #include "ops_info.hpp"
-#define USE_SSE 1
-#include "sse.h"
 
 using namespace vsip;
 
 
 
+/***********************************************************************
+  Definitions -- generic vector-multiply
+***********************************************************************/
+
+
 template <typename T>
 inline void
 vmul(
@@ -64,6 +67,10 @@
 
 
 
+/***********************************************************************
+  t_vmul1 -- Benchmark view-view vector multiply
+***********************************************************************/
+
 template <typename T>
 struct t_vmul1
 {
@@ -71,6 +78,7 @@
   int ops_per_point(length_type)  { return Ops_info<T>::mul; }
   int riob_per_point(length_type) { return 2*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
@@ -113,7 +121,9 @@
 
 
 
-// Benchmark scalar-view vector multiply (Scalar * View)
+/***********************************************************************
+  t_svmul1 -- Benchmark scalar-view vector multiply
+***********************************************************************/
 
 template <typename T>
 struct t_svmul1
@@ -122,6 +132,7 @@
   int ops_per_point(length_type)  { return Ops_info<T>::mul; }
   int riob_per_point(length_type) { return 1*sizeof(T); }
   int wiob_per_point(length_type) { return 1*sizeof(T); }
+  int mem_per_point(length_type)  { return 3*sizeof(T); }
 
   void operator()(length_type size, length_type loop, float& time)
   {
Index: benchmarks/vmul_ipp.cpp
===================================================================
RCS file: /home/cvs/Repository/vpp/benchmarks/vmul_ipp.cpp,v
retrieving revision 1.2
diff -u -r1.2 vmul_ipp.cpp
--- benchmarks/vmul_ipp.cpp	19 Dec 2005 16:08:55 -0000	1.2
+++ benchmarks/vmul_ipp.cpp	28 Feb 2006 22:17:51 -0000
@@ -33,10 +33,12 @@
 template <>
 struct t_vmul_ipp<float>
 {
+  typedef float T;
   char* what() { return "t_vmul_ipp"; }
-  int ops_per_point(size_t)  { return Ops_info<float>::mul; }
-  int riob_per_point(size_t) { return 2*sizeof(float); }
-  int wiob_per_point(size_t) { return 1*sizeof(float); }
+  int ops_per_point(size_t)  { return Ops_info<T>::mul; }
+  int riob_per_point(size_t) { return 2*sizeof(T); }
+  int wiob_per_point(size_t) { return 1*sizeof(T); }
+  int mem_per_point(size_t)  { return 3*sizeof(T); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
@@ -87,6 +89,7 @@
   int ops_per_point(size_t)  { return Ops_info<T>::mul; }
   int riob_per_point(size_t) { return 2*sizeof(T); }
   int wiob_per_point(size_t) { return 1*sizeof(T); }
+  int mem_per_point(size_t)  { return 3*sizeof(T); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
@@ -139,6 +142,7 @@
   int ops_per_point(size_t)  { return Ops_info<T>::mul; }
   int riob_per_point(size_t) { return 2*sizeof(T); }
   int wiob_per_point(size_t) { return 1*sizeof(T); }
+  int mem_per_point(size_t)  { return 2*sizeof(T); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
@@ -189,6 +193,7 @@
   int ops_per_point(size_t)  { return Ops_info<T>::mul; }
   int riob_per_point(size_t) { return 2*sizeof(T); }
   int wiob_per_point(size_t) { return 1*sizeof(T); }
+  int mem_per_point(size_t)  { return 2*sizeof(T); }
 
   void operator()(size_t size, size_t loop, float& time)
   {
Index: doc/csl-docbook/GNUmakefile.inc
===================================================================
RCS file: /home/cvs/Repository/csl-docbook/GNUmakefile.inc,v
retrieving revision 1.11
diff -u -r1.11 GNUmakefile.inc
--- doc/csl-docbook/GNUmakefile.inc	24 Jan 2006 20:08:30 -0000	1.11
+++ doc/csl-docbook/GNUmakefile.inc	28 Feb 2006 22:17:53 -0000
@@ -93,9 +93,16 @@
 install-html-$(notdir $(1)): $(1) install-htmldir
 	if test -r $(1); then \
 		$(INSTALL) -d $(DESTDIR)$(htmldir)/$(notdir $(1)); \
-		$(INSTALL_DATA) $(1).html $(DESTDIR)$(htmldir)/; \
-		$(INSTALL_DATA) $(1)/*.html $(DESTDIR)$(htmldir)/$(notdir $(1)); \
-		$(INSTALL_DATA) $(1)/*.css $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+		if test -r $(1).html; then \
+			$(INSTALL_DATA) $(1).html $(DESTDIR)$(htmldir)/; \
+		fi; \
+		for file in $(wildcard $(1)/*.html); do		\
+			$(INSTALL_DATA) $$file $(DESTDIR)$(libdir);	\
+			$(INSTALL_DATA) $$file $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+		done; \
+		for file in $(wildcard $(1)/*.css); do		\
+			$(INSTALL_DATA) $$file $(DESTDIR)$(htmldir)/$(notdir $(1)); \
+		done; \
 	fi
 	if test -d $(1)/images; then \
 		$(INSTALL) -d $(DESTDIR)$(htmldir)/$(notdir $(1))/images; \
Index: src/vsip/GNUmakefile.inc.in
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/GNUmakefile.inc.in,v
retrieving revision 1.13
diff -u -r1.13 GNUmakefile.inc.in
--- src/vsip/GNUmakefile.inc.in	4 Jan 2006 19:10:03 -0000	1.13
+++ src/vsip/GNUmakefile.inc.in	28 Feb 2006 22:17:54 -0000
@@ -22,6 +22,7 @@
 ifdef VSIP_IMPL_HAVE_SAL
 src_vsip_cxx_sources += $(srcdir)/src/vsip/impl/sal.cpp
 endif
+src_vsip_cxx_sources += $(srcdir)/src/vsip/impl/simd/vmul.cpp
 src_vsip_cxx_objects := $(patsubst $(srcdir)/%.cpp, %.$(OBJEXT), $(src_vsip_cxx_sources))
 cxx_sources += $(src_vsip_cxx_sources)
 
@@ -46,6 +47,7 @@
 	$(INSTALL) -d $(DESTDIR)$(libdir)
 	$(INSTALL_DATA) src/vsip/libvsip.a $(DESTDIR)$(libdir)/libvsip$(suffix).a
 	$(INSTALL) -d $(DESTDIR)$(includedir)/vsip/impl
+	$(INSTALL) -d $(DESTDIR)$(includedir)/vsip/impl/simd
 	$(INSTALL_DATA) src/vsip/impl/acconfig.hpp $(DESTDIR)$(includedir)/vsip/impl
 	for header in $(hdr); do \
           $(INSTALL_DATA) $(srcdir)/src/$$header \
Index: src/vsip/support.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/support.hpp,v
retrieving revision 1.26
diff -u -r1.26 support.hpp
--- src/vsip/support.hpp	27 Jan 2006 13:13:23 -0000	1.26
+++ src/vsip/support.hpp	28 Feb 2006 22:17:54 -0000
@@ -52,7 +52,9 @@
 /// indication that they should not be inlined, then VSIP_IMPL_NOINLINE
 /// is defined to that annotation, otherwise it is defined to
 /// nothing. 
-#if __GNUC__ >= 2 || defined(__ghs__)
+///
+/// Greenhills does not use this attribute.
+#if __GNUC__ >= 2
 #  define VSIP_IMPL_NOINLINE __attribute__ ((__noinline__))
 #else
 #  define VSIP_IMPL_NOINLINE
Index: src/vsip/vector.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/vector.hpp,v
retrieving revision 1.36
diff -u -r1.36 vector.hpp
--- src/vsip/vector.hpp	11 Jan 2006 16:22:44 -0000	1.36
+++ src/vsip/vector.hpp	28 Feb 2006 22:17:54 -0000
@@ -30,6 +30,7 @@
 #include <vsip/impl/par-chain-assign.hpp>
 #include <vsip/impl/dispatch-assign.hpp>
 #include <vsip/impl/lvalue-proxy.hpp>
+#include <vsip/math.hpp>
 
 /***********************************************************************
   Declarations
@@ -115,8 +116,11 @@
 
 
 
-#define VSIP_IMPL_ELEMENTWISE_SCALAR(op)        \
-  for (index_type i = 0; i < this->size(); i++) \
+#define VSIP_IMPL_ELEMENTWISE_SCALAR(op)        		   \
+  *this = *this op val
+
+#define VSIP_IMPL_ELEMENTWISE_SCALAR_NOFWD(op)			   \
+  for (index_type i = 0; i < this->size(); i++)			   \
     this->put(i, this->get(i) op val)
 
 #define VSIP_IMPL_ELEMENTWISE_VECTOR(op)	                   \
@@ -141,7 +145,7 @@
 #define VSIP_IMPL_ASSIGN_OP_NOFWD(asop, op)			   \
   template <typename T0>                                           \
   Vector& operator asop(T0 const& val) VSIP_NOTHROW                \
-  { VSIP_IMPL_ELEMENTWISE_SCALAR(op); return *this;}               \
+  { VSIP_IMPL_ELEMENTWISE_SCALAR_NOFWD(op); return *this;}	   \
   template <typename T0, typename Block0>                          \
   Vector& operator asop(const_Vector<T0, Block0> v) VSIP_NOTHROW   \
   { VSIP_IMPL_ELEMENTWISE_VECTOR_NOFWD(op); return *this;}	   \
Index: src/vsip/impl/distributed-block.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/distributed-block.hpp,v
retrieving revision 1.17
diff -u -r1.17 distributed-block.hpp
--- src/vsip/impl/distributed-block.hpp	11 Jan 2006 16:22:44 -0000	1.17
+++ src/vsip/impl/distributed-block.hpp	28 Feb 2006 22:17:54 -0000
@@ -133,7 +133,7 @@
   {
     index_type     sb = map_.impl_subblock_from_global_index(Index<1>(idx));
     processor_type pr = *(map_.processor_begin(sb));
-    value_type     val;
+    value_type     val = value_type(); // avoid -Wall 'may not be initialized'
 
     if (pr == proc_)
     {
@@ -152,7 +152,7 @@
     index_type     sb = map_.impl_subblock_from_global_index(
 				Index<2>(idx0, idx1));
     processor_type pr = *(map_.processor_begin(sb));
-    value_type     val;
+    value_type     val = value_type(); // avoid -Wall 'may not be initialized'
 
     if (pr == proc_)
     {
@@ -173,7 +173,7 @@
     index_type     sb = map_.impl_subblock_from_global_index(
 				Index<3>(idx0, idx1, idx2));
     processor_type pr = *(map_.processor_begin(sb));
-    value_type     val;
+    value_type     val = value_type(); // avoid -Wall 'may not be initialized'
 
     if (pr == proc_)
     {
Index: src/vsip/impl/eval-sal.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/eval-sal.hpp,v
retrieving revision 1.1
diff -u -r1.1 eval-sal.hpp
--- src/vsip/impl/eval-sal.hpp	15 Nov 2005 21:01:27 -0000	1.1
+++ src/vsip/impl/eval-sal.hpp	28 Feb 2006 22:17:54 -0000
@@ -47,7 +47,8 @@
           typename Block0,
           typename Block1,
           typename Block2>
-struct Evaluator<Op_prod_vv_outer, Block0, Op_list_3<T1, Block1, Block2>,
+struct Evaluator<Op_prod_vv_outer, Block0,
+		 Op_list_3<T1, Block1 const&, Block2 const&>,
                  Mercury_sal_tag>
 {
   static bool const ct_valid = 
@@ -59,7 +60,7 @@
     Ext_data_cost<Block1>::value == 0 &&
     Ext_data_cost<Block2>::value == 0;
 
-  static bool rt_valid(Block0& r, T1 alpha, Block1 const& a, Block2 const& b)
+  static bool rt_valid(Block0& r, T1, Block1 const& a, Block2 const& b)
   {
     typedef typename Block_layout<Block1>::order_type order0_type;
     dimension_type const r_dim1 = order0_type::impl_dim1;
@@ -125,7 +126,7 @@
           typename Block1,
           typename Block2>
 struct Evaluator<Op_prod_vv_outer, Block0, 
-                 Op_list_3<std::complex<T1>, Block1, Block2>,
+                 Op_list_3<std::complex<T1>, Block1 const&, Block2 const&>,
                  Mercury_sal_tag>
 {
   typedef typename Block_layout<Block0>::complex_type complex0_type;
@@ -144,7 +145,7 @@
     Type_equal<complex0_type, complex1_type>::value &&
     Type_equal<complex1_type, complex2_type>::value;
 
-  static bool rt_valid(Block0& r, std::complex<T1> alpha, 
+  static bool rt_valid(Block0& r, std::complex<T1>, 
     Block1 const& a, Block2 const& b)
   {
     typedef typename Block_layout<Block1>::order_type order0_type;
@@ -576,7 +577,7 @@
     Type_equal<complex0_type, complex1_type>::value &&
     Type_equal<complex1_type, complex2_type>::value;
 
-  static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+  static bool rt_valid(Block0&, Block1 const& a, Block2 const&)
   {
     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
 
@@ -584,7 +585,8 @@
     dimension_type const a_dim1 = order1_type::impl_dim1;
 
     bool dense_major_dim = 
-      (ext_a.stride(a_dim0) == ext_a.size(a_dim1) * ext_a.stride(a_dim1));
+      (ext_a.stride(a_dim0) == (stride_type)ext_a.size(a_dim1) *
+                                            ext_a.stride(a_dim1));
 
     return dense_major_dim;
   }
@@ -652,7 +654,7 @@
     Type_equal<complex0_type, complex1_type>::value &&
     Type_equal<complex1_type, complex2_type>::value;
 
-  static bool rt_valid(Block0& r, Block1 const& a, Block2 const& b)
+  static bool rt_valid(Block0&, Block1 const&, Block2 const& b)
   {
     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
 
@@ -660,7 +662,8 @@
     dimension_type const b_dim1 = order2_type::impl_dim1;
 
     bool dense_major_dim = 
-      (ext_b.stride(b_dim0) == ext_b.size(b_dim1) * ext_b.stride(b_dim1));
+      (ext_b.stride(b_dim0) == (stride_type)ext_b.size(b_dim1) *
+					    ext_b.stride(b_dim1));
 
     return dense_major_dim;
   }
@@ -746,9 +749,12 @@
     dimension_type const b_dim1 = order2_type::impl_dim1;
 
     bool dense_major_dim = 
-      (ext_r.stride(r_dim0) == ext_r.size(r_dim1) * ext_r.stride(r_dim1)) &&
-      (ext_a.stride(a_dim0) == ext_a.size(a_dim1) * ext_a.stride(a_dim1)) &&
-      (ext_b.stride(b_dim0) == ext_b.size(b_dim1) * ext_b.stride(b_dim1));
+      (ext_r.stride(r_dim0) == (stride_type)ext_r.size(r_dim1) *
+                                            ext_r.stride(r_dim1)) &&
+      (ext_a.stride(a_dim0) == (stride_type)ext_a.size(a_dim1) *
+                                            ext_a.stride(a_dim1)) &&
+      (ext_b.stride(b_dim0) == (stride_type)ext_b.size(b_dim1) *
+                                            ext_b.stride(b_dim1));
 
     return dense_major_dim;
   }
@@ -861,18 +867,18 @@
     Type_equal<complex0_type, complex1_type>::value &&
     Type_equal<complex1_type, complex2_type>::value;
 
-  static bool rt_valid(Block0& r, T1 alpha, Block1 const& a, 
-    Block2 const& b, T2 beta)
+  static bool rt_valid(Block0& r, T1, Block1 const& a, 
+    Block2 const& b, T2)
   {
     Ext_data<Block0> ext_r(const_cast<Block0&>(r));
     Ext_data<Block1> ext_a(const_cast<Block1&>(a));
     Ext_data<Block2> ext_b(const_cast<Block2&>(b));
 
-    dimension_type const r_dim0 = order0_type::impl_dim0;
+    // dimension_type const r_dim0 = order0_type::impl_dim0;
     dimension_type const r_dim1 = order0_type::impl_dim1;
-    dimension_type const a_dim0 = order1_type::impl_dim0;
+    // dimension_type const a_dim0 = order1_type::impl_dim0;
     dimension_type const a_dim1 = order1_type::impl_dim1;
-    dimension_type const b_dim0 = order2_type::impl_dim0;
+    // dimension_type const b_dim0 = order2_type::impl_dim0;
     dimension_type const b_dim1 = order2_type::impl_dim1;
 
     bool unit_minor_stride =
Index: src/vsip/impl/expr_serial_dispatch.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/expr_serial_dispatch.hpp,v
retrieving revision 1.3
diff -u -r1.3 expr_serial_dispatch.hpp
--- src/vsip/impl/expr_serial_dispatch.hpp	22 Dec 2005 01:29:25 -0000	1.3
+++ src/vsip/impl/expr_serial_dispatch.hpp	28 Feb 2006 22:17:54 -0000
@@ -23,6 +23,14 @@
 #ifdef VSIP_IMPL_HAVE_SAL
 #include <vsip/impl/sal.hpp>
 #endif
+
+#ifdef VSIP_IMPL_HAVE_SIMD_GENERIC
+#  include <vsip/impl/simd/eval-generic.hpp>
+#endif
+#ifdef VSIP_IMPL_HAVE_SIMD_3DNOWEXT
+#  include <vsip/impl/simd/eval-simd-3dnowext.hpp>
+#endif
+
 #include <iostream>
 
 /***********************************************************************
@@ -35,7 +43,8 @@
 {
 
 /// The list of evaluators to be tried, in that specific order.
-typedef Make_type_list<Intel_ipp_tag,
+typedef Make_type_list<VSIP_IMPL_SIMD_TAG_LIST
+		       Intel_ipp_tag,
 		       Transpose_tag,
                        Mercury_sal_tag,
 		       Loop_fusion_tag>::type LibraryTagList;
@@ -70,6 +79,7 @@
 {
 
   static void exec(DstBlock& dst, SrcBlock const& src)
+    VSIP_NOTHROW
   {
     if (EvalExpr::rt_valid(dst, src)) EvalExpr::exec(dst, src);
     else Serial_dispatch_helper<Dim, DstBlock, SrcBlock, Rest>::exec(dst, src);
@@ -104,6 +114,7 @@
 			      Tag, None_type, EvalExpr, true>
 {
   static void exec(DstBlock& dst, SrcBlock const& src)
+    VSIP_NOTHROW
   {
     if (EvalExpr::rt_valid(dst, src)) EvalExpr::exec(dst, src);
     else assert(0);
Index: src/vsip/impl/expr_serial_evaluator.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/expr_serial_evaluator.hpp,v
retrieving revision 1.4
diff -u -r1.4 expr_serial_evaluator.hpp
--- src/vsip/impl/expr_serial_evaluator.hpp	22 Dec 2005 01:29:25 -0000	1.4
+++ src/vsip/impl/expr_serial_evaluator.hpp	28 Feb 2006 22:17:54 -0000
@@ -32,6 +32,7 @@
 struct Loop_fusion_tag;
 struct Intel_ipp_tag;
 struct Mercury_sal_tag;
+struct Simd_tag;
 struct Transpose_tag;
 
 /// Serial_expr_evaluator template.
@@ -55,7 +56,8 @@
   
   static void exec(DstBlock& dst, SrcBlock const& src)
   {
-    for (index_type i=0; i<dst.size(1, 0); ++i)
+    length_type const size = dst.size(1, 0);
+    for (index_type i=0; i<size; ++i)
       dst.put(i, src.get(i));
   }
 };
@@ -156,15 +158,19 @@
 
   static void exec(DstBlock& dst, SrcBlock const& src, row2_type)
   {
-    for (index_type i=0; i<dst.size(2, 0); ++i)
-      for (index_type j=0; j<dst.size(2, 1); ++j)
+    length_type const rows = dst.size(2, 0);
+    length_type const cols = dst.size(2, 1);
+    for (index_type i=0; i<rows; ++i)
+      for (index_type j=0; j<cols; ++j)
 	dst.put(i, j, src.get(i, j));
   }
 
   static void exec(DstBlock& dst, SrcBlock const& src, col2_type)
   {
-    for (index_type j=0; j<dst.size(2, 1); ++j)
-      for (index_type i=0; i<dst.size(2, 0); ++i)
+    length_type const rows = dst.size(2, 0);
+    length_type const cols = dst.size(2, 1);
+    for (index_type j=0; j<cols; ++j)
+      for (index_type i=0; i<rows; ++i)
 	dst.put(i, j, src.get(i, j));
   }
 
Index: src/vsip/impl/lvalue-proxy.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/lvalue-proxy.hpp,v
retrieving revision 1.6
diff -u -r1.6 lvalue-proxy.hpp
--- src/vsip/impl/lvalue-proxy.hpp	3 Feb 2006 16:01:36 -0000	1.6
+++ src/vsip/impl/lvalue-proxy.hpp	28 Feb 2006 22:17:54 -0000
@@ -177,9 +177,8 @@
       block_(b), coord_(i)
   {};
 
-  /// Read access, by implicit conversion to the value type.
-  operator value_type() const
-    { return lvalue_detail::get(block_, coord_); }
+  /// Since proxy derives from value type, implicit conversion
+  /// is not necessary.
 
   /// Write access, by assignment from the value type.
   Lvalue_proxy& operator= (value_type v)
Index: src/vsip/impl/par-chain-assign.hpp
===================================================================
RCS file: /home/cvs/Repository/vpp/src/vsip/impl/par-chain-assign.hpp,v
retri