Actions

icon Post
text/html Subscribe
text/html Unsubscribe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[patch] SAL 2D Conv, Misc VSIPL++ bits


  • To: VSIPL++ Developers List <vsipl++@xxxxxxxxxxxxxxxx>
  • Subject: [patch] SAL 2D Conv, Misc VSIPL++ bits
  • From: Jules Bergmann <jules@xxxxxxxxxxxxxxxx>
  • Date: Wed, 08 Aug 2007 08:43:23 -0400

This patch adds support for SAL's 2D conv, adds SIMD loop fusions support for min, max, and mag, and has various benchmark and test updates.

Patch applied.

			-- Jules

--
Jules Bergmann
CodeSourcery
jules@xxxxxxxxxxxxxxxx
(650) 331-3385 x705
Index: ChangeLog
===================================================================
--- ChangeLog	(revision 177867)
+++ ChangeLog	(working copy)
@@ -1,3 +1,40 @@
+2007-08-08  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
+
+	* scripts/char.pl: Capture -diag output for each run.
+	* scripts/fmt-profile.pl: Fix typo in options description.
+	* scripts/char.db: Add vmin set.
+	* scripts/datasheet.pl: Add additional cases.
+	* src/vsip/core/signal/conv.hpp: Bump copyright.
+	* src/vsip/opt/sal/bindings.hpp: Add 2D convolution bindings.
+	* src/vsip/opt/sal/conv.hpp: Use 2D SAL conv bindings for 2D V++ conv.
+	* src/vsip/opt/sal/eval_elementwise.hpp: Fix assert to handle scalar
+	  blocks with 0 size.
+	* src/vsip/opt/fftw3/fft_impl.cpp: Force row alignment for FFTM.
+	* src/vsip/opt/simd/simd.hpp: Add min, max, mag functions.
+	* src/vsip/opt/simd/vma_ip_csc.cpp: New file, SIMD optimized
+	  R += real-a * cmplx-B
+	* src/vsip/opt/simd/vma_ip_csc.hpp: New file, likewise.
+	* src/vsip/opt/simd/vaxpy.cpp: New file, SIMD optimized
+	  R += cmplx-a * real-B + cmplx-C
+	* src/vsip/opt/simd/vaxpy.hpp: New file, likewise.
+	* src/vsip/opt/simd/eval_generic.hpp: Fix typo in rt_valid aligment
+	  check.
+	* src/vsip/opt/simd/expr_iterator.hpp: Recognize simd traits that
+	  don't support div.  Add min, max, and mag support.  Fix unaligned
+	  loader.
+	* src/vsip/opt/diag/eval.hpp: Add support for 'A = scalar' exprs.
+	* src/vsip/GNUmakefile.inc.in: Build vaxpy files.
+	* tests/coverage_common.hpp: Add VERBOSE output for TEST_UNARY.
+	* tests/regressions/simd_alignment.cpp: Add coverage for exprs
+	  with different aligment (that require unaligned SIMD load).
+	* tests/coverage_unary.cpp: Split neg tests into ...
+	* tests/coverage_unary_neg.cpp: .. here, new file.
+	* tests/conv-2d.cpp: Add more output on failure.
+	* benchmarks/vdiv.cpp: Add diag output.
+	* benchmarks/conv2d.cpp: Remove debug output.
+	* benchmarks/sal/vthresh.cpp: Add coverage for vthrx.
+	* examples/mercury/mcoe-setup.sh: Suppress GHC warning 175.
+	
 2007-07-31  Jules Bergmann  <jules@xxxxxxxxxxxxxxxx>
 
 	* src/vsip/core/fft/factory.hpp (create): Throw exception when
Index: scripts/char.pl
===================================================================
--- scripts/char.pl	(revision 177867)
+++ scripts/char.pl	(working copy)
@@ -177,7 +177,12 @@
    
    my $cmd = "$runcmd $full_pgm -$x $opt $runopt > $outfile";
    print "CMD $cmd\n";
-   system($cmd) if !$dry;
+   if (!$dry) {
+      system($cmd);
+      system("echo '---' >> $outfile");
+      $cmd = "$runcmd $full_pgm -$x $opt $runopt -diag >> $outfile";
+      system($cmd);
+      }
    }
 
 
Index: scripts/fmt-profile.pl
===================================================================
--- scripts/fmt-profile.pl	(revision 173072)
+++ scripts/fmt-profile.pl	(working copy)
@@ -12,7 +12,7 @@
 #									#
 # Options:								#
 #   -sec	-- convert ticks into seconds				#
-#   -sec	-- sum nested operations for events with 0 ops		#
+#   -sum	-- sum nested operations for events with 0 ops		#
 #   -extra <event>							#
 #		-- create pseudo event for unaccounted-for time		#
 #		   in nested events under <event>			#
Index: scripts/char.db
===================================================================
--- scripts/char.db	(revision 177867)
+++ scripts/char.db	(working copy)
@@ -33,7 +33,7 @@
 # Core characterization
 #########################################################################
 
-macro: core vmul vma fft-est fft-measure vendor-fft maxval sumval copy fastconv conv vthresh vgt_ite vendor-vthresh
+macro: core vmul vma vmin fft-est fft-measure vendor-fft maxval sumval copy fastconv conv vthresh vgt_ite vendor-vthresh
 
 
 
@@ -125,13 +125,17 @@
   pgm: vthresh
   cases: 1 2 3 11
 
+set: vmin
+  pgm: vmin
+  cases: 1
+
 set: vgt_ite
   pgm: vgt_ite
   cases: 1 2 5
 
 set: sal-vthresh
   pgm: sal/vthresh
-  cases: 1 11
+  cases: 1 2 11
   req:   sal
 
 set: sal-lvgt
Index: scripts/datasheet.pl
===================================================================
--- scripts/datasheet.pl	(revision 177867)
+++ scripts/datasheet.pl	(working copy)
@@ -96,7 +96,7 @@
    $x_name = "MB/s"    if ($metric eq 'r_mb_s');
    $x_name = "Mpt/s"   if ($metric eq 'mpts_s');
 
-   printf OUT "  %5s  %7s  %7s  %7s\n", "Size", "usec", "usec/pt", $x_name;
+   printf OUT "  %5s  %7s  %7s  %7s\n", "Size", "us/call", "us/pt", $x_name;
    # printf OUT "  %5s  %7s  %7s  %7s\n", "", "(usec)", "(usec)", "";
 
    foreach my $size (@sizes) {
@@ -118,7 +118,7 @@
       $x = $r_mb_s  if ($metric eq 'r_mb_s');
       $x = $mpts_s  if ($metric eq 'mpts_s');
 
-      printf OUT "  %5d  %7.2f  %7.2f  %7.2f\n", $size, $s, $us_pt, $x;
+      printf OUT "  %5d  %7.1f  %7.5f  %7.2f\n", $size, $s, $us_pt, $x;
       }
    print OUT "\n";
    }
@@ -152,11 +152,20 @@
 read_dat($db, "fft-17-1.dat");
 
 read_dat($db, "ipp-fft-1-1.dat");
+
 read_dat($db, "fftw3-fft-11-1.dat");
 read_dat($db, "fftw3-fft-12-1.dat");
 read_dat($db, "fftw3-fft-51-1.dat");
 read_dat($db, "fftw3-fft-52-1.dat");
 
+read_dat($db, "sal-fft-1-1.dat");
+read_dat($db, "sal-fft-2-1.dat");
+read_dat($db, "sal-fft-5-1.dat");
+read_dat($db, "sal-fft-6-1.dat");
+read_dat($db, "sal-fft-11-1.dat");
+read_dat($db, "sal-fft-12-1.dat");
+read_dat($db, "sal-fft-15-1.dat");
+read_dat($db, "sal-fft-16-1.dat");
 
 read_dat($db, "vma-1-1.dat");
 read_dat($db, "vma-2-1.dat");
@@ -164,6 +173,9 @@
 read_dat($db, "vma-11-1.dat");
 read_dat($db, "vma-12-1.dat");
 read_dat($db, "vma-13-1.dat");
+
+read_dat($db, "vmin-1-1.dat");
+
 read_dat($db, "vmul-1-1.dat");
 read_dat($db, "vmul-2-1.dat");
 
@@ -171,6 +183,12 @@
 read_dat($db, "vthresh-2-1.dat");
 read_dat($db, "vthresh-3-1.dat");
 read_dat($db, "vthresh-11-1.dat");
+
+read_dat($db, "sal-vthresh-1-1.dat");
+read_dat($db, "sal-vthresh-2-1.dat");
+read_dat($db, "sal-lvgt-1-1.dat");
+read_dat($db, "sal-lvgt-2-1.dat");
+
 read_dat($db, "vgt_ite-1-1.dat");
 read_dat($db, "vgt_ite-2-1.dat");
 read_dat($db, "vgt_ite-5-1.dat");
@@ -214,6 +232,15 @@
 
 report_func($db, "ipp-fft-1-1", "fft (vendor-IPP): Out-of-place CC Fwd FFT (inter complex)", optional => 1);
 
+report_func($db, "sal-fft-1-1", "fft (vendor-SAL): Out-of-place CC Fwd FFT (split complex)", optional => 1);
+report_func($db, "sal-fft-2-1", "fft (vendor-SAL): In-place CC Fwd FFT (split complex)", optional => 1);
+report_func($db, "sal-fft-5-1", "fft (vendor-SAL): Out-of-place CC Inv FFT (split complex)", optional => 1);
+report_func($db, "sal-fft-6-1", "fft (vendor-SAL): In-place CC Inv FFT (split complex)", optional => 1);
+report_func($db, "sal-fft-11-1", "fft (vendor-SAL): Out-of-place CC Fwd FFT (inter complex)", optional => 1);
+report_func($db, "sal-fft-12-1", "fft (vendor-SAL): In-place CC Fwd FFT (inter complex)", optional => 1);
+report_func($db, "sal-fft-15-1", "fft (vendor-SAL): Out-of-place CC Inv FFT (inter complex)", optional => 1);
+report_func($db, "sal-fft-16-1", "fft (vendor-SAL): In-place CC Inv FFT (inter complex)", optional => 1);
+
 # Maxval
 header();
 
@@ -232,6 +259,11 @@
 report_func($db, "vma-12-1", "vma: vector fused multiply-add (Z = a * B + C) (complex)");
 report_func($db, "vma-13-1", "vma: vector fused multiply-add (Z = a * B + c) (complex) (aka caxpy)");
 
+# VMIN
+header();
+
+report_func($db, "vmin-1-1", "vmin: vector minima (Z = min(A, B)) (float)");
+
 # VMUL
 header();
 
@@ -246,8 +278,18 @@
 report_func($db, "vthresh-3-1", "vthresh: vector threshold (Z = ite(A >= b, A, c)) (float)");
 report_func($db, "vthresh-11-1", "vthresh: vector threshold (Z = ite(A > B, 1, 0)) (float) lvgt");
 
+
+
 report_func($db, "vgt_ite-1-1", "vthresh: vector threshold (Z = ite(A > B, A, 0)) (float) lvgt/vmul");
 report_func($db, "vgt_ite-2-1", "vthresh: vector threshold (Z = ite(A > B, A, 1)) (float)");
 report_func($db, "vgt_ite-5-1", "vthresh: vector threshold (Z = ite(A > B, A, 0)) (float) multi-expr");
 
+header();
+
+report_func($db, "sal-vthresh-1-1", "vthresh (vendor-SAL): vthreshx (Z = ite(A >= b, A, 0)) (float)");
+report_func($db, "sal-vthresh-2-1", "vthresh (vendor-SAL): vthrx (Z = ite(A >= b, A, b)) (float)");
+
+report_func($db, "sal-lvgt-1-1", "vthresh (vendor-SAL): lvgtx (Z = ite(A > B, 1, 0)) (float)");
+report_func($db, "sal-lvgt-2-1", "vthresh (vendor-SAL): lvgtx/vmul (Z = ite(A > B, A, 0)) (float)");
+
 close(OUT);
Index: src/vsip/core/signal/conv.hpp
===================================================================
--- src/vsip/core/signal/conv.hpp	(revision 177858)
+++ src/vsip/core/signal/conv.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005 by CodeSourcery, LLC.  All rights reserved. */
+/* Copyright (c) 2005, 2006, 2007 by CodeSourcery.  All rights reserved. */
 
 /** @file    vsip/core/signal/conv.hpp
     @author  Jules Bergmann
Index: src/vsip/opt/sal/bindings.hpp
===================================================================
--- src/vsip/opt/sal/bindings.hpp	(revision 176624)
+++ src/vsip/opt/sal/bindings.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005, 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2005, 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -22,7 +22,6 @@
   Included Files
 ***********************************************************************/
 
-#include <iostream>
 #include <complex>
 #include <sal.h>
 
@@ -223,22 +222,22 @@
 
 // convolution functions
 
-#define VSIP_IMPL_SAL_CONV( T, SAL_T, SALFCN, STRIDE_X ) \
-inline void                            \
-conv( T *filter, int f_as, int M,      \
-      T *input,  int i_as, int N,      \
-      T *output, int o_as )            \
-{                                      \
-  SALFCN(                              \
-    (SAL_T *) &input[0],      /* input vector, length of A >= N+p-1 */ \
-    i_as * STRIDE_X,          /* address stride for A               */ \
-    (SAL_T *) &filter[M - 1], /* input filter                       */ \
-    -1 * f_as * STRIDE_X,     /* address stride for B               */ \
-    (SAL_T *) &output[0],     /* output vector                      */ \
-    o_as * STRIDE_X,          /* address stride for C               */ \
-    N,                        /* real output count                  */ \
-    M,                        /* filter length (vector B)           */ \
-    0                         /* ESAL flag                          */ \
+#define VSIP_IMPL_SAL_CONV( T, SAL_T, SALFCN, STRIDE_X )		\
+inline void								\
+conv( T *filter, int f_as, int M,					\
+      T *input,  int i_as, int N,					\
+      T *output, int o_as )						\
+{									\
+  SALFCN(								\
+    (SAL_T *) &input[0],      /* input vector, length of A >= N+p-1 */	\
+    i_as * STRIDE_X,          /* address stride for A               */	\
+    (SAL_T *) &filter[M - 1], /* input filter                       */	\
+    -1 * f_as * STRIDE_X,     /* address stride for B               */	\
+    (SAL_T *) &output[0],     /* output vector                      */	\
+    o_as * STRIDE_X,          /* address stride for C               */	\
+    N,                        /* real output count                  */	\
+    M,                        /* filter length (vector B)           */	\
+    0                         /* ESAL flag                          */	\
   );                                   \
 }
 
@@ -267,6 +266,71 @@
 
 VSIP_IMPL_SAL_CONV_SPLIT( float, COMPLEX_SPLIT, zconvx, 1 );
 
+
+
+#define VSIP_IMPL_SAL_CONV_2D(T, SAL_T, SALFCN, STRIDE_X)		\
+inline void								\
+conv2d(									\
+  T*  filter,								\
+  int filter_rows, int filter_cols,					\
+  int filter_row_stride,						\
+  T*  input,								\
+  int input_rows, int input_cols,					\
+  int input_row_stride,							\
+  T*  output,								\
+  int output_rows, int output_cols,					\
+  int output_row_stride,						\
+  int row_dec,								\
+  int col_dec)								\
+{									\
+  assert(filter_row_stride == filter_cols);				\
+  assert(input_rows == (output_rows-1)*row_dec + filter_rows);		\
+  assert(input_cols == (output_cols-1)*col_dec + filter_cols);		\
+  SALFCN(								\
+    (SAL_T*)&input[0],        /* input matrix                       */	\
+    input_row_stride,         /* total columns in input matrix      */	\
+    (SAL_T *)&filter[0],      /* kernel matrix                      */	\
+    (SAL_T *)&output[0],      /* output matrix                      */	\
+    output_row_stride,        /* total columns in output matrix     */	\
+    row_dec,                  /* row decimation factor              */	\
+    col_dec,                  /* column decimation factor           */	\
+    filter_cols,              /* number of filter columns           */	\
+    filter_rows,              /* number of filter rows              */	\
+    output_cols,              /* number of output columns           */	\
+    output_rows,              /* number of output rows              */	\
+    0,                        /* reserved                           */	\
+    0                         /* ESAL flag                          */	\
+    );									\
+}
+
+VSIP_IMPL_SAL_CONV_2D(float, float, conv2dx, 1);
+
+#undef VSIP_IMPL_SAL_CONV_2D
+
+
+
+#define VSIP_IMPL_SAL_CONV_2D_3X3(T, SAL_T, SALFCN, STRIDE_X)		\
+inline void								\
+conv2d_3x3(								\
+  T*  filter,								\
+  T*  input,								\
+  T*  output,								\
+  int rows, int cols)							\
+{									\
+  SALFCN(								\
+    (SAL_T*)&input[0],        /* input matrix                       */	\
+    cols,                     /* number of in/out columns           */	\
+    rows,                     /* number of in/out rows              */	\
+    (SAL_T *)&filter[0],      /* 3x3 filter matrix                  */	\
+    (SAL_T *)&output[0],      /* output matrix                      */	\
+    0                         /* ESAL flag                          */	\
+    );									\
+}
+
+VSIP_IMPL_SAL_CONV_2D_3X3(float, float, f3x3x, 1);
+
+#undef VSIP_IMPL_SAL_CONV_2D_3X3
+
 } // namespace vsip::impl::sal
 } // namespace vsip::impl
 } // namespace vsip
Index: src/vsip/opt/sal/conv.hpp
===================================================================
--- src/vsip/opt/sal/conv.hpp	(revision 177867)
+++ src/vsip/opt/sal/conv.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005, 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2005, 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -54,7 +54,13 @@
   static bool const value = true;
 };
 
+template <>
+struct Is_conv_impl_avail<Mercury_sal_tag, 2, float>
+{
+  static bool const value = true;
+};
 
+
 // These help enforce limits on the length of the kernel
 // when using SAL, which differ for complex values
 template <typename T>
@@ -145,10 +151,14 @@
 	   Matrix<T, Block1>)
     VSIP_NOTHROW;
 
-  typedef vsip::impl::Layout<1, row1_type, vsip::impl::Stride_unit,
-			     complex_type> layout_type;
-  typedef Vector<T> coeff_view_type;
-  typedef impl::Ext_data<typename coeff_view_type::block_type, layout_type> c_ext_type;
+  typedef vsip::impl::Layout<dim,
+			     typename Row_major<dim>::type,
+			     vsip::impl::Stride_unit,
+			     complex_type>
+		layout_type;
+  typedef typename View_of_dim<dim, T, Dense<dim, T> >::type coeff_view_type;
+  typedef impl::Ext_data<typename coeff_view_type::block_type, layout_type>
+		c_ext_type;
 
   // Member data.
 private:
@@ -156,6 +166,10 @@
   c_ext_type      coeff_ext_;
   ptr_type        pcoeff_;
 
+  coeff_view_type sal_coeff_;
+  c_ext_type      sal_coeff_ext_;
+  ptr_type        sal_pcoeff_;
+
   Domain<dim>     kernel_size_;
   Domain<dim>     input_size_;
   Domain<dim>     output_size_;
@@ -174,6 +188,31 @@
   Definitions
 ***********************************************************************/
 
+template <typename T,
+	  typename Block1,
+	  typename Block2>
+inline void
+mirror(
+  const_Vector<T, Block1> src,
+  Vector<T, Block2>       dst)
+{
+  dst(Domain<1>(dst.size()-1, -1, dst.size())) = src;
+}
+
+template <typename T,
+	  typename Block1,
+	  typename Block2>
+inline void
+mirror(
+  const_Matrix<T, Block1> src,
+  Matrix<T, Block2>       dst)
+{
+  dst(Domain<2>(Domain<1>(dst.size(0)-1, -1, dst.size(0)),
+		Domain<1>(dst.size(1)-1, -1, dst.size(1)))) = src;
+}
+
+
+
 /// Construct a convolution object.
 
 template <template <typename, typename> class ConstViewT,
@@ -192,6 +231,9 @@
   : coeff_      (conv_kernel<coeff_view_type>(Symm, filter_coeffs)),
     coeff_ext_  (coeff_.block(), impl::SYNC_IN),
     pcoeff_     (coeff_ext_.data()),
+    sal_coeff_     (conv_kernel<coeff_view_type>(Symm, filter_coeffs)),
+    sal_coeff_ext_ (sal_coeff_.block(), impl::SYNC_IN),
+    sal_pcoeff_    (sal_coeff_ext_.data()),
     kernel_size_(impl::view_domain(coeff_)),
     input_size_ (input_size),
     output_size_(impl::conv_output_size(Supp, kernel_size_, input_size,
@@ -199,6 +241,7 @@
     decimation_ (decimation),
     pm_non_opt_calls_ (0)
 {
+  mirror(coeff_, sal_coeff_);
   in_buffer_  = storage_type::allocate(input_size_.size());
   if (storage_type::is_null(in_buffer_))
     VSIP_IMPL_THROW(std::bad_alloc());
@@ -382,8 +425,124 @@
   Matrix<T, Block1>       out)
 VSIP_NOTHROW
 {
-  VSIP_IMPL_THROW(vsip::impl::unimplemented(
-		    "Convolution_impl<... Mercury_sal_tag> does not support Matrix"));
+  length_type const Mr = this->coeff_.size(0);
+  length_type const Mc = this->coeff_.size(1);
+
+  length_type const Nr = this->input_size_[0].size();
+  length_type const Nc = this->input_size_[1].size();
+
+  length_type const Pr = this->output_size_[0].size();
+  length_type const Pc = this->output_size_[1].size();
+
+  assert(Pr == out.size(0) && Pc == out.size(1));
+
+  typedef vsip::impl::Ext_data<Block0> in_ext_type;
+  typedef vsip::impl::Ext_data<Block1> out_ext_type;
+
+  in_ext_type  in_ext (in.block(),  vsip::impl::SYNC_IN,  in_buffer_);
+  out_ext_type out_ext(out.block(), vsip::impl::SYNC_OUT, out_buffer_);
+
+  VSIP_IMPL_PROFILE(pm_in_ext_cost_  += in_ext.cost());
+  VSIP_IMPL_PROFILE(pm_out_ext_cost_ += out_ext.cost());
+
+  T* pin    = in_ext.data();
+  T* pout   = out_ext.data();
+
+  stride_type coeff_row_stride = coeff_ext_.stride(0);
+  stride_type coeff_col_stride = coeff_ext_.stride(1);
+  stride_type in_row_stride    = in_ext.stride(0);
+  stride_type in_col_stride    = in_ext.stride(1);
+  stride_type out_row_stride   = out_ext.stride(0);
+  stride_type out_col_stride   = out_ext.stride(1);
+
+  if (Supp == support_full)
+  {
+#if 0
+    // Full support not implemented yet.
+    if (decimation_ == 1 && coeff_col_stride == 1 &&
+	in_ext.stride(1) == 1 && out_ext.stride(1) == 1)
+    {
+      impl::sal::conv2d_full(
+	pcoeff_, Mr, Mc, coeff_row_stride,
+	pin, Nr, Nc, in_row_stride,
+	pout, out_row_stride);
+    }
+    else
+#endif
+    {
+      conv_full<T>(pcoeff_, Mr, Mc, coeff_row_stride, coeff_col_stride,
+		   pin, Nr, Nc, in_row_stride, in_col_stride,
+		   pout, Pr, Pc, out_row_stride, out_col_stride,
+		   decimation_);
+    }
+  }
+  else if (Supp == support_same)
+  {
+    if (decimation_ == 1 && coeff_col_stride == 1 &&
+	in_ext.stride(1) == 1 && out_ext.stride(1) == 1)
+    {
+      if (Mr == 3 && Mc == 3 &&
+	  coeff_row_stride == (stride_type)Mc &&
+	  in_row_stride    == (stride_type)Nc &&
+	  out_row_stride   == (stride_type)Nc)
+      {
+	impl::sal::conv2d_3x3(sal_pcoeff_, pin, pout, Nr, Nc);
+      }
+      else
+      {
+	index_type n0_r = (Mr - 1) - (Mr/2);
+	index_type n0_c = (Mc - 1) - (Mc/2);
+	index_type n1_r = Nr - (Mr/2);
+	index_type n1_c = Nc - (Mc/2);
+
+	T* pout_adj = pout + (n0_r)*out_row_stride
+			   + (n0_c)*out_col_stride;
+
+	if (n1_r > n0_r && n1_c > n0_c)
+	  impl::sal::conv2d(
+	    sal_pcoeff_, Mr, Mc, coeff_row_stride,
+	    pin,         Nr, Nc, in_row_stride,
+	    pout_adj,    Pr - (Mr-1), Pc - (Mc-1), out_row_stride,
+	    decimation_, decimation_);
+      }
+
+      // SAL conv2d (conv2dx) is min-support, while SAL conv2d_3x3
+      // (f3x3x) leaves a strip of 0 around the edge of the output.
+      //
+      // Implement same-support by filling out the edges.
+
+      conv_same_edge<T>(pcoeff_, Mr, Mc, coeff_row_stride, coeff_col_stride,
+			pin, Nr, Nc, in_row_stride, in_col_stride,
+			pout, Pr, Pc, out_row_stride, out_col_stride,
+			decimation_);
+    }
+    else
+    {
+      conv_same<T>(pcoeff_, Mr, Mc, coeff_row_stride, coeff_col_stride,
+		   pin, Nr, Nc, in_row_stride, in_col_stride,
+		   pout, Pr, Pc, out_row_stride, out_col_stride,
+		   decimation_);
+    }
+  }
+  else // (Supp == support_min)
+  {
+    if (decimation_ == 1 && coeff_col_stride == 1 &&
+	in_ext.stride(1) == 1 && out_ext.stride(1) == 1)
+    {
+      impl::sal::conv2d(
+	sal_pcoeff_, Mr, Mc, coeff_row_stride,
+	pin,         Nr, Nc, in_row_stride,
+	pout,        Pr, Pc, out_row_stride,
+	decimation_, decimation_);
+    }
+    else
+    {
+      conv_min<T>(pcoeff_, Mr, Mc, coeff_row_stride, coeff_col_stride,
+		  pin, Nr, Nc, in_row_stride, in_col_stride,
+		  pout, Pr, Pc, out_row_stride, out_col_stride,
+		  decimation_);
+    }
+  }
 }
 
 } // namespace vsip::impl
Index: src/vsip/opt/sal/eval_elementwise.hpp
===================================================================
--- src/vsip/opt/sal/eval_elementwise.hpp	(revision 176624)
+++ src/vsip/opt/sal/eval_elementwise.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -273,8 +273,8 @@
     sal::Ext_wrapper<LBlock, lblock_lp> ext_l(src.left(),  SYNC_IN);	\
     sal::Ext_wrapper<RBlock, rblock_lp> ext_r(src.right(), SYNC_IN);	\
 									\
-    assert(dst.size() <= src.left().size());				\
-    assert(dst.size() <= src.right().size());				\
+    assert(dst.size() <= src.left().size()  || src.left().size() == 0);	\
+    assert(dst.size() <= src.right().size() || src.right().size() == 0); \
 									\
     VSIP_IMPL_COVER_BLK("SAL_VV", SrcBlock);				\
     FUN(								\
Index: src/vsip/opt/fftw3/fft_impl.cpp
===================================================================
--- src/vsip/opt/fftw3/fft_impl.cpp	(revision 177858)
+++ src/vsip/opt/fftw3/fft_impl.cpp	(working copy)
@@ -1051,12 +1051,13 @@
   virtual void query_layout(Rt_layout<2> &rtl_inout)
   {
     // By default use unit_stride,
-    rtl_inout.pack = stride_unit;
+    rtl_inout.pack = stride_unit_align;
     // an ordering that gives unit strides on the axis perpendicular to A,
     if (A == 0) rtl_inout.order = tuple<1, 0, 2>();
     else rtl_inout.order = tuple<0, 1, 2>();
     // make default based on library
     rtl_inout.complex = Create_plan<fftw3_complex_type>::format;
+    rtl_inout.align = VSIP_IMPL_ALLOC_ALIGNMENT;
   }
 
   virtual void query_layout(Rt_layout<2> &rtl_in, Rt_layout<2> &rtl_out)
Index: src/vsip/opt/simd/simd.hpp
===================================================================
--- src/vsip/opt/simd/simd.hpp	(revision 176624)
+++ src/vsip/opt/simd/simd.hpp	(working copy)
@@ -90,6 +90,8 @@
 //  - add             - add two SIMD vectors together
 //  - sub             - subtract two SIMD vectors
 //  - mul             - multiply two SIMD vectors together
+//  - fma
+//  - mag	      - magnitude (aka absolute value) of a SIMD vector
 //
 // Logic Operations:
 //  - band            - bitwise-and two SIMD vectors
@@ -129,6 +131,7 @@
   static int const  vec_size   = 1;
   static bool const is_accel   = false;
   static bool const has_perm   = false;
+  static bool const has_div    = true;
   static int  const alignment  = 1;
   static unsigned int const scalar_pos = 0;
 
@@ -162,10 +165,19 @@
   static simd_type div(simd_type const& v1, simd_type const& v2)
   { return v1 / v2; }
 
+  static simd_type mag(simd_type const& v1)
+  { return mag(v1); }
+
   static simd_type fma(simd_type const& v1, simd_type const& v2,
 		       simd_type const& v3)
   { return v1 * v2 + v3; }
 
+  static simd_type min(simd_type const& v1, simd_type const& v2)
+  { return (v1 < v2) ? v1 : v2; }
+
+  static simd_type max(simd_type const& v1, simd_type const& v2)
+  { return (v1 > v2) ? v1 : v2; }
+
   // These functions return ints and operate on ints
   static simd_itype band(simd_itype const& v1, simd_itype const& v2)
   { return v1 & v2; }
@@ -240,6 +252,7 @@
   static int  const vec_size   = 16;
   static bool const is_accel   = true;
   static bool const has_perm   = true;
+  static bool const has_div    = false;
   static int  const alignment  = 16;
 
   static unsigned int  const scalar_pos = VSIP_IMPL_SCALAR_POS(vec_size);
@@ -265,7 +278,7 @@
     return vec_perm(x0, x1, sh);
   }
   
-  static perm_simd_type shift_for_addr(value_type* addr)
+  static perm_simd_type shift_for_addr(value_type const* addr)
   { return vec_lvsl(0, addr); }
 
   static simd_type perm(simd_type x0, simd_type x1, perm_simd_type sh)
@@ -335,9 +348,10 @@
   typedef __vector VSIP_IMPL_AV_BOOL short bool_simd_type;
   typedef __vector signed char        pack_simd_type;
    
-  static int const  vec_size = 8;
-  static bool const is_accel = true;
-  static bool const has_perm = true;
+  static int const  vec_size  = 8;
+  static bool const is_accel  = true;
+  static bool const has_perm  = true;
+  static bool const has_div   = false;
   static int  const alignment = 16;
 
   static unsigned int  const scalar_pos = VSIP_IMPL_SCALAR_POS(vec_size);
@@ -361,7 +375,7 @@
     return vec_perm(x0, x1, sh);
   }
 
-  static perm_simd_type shift_for_addr(value_type* addr)
+  static perm_simd_type shift_for_addr(value_type const* addr)
   { return vec_lvsl(0, addr); }
 
   static simd_type perm(simd_type x0, simd_type x1, perm_simd_type sh)
@@ -434,9 +448,10 @@
   typedef __vector VSIP_IMPL_AV_BOOL int bool_simd_type;
   typedef __vector signed short          pack_simd_type;
    
-  static int const  vec_size = 4;
-  static bool const is_accel = true;
-  static bool const has_perm = true;
+  static int const  vec_size  = 4;
+  static bool const is_accel  = true;
+  static bool const has_perm  = true;
+  static bool const has_div   = false;
   static int  const alignment = 16;
 
   static unsigned int  const scalar_pos = VSIP_IMPL_SCALAR_POS(vec_size);
@@ -460,7 +475,7 @@
     return vec_perm(x0, x1, sh);
   }
 
-  static perm_simd_type shift_for_addr(value_type* addr)
+  static perm_simd_type shift_for_addr(value_type const* addr)
   { return vec_lvsl(0, addr); }
 
   static simd_type perm(simd_type x0, simd_type x1, perm_simd_type sh)
@@ -532,9 +547,10 @@
   typedef __vector unsigned char         perm_simd_type;
   typedef __vector VSIP_IMPL_AV_BOOL int bool_simd_type;
    
-  static int  const vec_size = 4;
-  static bool const is_accel = true;
-  static bool const has_perm = true;
+  static int  const vec_size  = 4;
+  static bool const is_accel  = true;
+  static bool const has_perm  = true;
+  static bool const has_div   = false;
   static int  const alignment = 16;
 
   static unsigned int  const scalar_pos = VSIP_IMPL_SCALAR_POS(vec_size);
@@ -558,7 +574,7 @@
     return vec_perm(x0, x1, sh);
   }
 
-  static perm_simd_type shift_for_addr(value_type* addr)
+  static perm_simd_type shift_for_addr(value_type const* addr)
   { return vec_lvsl(0, addr); }
 
   static simd_type perm(simd_type x0, simd_type x1, perm_simd_type sh)
@@ -602,6 +618,15 @@
 		       simd_type const& v3)
   { return vec_madd(v1, v2, v3); }
 
+  static simd_type mag(simd_type const& v1)
+  { return vec_abs(v1); }
+
+  static simd_type min(simd_type const& v1, simd_type const& v2)
+  { return vec_min(v1, v2); }
+
+  static simd_type max(simd_type const& v1, simd_type const& v2)
+  { return vec_max(v1, v2); }
+
   static bool_simd_type gt(simd_type const& v1, simd_type const& v2)
   { return vec_cmpgt(v1, v2); }
 
@@ -614,7 +639,7 @@
   // 070505: ppu-g++ 4.1.1 confused by return type for vec_cmple
   //         (but regular g++ 4.1.1 OK).  Use vec_cmpgt instead.
   static bool_simd_type le(simd_type const& v1, simd_type const& v2)
-  { return vec_cmpgt(v2, v1); }
+  { return vec_cmpge(v2, v1); }
 
   static simd_type real_from_interleaved(simd_type const& v1,
 					 simd_type const& v2)
@@ -667,6 +692,7 @@
   static int const  vec_size   = 16;
   static bool const is_accel   = true;
   static bool const has_perm   = false;
+  static bool const has_div    = false;
   static int  const alignment  = 16;
   static unsigned int  const scalar_pos = 0;
 
@@ -728,6 +754,7 @@
   static int const  vec_size   = 8;
   static bool const is_accel   = true;
   static bool const has_perm   = false;
+  static bool const has_div    = false;
   static int  const alignment  = 16;
   static unsigned int  const scalar_pos = 0;
 
@@ -830,6 +857,7 @@
   static int const  vec_size   = 4;
   static bool const is_accel   = true;
   static bool const has_perm   = false;
+  static bool const has_div    = false;
   static int  const alignment  = 16;
   static unsigned int  const scalar_pos = 0;
 
@@ -918,6 +946,7 @@
   static int const  vec_size   = 4;
   static bool const is_accel   = true;
   static bool const has_perm   = false;
+  static bool const has_div    = true;
   static int  const alignment  = 16;
   static unsigned int  const scalar_pos = 0;
 
@@ -968,6 +997,18 @@
 		       simd_type const& v3)
   { return add(mul(v1,v2),v3); }
 
+  static simd_type mag(simd_type const& v1)
+  {
+    simd_type mask = (simd_type)Simd_traits<int>::load_scalar_all(0x7fffffff);
+    return _mm_and_ps(mask, v1);
+  }
+
+  static simd_type min(simd_type const& v1, simd_type const& v2)
+  { return _mm_min_ps(v1, v2); }
+
+  static simd_type max(simd_type const& v1, simd_type const& v2)
+  { return _mm_max_ps(v1, v2); }
+
   static simd_type gt(simd_type const& v1, simd_type const& v2)
   { return _mm_cmpgt_ps(v1, v2); }
 
@@ -1017,6 +1058,7 @@
   static int const  vec_size   = 2;
   static bool const is_accel   = true;
   static bool const has_perm   = false;
+  static bool const has_div    = true;
   static int  const alignment  = 16;
   static unsigned int  const scalar_pos = 0;
 
@@ -1067,6 +1109,19 @@
 		       simd_type const& v3)
   { return add(mul(v1,v2),v3); }
 
+  static simd_type mag(simd_type const& v1)
+  {
+    simd_type mask = (simd_type)_mm_set_epi32(0x7ffffff, 0xfffffff,
+					      0x7ffffff, 0xfffffff);
+    return _mm_and_pd((simd_type)mask, v1);
+  }
+
+  static simd_type min(simd_type const& v1, simd_type const& v2)
+  { return _mm_min_pd(v1, v2); }
+
+  static simd_type max(simd_type const& v1, simd_type const& v2)
+  { return _mm_max_pd(v1, v2); }
+
   static simd_type gt(simd_type const& v1, simd_type const& v2)
   { return _mm_cmpgt_pd(v1, v2); }
 
@@ -1125,7 +1180,8 @@
    
   static int const  vec_size  = Simd_traits<T>::vec_size;
   static bool const is_accel  = Simd_traits<T>::is_accel;
-  static bool const has_perm  = Simd_traits<T>::has_perm;
+  static bool const has_perm  = false;
+  static bool const has_div   = Simd_traits<T>::has_div;
   static int  const alignment = Simd_traits<T>::alignment;
 
   static intptr_t alignment_of(value_type const* addr)
@@ -1332,6 +1388,8 @@
 struct Alg_vbxor;
 struct Alg_vbnot;
 struct Alg_threshold;
+struct Alg_vma_cSC;
+struct Alg_vma_ip_cSC;
 
 template <typename T,
 	  bool     IsSplit,
Index: src/vsip/opt/simd/vma_ip_csc.cpp
===================================================================
--- src/vsip/opt/simd/vma_ip_csc.cpp	(revision 0)
+++ src/vsip/opt/simd/vma_ip_csc.cpp	(revision 0)
@@ -0,0 +1,53 @@
+/* Copyright (c) 2006 by CodeSourcery.  All rights reserved. */
+
+/** @file    vsip/opt/simd/vma_ip_csc.cpp
+    @author  Jules Bergmann
+    @date    2006-11-17
+    @brief   VSIPL++ Library: SIMD element-wise vector multiplication.
+
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/opt/simd/vma_ip_csc.hpp>
+
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+#if !VSIP_IMPL_INLINE_LIBSIMD
+
+template <typename T>
+void
+vma_ip_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T>*       R,
+  int                    n)
+{
+  static bool const Is_vectorized =
+    Is_algorithm_supported<std::complex<T>, false, Alg_vma_ip_cSC>::value;
+  Simd_vma_ip_cSC<std::complex<T>, Is_vectorized>::exec(a, B, R, n);
+}
+
+template void vma_ip_cSC(std::complex<float> const&, float const*,
+			 std::complex<float>*, int);
+template void vma_ip_cSC(std::complex<double> const&, double const*,
+			 std::complex<double>*, int);
+
+#endif
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
Index: src/vsip/opt/simd/vaxpy.cpp
===================================================================
--- src/vsip/opt/simd/vaxpy.cpp	(revision 0)
+++ src/vsip/opt/simd/vaxpy.cpp	(revision 0)
@@ -0,0 +1,54 @@
+/* Copyright (c) 2006 by CodeSourcery.  All rights reserved. */
+
+/** @file    vsip/opt/simd/vma.cpp
+    @author  Jules Bergmann
+    @date    2006-11-17
+    @brief   VSIPL++ Library: SIMD element-wise vector multiplication.
+
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <vsip/opt/simd/vaxpy.hpp>
+
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+#if !VSIP_IMPL_INLINE_LIBSIMD
+
+template <typename T>
+void
+vma_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T> const* C,
+  std::complex<T>*       R,
+  int                    n)
+{
+  static bool const Is_vectorized =
+    Is_algorithm_supported<std::complex<T>, false, Alg_vma_cSC>::value;
+  Simd_vma_cSC<std::complex<T>, Is_vectorized>::exec(a, B, C, R, n);
+}
+
+template void vma_cSC(std::complex<float> const&, float const*,
+		      std::complex<float> const*, std::complex<float>*, int);
+template void vma_cSC(std::complex<double> const&, double const*,
+		   std::complex<double> const*, std::complex<double>*, int);
+
+#endif
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
Index: src/vsip/opt/simd/vma_ip_csc.hpp
===================================================================
--- src/vsip/opt/simd/vma_ip_csc.hpp	(revision 0)
+++ src/vsip/opt/simd/vma_ip_csc.hpp	(revision 0)
@@ -0,0 +1,272 @@
+/* Copyright (c) 2006 by CodeSourcery.  All rights reserved. */
+
+/** @file    vsip/opt/simd/vma_ip_csc.hpp
+    @author  Jules Bergmann
+    @date    2006-11-17
+    @brief   VSIPL++ Library: SIMD element-wise AXPY.
+
+*/
+
+#ifndef VSIP_OPT_SIMD_VMA_IP_CSC_HPP
+#define VSIP_OPT_SIMD_VMA_IP_CSC_HPP
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <complex>
+
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/core/metaprogramming.hpp>
+
+#define VSIP_IMPL_INLINE_LIBSIMD 0
+
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+// Define value_types for which vma_ip is optimized.
+//  - float
+//  - double
+
+template <typename T,
+	  bool     IsSplit>
+struct Is_algorithm_supported<T, IsSplit, Alg_vma_ip_cSC>
+{
+  typedef typename Scalar_of<T>::type scalar_type;
+  static bool const value =
+    Simd_traits<scalar_type>::is_accel &&
+    (Type_equal<scalar_type, float>::value ||
+     Type_equal<scalar_type, double>::value);
+};
+
+
+
+// Class for vma_ip - vector element-wise multiplication.
+
+template <typename T,
+	  bool     Is_vectorized>
+struct Simd_vma_ip_cSC;
+
+
+
+// Generic, non-vectorized implementation of vector element-wise multiply.
+// C += c*S
+// R += A*B 
+
+template <typename T>
+struct Simd_vma_ip_cSC<std::complex<T>, false>
+{
+  static void exec(
+    std::complex<T> const& a,
+    T const*               B,
+    std::complex<T>*       R,
+    int                    n)
+  {
+    while (n)
+    {
+      *R += a * *B;
+      R++; B++;
+      n--;
+    }
+  }
+};
+
+
+
+// Vectorized implementation of vector element-wise multiply for scalars
+// (float, double, etc).
+
+template <typename T>
+struct Simd_vma_ip_cSC<std::complex<T>, true>
+{
+  static void exec(
+    std::complex<T> const& a,
+    T const*               B,
+    std::complex<T>*       R,
+    int                    n)
+  {
+    typedef Simd_traits<T> simd;
+
+    typedef typename simd::simd_type simd_type;
+
+    // handle mis-aligned vectors
+    if (simd::alignment_of((T*)R) != simd::alignment_of(B))
+    {
+      // PROFILE
+      while (n)
+      {
+	*R += a * *B;
+	R++; B++;
+	n--;
+      }
+      return;
+    }
+
+    // clean up initial unaligned values
+    while (simd::alignment_of((T*)R) != 0)
+    {
+      *R += a * *B;
+      R++; B++;
+      n--;
+    }
+  
+    if (n == 0) return;
+
+    simd::enter();
+
+
+#if 0
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+
+    while (n >= 1*simd::vec_size)
+    {
+      simd_type reg_B = simd::load((T*)B);
+
+      simd_type reg_C0 = simd::load((T*)R);
+      simd_type reg_C1 = simd::load((T*)R + simd::vec_size);
+
+      simd_type reg_ABr = simd::mul(reg_Ar, reg_B);
+      simd_type reg_ABi = simd::mul(reg_Ai, reg_B);
+
+      simd_type reg_Cr = simd::real_from_interleaved(reg_C0, reg_C1);
+      simd_type reg_Ci = simd::imag_from_interleaved(reg_C0, reg_C1);
+
+      simd_type reg_Rr = simd::add(reg_Cr, reg_ABr);
+      simd_type reg_Ri = simd::add(reg_Ci, reg_ABi);
+
+      simd_type reg_R1 = simd::interleaved_lo_from_split(reg_Rr, reg_Ri);
+      simd_type reg_R2 = simd::interleaved_hi_from_split(reg_Rr, reg_Ri);
+      
+      simd::store((T*)R,                  reg_R1);
+      simd::store((T*)R + simd::vec_size, reg_R2);
+      
+      B += simd::vec_size; C += simd::vec_size; R += simd::vec_size;
+      n -= 1*simd::vec_size;
+    }
+#elif 0
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+    simd_type reg_A  = simd::interleaved_lo_from_split(reg_Ar, reg_Ai);
+
+    while (n >= 1*simd::vec_size)
+    {
+      simd_type reg_B = simd::load((T*)B);
+
+      simd_type reg_C1 = simd::load((T*)R);
+      simd_type reg_C2 = simd::load((T*)R + simd::vec_size);
+
+      simd_type reg_B1 = simd::interleaved_lo_from_split(reg_B, reg_B);
+      simd_type reg_B2 = simd::interleaved_hi_from_split(reg_B, reg_B);
+
+      simd_type reg_AB1 = simd::mul(reg_A, reg_B1);
+      simd_type reg_AB2 = simd::mul(reg_A, reg_B2);
+
+      simd_type reg_R1 = simd::add(reg_C1, reg_AB1);
+      simd_type reg_R2 = simd::add(reg_C2, reg_AB2);
+
+      simd::store((T*)R,                  reg_R1);
+      simd::store((T*)R + simd::vec_size, reg_R2);
+      
+      B += simd::vec_size; C += simd::vec_size; R += simd::vec_size;
+      n -= 1*simd::vec_size;
+    }
+#else
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+    simd_type reg_A  = simd::interleaved_lo_from_split(reg_Ar, reg_Ai);
+
+    while (n >= 2*simd::vec_size)
+    {
+      simd_type reg_B01 = simd::load((T*)B);
+      simd_type reg_B23 = simd::load((T*)B + simd::vec_size);
+
+      simd_type reg_C0 = simd::load((T*)R);
+      simd_type reg_C1 = simd::load((T*)R + simd::vec_size);
+      simd_type reg_C2 = simd::load((T*)R + 2*simd::vec_size);
+      simd_type reg_C3 = simd::load((T*)R + 3*simd::vec_size);
+
+      simd_type reg_B0 = simd::interleaved_lo_from_split(reg_B01, reg_B01);
+      simd_type reg_B1 = simd::interleaved_hi_from_split(reg_B01, reg_B01);
+      simd_type reg_B2 = simd::interleaved_hi_from_split(reg_B23, reg_B23);
+      simd_type reg_B3 = simd::interleaved_hi_from_split(reg_B23, reg_B23);
+
+      simd_type reg_AB0 = simd::mul(reg_A, reg_B0);
+      simd_type reg_AB1 = simd::mul(reg_A, reg_B1);
+      simd_type reg_AB2 = simd::mul(reg_A, reg_B2);
+      simd_type reg_AB3 = simd::mul(reg_A, reg_B3);
+
+      simd_type reg_R0 = simd::add(reg_C0, reg_AB0);
+      simd_type reg_R1 = simd::add(reg_C1, reg_AB1);
+      simd_type reg_R2 = simd::add(reg_C2, reg_AB2);
+      simd_type reg_R3 = simd::add(reg_C3, reg_AB3);
+
+      simd::store((T*)R,                    reg_R0);
+      simd::store((T*)R + 1*simd::vec_size, reg_R1);
+      simd::store((T*)R + 2*simd::vec_size, reg_R2);
+      simd::store((T*)R + 3*simd::vec_size, reg_R3);
+      
+      B += 2*simd::vec_size; R += 2*simd::vec_size;
+      n -= 2*simd::vec_size;
+    }
+#endif
+    
+    simd::exit();
+
+    while (n)
+    {
+      *R += a * *B;
+      R++; B++;
+      n--;
+    }
+  }
+};
+
+
+
+// Depending on VSIP_IMPL_LIBSIMD_INLINE macro, either provide these
+// functions inline, or provide non-inline functions in the libvsip.a.
+
+#if VSIP_IMPL_INLINE_LIBSIMD
+
+template <typename T>
+inline void
+vma_ip_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T>*       R,
+  int                    n)
+{
+  static bool const Is_vectorized =
+    Is_algorithm_supported<std::complex<T>, false, Alg_vma_ip_cSC>::value;
+  Simd_vma_ip_cSC<std::complex<T>, Is_vectorized>::exec(a, B, R, n);
+}
+
+#else
+
+template <typename T>
+void
+vma_ip_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T>*       R,
+  int                    n);
+
+#endif // VSIP_IMPL_INLINE_LIBSIMD
+
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_VMA_IP_CSC_HPP
Index: src/vsip/opt/simd/vaxpy.hpp
===================================================================
--- src/vsip/opt/simd/vaxpy.hpp	(revision 0)
+++ src/vsip/opt/simd/vaxpy.hpp	(revision 0)
@@ -0,0 +1,277 @@
+/* Copyright (c) 2006 by CodeSourcery.  All rights reserved. */
+
+/** @file    vsip/opt/simd/vma_cSC.hpp
+    @author  Jules Bergmann
+    @date    2006-11-17
+    @brief   VSIPL++ Library: SIMD element-wise AXPY.
+
+*/
+
+#ifndef VSIP_OPT_SIMD_VMA_CSC_HPP
+#define VSIP_OPT_SIMD_VMA_CSC_HPP
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#include <complex>
+
+#include <vsip/opt/simd/simd.hpp>
+#include <vsip/core/metaprogramming.hpp>
+
+#define VSIP_IMPL_INLINE_LIBSIMD 0
+
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+namespace vsip
+{
+namespace impl
+{
+namespace simd
+{
+
+// Define value_types for which vma is optimized.
+//  - float
+//  - double
+
+template <typename T,
+	  bool     IsSplit>
+struct Is_algorithm_supported<T, IsSplit, Alg_vma_cSC>
+{
+  typedef typename Scalar_of<T>::type scalar_type;
+  static bool const value =
+    Simd_traits<scalar_type>::is_accel &&
+    (Type_equal<scalar_type, float>::value ||
+     Type_equal<scalar_type, double>::value);
+};
+
+
+
+// Class for vma - vector element-wise multiplication.
+
+template <typename T,
+	  bool     Is_vectorized>
+struct Simd_vma_cSC;
+
+
+
+// Generic, non-vectorized implementation of vector element-wise multiply.
+// C = c*S + C
+// R = A*B + C
+
+template <typename T>
+struct Simd_vma_cSC<std::complex<T>, false>
+{
+  static void exec(
+    std::complex<T> const& a,
+    T const*               B,
+    std::complex<T> const* C,
+    std::complex<T>*       R,
+    int                    n)
+  {
+    while (n)
+    {
+      *R = a * *B + *C;
+      R++; B++; C++;
+      n--;
+    }
+  }
+};
+
+
+
+// Vectorized implementation of vector element-wise multiply for scalars
+// (float, double, etc).
+
+template <typename T>
+struct Simd_vma_cSC<std::complex<T>, true>
+{
+  static void exec(
+    std::complex<T> const& a,
+    T const*               B,
+    std::complex<T> const* C,
+    std::complex<T>*       R,
+    int                    n)
+  {
+    typedef Simd_traits<T> simd;
+
+    typedef typename simd::simd_type simd_type;
+
+    // handle mis-aligned vectors
+    if (simd::alignment_of((T*)R) != simd::alignment_of(B) ||
+	simd::alignment_of((T*)R) != simd::alignment_of((T*)C))
+    {
+      // PROFILE
+      while (n)
+      {
+	*R = a * *B + *C;
+	R++; B++; C++;
+	n--;
+      }
+      return;
+    }
+
+    // clean up initial unaligned values
+    while (simd::alignment_of((T*)R) != 0)
+    {
+      *R = a * *B + *C;
+      R++; B++; C++;
+      n--;
+    }
+  
+    if (n == 0) return;
+
+    simd::enter();
+
+
+#if 0
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+
+    while (n >= 1*simd::vec_size)
+    {
+      simd_type reg_B = simd::load((T*)B);
+
+      simd_type reg_C0 = simd::load((T*)C);
+      simd_type reg_C1 = simd::load((T*)C + simd::vec_size);
+
+      simd_type reg_ABr = simd::mul(reg_Ar, reg_B);
+      simd_type reg_ABi = simd::mul(reg_Ai, reg_B);
+
+      simd_type reg_Cr = simd::real_from_interleaved(reg_C0, reg_C1);
+      simd_type reg_Ci = simd::imag_from_interleaved(reg_C0, reg_C1);
+
+      simd_type reg_Rr = simd::add(reg_Cr, reg_ABr);
+      simd_type reg_Ri = simd::add(reg_Ci, reg_ABi);
+
+      simd_type reg_R1 = simd::interleaved_lo_from_split(reg_Rr, reg_Ri);
+      simd_type reg_R2 = simd::interleaved_hi_from_split(reg_Rr, reg_Ri);
+      
+      simd::store((T*)R,                  reg_R1);
+      simd::store((T*)R + simd::vec_size, reg_R2);
+      
+      B += simd::vec_size; C += simd::vec_size; R += simd::vec_size;
+      n -= 1*simd::vec_size;
+    }
+#elif 0
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+    simd_type reg_A  = simd::interleaved_lo_from_split(reg_Ar, reg_Ai);
+
+    while (n >= 1*simd::vec_size)
+    {
+      simd_type reg_B = simd::load((T*)B);
+
+      simd_type reg_C1 = simd::load((T*)C);
+      simd_type reg_C2 = simd::load((T*)C + simd::vec_size);
+
+      simd_type reg_B1 = simd::interleaved_lo_from_split(reg_B, reg_B);
+      simd_type reg_B2 = simd::interleaved_hi_from_split(reg_B, reg_B);
+
+      simd_type reg_AB1 = simd::mul(reg_A, reg_B1);
+      simd_type reg_AB2 = simd::mul(reg_A, reg_B2);
+
+      simd_type reg_R1 = simd::add(reg_C1, reg_AB1);
+      simd_type reg_R2 = simd::add(reg_C2, reg_AB2);
+
+      simd::store((T*)R,                  reg_R1);
+      simd::store((T*)R + simd::vec_size, reg_R2);
+      
+      B += simd::vec_size; C += simd::vec_size; R += simd::vec_size;
+      n -= 1*simd::vec_size;
+    }
+#else
+    simd_type reg_Ar = simd::load_scalar_all(a.real());
+    simd_type reg_Ai = simd::load_scalar_all(a.imag());
+    simd_type reg_A  = simd::interleaved_lo_from_split(reg_Ar, reg_Ai);
+
+    while (n >= 2*simd::vec_size)
+    {
+      simd_type reg_B01 = simd::load((T*)B);
+      simd_type reg_B23 = simd::load((T*)B + simd::vec_size);
+
+      simd_type reg_C0 = simd::load((T*)C);
+      simd_type reg_C1 = simd::load((T*)C + simd::vec_size);
+      simd_type reg_C2 = simd::load((T*)C + 2*simd::vec_size);
+      simd_type reg_C3 = simd::load((T*)C + 3*simd::vec_size);
+
+      simd_type reg_B0 = simd::interleaved_lo_from_split(reg_B01, reg_B01);
+      simd_type reg_B1 = simd::interleaved_hi_from_split(reg_B01, reg_B01);
+      simd_type reg_B2 = simd::interleaved_hi_from_split(reg_B23, reg_B23);
+      simd_type reg_B3 = simd::interleaved_hi_from_split(reg_B23, reg_B23);
+
+      simd_type reg_AB0 = simd::mul(reg_A, reg_B0);
+      simd_type reg_AB1 = simd::mul(reg_A, reg_B1);
+      simd_type reg_AB2 = simd::mul(reg_A, reg_B2);
+      simd_type reg_AB3 = simd::mul(reg_A, reg_B3);
+
+      simd_type reg_R0 = simd::add(reg_C0, reg_AB0);
+      simd_type reg_R1 = simd::add(reg_C1, reg_AB1);
+      simd_type reg_R2 = simd::add(reg_C2, reg_AB2);
+      simd_type reg_R3 = simd::add(reg_C3, reg_AB3);
+
+      simd::store((T*)R,                    reg_R0);
+      simd::store((T*)R + 1*simd::vec_size, reg_R1);
+      simd::store((T*)R + 2*simd::vec_size, reg_R2);
+      simd::store((T*)R + 3*simd::vec_size, reg_R3);
+      
+      B += 2*simd::vec_size; C += 2*simd::vec_size; R += 2*simd::vec_size;
+      n -= 2*simd::vec_size;
+    }
+#endif
+    
+    simd::exit();
+
+    while (n)
+    {
+      *R = a * *B + *C;
+      R++; B++; C++;
+      n--;
+    }
+  }
+};
+
+
+
+// Depending on VSIP_IMPL_LIBSIMD_INLINE macro, either provide these
+// functions inline, or provide non-inline functions in the libvsip.a.
+
+#if VSIP_IMPL_INLINE_LIBSIMD
+
+template <typename T>
+inline void
+vma_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T> const* C,
+  std::complex<T>*       R,
+  int                    n)
+{
+  static bool const Is_vectorized =
+    Is_algorithm_supported<std::complex<T>, false, Alg_vma_cSC>::value;
+  Simd_vma_cSC<std::complex<T>, Is_vectorized>::exec(a, B, R, C, n);
+}
+
+#else
+
+template <typename T>
+void
+vma_cSC(
+  std::complex<T> const& a,
+  T const*               B,
+  std::complex<T> const* C,
+  std::complex<T>*       R,
+  int                    n);
+
+#endif // VSIP_IMPL_INLINE_LIBSIMD
+
+
+} // namespace vsip::impl::simd
+} // namespace vsip::impl
+} // namespace vsip
+
+#endif // VSIP_IMPL_SIMD_VMA_CSC_HPP
Index: src/vsip/opt/simd/eval_generic.hpp
===================================================================
--- src/vsip/opt/simd/eval_generic.hpp	(revision 176624)
+++ src/vsip/opt/simd/eval_generic.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -677,7 +677,8 @@
 	   (&(src.first().left()) == &(src.second())) &&
 	   // make sure everyting has same alignment
            (simd::alignment_of(ext_dst.data()) ==
-	    simd::alignment_of(ext_a.data()) ==
+	    simd::alignment_of(ext_a.data())) &&
+           (simd::alignment_of(ext_dst.data()) ==
 	    simd::alignment_of(ext_b.data())));
   }
 
Index: src/vsip/opt/simd/expr_iterator.hpp
===================================================================
--- src/vsip/opt/simd/expr_iterator.hpp	(revision 177792)
+++ src/vsip/opt/simd/expr_iterator.hpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -128,14 +128,59 @@
   typedef T value_type;
   typedef T return_type;
   typedef typename Simd_traits<T>::simd_type simd_type;
-  static bool const is_supported = true;
+  static bool const is_supported = Simd_traits<T>::has_div;
   static simd_type 
   apply(simd_type const &left, simd_type const &right)
   { return Simd_traits<T>::div(left, right);}
 };
 
+
+
+#define VSIP_OPT_DECL_UNARY_OP(FCN, OP)					\
+template <typename T>							\
+struct Unary_operator_map<T, OP>					\
+{									\
+  typedef typename Simd_traits<T>::simd_type simd_type;			\
+  typedef T                             return_type;			\
+  typedef T                             value_type;			\
+  									\
+  static bool const is_supported = true;				\
+  static inline simd_type						\
+  apply(simd_type const &arg)						\
+  {									\
+    return Simd_traits<T>::FCN(arg);					\
+  }									\
+};
+
+#define VSIP_OPT_DECL_UNARY_OP_COMPLEX_BLOCK(OP)			\
+template <typename T>							\
+struct Unary_operator_map<complex<T>, OP>				\
+{									\
+  static bool const is_supported = false;				\
+};
+
+
+
+#define VSIP_OPT_DECL_BINARY_OP(FCN,OP)					\
+template <typename T>							\
+struct Binary_operator_map<T, OP>					\
+{									\
+  typedef typename Simd_traits<T>::simd_type simd_type;			\
+  typedef T                             return_type;			\
+  typedef T                             value_type;			\
+  									\
+  static bool const is_supported = true;				\
+  static inline simd_type						\
+  apply(simd_type const &left, simd_type const &right)			\
+  {									\
+    return Simd_traits<T>::FCN(left, right);				\
+  }									\
+};
+
+
+
 // Binary operators that return different type than type of operands
-#define VSIP_OPT_DECL_BINARY_OP(FCN,O) \
+#define VSIP_OPT_DECL_BINARY_CMP_OP(FCN,O) \
 template <typename T> \
 struct Binary_operator_map<T, O> \
 { \
@@ -153,12 +198,24 @@
   }                                                        \
 };
 
-VSIP_OPT_DECL_BINARY_OP(gt,  gt_functor)
-VSIP_OPT_DECL_BINARY_OP(lt,  lt_functor)
-VSIP_OPT_DECL_BINARY_OP(ge,  ge_functor)
-VSIP_OPT_DECL_BINARY_OP(le,  le_functor)
 
+
+
+VSIP_OPT_DECL_BINARY_CMP_OP(gt,  gt_functor)
+VSIP_OPT_DECL_BINARY_CMP_OP(lt,  lt_functor)
+VSIP_OPT_DECL_BINARY_CMP_OP(ge,  ge_functor)
+VSIP_OPT_DECL_BINARY_CMP_OP(le,  le_functor)
+
+VSIP_OPT_DECL_BINARY_OP(max, max_functor)
+VSIP_OPT_DECL_BINARY_OP(min, min_functor)
+
+VSIP_OPT_DECL_UNARY_OP(mag, mag_functor)
+VSIP_OPT_DECL_UNARY_OP_COMPLEX_BLOCK(mag_functor)
+
+#undef VSIP_OPT_DECL_BINARY_CMP_OP
 #undef VSIP_OPT_DECL_BINARY_OP
+#undef VSIP_OPT_DECL_UNARY_OP
+#undef VSIP_OPT_DECL_UNARY_OP_COMPLEX_BLOCK
 
 // Support for ternary maps
 template <typename T>
@@ -244,6 +301,20 @@
   typedef typename P1::value_type value_type;
 };
 
+
+
+// Helper class for loading SIMD values.
+//
+// Two specializatons are provided:
+//   - has_perm=true is for SIMD instruction sets that have a permute
+//                   instruction (and no unaligned load), such as
+//                   AltiVec.
+//   - has_perm=false is for SIMD instruction sets that do not have
+//                   a permute instruction, but allow unaligned loads,
+//                   such as SSE.
+//
+// The SIMD traits class has a 'has_perm' trait.
+
 template <typename T,
           bool has_perm = Simd_traits<T>::has_perm>
 struct Simd_unaligned_loader;
@@ -258,7 +329,7 @@
 
   Simd_unaligned_loader(value_type const* ptr) : ptr_unaligned_(ptr) 
   {
-    ptr_aligned_    = (simd_type*)((intptr_t)ptr & ~(simd::alignment-1));
+    ptr_aligned_    = (value_type*)((intptr_t)ptr & ~(simd::alignment-1));
 
     x0_  = simd::load((value_type*)ptr_aligned_);
     x1_  = simd::load((value_type*)(ptr_aligned_+simd::vec_size));
@@ -270,8 +341,8 @@
 
   void increment(length_type n = 1)
   {
-    ptr_unaligned_ += n * Simd_traits<value_type>::vec_size;
-    ptr_aligned_   += n;
+    ptr_unaligned_ += n * simd::vec_size;
+    ptr_aligned_   += n * simd::vec_size;
   
     // update x0
     x0_ = (n == 1) ? x1_ : simd::load((value_type*)ptr_aligned_);
@@ -281,7 +352,7 @@
   }
 
   value_type const*            ptr_unaligned_;
-  simd_type const*             ptr_aligned_;
+  value_type const*            ptr_aligned_;
   simd_type                    x0_;
   simd_type                    x1_;
   perm_simd_type               sh_;
Index: src/vsip/opt/diag/eval.hpp
===================================================================
--- src/vsip/opt/diag/eval.hpp	(revision 177858)
+++ src/vsip/opt/diag/eval.hpp	(working copy)
@@ -37,6 +37,7 @@
 
 
 
+
 namespace diag_detail
 {
 
@@ -48,12 +49,18 @@
   static std::string name() { return "unknown"; }
 };
 
-#define VSIP_IMPL_DISPATCH_NAME(TYPE)		\
+#define VSIP_IMPL_DISPATCH_NAME(TYPE)			\
   template <>						\
   struct Dispatch_name<TYPE> {				\
     static std::string name() { return "" # TYPE; }	\
   };
 
+#define VSIP_IMPL_DISPATCH_NAME_AS(TYPE, ASTYPE)	\
+  template <>						\
+  struct Dispatch_name<TYPE> {				\
+    static std::string name() { return "" # ASTYPE; }	\
+  };
+
 VSIP_IMPL_DISPATCH_NAME(Intel_ipp_tag)
 VSIP_IMPL_DISPATCH_NAME(Transpose_tag)
 VSIP_IMPL_DISPATCH_NAME(Mercury_sal_tag)
@@ -63,6 +70,7 @@
 VSIP_IMPL_DISPATCH_NAME(Op_expr_tag)
 VSIP_IMPL_DISPATCH_NAME(Simd_builtin_tag)
 VSIP_IMPL_DISPATCH_NAME(Simd_loop_fusion_tag)
+VSIP_IMPL_DISPATCH_NAME_AS(Simd_unaligned_loop_fusion_tag, Simd_ulf_tag)
 VSIP_IMPL_DISPATCH_NAME(Fc_expr_tag)
 VSIP_IMPL_DISPATCH_NAME(Rbo_expr_tag)
 VSIP_IMPL_DISPATCH_NAME(Loop_fusion_tag)
@@ -77,6 +85,34 @@
 
 
 
+// Represent an unknown type.  Used when a non-view is the RHS of an
+// assignment.
+
+struct Unknown_type {};
+
+
+
+// Helper class to determine the block type of a view.  Handles non-view
+// types (which occur in simple expressions, such as A = 5).
+
+template <typename ViewT>
+struct Block_of
+{
+  typedef Unknown_type type;
+  static type block(ViewT) { return type(); }
+};
+
+template <template <typename, typename> class View,
+	  typename T,
+	  typename BlockT>
+struct Block_of<View<T, BlockT> >
+{
+  typedef BlockT type;
+  static type& block(View<T, BlockT> view) { return view.block(); }
+};
+
+
+
 // Helper class to conditionally call to Serial_expr_evaluator's
 // rt_valid() method, when ct_valid is true.  If ct_valid is false,
 // the call may not be valid.
@@ -254,9 +290,29 @@
   }
 };
 
+// Specialization for Unknown_type RHS.
 
 template <dimension_type Dim,
 	  typename       DstBlock,
+	  typename       TagList,
+	  typename       Tag,
+	  typename       Rest>
+struct Diag_eval_list_helper<Dim, DstBlock, Unknown_type, TagList, Tag, Rest>
+{
+  static void exec(
+    DstBlock&           /*dst*/,
+    Unknown_type const& /*src*/)
+  {
+    std::cout << "Delh: unknown type\n";
+  }
+};
+
+
+
+// Helper class for diag_eval_dispatch.
+
+template <dimension_type Dim,
+	  typename       DstBlock,
 	  typename       SrcBlock,
 	  typename       DaTag>
 struct Diag_eval_dispatch_helper
@@ -466,8 +522,8 @@
   DstViewT dst,
   SrcViewT src)
 {
-  typedef typename DstViewT::block_type dst_block_type;
-  typedef typename SrcViewT::block_type src_block_type;
+  typedef typename diag_detail::Block_of<DstViewT>::type dst_block_type;
+  typedef typename diag_detail::Block_of<SrcViewT>::type src_block_type;
   using vsip::impl::diag_detail::Diag_eval_list_helper;
 
   dimension_type const dim = DstViewT::dim;
@@ -476,7 +532,8 @@
 	    << "  dst expr: " << typeid(dst_block_type).name() << std::endl
 	    << "  src expr: " << typeid(src_block_type).name() << std::endl;
   Diag_eval_list_helper<dim, dst_block_type, src_block_type, TagList>
-    ::exec(dst.block(), src.block());
+    ::exec(diag_detail::Block_of<DstViewT>::block(dst),
+	   diag_detail::Block_of<SrcViewT>::block(src));
 }
 
 
Index: src/vsip/GNUmakefile.inc.in
===================================================================
--- src/vsip/GNUmakefile.inc.in	(revision 177479)
+++ src/vsip/GNUmakefile.inc.in	(working copy)
@@ -56,7 +56,9 @@
 			$(srcdir)/src/vsip/opt/simd/vadd.cpp \
 			$(srcdir)/src/vsip/opt/simd/vgt.cpp \
 			$(srcdir)/src/vsip/opt/simd/vlogic.cpp \
-			$(srcdir)/src/vsip/opt/simd/threshold.cpp
+			$(srcdir)/src/vsip/opt/simd/threshold.cpp \
+			$(srcdir)/src/vsip/opt/simd/vaxpy.cpp \
+			$(srcdir)/src/vsip/opt/simd/vma_ip_csc.cpp
 endif # VSIP_IMPL_REF_IMPL
 
 src_vsip_cxx_objects := $(patsubst $(srcdir)/%.cpp, %.$(OBJEXT), $(src_vsip_cxx_sources))
Index: tests/coverage_common.hpp
===================================================================
--- tests/coverage_common.hpp	(revision 174331)
+++ tests/coverage_common.hpp	(working copy)
@@ -93,6 +93,21 @@
   Unary Operator Tests
 ***********************************************************************/
 
+#if VERBOSE
+#  define DEBUG_UNARY(NAME, OP)						\
+  {									\
+    std::cout << "Test"#NAME << std::endl				\
+	      << "  at pos  : " << i << std::endl			\
+	      << "  expected: " << expected << std::endl		\
+	      << "  got     : " << get_nth(view2, i) << std::endl	\
+      ;									\
+    vsip::impl::diagnose_eval_list_std(view2, OP (view1));		\
+  }
+#else
+#  define DEBUG_UNARY(NAME, OP)						\
+  {}
+#endif
+
 // Test structure for Unary operator
 //
 // Where
@@ -126,6 +141,8 @@
     {									\
       T2 expected = 							\
         CHKOP (Get_value<T1>::at(0, Is_scalar<View1>::value ? 0 : i, RT));\
+      if (!(equal(get_nth(view2, i), expected)))			\
+	DEBUG_UNARY(NAME, OP)						\
       test_assert(equal(get_nth(view2, i), expected));			\
     }									\
   }									\
Index: tests/regressions/simd_alignment.cpp
===================================================================
--- tests/regressions/simd_alignment.cpp	(revision 177792)
+++ tests/regressions/simd_alignment.cpp	(working copy)
@@ -30,8 +30,6 @@
 #include <vsip_csl/test.hpp>
 #include <vsip_csl/output.hpp>
 
-#include "test_common.hpp"
-
 using namespace std;
 using namespace vsip;
 using namespace vsip_csl;
@@ -40,12 +38,19 @@
 
 
 /***********************************************************************
-  Definitions
+  Definitions - Misaligned
 ***********************************************************************/
 
+// Test an unaligned negation, where all views have same relative
+// alignment.
+//
+// Source and destination are not SIMD aligned, but they both have the
+// same relative alignement.  Hence they can be processed with aligned
+// SIMD code after the initial misalignment is cleaned up.
+
 template <typename T>
 void
-do_negate(length_type size, length_type align, length_type cleanup)
+do_negate_same(length_type size, length_type align, length_type cleanup)
 {
   Vector<T> src(size);
   Vector<T> dst(size);
@@ -73,30 +78,85 @@
 
 template <typename T>
 void
-test()
+test_unaligned_same()
 {
-  do_negate<T>(16, 0, 0);
+  do_negate_same<T>(16, 0, 0);
 
-  do_negate<T>(16, 1, 0);
-  do_negate<T>(16, 2, 0);
-  do_negate<T>(16, 3, 0);
+  do_negate_same<T>(16, 1, 0);
+  do_negate_same<T>(16, 2, 0);
+  do_negate_same<T>(16, 3, 0);
 
-  do_negate<T>(16, 1, 1);
-  do_negate<T>(16, 2, 1);
-  do_negate<T>(16, 3, 1);
+  do_negate_same<T>(16, 1, 1);
+  do_negate_same<T>(16, 2, 1);
+  do_negate_same<T>(16, 3, 1);
 
-  do_negate<T>(16, 1, 2);
-  do_negate<T>(16, 2, 2);
-  do_negate<T>(16, 3, 2);
+  do_negate_same<T>(16, 1, 2);
+  do_negate_same<T>(16, 2, 2);
+  do_negate_same<T>(16, 3, 2);
 
-  do_negate<T>(16, 1, 3);
-  do_negate<T>(16, 2, 3);
-  do_negate<T>(16, 3, 3);
+  do_negate_same<T>(16, 1, 3);
+  do_negate_same<T>(16, 2, 3);
+  do_negate_same<T>(16, 3, 3);
 }
 
 
 
 /***********************************************************************
+  Unaligned test
+***********************************************************************/
+
+// Test an "unaligned" negation, with different relative alignment.
+//
+// Source and destination are not SIMD aligned, and they have the
+// different relative alignement.  They must be processed with
+// unaligned SIMD code.
+
+template <typename T>
+void
+do_negate_diff(
+  length_type total_size,
+  length_type process_size,
+  index_type  offset1,
+  index_type  offset2)
+{
+  Vector<T> src(total_size);
+  Vector<T> dst(total_size);
+
+  Domain<1> dom1(offset1, 1, process_size);
+  Domain<1> dom2(offset2, 1, process_size);
+
+  src = ramp(T(0), T(1), total_size);
+
+  dst(dom1) = -src(dom2);
+
+  for (index_type i=0; i<process_size; ++i)
+  {
+#if VERBOSE
+    if (!(dst(dom1)(i) == -src(dom2)(i)))
+    {
+      cout << "src:\n" << src << endl;
+      cout << "dst:\n" << dst << endl;
+    }
+#endif
+    test_assert(dst(dom1)(i) == -src(dom2)(i));
+  }
+}
+
+
+
+template <typename T>
+void
+test_unaligned_diff()
+{
+  do_negate_diff<T>(32, 16, 0, 0);
+  do_negate_diff<T>(32, 16, 1, 1);
+  do_negate_diff<T>(32, 16, 0, 1);
+  do_negate_diff<T>(32, 16, 1, 2); // doesn't dispatch to SIMD UALF
+}
+
+
+
+/***********************************************************************
   Main
 ***********************************************************************/
 
@@ -105,7 +165,8 @@
 {
   vsipl init(argc, argv);
 
-  test<float>();
+  test_unaligned_same<float>();
+  test_unaligned_diff<float>();
 
   return 0;
 }
Index: tests/coverage_unary_neg.cpp
===================================================================
--- tests/coverage_unary_neg.cpp	(revision 0)
+++ tests/coverage_unary_neg.cpp	(revision 0)
@@ -0,0 +1,67 @@
+/* Copyright (c) 2005, 2006, 2007 by CodeSourcery.  All rights reserved.
+
+   This file is available for license from CodeSourcery, Inc. under the terms
+   of a commercial license and under the GPL.  It is not part of the VSIPL++
+   reference implementation and is not available under the BSD license.
+*/
+/** @file    tests/coverage_unary_neg.hpp
+    @author  Jules Bergmann
+    @date    2005-09-13
+    @brief   VSIPL++ Library: Coverage tests for neg unary expressions.
+*/
+
+/***********************************************************************
+  Included Files
+***********************************************************************/
+
+#define VERBOSE 1
+
+#include <iostream>
+
+#include <vsip/support.hpp>
+#include <vsip/initfin.hpp>
+#include <vsip/vector.hpp>
+#include <vsip/math.hpp>
+#include <vsip/random.hpp>
+
+#include <vsip_csl/test.hpp>
+#include <vsip_csl/test-storage.hpp>
+#include "coverage_common.hpp"
+
+using namespace std;
+using namespace vsip;
+using namespace vsip_csl;
+
+
+/***********************************************************************
+  Definitions
+***********************************************************************/
+
+TEST_UNARY(neg,   -,     -,     anyval)
+
+
+
+/***********************************************************************
+  Main
+***********************************************************************/
+
+int
+main(int argc, char** argv)
+{
+  vsipl init(argc, argv);
+
+#if VSIP_IMPL_TEST_LEVEL == 0
+
+  vector_cases2<Test_neg, float>();
+
+#else
+
+  // Unary operators
+  vector_cases2<Test_neg, int>();
+  vector_cases2<Test_neg, float>();
+  vector_cases2<Test_neg, double>();
+  vector_cases2<Test_neg, complex<float> >();
+  vector_cases2<Test_neg, complex<double> >();
+
+#endif // VSIP_IMPL_TEST_LEVEL > 0
+}
Index: tests/conv-2d.cpp
===================================================================
--- tests/conv-2d.cpp	(revision 173072)
+++ tests/conv-2d.cpp	(working copy)
@@ -344,6 +344,8 @@
   if (!good)
   {
 #if VERBOSE
+    cout << "in =\n" << in << endl;
+    cout << "coeff =\n" << coeff << endl;
     cout << "out =\n" << out << endl;
     cout << "ex =\n" << ex << endl;
 #endif
Index: tests/coverage_unary.cpp
===================================================================
--- tests/coverage_unary.cpp	(revision 177533)
+++ tests/coverage_unary.cpp	(working copy)
@@ -36,7 +36,6 @@
 ***********************************************************************/
 
 TEST_UNARY(copy,   ,      ,     anyval)
-TEST_UNARY(neg,   -,     -,     anyval)
 TEST_UNARY(mag,   mag,   mag,   anyval)
 TEST_UNARY(sqrt,  sqrt,  sqrt,  posval)
 TEST_UNARY(rsqrt, rsqrt, rsqrt, posval)
@@ -61,7 +60,6 @@
 
 #if VSIP_IMPL_TEST_LEVEL == 0
 
-  vector_cases2<Test_neg, float>();
   vector_cases2<Test_mag, float>();
   vector_cases2_rt<Test_mag, complex<float>,  float>();
   vector_cases2<Test_sqrt, float>();
@@ -74,12 +72,6 @@
 #else
 
   // Unary operators
-  vector_cases2<Test_neg, int>();
-  vector_cases2<Test_neg, float>();
-  vector_cases2<Test_neg, double>();
-  vector_cases2<Test_neg, complex<float> >();
-  vector_cases2<Test_neg, complex<double> >();
-
   vector_cases2<Test_mag, int>();
   vector_cases2<Test_mag, float>();
   vector_cases2<Test_mag, double>();
Index: benchmarks/vdiv.cpp
===================================================================
--- benchmarks/vdiv.cpp	(revision 173215)
+++ benchmarks/vdiv.cpp	(working copy)
@@ -1,4 +1,4 @@
-/* Copyright (c) 2005, 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2005, 2006, 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
@@ -21,6 +21,8 @@
 #include <vsip/support.hpp>
 #include <vsip/math.hpp>
 #include <vsip/random.hpp>
+#include <vsip/opt/diag/eval.hpp>
+
 #include "benchmarks.hpp"
 
 using namespace vsip;
@@ -71,6 +73,17 @@
     
     time = t1.delta();
   }
+
+  void diag()
+  {
+    length_type const size = 256;
+
+    Vector<T>   A(size, T());
+    Vector<T>   B(size, T());
+    Vector<T>   C(size);
+
+    vsip::impl::diagnose_eval_list_std(C, A / B);
+  }
 };
 
 
Index: benchmarks/conv2d.cpp
===================================================================
--- benchmarks/conv2d.cpp	(revision 177858)
+++ benchmarks/conv2d.cpp	(working copy)
@@ -1,13 +1,13 @@
-/* Copyright (c) 2005, 2006 by CodeSourcery.  All rights reserved.
+/* Copyright (c) 2007 by CodeSourcery.  All rights reserved.
 
    This file is available for license from CodeSourcery, Inc. under the terms
    of a commercial license and under the GPL.  It is not part of the VSIPL++
    reference implementation and is not available under the BSD license.
 */
-/** @file    benchmarks/conv.cpp
+/** @file    benchmarks/conv2d.cpp
     @author  Jules Bergmann
     @date    2005-07-11
-    @brief   VSIPL++ Library: Benchmark for Convolution.
+    @brief   VSIPL++ Library: Benchmark for 2D Convolution.
 
 */
 
@@ -151,8 +151,6 @@
   length_type M    = atoi(loop.param_["m"].c_str());
   length_type N    = atoi(loop.param_["n"].c_str());
 
-  std::cout << "MN: " << MN << std::endl;
-
   if (MN != 0)
     M = N = MN;
 
Index: benchmarks/sal/vthresh.cpp
===================================================================
--- benchmarks/sal/vthresh.cpp	(revision 173875)
+++ benchmarks/sal/vthresh.cpp	(working copy)
@@ -42,6 +42,9 @@
 template <typename T>
 struct t_vthres_sal;
 
+template <typename T>
+struct t_vthr_sal;
+
 template <>
 struct t_vthres_sal<float> : Benchmark_base
 {
@@ -76,12 +79,12 @@
       T* p_result = ext_result.data();
       
       t1.start();
-      marker_start();
+      marker1_start();
       for (size_t l=0; l<loop; ++l)
       {
 	vthresx(p_A,1, &b, p_result,1, size, 0);
       }
-      marker_stop();
+      marker1_stop();
       t1.stop();
     }
     
@@ -96,6 +99,60 @@
 
 
 
+template <>
+struct t_vthr_sal<float> : Benchmark_base
+{
+  typedef float T;
+
+  char* what() { return "t_vthr_sal"; }
+  int ops_per_point(size_t)  { return 1; }
+
+  int riob_per_point(size_t) { return 1*sizeof(T); }
+  int wiob_per_point(size_t) { return 1*sizeof(T); }
+  int mem_per_point(size_t)  { return 2*sizeof(T); }
+
+  void operator()(size_t size, size_t loop, float& time)
+  {
+    typedef impl::Layout<1, row1_type, Stride_unit_dense, Cmplx_inter_fmt> LP;
+    typedef impl::Fast_block<1, T, LP, Local_map> block_type;
+
+    Vector<T, block_type>   A     (size, T());
+    Vector<T, block_type>   result(size, T());
+    T                       b = T(0.5);
+
+    Rand<T> gen(0, 0);
+    A = gen.randu(size);
+
+    vsip::impl::profile::Timer t1;
+
+    { // Control Ext_data scope.
+      impl::Ext_data<block_type> ext_A     (A.block(),      impl::SYNC_IN);
+      impl::Ext_data<block_type> ext_result(result.block(), impl::SYNC_OUT);
+    
+      T* p_A      = ext_A.data();
+      T* p_result = ext_result.data();
+      
+      t1.start();
+      marker1_start();
+      for (size_t l=0; l<loop; ++l)
+      {
+	vthrx(p_A,1, &b, p_result,1, size, 0);
+      }
+      marker1_stop();
+      t1.stop();
+    }
+    
+    for (index_type i=0; i<size; ++i)
+    {
+      test_assert(equal<T>(result.get(i), (A(i) >= b) ? A(i) : b));
+    }
+    
+    time = t1.delta();
+  }
+};
+
+
+
 template <typename T>
 struct t_vthres_c : Benchmark_base
 {
@@ -161,11 +218,13 @@
   switch (what)
   {
   case  1: loop(t_vthres_sal<float>()); break;
+  case  2: loop(t_vthr_sal<float>()); break;
   case 11: loop(t_vthres_c<float>()); break;
   case  0:
     std::cout
       << "SAL vthres\n"
       << "  -1 -- SAL vthresx (float) Z(i) = A(i) > b ? A(i) : 0\n"
+      << "  -2 -- SAL vthrx   (float) Z(i) = A(i) > b ? A(i) : b\n"
       << " -11 -- C           (float) Z(i) = A(i) > b ? A(i) : 0\n"
       ;
   }
Index: examples/mercury/mcoe-setup.sh
===================================================================
--- examples/mercury/mcoe-setup.sh	(revision 173215)
+++ examples/mercury/mcoe-setup.sh	(working copy)
@@ -132,7 +132,7 @@
   cxxflags="$pflags $toolset_flag --no_implicit_include"
  
   opt_flags="-Ospeed -Onotailrecursion --max_inlining"
-  opt_flags="$opt_flags -DNDEBUG --diag_suppress 177,550"
+  opt_flags="$opt_flags -DNDEBUG --diag_suppress 175,177,550"
   dbg_flags="-g"
 
   ex_off_flags="--no_exceptions"